From e42389f9d7a5e04aee3463b3e08bafdc86a9457b Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 26 Mar 2025 03:26:16 +0000
Subject: [PATCH 001/593] Transformers backend already supports V1 (#15463)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/models/test_transformers.py          | 22 +++++-----------------
 vllm/engine/arg_utils.py                   |  8 --------
 vllm/model_executor/models/transformers.py |  2 ++
 3 files changed, 7 insertions(+), 25 deletions(-)

diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
index 243cb92ae2569..c45fc7e649ec8 100644
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -3,8 +3,6 @@
 
 Run `pytest tests/models/test_transformers.py`.
 """
-from contextlib import nullcontext
-
 import pytest
 
 from ..conftest import HfRunner, VllmRunner
@@ -42,7 +40,6 @@ def check_implementation(
     "model,model_impl",
     [
         ("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
-        ("openai-community/gpt2", "transformers"),
         ("ArthurZ/Ilama-3.2-1B", "auto"),  # CUSTOM CODE
     ])  # trust_remote_code=True by default
 def test_models(
@@ -52,20 +49,11 @@ def test_models(
     model: str,
     model_impl: str,
 ) -> None:
-
-    maybe_raises = nullcontext()
-    if model == "openai-community/gpt2" and model_impl == "transformers":
-        # Model is not backend compatible
-        maybe_raises = pytest.raises(
-            ValueError,
-            match="The Transformers implementation.*not compatible with vLLM")
-
-    with maybe_raises:
-        check_implementation(hf_runner,
-                             vllm_runner,
-                             example_prompts,
-                             model,
-                             model_impl=model_impl)
+    check_implementation(hf_runner,
+                         vllm_runner,
+                         example_prompts,
+                         model,
+                         model_impl=model_impl)
 
 
 @multi_gpu_test(num_gpus=2)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 65a1676c0637d..75ac326aaa3d6 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1613,14 +1613,6 @@ class EngineArgs:
                                recommend_to_remove=False)
             return False
 
-        # No TransformersModel support so far.
-        if (model_config.model_impl == ModelImpl.TRANSFORMERS
-                or model_config.model_impl == "transformers"):
-            _raise_or_fallback(
-                feature_name=f"model_impl={model_config.model_impl}",
-                recommend_to_remove=False)
-            return False
-
         # No Concurrent Partial Prefills so far.
         if (self.max_num_partial_prefills
                 != EngineArgs.max_num_partial_prefills
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index fe6a9d7a4aa43..56ec00dcf222c 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -24,6 +24,7 @@ from transformers import AutoModel, PretrainedConfig, PreTrainedModel
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
 
 from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
                          ParallelConfig, VllmConfig)
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -109,6 +110,7 @@ def replace_linear_class(
     )
 
 
+@support_torch_compile
 class TransformersModel(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
     embedding_padding_modules = ["lm_head"]
     embedding_modules = ["embed_tokens"

From 997c8811d6aadf92dc299e0c2a8d274117308880 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 26 Mar 2025 11:26:33 +0800
Subject: [PATCH 002/593] [Model] Support multi-image for Molmo (#15438)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/supported_models.md        |  2 +-
 .../vision_language/test_models.py            |  2 +-
 vllm/model_executor/models/molmo.py           | 57 +++++++++----------
 vllm/model_executor/models/vision.py          | 13 +++--
 4 files changed, 39 insertions(+), 35 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 56ea8c5d8372b..f106195e10585 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -853,7 +853,7 @@ See [this page](#generative-models) for more information on how to use generativ
   *
 - * `MolmoForCausalLM`
   * Molmo
-  * T + I
+  * T + I<sup>+</sup>
   * `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc.
   * ✅︎
   * ✅︎
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 94b61b6ae7803..d500ef5d8b805 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -431,7 +431,7 @@ VLM_TEST_SETTINGS = {
     ),
     "molmo": VLMTestInfo(
         models=["allenai/Molmo-7B-D-0924"],
-        test_type=(VLMTestType.IMAGE),
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         prompt_formatter=identity,
         max_model_len=4096,
         max_num_seqs=2,
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 3f0c644a5a866..146d48e522119 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -57,7 +57,7 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import select_patch_features
+from .vision import scatter_patch_features, select_patch_features
 
 # TODO: hard-coded for now. Consider making it configurable.
 VIT_LAYERS = [-2, -9]
@@ -71,13 +71,13 @@ POOLING_SIZE = 2
 
 
 class MolmoImageInputs(TypedDict):
-    images: Union[torch.Tensor, List[torch.Tensor]]
+    images: Union[torch.Tensor, list[torch.Tensor]]
     """Shape: `(batch_size, num_crops, num_patch, patch_dim)`"""
 
-    image_masks: Optional[Union[torch.Tensor, List[torch.Tensor]]]
+    image_masks: Optional[Union[torch.Tensor, list[torch.Tensor]]]
     """Shape: `(batch_size, num_crops, num_patch)`"""
 
-    feat_is_patch: Union[torch.Tensor, List[torch.Tensor]]
+    feat_is_patch: Union[torch.Tensor, list[torch.Tensor]]
     """
     A boolean mask indicating which image features correspond
     to patch tokens.
@@ -85,7 +85,7 @@ class MolmoImageInputs(TypedDict):
     Shape: `(batch_size, num_crops, num_patch)`
     """
 
-    embed_is_patch: Union[torch.Tensor, List[torch.Tensor]]
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
     """
     A boolean mask indicating which image embeddings correspond
     to patch tokens.
@@ -93,7 +93,7 @@ class MolmoImageInputs(TypedDict):
     Shape: `(batch_size, num_embeds)`
     """
 
-    num_crops: torch.Tensor
+    num_crops: Union[torch.Tensor, list[torch.Tensor]]
     """Shape: `(batch_size, num_images)`"""
 
 
@@ -1144,13 +1144,7 @@ class MolmoProcessorWrapper:
 
         image_input_idx = outputs.pop("image_input_idx", None)
         if image_input_idx is not None:
-            input_is_patch = input_ids == self.image_patch_id
-            image_input_idx_flat: torch.Tensor = image_input_idx.view(-1)
-            image_valid_flat = image_input_idx_flat >= 0
-            feat_is_patch_flat = image_valid_flat.clone()
-            feat_is_patch_flat[image_valid_flat] = (
-                input_is_patch[image_input_idx_flat[image_valid_flat]])
-            feat_is_patch = feat_is_patch_flat.view(*image_input_idx.shape)
+            feat_is_patch = image_input_idx >= 0
 
             input_is_embed = torch.isin(
                 input_ids,
@@ -1165,6 +1159,17 @@ class MolmoProcessorWrapper:
             embed_is_patch = embed_ids == self.image_patch_id
             assert embed_is_patch.sum() == feat_is_patch.sum()
 
+            # image_tokens = extra_joint + joint
+            # Both `extra_joint` and `joint` have `im_start_id` and `im_end_id`
+            embed_start = torch.nonzero(embed_ids == self.im_start_id)[::2, 0]
+            embed_end = torch.nonzero(embed_ids == self.im_end_id)[1::2, 0]
+            assert len(embed_start) == len(embed_end) == len(images)
+
+            embed_is_patch = [
+                embed_is_patch[start:end + 1]
+                for start, end in zip(embed_start, embed_end)
+            ]
+
             tilings = [
                 self.select_tiling(
                     image_width=image.size[0],
@@ -1180,7 +1185,7 @@ class MolmoProcessorWrapper:
             outputs["num_crops"] = num_crops
             outputs["img_patch_id"] = self.image_patch_id
 
-        return BatchFeature(outputs, tensor_type=return_tensors)
+        return BatchFeature(outputs)
 
 
 class MolmoProcessingInfo(BaseProcessingInfo):
@@ -1190,9 +1195,7 @@ class MolmoProcessingInfo(BaseProcessingInfo):
         return MolmoProcessorWrapper(processor)
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        # TODO: Investigate different `embed_is_patch` between cache/no-cache
-        # in multi-image case
-        return {"image": 1}
+        return {"image": None}
 
     def get_mm_max_tokens_per_item(
         self,
@@ -1325,7 +1328,7 @@ class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
                 "image", num_crops),
             feat_is_patch=MultiModalFieldConfig.flat_from_sizes(
                 "image", num_crops),
-            embed_is_patch=MultiModalFieldConfig.shared("image", num_images),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
             num_crops=MultiModalFieldConfig.batched("image"),
             img_patch_id=MultiModalFieldConfig.shared("image", num_images),
         )
@@ -1499,7 +1502,7 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
     def _process_image_input(
         self,
         image_input: MolmoImageInputs,
-    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
         if isinstance(image_input["images"], list):
             # Call the vision backbone on the whole batch at once
             images_flat = flatten_bn(image_input["images"], concat=True)
@@ -1530,7 +1533,7 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
             feat_is_patch: torch.Tensor,  # Shape: (num_crop, num_patch)
             num_crops: torch.Tensor,  # Shape: (num_images,)
             embed_is_patch: torch.Tensor,  # Shape: (num_embeds,)
-    ) -> list[torch.Tensor]:
+    ) -> tuple[torch.Tensor, ...]:
         """
         Scatter the patch features into a contiguous tensor that corresponds
         to the embedding tokens defined by the multimodal processor.
@@ -1565,16 +1568,12 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
         feats_per_image = features.split(num_crops_per_image)
         f_is_patch_per_image = feat_is_patch.split(num_crops_per_image)
 
-        _, _, embed_dim = features.shape
-        (num_embeds, ) = embed_is_patch.shape
+        features = torch.cat([
+            feats[f_is_patch]
+            for feats, f_is_patch in zip(feats_per_image, f_is_patch_per_image)
+        ])
 
-        embeds_in_batch = list[torch.Tensor]()
-        for feats, f_is_patch in zip(feats_per_image, f_is_patch_per_image):
-            embeds = feats.new_full((num_embeds, embed_dim), torch.nan)
-            embeds[embed_is_patch] = feats[f_is_patch]
-            embeds_in_batch.append(embeds)
-
-        return embeds_in_batch
+        return scatter_patch_features(features, embed_is_patch)
 
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 250b0ee3c2a1b..c91459398308e 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -155,7 +155,7 @@ def resolve_visual_encoder_outputs(
 
 def scatter_patch_features(
     features: torch.Tensor,
-    embed_is_patch: torch.Tensor,
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]],
 ) -> tuple[torch.Tensor, ...]:
     """
     Scatter the patch features into a contiguous tensor that corresponds
@@ -194,14 +194,19 @@ def scatter_patch_features(
             The resulting embedding tensor is:
             [  nan     p1      p2      nan      p3      p4     nan    nan  ]
     """
-    num_images, num_embeds = embed_is_patch.shape
-    num_embeds_per_image = [num_embeds] * num_images
+    num_embeds_per_image = [
+        e_is_patch.numel() for e_is_patch in embed_is_patch
+    ]
+    if isinstance(embed_is_patch, torch.Tensor):
+        embed_is_patch_flat = embed_is_patch.view(-1)
+    else:
+        embed_is_patch_flat = torch.cat(embed_is_patch)
 
     embeds_flat = features.new_full(
         (sum(num_embeds_per_image), features.shape[-1]),
         fill_value=torch.nan,
     )
-    embeds_flat[embed_is_patch.view(-1)] = features.flatten(0, -2)
+    embeds_flat[embed_is_patch_flat] = features.flatten(0, -2)
 
     return embeds_flat.split(num_embeds_per_image)
 

From 23114d33640175229a395b9ed1128c3a41ad65d9 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Tue, 25 Mar 2025 23:31:04 -0400
Subject: [PATCH 003/593] [Misc] Warn about v0 in benchmark_paged_attn.py
 (#15495)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 benchmarks/kernels/benchmark_paged_attention.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 48b351bc48141..2625239b08ef2 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -7,10 +7,13 @@ from typing import Optional
 import torch
 
 from vllm import _custom_ops as ops
+from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
                         create_kv_caches_with_random)
 
+logger = init_logger(__name__)
+
 NUM_BLOCKS = 128 * 1024
 PARTITION_SIZE = 512
 PARTITION_SIZE_ROCM = 256
@@ -193,6 +196,9 @@ def main(
 
 
 if __name__ == '__main__':
+    logger.warning("This script benchmarks the paged attention kernel. "
+                   "By default this is no longer used in vLLM inference.")
+
     parser = FlexibleArgumentParser(
         description="Benchmark the paged attention kernel.")
     parser.add_argument("--version",

From 33437bc6e7af316fa9ce6b6e559501ca45d9cd45 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Tue, 25 Mar 2025 23:33:22 -0400
Subject: [PATCH 004/593] [BugFix] Fix nightly MLA failure (FA2 + MLA chunked
 prefill, i.e. V1, producing bad results) (#15492)

Signed-off-by: LucasWilkinson <lwilkinson@neuralmagic.com>
---
 vllm/attention/ops/triton_merge_attn_states.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vllm/attention/ops/triton_merge_attn_states.py b/vllm/attention/ops/triton_merge_attn_states.py
index 31545b607fecd..9671b933f47b9 100644
--- a/vllm/attention/ops/triton_merge_attn_states.py
+++ b/vllm/attention/ops/triton_merge_attn_states.py
@@ -54,6 +54,15 @@ def merge_attn_states_kernel(
 
     p_lse = tl.load(prefix_lse + head_idx * num_tokens + token_idx)
     s_lse = tl.load(suffix_lse + head_idx * num_tokens + token_idx)
+
+    # FA2 and FA3 have different behavior for when the sum-exp is 0, this namely
+    # arises with 0 len seqlens. FA3 returns -inf here while FA2 returns inf.
+    # If we see an inf assume FA2 and convert inf to -inf for consistency
+    # and correctness. Inf generally doesn't make sense in this context outside
+    # of undefined-behavior/FA2-case, so I think this a safe assumption.
+    p_lse = float('-inf') if p_lse == float('inf') else p_lse
+    s_lse = float('-inf') if s_lse == float('inf') else s_lse
+
     max_lse = tl.maximum(p_lse, s_lse)
     p_lse = p_lse - max_lse
     s_lse = s_lse - max_lse

From 6c663dfd5e5b5ab4a1eb46391c2c65d1eff0218f Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Tue, 25 Mar 2025 20:33:45 -0700
Subject: [PATCH 005/593] [misc] LoRA - Skip LoRA kernels when not required
 (#15152)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 vllm/lora/ops/triton_ops/lora_expand.py       | 13 +++-
 .../ops/triton_ops/lora_kernel_metadata.py    | 42 ++++++++--
 vllm/lora/ops/triton_ops/lora_shrink.py       | 13 +++-
 vllm/worker/model_runner.py                   | 78 +++++++++++++------
 4 files changed, 113 insertions(+), 33 deletions(-)

diff --git a/vllm/lora/ops/triton_ops/lora_expand.py b/vllm/lora/ops/triton_ops/lora_expand.py
index b47e491ad7ed1..eacc6fb46ebd7 100644
--- a/vllm/lora/ops/triton_ops/lora_expand.py
+++ b/vllm/lora/ops/triton_ops/lora_expand.py
@@ -136,6 +136,7 @@ def _lora_expand(
     num_tokens_per_lora: torch.Tensor,  # shape [max-loras + 1]
     lora_token_start_loc: torch.Tensor,  # shape [max-loras + 2]
     lora_ids: torch.Tensor,  # shape [max-loras + 1]
+    no_lora_flag_cpu: torch.Tensor,  # shape [1] 
     offset_start: int = 0,
     add_inputs: bool = False,
 ) -> None:
@@ -157,11 +158,19 @@ def _lora_expand(
             identifies the the region in token_indices_sorted_by_lora_ids that
             LoRA lora_ids[i] should process.
         lora_ids (torch.Tensor): LoRA ids to process.
+        no_lora_flag_cpu (torch.Tensor): A CPU tensor of size 1, that indicates
+            if there are any requests that require LoRA.
         offset_start (int, optional): Offset start for output_tensor. 
             Defaults to 0.
         add_inputs (bool, optional): Whether to add the input tensor to the 
             output tensor. Defaults to False.
     """
+
+    assert no_lora_flag_cpu.numel() == 1
+    if no_lora_flag_cpu.item():
+        # None of the inputs require LoRA.
+        return
+
     assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
     for weight in lora_b_weights:
         assert weight.dtype in [torch.float16, torch.bfloat16]
@@ -170,6 +179,8 @@ def _lora_expand(
     assert output_tensor.is_contiguous()
 
     # metadata sanity check.
+    M = inputs.size(1)
+    assert token_lora_mapping.size(0) == M
     assert token_lora_mapping.size(0) == token_indices_sorted_by_lora_ids.size(
         0)
     assert lora_ids.size(0) == num_tokens_per_lora.size(0)
@@ -181,7 +192,6 @@ def _lora_expand(
                                            inputs.device)
 
     K = lora_b_weights[0].shape[-1]  # K= rank
-    M = inputs.size(1)
     ADD_INPUTS = add_inputs
     MAX_LORAS = lora_ids.size(0)
     CAST_TYPE = False
@@ -263,6 +273,7 @@ def _lora_expand_fake(
     num_tokens_per_lora: torch.Tensor,
     lora_token_start_loc: torch.Tensor,
     lora_ids: torch.Tensor,
+    no_lora_flag_cpu: torch.Tensor,
     offset_start: int = 0,
     add_inputs: bool = False,
 ) -> None:
diff --git a/vllm/lora/ops/triton_ops/lora_kernel_metadata.py b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
index 2add1177e84c8..1dcdfc814a891 100644
--- a/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
+++ b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
@@ -17,6 +17,17 @@ class LoRAKernelMeta:
     num_tokens_per_lora: torch.Tensor
     lora_token_start_loc: torch.Tensor
 
+    # The V1 architecture uses the traced torch.compile graphs to execute
+    # a forward pass. Things to note about this process,
+    # 1. The tracing infers all python scalar datatype objects into a constant
+    # value.
+    # 2. The tracing cannot handle dynamic control flow. (dynamic control flow
+    # is an experimental feature in pytorch)
+    # 3. The internals of torch.ops functions are not traced.
+    # We disguise the "no_lora" flag as a cpu tensor and leverage point number 3
+    # to early exit from inside the lora_expand / lora_shrink torch operation.
+    no_lora_flag_cpu: torch.Tensor
+
     @staticmethod
     def make(max_loras: int, max_num_tokens: int,
              device: Union[torch.device, str]) -> "LoRAKernelMeta":
@@ -47,17 +58,24 @@ class LoRAKernelMeta:
         lora_token_start_loc = torch.zeros(max_loras + 2,
                                            dtype=torch.int32,
                                            device=device)
+
+        no_lora_flag_cpu = torch.tensor([False],
+                                        dtype=torch.bool,
+                                        device='cpu')
+
         return LoRAKernelMeta(
             token_lora_mapping=token_lora_mapping,
             token_indices_sorted_by_lora_ids=token_indices_sorted_by_lora_ids,
             active_lora_ids=active_lora_ids,
             num_tokens_per_lora=num_tokens_per_lora,
-            lora_token_start_loc=lora_token_start_loc)
+            lora_token_start_loc=lora_token_start_loc,
+            no_lora_flag_cpu=no_lora_flag_cpu)
 
     def _reset(self):
         self.active_lora_ids.fill_(-1)
         self.num_tokens_per_lora.fill_(0)
         self.lora_token_start_loc.fill_(0)
+        self.no_lora_flag_cpu.fill_(False)
 
     def prepare_tensors(self, token_lora_mapping: torch.Tensor) -> None:
         """
@@ -70,6 +88,14 @@ class LoRAKernelMeta:
 
         self._reset()
 
+        # Check and record no-lora case.
+        no_lora = torch.all(token_lora_mapping == -1)
+        self.no_lora_flag_cpu[0] = no_lora
+
+        if no_lora:
+            # Early exit. LoRA kernels will not be run.
+            return
+
         num_tokens = token_lora_mapping.size(0)
 
         # copy token lora mapping
@@ -100,7 +126,7 @@ class LoRAKernelMeta:
     def meta_args(
         self, token_nums: int
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
-               torch.Tensor]:
+               torch.Tensor, torch.Tensor]:
         """
         This function returns the kernel metadata required for the current
         forward pass execution of the kernel. The function returns all the
@@ -111,7 +137,11 @@ class LoRAKernelMeta:
             token_nums (int): Number of input tokens in the current forward
             pass. 
         """
-        return (self.token_lora_mapping[:token_nums],
-                self.token_indices_sorted_by_lora_ids[:token_nums],
-                self.num_tokens_per_lora, self.lora_token_start_loc,
-                self.active_lora_ids)
+        return (
+            self.token_lora_mapping[:token_nums],
+            self.token_indices_sorted_by_lora_ids[:token_nums],
+            self.num_tokens_per_lora,
+            self.lora_token_start_loc,
+            self.active_lora_ids,
+            self.no_lora_flag_cpu,
+        )
diff --git a/vllm/lora/ops/triton_ops/lora_shrink.py b/vllm/lora/ops/triton_ops/lora_shrink.py
index a97c50c44f47a..82331939d859b 100644
--- a/vllm/lora/ops/triton_ops/lora_shrink.py
+++ b/vllm/lora/ops/triton_ops/lora_shrink.py
@@ -106,6 +106,7 @@ def _lora_shrink(
     num_tokens_per_lora: torch.Tensor,  # shape [max-loras + 1]
     lora_token_start_loc: torch.Tensor,  # shape [max-loras + 2]
     lora_ids: torch.Tensor,  # shape [max-loras + 1]
+    no_lora_flag_cpu: torch.Tensor,  # shape [1]
     scaling: float,
 ) -> None:
     """
@@ -126,8 +127,16 @@ def _lora_shrink(
             identifies the region in token_indices_sorted_by_lora_ids that
             LoRA lora_ids[i] should process.
         lora_ids (torch.Tensor): LoRA ids to process.
+        no_lora_flag_cpu (torch.Tensor): A CPU tensor of size 1, that indicates
+            if there are any requests that require LoRA.
         scaling (float): Scaling factor.
     """
+
+    assert no_lora_flag_cpu.numel() == 1
+    if no_lora_flag_cpu.item():
+        # None of the inputs require LoRA.
+        return
+
     assert inputs.dtype == lora_a_weights[0].dtype
     assert inputs.dtype in [torch.float16, torch.bfloat16]
     for weight in lora_a_weights:
@@ -138,6 +147,8 @@ def _lora_shrink(
     assert output_tensor.is_contiguous()
 
     # metadata sanity check
+    M = inputs.size(0)
+    assert token_lora_mapping.size(0) == M
     assert token_lora_mapping.size(0) == token_indices_sorted_by_lora_ids.size(
         0)
     assert lora_ids.size(0) == num_tokens_per_lora.size(0)
@@ -146,7 +157,6 @@ def _lora_shrink(
     (lora_ptr_tensor, lora_strides_d0, lora_strides_d1,
      lora_strides_d2) = _get_lora_a_ptr(lora_a_weights, inputs.device)
     N, K = lora_a_weights[0].shape[-2:]  # K=hidden_size,N=rank
-    M = inputs.size(0)
     NUM_SLICES = len(lora_a_weights)
     MAX_LORAS = lora_ids.size(0)
 
@@ -218,6 +228,7 @@ def _lora_shrink_fake(
     num_tokens_per_lora: torch.Tensor,
     lora_token_start_loc: torch.Tensor,
     lora_ids: torch.Tensor,
+    no_lora_flag_cpu: torch.Tensor,
     scaling: float,
 ) -> None:
     return
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 473bd901b5b23..edbafb48c9386 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1242,6 +1242,29 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
         max_num_seqs = self.scheduler_config.max_num_seqs
         self._dummy_run(max_num_batched_tokens, max_num_seqs)
 
+    def _add_dummy_loras(self, num_loras: int) -> list[LoRARequest]:
+        assert num_loras > 0
+        assert self.lora_manager is not None
+
+        dummy_lora_requests: list[LoRARequest] = []
+        with self.lora_manager.dummy_lora_cache():
+            for idx in range(num_loras):
+                lora_id = idx + 1
+                dummy_lora_request = LoRARequest(
+                    lora_name=f"warmup_{lora_id}",
+                    lora_int_id=lora_id,
+                    lora_path="/not/a/real/path",
+                )
+                self.lora_manager.add_dummy_lora(dummy_lora_request,
+                                                 rank=LORA_WARMUP_RANK)
+                dummy_lora_requests.append(dummy_lora_request)
+        return dummy_lora_requests
+
+    def _remove_dummy_loras(self):
+        # Remove dummy loras.
+        assert self.lora_manager is not None
+        self.remove_all_loras()
+
     def _dummy_run(self,
                    max_num_batched_tokens: int,
                    max_num_seqs: int = 1) -> None:
@@ -1251,28 +1274,20 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                 SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
 
             # This represents the maximum number of different requests
-            # that will have unique loras, an therefore the max amount of memory
-            # consumption create dummy lora request copies from the lora request
-            # passed in, which contains a lora from the lora warmup path.
+            # that will have unique loras, and therefore the max amount of
+            # memory consumption. Create dummy lora request copies from the
+            # lora request passed in, which contains a lora from the lora
+            # warmup path.
             dummy_lora_requests: List[LoRARequest] = []
             dummy_lora_requests_per_seq: List[LoRARequest] = []
             if self.lora_config:
-                assert self.lora_manager is not None
-                with self.lora_manager.dummy_lora_cache():
-                    for idx in range(self.lora_config.max_loras):
-                        lora_id = idx + 1
-                        dummy_lora_request = LoRARequest(
-                            lora_name=f"warmup_{lora_id}",
-                            lora_int_id=lora_id,
-                            lora_path="/not/a/real/path",
-                        )
-                        self.lora_manager.add_dummy_lora(dummy_lora_request,
-                                                         rank=LORA_WARMUP_RANK)
-                        dummy_lora_requests.append(dummy_lora_request)
-                    dummy_lora_requests_per_seq = [
-                        dummy_lora_requests[idx % len(dummy_lora_requests)]
-                        for idx in range(max_num_seqs)
-                    ]
+                dummy_lora_requests = self._add_dummy_loras(
+                    self.lora_config.max_loras)
+                assert len(dummy_lora_requests) == self.lora_config.max_loras
+                dummy_lora_requests_per_seq = [
+                    dummy_lora_requests[idx % len(dummy_lora_requests)]
+                    for idx in range(max_num_seqs)
+                ]
 
             # Profile memory usage with max_num_sequences sequences and the
             # total number of tokens equal to max_num_batched_tokens.
@@ -1354,9 +1369,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
             self.execute_model(model_input, kv_caches, intermediate_tensors)
             torch.cuda.synchronize()
             if self.lora_config:
-                # Remove dummy loras.
-                assert self.lora_manager is not None
-                self.remove_all_loras()
+                self._remove_dummy_loras()
+
             return
 
     def remove_all_loras(self):
@@ -1479,6 +1493,16 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                 dtype=self.model_config.dtype,
                 device=self.device)
 
+        dummy_lora_id: Optional[int] = None
+        dummy_lora_request: LoRARequest = []
+        if self.lora_config:
+            # The goal is to capture the LoRA kernels in cuda graphs.
+            # for this purpose, as single dummy lora is sufficient.
+            dummy_lora_requests = self._add_dummy_loras(num_loras=1)
+            assert len(dummy_lora_requests) == 1
+            dummy_lora_request = dummy_lora_requests[0]
+            dummy_lora_id = dummy_lora_request.lora_int_id
+
         with self.attn_state.graph_capture(max_batch_size), graph_capture(
                 self.device) as graph_capture_context:
             # NOTE: Capturing the largest batch size first may help reduce the
@@ -1503,10 +1527,11 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                     attn_metadata.enable_kv_scales_calculation = False
                     if self.lora_config:
                         lora_mapping = LoRAMapping(
-                            **dict(index_mapping=[0] * batch_size,
-                                   prompt_mapping=[0] * batch_size,
+                            **dict(index_mapping=[dummy_lora_id] * batch_size,
+                                   prompt_mapping=[dummy_lora_id] * batch_size,
                                    is_prefill=False))
-                        self.set_active_loras(set(), lora_mapping)
+                        self.set_active_loras(set([dummy_lora_request]),
+                                              lora_mapping)
 
                     if self.prompt_adapter_config:
                         prompt_adapter_mapping = PromptAdapterMapping(
@@ -1562,6 +1587,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                     self.graph_runners[virtual_engine][batch_size] = (
                         graph_runner)
 
+        if self.lora_config:
+            self._remove_dummy_loras()
+
         end_time = time.perf_counter()
         end_free_gpu_memory = torch.cuda.mem_get_info()[0]
         elapsed_time = end_time - start_time

From 5aefd6ac3169b7b56023549cfa9614274d6e15f0 Mon Sep 17 00:00:00 2001
From: daniel-salib <danielsalib@meta.com>
Date: Tue, 25 Mar 2025 22:29:54 -0700
Subject: [PATCH 006/593] Fix raw_request extraction in load_aware_call
 decorator (#15382)

Signed-off-by: Daniel Salib <danielsalib@meta.com>
---
 vllm/entrypoints/utils.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index 60cbb58af3d9a..773f52fa38f88 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -68,13 +68,20 @@ def decrement_server_load(request: Request):
 def load_aware_call(func):
 
     @functools.wraps(func)
-    async def wrapper(*args, raw_request: Request, **kwargs):
+    async def wrapper(*args, **kwargs):
+        raw_request = kwargs.get("raw_request",
+                                 args[1] if len(args) > 1 else None)
+
+        if raw_request is None:
+            raise ValueError(
+                "raw_request required when server load tracking is enabled")
+
         if not raw_request.app.state.enable_server_load_tracking:
-            return await func(*args, raw_request=raw_request, **kwargs)
+            return await func(*args, **kwargs)
 
         raw_request.app.state.server_load_metrics += 1
         try:
-            response = await func(*args, raw_request=raw_request, **kwargs)
+            response = await func(*args, **kwargs)
         except Exception:
             raw_request.app.state.server_load_metrics -= 1
             raise

From 781d0562809b34f0c548cd354bbc01c861814f94 Mon Sep 17 00:00:00 2001
From: Bryan Lu <55512809+luyuzhe111@users.noreply.github.com>
Date: Wed, 26 Mar 2025 01:24:07 -0700
Subject: [PATCH 007/593] [Feature] Enhance EAGLE Architecture with Proper RMS
 Norms (#14990)

Signed-off-by: Bryan Lu <yuzhelu@amazon.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 vllm/config.py                           | 16 ++++++--
 vllm/model_executor/models/eagle.py      | 51 +++++++++++++++++++++---
 vllm/transformers_utils/configs/eagle.py | 15 ++++++-
 3 files changed, 70 insertions(+), 12 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 87ede1e077b8a..6f2da6aa87136 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -800,10 +800,18 @@ class ModelConfig:
 
     @property
     def is_deepseek_mla(self) -> bool:
-        return (hasattr(self.hf_text_config, "model_type")) \
-                and (self.hf_text_config.model_type in \
-                    ('deepseek_v2', 'deepseek_v3', 'deepseek_mtp'))\
-                and (self.hf_text_config.kv_lora_rank is not None)
+        if not hasattr(self.hf_text_config, "model_type"):
+            return False
+        elif self.hf_text_config.model_type in \
+            ('deepseek_v2', 'deepseek_v3', 'deepseek_mtp'):
+            return self.hf_text_config.kv_lora_rank is not None
+        elif self.hf_text_config.model_type == 'eagle':
+            # if the model is an EAGLE module, check for the
+            # underlying architecture
+            return self.hf_text_config.model.model_type in \
+                    ('deepseek_v2', 'deepseek_v3') \
+                and self.hf_text_config.kv_lora_rank is not None
+        return False
 
     def get_head_size(self) -> int:
         # TODO remove hard code
diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py
index 010e51a3b9f28..3e4a5040b7c89 100644
--- a/vllm/model_executor/models/eagle.py
+++ b/vllm/model_executor/models/eagle.py
@@ -7,6 +7,7 @@ import torch.nn as nn
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -59,7 +60,15 @@ class EAGLE(nn.Module):
        truncated_vocab_size < vocab_size. To use this technique, one has to find
        the top-k most frequent tokens in target dataset and add that as a tensor
        in the draft checkpoint (using key token_map). Also, the draft config
-       needs to have truncated_vocab_size (=k) as an attribute."""
+       needs to have truncated_vocab_size (=k) as an attribute.
+    4. We allow an enhanced EAGLE architecture similar to the DeepSeek MTP 
+       module with regards to the use of additional RMS norms. The original 
+       EAGLE architecture 1) skips the pre-attention norm in its first 
+       transformer block, and 2) skips the final output norm, both of which we 
+       found to be suboptimal. We also add the support for separate norms
+       applying to both the token embedding and hidden states before projection
+       as in DeepSeek MTP, which we found to improve performance as well.
+    """
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -81,9 +90,22 @@ class EAGLE(nn.Module):
         # While weights and biases are generally not needed,
         # they are retained here to support certain unit tests
         # (e.g., spec_decode/e2e/test_eagle_correctness.py).
-        self.model.model.layers[0].input_layernorm = DummyInputLayerNorm(
-            weight=self.model.model.layers[0].input_layernorm.weight)
-        self.model.model.norm = DummyOutputNorm()
+        if not hasattr(self.config.model,
+                       "skip_prenorm") or self.config.model.skip_prenorm:
+            self.model.model.layers[0].input_layernorm = DummyInputLayerNorm(
+                weight=self.model.model.layers[0].input_layernorm.weight)
+
+        if not hasattr(
+                self.config.model,
+                "skip_output_norm") or self.config.model.skip_output_norm:
+            self.model.model.norm = DummyOutputNorm()
+
+        self.add_para_norm = False
+        if hasattr(self.config.model,
+                   "add_para_norm") and self.config.model.add_para_norm:
+            self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            self.add_para_norm = True
 
         self.orig_vocab_size = config.vocab_size
         self.truncated_vocab_size = config.truncated_vocab_size
@@ -128,8 +150,17 @@ class EAGLE(nn.Module):
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings(input_ids)
 
-        inputs_embeds = self.fc(
-            torch.cat([inputs_embeds, previous_hidden_states], dim=-1))
+        if self.add_para_norm:
+            inputs_embeds = torch.cat([
+                self.enorm(inputs_embeds),
+                self.hnorm(previous_hidden_states)
+            ],
+                                      dim=-1)
+        else:
+            inputs_embeds = torch.cat([inputs_embeds, previous_hidden_states],
+                                      dim=-1)
+
+        inputs_embeds = self.fc(inputs_embeds)
 
         inputs_embeds[positions == 0] = 0  # masking inputs at position=0
 
@@ -190,6 +221,14 @@ class EAGLE(nn.Module):
                 else:
                     logger.warning_once("Found bias in the loaded weights but "
                                         "the model config doesn't have bias.")
+            elif name.startswith("enorm.weight"):
+                weight_loader = getattr(self.enorm.weight, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(self.enorm.weight, loaded_weight)
+            elif name.startswith("hnorm.weight"):
+                weight_loader = getattr(self.hnorm.weight, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(self.hnorm.weight, loaded_weight)
             elif name.startswith("model.lm_head.") or name.startswith(
                     "model.model."):
                 model_weights[name.split("model.", 1)[-1]] = loaded_weight
diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py
index b26aba66699fd..dd806061ff589 100644
--- a/vllm/transformers_utils/configs/eagle.py
+++ b/vllm/transformers_utils/configs/eagle.py
@@ -5,6 +5,8 @@ from typing import Optional, Union
 
 from transformers import AutoConfig, PretrainedConfig
 
+from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
+
 
 class EAGLEConfig(PretrainedConfig):
     model_type = "eagle"
@@ -14,8 +16,17 @@ class EAGLEConfig(PretrainedConfig):
                  truncated_vocab_size: Optional[int] = None,
                  **kwargs):
 
-        model_config = None if model is None else (AutoConfig.for_model(
-            **model) if isinstance(model, dict) else model)
+        model_config: Union[PretrainedConfig, DeepseekV2Config, None]
+        if isinstance(model, dict):
+            archs = model.get("architectures", [])
+            target_archs = ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]
+            if any(target_arch in archs for target_arch in target_archs):
+                # AutoConfig does not support DeepSeek MoE models yet
+                model_config = DeepseekV2Config(**model)
+            else:
+                model_config = AutoConfig.for_model(**model)
+        else:
+            model_config = model
 
         for k, v in kwargs.items():
             if k != "architectures" and k != "model_type" and hasattr(

From 5ebf66748b8b67731972c389d879ca69c68dc2c4 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Wed, 26 Mar 2025 16:30:30 +0800
Subject: [PATCH 008/593] [FEAT][ROCm] Integrate Fused MoE Kernels from AITER
 (#14967)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 tests/kernels/test_moe.py                     |  25 ++-
 .../model_executor/test_enabled_custom_ops.py |  36 ++++
 .../decoder_only/language/test_mistral.py     |  41 +----
 tests/quantization/test_fp8.py                |  23 ++-
 vllm/envs.py                                  |  15 ++
 .../layers/fused_moe/fused_moe.py             |  94 ++++++++---
 vllm/model_executor/layers/fused_moe/layer.py |  14 +-
 .../layers/fused_moe/rocm_aiter_fused_moe.py  | 157 ++++++++++++++++++
 .../model_executor/layers/quantization/fp8.py |  52 ++++++
 9 files changed, 391 insertions(+), 66 deletions(-)
 create mode 100644 vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py

diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 653d2734afe89..3f4dd3cf0e5d7 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -3,7 +3,6 @@
 
 Run `pytest tests/kernels/test_moe.py`.
 """
-
 import pytest
 import torch
 from torch.nn import Parameter
@@ -216,11 +215,17 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
 @pytest.mark.parametrize("dtype",
                          [torch.float32, torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("padding", [True, False])
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
 @torch.inference_mode()
-def test_mixtral_moe(dtype: torch.dtype, padding: bool):
+def test_mixtral_moe(dtype: torch.dtype, padding: bool, use_rocm_aiter: bool,
+                     monkeypatch):
     """Make sure our Mixtral MoE implementation agrees with the one from
     huggingface."""
 
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
     # Instantiate our and huggingface's MoE blocks
     config = MixtralConfig()
     hf_moe = MixtralSparseMoeBlock(config).to(dtype).to("cuda")
@@ -268,10 +273,18 @@ def test_mixtral_moe(dtype: torch.dtype, padding: bool):
         torch.bfloat16: 1e-2,
     }
 
-    torch.testing.assert_close(hf_states.flatten(0, 1),
-                               vllm_states,
-                               rtol=mixtral_moe_tol[dtype],
-                               atol=mixtral_moe_tol[dtype])
+    if use_rocm_aiter:
+        # The values of rtol and atol are set based on the tests in ROCM AITER package. # noqa: E501
+        # https://github.com/ROCm/aiter/blob/dfed377f4be7da96ca2d75ac0761f569676f7240/op_tests/test_moe.py#L174  # noqa: E501
+        torch.testing.assert_close(hf_states.flatten(0, 1),
+                                   vllm_states,
+                                   rtol=0.01,
+                                   atol=100)
+    else:
+        torch.testing.assert_close(hf_states.flatten(0, 1),
+                                   vllm_states,
+                                   rtol=mixtral_moe_tol[dtype],
+                                   atol=mixtral_moe_tol[dtype])
 
 
 @pytest.mark.parametrize("m", [1, 33, 64, 222])
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index 24147b741278b..ac2e0f3542e78 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -7,6 +7,10 @@ from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.activation import (GeluAndMul,
                                                    ReLUSquaredActivation,
                                                    SiluAndMul)
+from vllm.model_executor.layers.fused_moe.fused_moe import (
+    dispatch_fused_experts_func, dispatch_topk_func,
+    torch_vllm_inplace_fused_experts, torch_vllm_outplace_fused_experts,
+    vllm_topk_softmax)
 from vllm.model_executor.layers.layernorm import (
     RMSNorm, dispatch_cuda_rmsnorm_func, fused_add_rms_norm, rms_norm,
     rocm_aiter_fused_add_rms_norm, rocm_aiter_rms_norm)
@@ -92,6 +96,38 @@ def test_enabled_ops_invalid(env: str):
             RMSNorm(1024).enabled()
 
 
+@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
+def test_topk_dispatch(use_rocm_aiter: str, monkeypatch):
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
+    topk_func = dispatch_topk_func()
+
+    if current_platform.is_rocm() and int(use_rocm_aiter):
+        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+            rocm_aiter_topk_softmax)
+
+        assert topk_func == rocm_aiter_topk_softmax
+    else:
+        assert topk_func == vllm_topk_softmax
+
+
+@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
+@pytest.mark.parametrize("inplace", [True, False])
+def test_fused_experts_dispatch(use_rocm_aiter: str, inplace: bool,
+                                monkeypatch):
+
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
+    fused_experts_func = dispatch_fused_experts_func(inplace)
+    if current_platform.is_rocm() and int(use_rocm_aiter):
+        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+            rocm_aiter_fused_experts)
+
+        assert fused_experts_func == rocm_aiter_fused_experts
+    elif inplace:
+        assert fused_experts_func == torch_vllm_inplace_fused_experts
+    else:
+        assert fused_experts_func == torch_vllm_outplace_fused_experts
+
+
 @pytest.mark.parametrize("add_residual", [True, False])
 @pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
 @pytest.mark.parametrize("use_rocm_aiter_norm", ["0", "1"])
diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py
index 4c2055361d445..ec885386dd940 100644
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -174,15 +174,8 @@ SAMPLE_JSON_SCHEMA = {
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
+def test_models(hf_runner, vllm_runner, example_prompts, model: str,
+                dtype: str, max_tokens: int, num_logprobs: int) -> None:
     # TODO(sang): Sliding window should be tested separately.
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
@@ -206,14 +199,8 @@ def test_models(
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_mistral_format(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
+def test_mistral_format(vllm_runner, example_prompts, model: str, dtype: str,
+                        max_tokens: int, num_logprobs: int) -> None:
     with vllm_runner(
             model,
             dtype=dtype,
@@ -244,11 +231,8 @@ def test_mistral_format(
 
 @pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
-def test_mistral_symbolic_languages(
-    vllm_runner,
-    model: str,
-    dtype: str,
-) -> None:
+def test_mistral_symbolic_languages(vllm_runner, model: str,
+                                    dtype: str) -> None:
     with vllm_runner(model,
                      dtype=dtype,
                      max_model_len=8192,
@@ -266,11 +250,7 @@ def test_mistral_symbolic_languages(
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("model",
                          MISTRAL_FORMAT_MODELS)  # v1 can't do func calling
-def test_mistral_function_calling(
-    vllm_runner,
-    model: str,
-    dtype: str,
-) -> None:
+def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
     with vllm_runner(model,
                      dtype=dtype,
                      tokenizer_mode="mistral",
@@ -301,11 +281,8 @@ def test_mistral_function_calling(
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("guided_backend",
                          ["outlines", "lm-format-enforcer", "xgrammar"])
-def test_mistral_guided_decoding(
-    vllm_runner,
-    model: str,
-    guided_backend: str,
-) -> None:
+def test_mistral_guided_decoding(vllm_runner, model: str,
+                                 guided_backend: str) -> None:
     with vllm_runner(model, dtype='bfloat16',
                      tokenizer_mode="mistral") as vllm_model:
 
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 19cf29d3e6591..e74e14a0dcb64 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -23,8 +23,14 @@ MODELS = [
                     reason="FP8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_id", MODELS)
 @pytest.mark.parametrize("force_marlin", [False, True])
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
 def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
-                            monkeypatch) -> None:
+                            use_rocm_aiter: bool, monkeypatch) -> None:
+
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
     if force_marlin:
         monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
 
@@ -47,7 +53,13 @@ KV_CACHE_MODELS = [
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="FP8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_id", KV_CACHE_MODELS)
-def test_kv_cache_model_load_and_run(vllm_runner, model_id: str, monkeypatch):
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
+def test_kv_cache_model_load_and_run(vllm_runner, model_id: str,
+                                     use_rocm_aiter: bool, monkeypatch):
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
     # vllm_runner.apply_model() relies on V0 internals.
     monkeypatch.setenv("VLLM_USE_V1", "0")
     with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
@@ -86,8 +98,13 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str, monkeypatch):
                     reason="FP8 is not supported on this GPU type.")
 @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
 @pytest.mark.parametrize("force_marlin", [False, True])
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
 def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
-                         monkeypatch) -> None:
+                         use_rocm_aiter: bool, monkeypatch) -> None:
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
     # vllm_runner.apply_model() relies on V0 internals.
     monkeypatch.setenv("VLLM_USE_V1", "0")
 
diff --git a/vllm/envs.py b/vllm/envs.py
index b4305d9c8e22c..4c413006a6413 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -73,6 +73,8 @@ if TYPE_CHECKING:
     VLLM_DISABLED_KERNELS: list[str] = []
     VLLM_USE_V1: bool = True
     VLLM_ROCM_USE_AITER: bool = False
+    VLLM_ROCM_USE_AITER_MOE: bool = True
+    VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE: bool = False
     VLLM_ROCM_USE_AITER_RMSNORM: bool = True
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ROCM_MOE_PADDING: bool = True
@@ -513,6 +515,19 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
              ("true", "1")),
 
+    # Whether to use aiter moe ops.
+    # By default is enabled.
+    "VLLM_ROCM_USE_AITER_MOE":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER_MOE", "True").lower() in
+             ("true", "1")),
+
+    # Whether to use aiter block scaled moe kernel.
+    # By default this is disabled.
+    "VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE":
+    lambda:
+    (os.getenv("VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE", "false").lower() in
+     ("true", "1")),
+
     # use aiter rms norm op if aiter ops are enabled.
     "VLLM_ROCM_USE_AITER_RMSNORM":
     lambda: (os.getenv("VLLM_ROCM_USE_AITER_RMSNORM", "True").lower() in
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 4de020ff81c0e..97e915c60335a 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -17,6 +17,10 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 
+from .rocm_aiter_fused_moe import (is_rocm_aiter_moe_enabled,
+                                   rocm_aiter_fused_experts,
+                                   rocm_aiter_topk_softmax)
+
 logger = init_logger(__name__)
 
 
@@ -1035,6 +1039,28 @@ def try_get_optimal_moe_config(
     return config
 
 
+def vllm_topk_softmax(topk_weights: torch.Tensor, topk_indices: torch.Tensor,
+                      token_expert_indices: torch.Tensor,
+                      gating_output: torch.Tensor,
+                      renormalize: bool) -> tuple[torch.Tensor, ...]:
+    ops.topk_softmax(
+        topk_weights,
+        topk_indices,
+        token_expert_indices,
+        gating_output,
+    )
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+
+    return topk_weights, topk_indices
+
+
+def dispatch_topk_func() -> Callable[..., tuple[torch.Tensor, ...]]:
+    if is_rocm_aiter_moe_enabled():
+        return rocm_aiter_topk_softmax
+    return vllm_topk_softmax
+
+
 def fused_topk(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
@@ -1059,17 +1085,14 @@ def fused_topk(
                                         dtype=torch.int32,
                                         device=hidden_states.device)
 
-    ops.topk_softmax(
-        topk_weights,
-        topk_ids,
-        token_expert_indicies,
-        gating_output.float(),  # TODO(woosuk): Optimize this.
-    )
+    gating_output_float = gating_output.float()  # TODO(woosuk): Optimize this.
+
+    topk_func = dispatch_topk_func()
+    topk_weights, topk_ids = topk_func(topk_weights, topk_ids,
+                                       token_expert_indicies,
+                                       gating_output_float, renormalize)
+
     del token_expert_indicies  # Not used. Will be used in the future.
-
-    if renormalize:
-        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
-
     return topk_weights, topk_ids
 
 
@@ -1259,6 +1282,24 @@ direct_register_custom_op(
 )
 
 
+def torch_vllm_inplace_fused_experts(**kwargs) -> torch.Tensor:
+    torch.ops.vllm.inplace_fused_experts(**kwargs)
+    hidden_states = kwargs['hidden_states']
+    return hidden_states
+
+
+def torch_vllm_outplace_fused_experts(**kwargs) -> torch.Tensor:
+    return torch.ops.vllm.outplace_fused_experts(**kwargs)
+
+
+def dispatch_fused_experts_func(inplace: bool) -> Callable[..., torch.Tensor]:
+    if is_rocm_aiter_moe_enabled():
+        return rocm_aiter_fused_experts
+    if inplace:
+        return torch_vllm_inplace_fused_experts
+    return torch_vllm_outplace_fused_experts
+
+
 def fused_experts(hidden_states: torch.Tensor,
                   w1: torch.Tensor,
                   w2: torch.Tensor,
@@ -1278,20 +1319,25 @@ def fused_experts(hidden_states: torch.Tensor,
                   a1_scale: Optional[torch.Tensor] = None,
                   a2_scale: Optional[torch.Tensor] = None,
                   block_shape: Optional[List[int]] = None) -> torch.Tensor:
-
-    if inplace:
-        torch.ops.vllm.inplace_fused_experts(
-            hidden_states, w1, w2, topk_weights, topk_ids, activation,
-            use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, global_num_experts,
-            expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale,
-            block_shape)
-        return hidden_states
-    else:
-        return torch.ops.vllm.outplace_fused_experts(
-            hidden_states, w1, w2, topk_weights, topk_ids, activation,
-            use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, global_num_experts,
-            expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale,
-            block_shape)
+    return dispatch_fused_experts_func(inplace)(
+        hidden_states=hidden_states,
+        w1=w1,
+        w2=w2,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        activation=activation,
+        use_fp8_w8a8=use_fp8_w8a8,
+        use_int8_w8a16=use_int8_w8a16,
+        use_int4_w4a16=use_int4_w4a16,
+        global_num_experts=global_num_experts,
+        expert_map=expert_map,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        w1_zp=w1_zp,
+        w2_zp=w2_zp,
+        a1_scale=a1_scale,
+        a2_scale=a2_scale,
+        block_shape=block_shape)
 
 
 def fused_experts_impl(hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index bc134f676159e..b72f51aa52bfa 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -8,7 +8,7 @@ import torch
 import torch.nn.functional as F
 from torch.nn.parameter import UninitializedParameter
 
-from vllm import envs
+import vllm.envs as envs
 from vllm.config import get_current_vllm_config
 from vllm.distributed import (get_dp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
@@ -16,6 +16,8 @@ from vllm.distributed import (get_dp_group, get_tensor_model_parallel_rank,
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+    is_rocm_aiter_moe_enabled, shuffle_weights)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.utils import set_weight_attrs
@@ -118,6 +120,16 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
             layer.w2_weight.data),
                                              requires_grad=False)
 
+        if is_rocm_aiter_moe_enabled():
+            # reshaping weights is required for aiter moe kernel.
+            shuffled_w13, shuffled_w2 = shuffle_weights(
+                layer.w13_weight.data, layer.w2_weight.data)
+
+            layer.w13_weight = torch.nn.Parameter(shuffled_w13,
+                                                  requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(shuffled_w2,
+                                                 requires_grad=False)
+
         if current_platform.is_cpu():
             if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
                 import intel_extension_for_pytorch as ipex
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
new file mode 100644
index 0000000000000..c9bb676710a78
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -0,0 +1,157 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import List, Optional
+
+import torch
+
+import vllm.envs as envs
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8)
+from vllm.platforms import current_platform
+
+
+def is_rocm_aiter_moe_enabled() -> bool:
+    return current_platform.is_rocm() \
+        and envs.VLLM_ROCM_USE_AITER_MOE \
+        and envs.VLLM_ROCM_USE_AITER \
+
+
+def is_rocm_aiter_block_scaled_moe_enabled() -> bool:
+    return is_rocm_aiter_moe_enabled() and \
+        envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE
+
+
+def rocm_aiter_fused_experts(
+        *,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        use_fp8_w8a8: bool = False,
+        w1_scale: Optional[torch.Tensor] = None,
+        w2_scale: Optional[torch.Tensor] = None,
+        block_shape: Optional[List[int]] = None,
+        expert_mask: Optional[torch.Tensor] = None,
+        **kwagrs  # Ignore additional keyword arguments
+) -> torch.Tensor:
+
+    import aiter as rocm_aiter
+    import aiter.fused_moe_bf16_asm as rocm_aiter_asm_fmoe
+
+    if envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE and use_fp8_w8a8:
+        assert w1_scale is not None
+        assert w2_scale is not None
+
+        local_E = E = w1.shape[0]
+        if expert_mask is not None:
+            E = expert_mask.numel()
+
+        topk = topk_ids.shape[1]
+        model_dim = w1.shape[-1]
+        dtype = hidden_states.dtype
+        # The default block sizes are 128 in AITER.
+        if block_shape is None:
+            block_shape = [128, 128]
+
+        scale_blk_k = block_shape[1]
+
+        (
+            sorted_token_ids,
+            sorted_weight_buf,
+            sorted_expert_ids,
+            num_valid_ids,
+            out_asm,
+        ) = rocm_aiter_asm_fmoe.moe_sorting_ck(topk_ids,
+                                               topk_weights,
+                                               E,
+                                               model_dim,
+                                               dtype,
+                                               expert_mask=expert_mask)
+
+        a1, a1_scale = per_token_group_quant_fp8(hidden_states, scale_blk_k)
+        rocm_aiter.fmoe_fp8_blockscale_g1u1(
+            out_asm,
+            a1,
+            w1,
+            w2,
+            sorted_token_ids,
+            sorted_weight_buf,
+            sorted_expert_ids,
+            num_valid_ids,
+            topk,
+            w1_scale.view(local_E, -1),
+            w2_scale.view(local_E, -1),
+            a1_scale.t().contiguous(),
+            block_shape[0],
+            block_shape[1],
+            None,
+        )
+        return out_asm
+
+    elif use_fp8_w8a8:
+        return rocm_aiter_asm_fmoe.asm_moe(hidden_states=hidden_states,
+                                           w1=w1,
+                                           w2=w2,
+                                           topk_weight=topk_weights,
+                                           topk_ids=topk_ids,
+                                           fc1_scale=w1_scale,
+                                           fc2_scale=w2_scale,
+                                           fc1_smooth_scale=None,
+                                           fc2_smooth_scale=None,
+                                           a16=False)
+
+    return rocm_aiter.ck_moe(hidden_states=hidden_states,
+                             w1=w1,
+                             w2=w2,
+                             topk_weights=topk_weights,
+                             topk_ids=topk_ids)
+
+
+def rocm_aiter_topk_softmax(topk_weights: torch.Tensor,
+                            topk_indices: torch.Tensor,
+                            token_expert_indices: torch.Tensor,
+                            gating_output: torch.Tensor,
+                            renormalize: bool) -> tuple[torch.Tensor, ...]:
+    import aiter as rocm_aiter
+    rocm_aiter.topk_softmax(topk_weights, topk_indices, token_expert_indices,
+                            gating_output, renormalize)
+
+    return topk_weights, topk_indices
+
+
+def shuffle_weights(*tensors: torch.Tensor) -> tuple[torch.Tensor, ...]:
+    """
+    Applies shuffle_weight function from AITER to each 
+    input tensor and returns them.
+
+    Args:
+    *tensors: Variable number of torch.Tensor objects.
+
+    Returns:
+    A tuple of shuffled tensors.
+    """
+    from aiter.ops.shuffle import shuffle_weight
+
+    return tuple(shuffle_weight(tensor) for tensor in tensors)
+
+
+def expand_weights(*tensors: torch.Tensor,
+                   expansion_dims: list[int]) -> tuple[torch.Tensor, ...]:
+    """
+    Expands the dimensions of input tensors.
+
+    Args:
+        *tensors: A variable number of torch.Tensor objects.
+        expansion_dims: A list of expansion dimensions 
+        corresponding to each tensor.
+
+    Returns:
+        A tuple of tensors with expanded dimensions.
+    """
+
+    assert len(tensors) == len(expansion_dims), \
+    "Number of tensors must match the number of expansion dimensions."
+
+    return tuple(
+        tensor.unsqueeze(-1).unsqueeze(-1).expand((-1, dim, -1))
+        for tensor, dim in zip(tensors, expansion_dims))
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index d92b0931a6ee0..bc17a569da2c3 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -13,6 +13,9 @@ from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
                                                   FusedMoeWeightScaleSupported)
+from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+    expand_weights, is_rocm_aiter_block_scaled_moe_enabled,
+    is_rocm_aiter_moe_enabled, shuffle_weights)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization.base_config import (
@@ -554,6 +557,15 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             layer.w2_weight = Parameter(w2_weight, requires_grad=False)
             layer.w2_weight_scale_inv = Parameter(w2_weight_scale_inv,
                                                   requires_grad=False)
+            if is_rocm_aiter_block_scaled_moe_enabled():
+                # reshaping weights is required for aiter moe kernel.
+                shuffled_w13, shuffled_w2 = shuffle_weights(
+                    layer.w13_weight.data, layer.w2_weight.data)
+
+                layer.w13_weight = torch.nn.Parameter(shuffled_w13,
+                                                      requires_grad=False)
+                layer.w2_weight = torch.nn.Parameter(shuffled_w2,
+                                                     requires_grad=False)
             return
 
         # If checkpoint is fp16, quantize in place.
@@ -581,6 +593,26 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                                                   requires_grad=False)
             layer.w2_weight = torch.nn.Parameter(w2_weight,
                                                  requires_grad=False)
+            if is_rocm_aiter_moe_enabled():
+                # reshaping weights is required for aiter moe kernel.
+                w13_scales, w2_scales = expand_weights(
+                    layer.w13_weight_scale.data,
+                    layer.w2_weight_scale.data,
+                    expansion_dims=[
+                        layer.w13_weight.shape[1], layer.w2_weight.shape[1]
+                    ])
+                layer.w13_weight_scale = torch.nn.Parameter(
+                    w13_scales.contiguous(), requires_grad=False)
+                layer.w2_weight_scale = torch.nn.Parameter(
+                    w2_scales.contiguous(), requires_grad=False)
+
+                shuffled_w13, shuffled_w2 = shuffle_weights(
+                    layer.w13_weight, layer.w2_weight)
+
+                layer.w13_weight = torch.nn.Parameter(shuffled_w13,
+                                                      requires_grad=False)
+                layer.w2_weight = torch.nn.Parameter(shuffled_w2,
+                                                     requires_grad=False)
             return
 
         # If checkpoint is fp8, we need to handle that the
@@ -648,6 +680,26 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                             dq_weight, max_w13_scales[expert_id])
                     start += shard_size
 
+            if is_rocm_aiter_moe_enabled():
+                # reshaping weights is required for aiter moe kernel.
+                expansion_dims = [
+                    layer.w13_weight.shape[1], layer.w2_weight.shape[1]
+                ]
+                max_w13_scales, w2_scales = expand_weights(
+                    max_w13_scales,
+                    layer.w2_weight_scale.data,
+                    expansion_dims=expansion_dims)
+                layer.w2_weight_scale = torch.nn.Parameter(
+                    w2_scales.contiguous(), requires_grad=False)
+
+                shuffled_w13, shuffled_w2 = shuffle_weights(
+                    layer.w13_weight, layer.w2_weight)
+
+                layer.w13_weight = torch.nn.Parameter(shuffled_w13,
+                                                      requires_grad=False)
+                layer.w2_weight = torch.nn.Parameter(shuffled_w2,
+                                                     requires_grad=False)
+
             layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
                                                         requires_grad=False)
             return

From 99f536f83093dc22e4220b5bd0f8c63f9e86a406 Mon Sep 17 00:00:00 2001
From: wwl2755 <wangwenlong2755@gmail.com>
Date: Wed, 26 Mar 2025 04:21:15 -0500
Subject: [PATCH 009/593] [Misc] Enhance warning information to user-defined
 chat template (#15408)

Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
---
 tests/entrypoints/test_chat_utils.py  | 10 +++----
 vllm/entrypoints/chat_utils.py        | 40 +++++++++++++++++----------
 vllm/entrypoints/openai/api_server.py | 27 ++++++++++++++++--
 3 files changed, 55 insertions(+), 22 deletions(-)

diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 6efed990b1893..8cc51a5d73b3f 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -9,11 +9,11 @@ from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm.assets.image import ImageAsset
 from vllm.config import ModelConfig
-from vllm.entrypoints.chat_utils import (_resolve_hf_chat_template,
-                                         _try_extract_ast, load_chat_template,
+from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
                                          parse_chat_messages,
                                          parse_chat_messages_futures,
-                                         resolve_chat_template_content_format)
+                                         resolve_chat_template_content_format,
+                                         resolve_hf_chat_template)
 from vllm.entrypoints.llm import apply_hf_chat_template
 from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.utils import encode_image_base64
@@ -747,7 +747,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
     }] if use_tools else None
 
     # Test detecting the tokenizer's chat_template
-    chat_template = _resolve_hf_chat_template(
+    chat_template = resolve_hf_chat_template(
         tokenizer,
         chat_template=None,
         tools=tools,
@@ -781,7 +781,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
     tokenizer = tokenizer_group.tokenizer
 
     # Test detecting the tokenizer's chat_template
-    chat_template = _resolve_hf_chat_template(
+    chat_template = resolve_hf_chat_template(
         tokenizer,
         chat_template=None,
         tools=None,
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index d3613384590de..73a69d3037f7f 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -306,7 +306,24 @@ def _detect_content_format(
         return "openai"
 
 
-def _resolve_hf_chat_template(
+def resolve_mistral_chat_template(
+    chat_template: Optional[str],
+    **kwargs: Any,
+) -> Optional[str]:
+    if chat_template is not None:
+        logger.warning_once(
+            "'chat_template' cannot be overridden for mistral tokenizer.")
+    if "add_generation_prompt" in kwargs:
+        logger.warning_once(
+            "'add_generation_prompt' is not supported for mistral tokenizer, "
+            "so it will be ignored.")
+    if "continue_final_message" in kwargs:
+        logger.warning_once(
+            "'continue_final_message' is not supported for mistral tokenizer, "
+            "so it will be ignored.")
+    return None
+
+def resolve_hf_chat_template(
     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     chat_template: Optional[str],
     tools: Optional[list[dict[str, Any]]],
@@ -352,7 +369,7 @@ def _resolve_chat_template_content_format(
     trust_remote_code: bool,
 ) -> _ChatTemplateContentFormat:
     if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
-        hf_chat_template = _resolve_hf_chat_template(
+        hf_chat_template = resolve_hf_chat_template(
             tokenizer,
             chat_template=chat_template,
             trust_remote_code=trust_remote_code,
@@ -1140,7 +1157,7 @@ def apply_hf_chat_template(
     tokenize: bool = False,  # Different from HF's default
     **kwargs: Any,
 ) -> str:
-    hf_chat_template = _resolve_hf_chat_template(
+    hf_chat_template = resolve_hf_chat_template(
         tokenizer,
         chat_template=chat_template,
         tools=tools,
@@ -1169,17 +1186,12 @@ def apply_mistral_chat_template(
     tools: Optional[list[dict[str, Any]]],
     **kwargs: Any,
 ) -> list[int]:
-    if chat_template is not None:
-        logger.warning_once(
-            "'chat_template' cannot be overridden for mistral tokenizer.")
-    if "add_generation_prompt" in kwargs:
-        logger.warning_once(
-            "'add_generation_prompt' is not supported for mistral tokenizer, "
-            "so it will be ignored.")
-    if "continue_final_message" in kwargs:
-        logger.warning_once(
-            "'continue_final_message' is not supported for mistral tokenizer, "
-            "so it will be ignored.")
+    # The return value of resolve_mistral_chat_template is always None,
+    # and we won't use it.
+    resolve_mistral_chat_template(
+        chat_template=chat_template,
+        **kwargs,
+    )
 
     return tokenizer.apply_chat_template(
         messages=messages,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index f9b1d69a31d8c..374e43fb15341 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -35,7 +35,9 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine  # type: ignore
 from vllm.engine.multiprocessing.client import MQLLMEngineClient
 from vllm.engine.multiprocessing.engine import run_mp_engine
 from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.chat_utils import load_chat_template
+from vllm.entrypoints.chat_utils import (load_chat_template,
+                                         resolve_hf_chat_template,
+                                         resolve_mistral_chat_template)
 from vllm.entrypoints.launcher import serve_http
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.cli_args import (make_arg_parser,
@@ -84,6 +86,7 @@ from vllm.entrypoints.utils import load_aware_call, with_cancellation
 from vllm.logger import init_logger
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
+from vllm.transformers_utils.tokenizer import MistralTokenizer
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path,
                         is_valid_ipv6_address, set_ulimit)
@@ -883,8 +886,26 @@ async def init_app_state(
 
     resolved_chat_template = load_chat_template(args.chat_template)
     if resolved_chat_template is not None:
-        logger.info("Using supplied chat template:\n%s",
-                    resolved_chat_template)
+        # Get the tokenizer to check official template
+        tokenizer = await engine_client.get_tokenizer()
+
+        if isinstance(tokenizer, MistralTokenizer):
+            # The warning is logged in resolve_mistral_chat_template.
+            resolved_chat_template = resolve_mistral_chat_template(
+                chat_template=resolved_chat_template)
+        else:
+            hf_chat_template = resolve_hf_chat_template(
+                tokenizer,
+                chat_template=None,
+                tools=None,
+                trust_remote_code=model_config.trust_remote_code)
+
+            if hf_chat_template != resolved_chat_template:
+                logger.warning(
+                    "Using supplied chat template: %s\n"
+                    "It is different from official chat template '%s'. "
+                    "This discrepancy may lead to performance degradation.",
+                    resolved_chat_template, args.model)
 
     state.openai_serving_models = OpenAIServingModels(
         engine_client=engine_client,

From 4ec2cee000af209a9499e0696993834af4f45035 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Wed, 26 Mar 2025 18:12:47 +0800
Subject: [PATCH 010/593] [Misc] improve example script output (#15528)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 examples/offline_inference/basic/basic.py    | 5 ++++-
 examples/offline_inference/basic/chat.py     | 5 +++--
 examples/offline_inference/basic/classify.py | 4 +++-
 examples/offline_inference/basic/embed.py    | 4 +++-
 examples/offline_inference/basic/score.py    | 4 +++-
 5 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/examples/offline_inference/basic/basic.py b/examples/offline_inference/basic/basic.py
index a6e96c0bb4339..2ba5ec1192b19 100644
--- a/examples/offline_inference/basic/basic.py
+++ b/examples/offline_inference/basic/basic.py
@@ -18,7 +18,10 @@ llm = LLM(model="facebook/opt-125m")
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
 # Print the outputs.
+print("\nGenerated Outputs:\n" + "-" * 60)
 for output in outputs:
     prompt = output.prompt
     generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
\ No newline at end of file
+    print(f"Prompt:    {prompt!r}")
+    print(f"Output:    {generated_text!r}")
+    print("-" * 60)
\ No newline at end of file
diff --git a/examples/offline_inference/basic/chat.py b/examples/offline_inference/basic/chat.py
index b2523e533a40a..2dea45f843cf3 100644
--- a/examples/offline_inference/basic/chat.py
+++ b/examples/offline_inference/basic/chat.py
@@ -27,12 +27,13 @@ def main(args: dict):
         sampling_params.top_k = top_k
 
     def print_outputs(outputs):
+        print("\nGenerated Outputs:\n" + "-" * 80)
         for output in outputs:
             prompt = output.prompt
             generated_text = output.outputs[0].text
-            print(f"Prompt: {prompt!r}")
+            print(f"Prompt: {prompt!r}\n")
             print(f"Generated text: {generated_text!r}")
-        print("-" * 80)
+            print("-" * 80)
 
     print("=" * 80)
 
diff --git a/examples/offline_inference/basic/classify.py b/examples/offline_inference/basic/classify.py
index 4ef949b4784de..72c29e4c77c30 100644
--- a/examples/offline_inference/basic/classify.py
+++ b/examples/offline_inference/basic/classify.py
@@ -23,12 +23,14 @@ def main(args: Namespace):
     outputs = model.classify(prompts)
 
     # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
     for prompt, output in zip(prompts, outputs):
         probs = output.outputs.probs
         probs_trimmed = ((str(probs[:16])[:-1] +
                           ", ...]") if len(probs) > 16 else probs)
-        print(f"Prompt: {prompt!r} | "
+        print(f"Prompt: {prompt!r} \n"
               f"Class Probabilities: {probs_trimmed} (size={len(probs)})")
+        print("-" * 60)
 
 
 if __name__ == "__main__":
diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py
index f1655b6dbe111..0283909a2a84a 100644
--- a/examples/offline_inference/basic/embed.py
+++ b/examples/offline_inference/basic/embed.py
@@ -23,12 +23,14 @@ def main(args: Namespace):
     outputs = model.embed(prompts)
 
     # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
     for prompt, output in zip(prompts, outputs):
         embeds = output.outputs.embedding
         embeds_trimmed = ((str(embeds[:16])[:-1] +
                            ", ...]") if len(embeds) > 16 else embeds)
-        print(f"Prompt: {prompt!r} | "
+        print(f"Prompt: {prompt!r} \n"
               f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
+        print("-" * 60)
 
 
 if __name__ == "__main__":
diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py
index 2d21f1f0e3971..83b8253f4e257 100644
--- a/examples/offline_inference/basic/score.py
+++ b/examples/offline_inference/basic/score.py
@@ -22,9 +22,11 @@ def main(args: Namespace):
     outputs = model.score(text_1, texts_2)
 
     # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
     for text_2, output in zip(texts_2, outputs):
         score = output.outputs.score
-        print(f"Pair: {[text_1, text_2]!r} | Score: {score}")
+        print(f"Pair: {[text_1, text_2]!r} \nScore: {score}")
+        print("-" * 60)
 
 
 if __name__ == "__main__":

From cf5c8f1686d810883f27974fa4433f0f95c94cbe Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 26 Mar 2025 10:13:38 +0000
Subject: [PATCH 011/593] Separate base model from `TransformersModel` (#15467)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 docs/source/models/supported_models.md      |   6 +-
 tests/distributed/test_pipeline_parallel.py |   2 +-
 tests/models/registry.py                    |   2 +-
 vllm/model_executor/model_loader/utils.py   |   6 +-
 vllm/model_executor/models/registry.py      |   4 +-
 vllm/model_executor/models/transformers.py  | 149 +++++++++++++-------
 6 files changed, 110 insertions(+), 59 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index f106195e10585..8ff18a17d36c3 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -57,10 +57,10 @@ llm = LLM(model=..., task="generate")  # Name or path of your model
 llm.apply_model(lambda model: print(type(model)))
 ```
 
-If it is `TransformersModel` then it means it's based on Transformers!
+If it is `TransformersForCausalLM` then it means it's based on Transformers!
 
 :::{tip}
-You can force the use of `TransformersModel` by setting `model_impl="transformers"` for <project:#offline-inference> or `--model-impl transformers` for the <project:#openai-compatible-server>.
+You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for <project:#offline-inference> or `--model-impl transformers` for the <project:#openai-compatible-server>.
 :::
 
 :::{note}
@@ -119,7 +119,7 @@ Here is what happens in the background:
 
 1. The config is loaded
 2. `MyModel` Python class is loaded from the `auto_map`, and we check that the model `_supports_attention_backend`.
-3. The `TransformersModel` backend is used. See <gh-file:vllm/model_executor/models/transformers.py>, which leverage `self.config._attn_implementation = "vllm"`, thus the need to use `ALL_ATTENTION_FUNCTION`.
+3. The `TransformersForCausalLM` backend is used. See <gh-file:vllm/model_executor/models/transformers.py>, which leverage `self.config._attn_implementation = "vllm"`, thus the need to use `ALL_ATTENTION_FUNCTION`.
 
 To make your model compatible with tensor parallel, it needs:
 
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index e757db45c8cf5..751c4eb096ae0 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -175,7 +175,7 @@ TEXT_GENERATION_MODELS = {
     "inceptionai/jais-13b-chat": PPTestSettings.fast(),
     "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
     "meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(),
-    # Tests TransformersModel
+    # Tests TransformersForCausalLM
     "ArthurZ/Ilama-3.2-1B": PPTestSettings.fast(),
     "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(),
     "openbmb/MiniCPM3-4B": PPTestSettings.fast(),
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 5c84e85aaa907..d7946b75b7978 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -319,7 +319,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
 }
 
 _FALLBACK_MODEL = {
-    "TransformersModel": _HfExamplesInfo("ArthurZ/Ilama-3.2-1B", trust_remote_code=True),  # noqa: E501
+    "TransformersForCausalLM": _HfExamplesInfo("ArthurZ/Ilama-3.2-1B", trust_remote_code=True),  # noqa: E501
 }
 
 _EXAMPLE_MODELS = {
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index ce90614329725..a252c7f8e57bc 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -45,7 +45,7 @@ def is_transformers_impl_compatible(
 def resolve_transformers_fallback(model_config: ModelConfig,
                                   architectures: list[str]):
     for i, arch in enumerate(architectures):
-        if arch == "TransformersModel":
+        if arch == "TransformersForCausalLM":
             continue
         auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
                                            None) or dict()
@@ -69,7 +69,7 @@ def resolve_transformers_fallback(model_config: ModelConfig,
                 raise ValueError(
                     f"The Transformers implementation of {arch} is not "
                     "compatible with vLLM.")
-            architectures[i] = "TransformersModel"
+            architectures[i] = "TransformersForCausalLM"
         if model_config.model_impl == ModelImpl.AUTO:
             if not is_transformers_impl_compatible(arch, custom_model_module):
                 raise ValueError(
@@ -80,7 +80,7 @@ def resolve_transformers_fallback(model_config: ModelConfig,
                 "%s has no vLLM implementation, falling back to Transformers "
                 "implementation. Some features may not be supported and "
                 "performance may not be optimal.", arch)
-            architectures[i] = "TransformersModel"
+            architectures[i] = "TransformersForCausalLM"
     return architectures
 
 
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 7c8e506713833..7797d9a2cc203 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -201,7 +201,7 @@ _SPECULATIVE_DECODING_MODELS = {
 }
 
 _FALLBACK_MODEL = {
-    "TransformersModel": ("transformers", "TransformersModel"),
+    "TransformersForCausalLM": ("transformers", "TransformersForCausalLM"),
 }
 # yapf: enable
 
@@ -425,7 +425,7 @@ class _ModelRegistry:
 
         # make sure Transformers fallback are put at the last
         if len(normalized_arch) != len(architectures):
-            normalized_arch.append("TransformersModel")
+            normalized_arch.append("TransformersForCausalLM")
         return normalized_arch
 
     def inspect_model_cls(
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 56ec00dcf222c..6ea149506581c 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -43,7 +43,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
-from .utils import (PPMissingLayer, is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
+                    is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, maybe_prefix)
 
 logger = init_logger(__name__)
@@ -110,13 +111,9 @@ def replace_linear_class(
     )
 
 
-@support_torch_compile
-class TransformersModel(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
-    embedding_padding_modules = ["lm_head"]
-    embedding_modules = ["embed_tokens"
-                         ]  # TODO transformers will have a util to get it
+class TransformersModel(nn.Module):
 
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         logger.info("Using Transformers backend.")
 
@@ -134,9 +131,6 @@ class TransformersModel(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
         self.parallel_config = parallel_config
         self.quant_config = quant_config
 
-        self.vocab_size = model_config.get_vocab_size()
-        self.unpadded_vocab_size = model_config.get_vocab_size()
-
         self.pp_group = get_pp_group()
         self.pp_size = self.pp_group.world_size
         self.pp_rank = self.pp_group.rank_in_group
@@ -144,13 +138,15 @@ class TransformersModel(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
 
         # Use meta device to delay allocating GPU tensors
         with torch.device("meta"):
+            # FIXME(Isotr0py): We need to refactor this part in the future to
+            # avoid registering an extra model layer, otherwise we will need a
+            # weights mapper to rename weights.
             self.model: PreTrainedModel = AutoModel.from_config(
                 config,
                 attn_implementation="vllm",
                 torch_dtype=model_config.dtype,
                 trust_remote_code=model_config.trust_remote_code,
             )
-        prefix = self.model.base_model_prefix
 
         self.pipeline_parallel()
         self.tensor_parallel()
@@ -168,32 +164,12 @@ class TransformersModel(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
         # Attention layers
         self.attention_instances = self.create_attention_instances()
 
-        # Output embeddings
-        if not isinstance(getattr(self, "lm_head", None), PPMissingLayer):
-            self.unpadded_vocab_size = config.vocab_size
-            self.lm_head = ParallelLMHead(
-                config.vocab_size,
-                config.hidden_size,
-                quant_config=quant_config,
-                prefix=maybe_prefix(prefix, "lm_head"),
-            )
-            if config.tie_word_embeddings:
-                self.lm_head = self.lm_head.tie_weights(
-                    self.model.get_input_embeddings())
-
-            logit_scale = getattr(config, "logit_scale", 1.0)
-            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                    config.vocab_size,
-                                                    logit_scale)
-
         # Initialize buffers (e.g. rotary embedding inverse frequency)
         self.init_buffers(self.model)
 
         # Move remaining meta tensors to device (should happen last)
         self.meta_to_empty(self.model)
 
-        self.sampler = get_sampler()
-
         self.make_empty_intermediate_tensors = (
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.hidden_size))
@@ -248,9 +224,6 @@ class TransformersModel(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
             if not self.pp_group.is_last_rank:
                 setattr(self.model, name, PPMissingLayer())
 
-        if not self.pp_group.is_last_rank:
-            self.lm_head = PPMissingLayer()
-
     def tensor_parallel(self):
         """
         Apply the model's tensor parallelization plan.
@@ -331,6 +304,9 @@ class TransformersModel(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
         for child in module.children():
             self.meta_to_empty(child)
 
+    def get_input_embeddings(self) -> nn.Module:
+        return self.model.get_input_embeddings()
+
     def forward(
         self,
         input_ids: Optional[torch.Tensor],
@@ -361,21 +337,6 @@ class TransformersModel(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
 
         return hidden_states
 
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[torch.Tensor]:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
-        return logits
-
-    def sample(self, logits: torch.Tensor,
-               sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]:
-
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
         params_dict = dict(self.named_parameters())
@@ -393,3 +354,93 @@ class TransformersModel(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
                 weight_loader(param, loaded_weight)
                 loaded_params.add(name)
         return loaded_params
+
+
+@support_torch_compile
+class TransformersForCausalLM(nn.Module, SupportsQuant, SupportsLoRA,
+                              SupportsPP):
+    embedding_padding_modules = ["lm_head"]
+    embedding_modules = ["embed_tokens"
+                         ]  # TODO transformers will have a util to get it
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: PretrainedConfig = vllm_config.model_config.hf_config
+        quant_config: QuantizationConfig = vllm_config.quant_config
+
+        self.config = config
+
+        self.model = TransformersModel(vllm_config=vllm_config, prefix=prefix)
+
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+            if config.tie_word_embeddings:
+                self.lm_head = self.lm_head.tie_weights(
+                    self.model.get_input_embeddings())
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    config.vocab_size,
+                                                    logit_scale)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.sampler = get_sampler()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    # FIXME(Isotr0py): Don't use any weights mapper for Transformers fallback,
+    # this makes thing complicated. We need to remove this mapper after refactor
+    # `TransformersModel` in the future.
+    @property
+    def hf_to_vllm_mapper(self):
+        prefix_mapper = {
+            name: "model." + name
+            for name, _ in self.model.model.named_children()
+        }
+        return WeightsMapper(
+            orig_to_new_substr={"model.": "model.model."},
+            orig_to_new_prefix=prefix_mapper,
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        model_output = self.model(input_ids, positions, intermediate_tensors,
+                                  inputs_embeds)
+        return model_output
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(self, logits: torch.Tensor,
+               sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]:
+
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)

From 1aa162e030d62fcf476388ac77c141cb5b52957b Mon Sep 17 00:00:00 2001
From: cyyever <cyyever@outlook.com>
Date: Wed, 26 Mar 2025 20:09:06 +0800
Subject: [PATCH 012/593] Apply torchfix (#15532)

Signed-off-by: cyy <cyyever@outlook.com>
---
 vllm/attention/backends/rocm_flash_attn.py | 5 ++---
 vllm/lora/models.py                        | 4 +++-
 vllm/model_executor/models/nemotron.py     | 6 +++---
 vllm/model_executor/models/phi4mm_utils.py | 9 ++++++---
 vllm/multimodal/image.py                   | 2 +-
 5 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index c47202099ac60..34f5fedcf36e8 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -884,9 +884,8 @@ def _sdpa_attention(
 
     for i, seq_len in enumerate(seq_lens):
         end = start + seq_len
-        with torch.backends.cuda.sdp_kernel(enable_math=True,
-                                            enable_flash=False,
-                                            enable_mem_efficient=False):
+        with torch.nn.attention.sdpa_kernel(
+                torch.nn.attention.SDPBackend.MATH):
             sub_out = torch.nn.functional.scaled_dot_product_attention(
                 query[:, start:end, :],
                 key[:, start:end, :],
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 22a45b60ca399..8164d919ca8b4 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -272,7 +272,9 @@ class LoRAModel(AdapterModel):
                     f" target modules in {expected_lora_modules}"
                     f" but received {unexpected_modules}."
                     f" Please verify that the loaded LoRA module is correct")
-            tensors = torch.load(lora_bin_file_path, map_location=device)
+            tensors = torch.load(lora_bin_file_path,
+                                 map_location=device,
+                                 weights_only=True)
         else:
             raise ValueError(f"{lora_dir} doesn't contain tensors")
 
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index a2b4949496897..0ea296b2f93d1 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -63,8 +63,8 @@ def _cast_if_autocast_enabled(*args):
     if not torch.is_autocast_enabled():
         return args
     else:
-        return torch.cuda.amp.autocast_mode._cast(
-            args, torch.get_autocast_gpu_dtype())
+        return torch.amp.autocast_mode._cast(
+            args, device_type="cuda", dtype=torch.get_autocast_gpu_dtype())
 
 
 class NemotronLayerNorm1P(nn.LayerNorm):
@@ -89,7 +89,7 @@ class NemotronLayerNorm1P(nn.LayerNorm):
             residual = x
         args = _cast_if_autocast_enabled(x, self.normalized_shape,
                                          self.weight + 1, self.bias, self.eps)
-        with torch.cuda.amp.autocast(enabled=False):
+        with torch.amp.autocast("cuda", enabled=False):
             x = torch.nn.functional.layer_norm(*args)
             return x if residual is None else (x, residual)
 
diff --git a/vllm/model_executor/models/phi4mm_utils.py b/vllm/model_executor/models/phi4mm_utils.py
index ca00207a9b6f7..9f08a1c4c6f5a 100644
--- a/vllm/model_executor/models/phi4mm_utils.py
+++ b/vllm/model_executor/models/phi4mm_utils.py
@@ -1766,9 +1766,12 @@ class MultiHeadedAttention(nn.Module):
                 if mask.dtype != q.dtype:
                     attn_mask = attn_mask.to(q.dtype)
 
-            with torch.backends.cuda.sdp_kernel(enable_flash=True,
-                                                enable_math=True,
-                                                enable_mem_efficient=True):
+            with torch.nn.attention.sdpa_kernel([
+                    torch.nn.attention.SDPBackend.FLASH_ATTENTION,
+                    torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION,
+                    torch.nn.attention.SDPBackend.MATH,
+                    torch.nn.attention.SDPBackend.CUDNN_ATTENTION,
+            ]):
                 x = torch.nn.functional.scaled_dot_product_attention(
                     q,
                     k,
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 255fac30bd78a..0c5a84c6508a1 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -149,7 +149,7 @@ class ImageEmbeddingMediaIO(MediaIO[torch.Tensor]):
         return self.load_bytes(base64.b64decode(data))
 
     def load_file(self, filepath: Path) -> torch.Tensor:
-        return torch.load(filepath)
+        return torch.load(filepath, weights_only=True)
 
     def encode_base64(self, media: torch.Tensor) -> str:
         return base64.b64encode(media.numpy()).decode('utf-8')

From c091c0a58898b8a0a76e18bd6724732d80fcfc28 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 26 Mar 2025 14:26:48 +0000
Subject: [PATCH 013/593] Improve validation of TP in Transformers backend
 (#15540)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/transformers.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 6ea149506581c..bdc390689104e 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -229,7 +229,10 @@ class TransformersModel(nn.Module):
         Apply the model's tensor parallelization plan.
         Currently only supports linear layers.
         """
-        if self.tp_size > 1 and self.config.base_model_tp_plan is None:
+        if not self.model.supports_tp_plan:
+            if self.tp_size <= 1:
+                return
+
             raise ValueError(
                 f"{type(self.model)} does not support tensor parallel yet!")
 

From 1711b929b6fadd02a7d66d936ec6ffd24e4c3b54 Mon Sep 17 00:00:00 2001
From: Alex Brooks <alex.brooks@ibm.com>
Date: Wed, 26 Mar 2025 08:28:07 -0600
Subject: [PATCH 014/593] [Model] Add Reasoning Parser for Granite Models
 (#14202)

Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
Co-authored-by: Joe Runde <joe@joerun.de>
---
 docs/source/features/reasoning_outputs.md     |   7 +-
 .../openai_chat_completion_with_reasoning.py  |   1 +
 ...hat_completion_with_reasoning_streaming.py |   1 +
 .../test_granite_reasoning_parser.py          | 349 +++++++++++++++++
 vllm/engine/arg_utils.py                      |   2 +-
 .../openai/reasoning_parsers/__init__.py      |   6 +-
 .../granite_reasoning_parser.py               | 363 ++++++++++++++++++
 .../guided_decoding/reasoner/__init__.py      |   4 +
 8 files changed, 730 insertions(+), 3 deletions(-)
 create mode 100644 tests/entrypoints/openai/reasoning_parsers/test_granite_reasoning_parser.py
 create mode 100644 vllm/entrypoints/openai/reasoning_parsers/granite_reasoning_parser.py

diff --git a/docs/source/features/reasoning_outputs.md b/docs/source/features/reasoning_outputs.md
index 0b170aadc3443..879b16d4f7b50 100644
--- a/docs/source/features/reasoning_outputs.md
+++ b/docs/source/features/reasoning_outputs.md
@@ -4,7 +4,7 @@
 
 vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions.
 
-Reasoning models return a additional `reasoning_content` field in their outputs, which contains the reasoning steps that led to the final conclusion. This field is not present in the outputs of other models.
+Reasoning models return an additional `reasoning_content` field in their outputs, which contains the reasoning steps that led to the final conclusion. This field is not present in the outputs of other models.
 
 ## Supported Models
 
@@ -14,6 +14,9 @@ vLLM currently supports the following reasoning models:
 |--------------|-------------|------------------|-------------|
 | [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` | ❌ |
 | [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ |
+| [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
+
+- IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
 
 ## Quickstart
 
@@ -43,6 +46,7 @@ model = models.data[0].id
 
 # Round 1
 messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
 response = client.chat.completions.create(model=model, messages=messages)
 
 reasoning_content = response.choices[0].message.reasoning_content
@@ -97,6 +101,7 @@ models = client.models.list()
 model = models.data[0].id
 
 messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
 stream = client.chat.completions.create(model=model,
                                         messages=messages,
                                         stream=True)
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py
index b5dbed1205d35..e753cedcdc08d 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning.py
@@ -31,6 +31,7 @@ model = models.data[0].id
 
 # Round 1
 messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
 response = client.chat.completions.create(model=model, messages=messages)
 
 reasoning_content = response.choices[0].message.reasoning_content
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
index fe4332576d438..cb13b0c614aa1 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
@@ -38,6 +38,7 @@ models = client.models.list()
 model = models.data[0].id
 
 messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
 stream = client.chat.completions.create(model=model,
                                         messages=messages,
                                         stream=True)
diff --git a/tests/entrypoints/openai/reasoning_parsers/test_granite_reasoning_parser.py b/tests/entrypoints/openai/reasoning_parsers/test_granite_reasoning_parser.py
new file mode 100644
index 0000000000000..84ac6600498b2
--- /dev/null
+++ b/tests/entrypoints/openai/reasoning_parsers/test_granite_reasoning_parser.py
@@ -0,0 +1,349 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+from transformers import AutoTokenizer
+
+from tests.entrypoints.openai.reasoning_parsers.utils import (
+    DeltaMessage, run_reasoning_extraction)
+from vllm.entrypoints.openai.reasoning_parsers import (ReasoningParser,
+                                                       ReasoningParserManager)
+
+parser_name = "granite"
+START_REASONING = "Here is my thought process:"
+START_RESPONSE = "Here is my response:"
+
+SIMPLE_REASONING = {
+    "output":
+    f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest",  #noqa: E501
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+}
+COMPLETE_REASONING = {
+    "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+}
+NO_REASONING = {
+    "output": "This is content",
+    "reasoning_content": None,
+    "content": "This is content",
+}
+MULTIPLE_LINES = {
+    "output":
+    f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
+    "reasoning_content": "This\nThat",
+    "content": "This is the rest\nThat",
+}
+REASONING_WITH_THINK = {
+    "output":
+    f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest",  #noqa: E501
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+}
+COMPLETE_REASONING_WITH_THINK = {
+    "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+}
+MULTIPLE_LINES_WITH_THINK = {
+    "output":
+    f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
+    "reasoning_content": "This\nThat",
+    "content": "This is the rest\nThat",
+}
+
+TEST_CASES = [
+    pytest.param(
+        False,
+        SIMPLE_REASONING,
+        id="simple_reasoning",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING,
+        id="complete_reasoning",
+    ),
+    pytest.param(
+        False,
+        NO_REASONING,
+        id="no_reasoning",
+    ),
+    pytest.param(
+        False,
+        MULTIPLE_LINES,
+        id="multiple_lines",
+    ),
+    pytest.param(
+        False,
+        REASONING_WITH_THINK,
+        id="reasoning_with_think",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING_WITH_THINK,
+        id="complete_reasoning_with_think",
+    ),
+    pytest.param(
+        False,
+        MULTIPLE_LINES_WITH_THINK,
+        id="multiple_lines_with_think",
+    ),
+    pytest.param(
+        True,
+        SIMPLE_REASONING,
+        id="simple_reasoning_streaming",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING,
+        id="complete_reasoning_streaming",
+    ),
+    pytest.param(
+        True,
+        NO_REASONING,
+        id="no_reasoning_streaming",
+    ),
+    pytest.param(
+        True,
+        MULTIPLE_LINES,
+        id="multiple_lines_streaming",
+    ),
+    pytest.param(
+        True,
+        REASONING_WITH_THINK,
+        id="reasoning_with_think_streaming",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING_WITH_THINK,
+        id="complete_reasoning_with_think_streaming",
+    ),
+    pytest.param(
+        True,
+        MULTIPLE_LINES_WITH_THINK,
+        id="multiple_lines_with_think_streaming",
+    ),
+]
+
+# Global tokenizer initialization to avoid repeated loading
+tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
+
+
+@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def test_reasoning(
+    streaming: bool,
+    param_dict: dict,
+):
+    output = tokenizer.tokenize(param_dict["output"])
+    # decode everything to tokens
+    output_tokens: list[str] = [
+        tokenizer.convert_tokens_to_string([token]) for token in output
+    ]
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
+        parser_name)(tokenizer)
+
+    reasoning, content = run_reasoning_extraction(parser,
+                                                  output_tokens,
+                                                  streaming=streaming)
+
+    assert reasoning == param_dict["reasoning_content"]
+    assert content == param_dict["content"]
+
+
+# Additional tests for verifying the correctness of granite streaming; this
+# is complicated because granite uses multiple tokens to indicate when thinking
+# is starting / when it's starting its response, so skipping special tokens
+# is awkward.
+
+### Handling the start of reasoning
+STREAMING_1 = {
+    "previous_text": None,
+    "current_text": "Here",
+    "delta_text": "Here",
+    "reasoning_content": None,
+    "content": None,
+}
+# When we fail, we should give what was previously being silenced first
+STREAMING_2 = {
+    "previous_text": "Here is my thought",
+    "current_text": "Here is my thought failure",
+    "delta_text": " failure",
+    "reasoning_content": None,
+    "content": "Here is my thought failure",
+}
+# But then after the first one, we should only add the delta text to content
+STREAMING_3 = {
+    "previous_text": "Here wrong",
+    "current_text": " words",
+    "delta_text": " Here wrong words",
+    "reasoning_content": None,
+    "content": " words",
+}
+# But then after the first one, we should only add the delta text to content
+STREAMING_4 = {
+    "previous_text": "Here is my thought",
+    "current_text": "Here is my thought process:",
+    "delta_text": " process:",
+    "reasoning_content": None,
+    "content": None,
+}
+# Reasoning started successfully; parse reasoning content
+STREAMING_5 = {
+    "previous_text": "Here is my thought process:",
+    "current_text": "Here is my thought process: foo",
+    "delta_text": " foo",
+    "reasoning_content": " foo",
+    "content": None,
+}
+# Response special sequence has started, but not finished.
+STREAMING_6 = {
+    "previous_text": "Here is my thought process: foo",
+    "current_text": "Here is my thought process: foo Here is",
+    "delta_text": " Here is",
+    "reasoning_content": " ",
+    "content": None,
+}
+# Response special sequence started, but was broken; the reasoning
+# content should be the content that was previously unused.
+STREAMING_7 = {
+    "previous_text": "Here is my thought process: foo Here is",
+    "current_text": "Here is my thought process: foo Here is Here",
+    "delta_text": " Here",
+    "reasoning_content": "Here is ",
+    "content": None,
+}
+# Response special sequence is ongoing
+STREAMING_8 = {
+    "previous_text": "Here is my thought process: foo Here is my response:",
+    "current_text": "Here is my thought process: foo Here is my response: bar",
+    "delta_text": " bar",
+    "reasoning_content": None,
+    "content": " bar",
+}
+# The delta text has everything; we should be able to correctly parse both
+STREAMING_9 = {
+    "previous_text": None,
+    "current_text": "Here is my thought process: foo Here is my response: bar",
+    "delta_text": "Here is my thought process: foo Here is my response: bar",
+    "reasoning_content": " foo ",
+    "content": " bar",
+}
+## The Response is ongoing, and the delta mixes reasoning content / content
+STREAMING_10 = {
+    "previous_text": "Here is my thought process: foo",
+    "current_text":
+    "Here is my thought process: foo bar Here is my response: baz",
+    "delta_text": " bar Here is my response: baz",
+    "reasoning_content": " bar ",
+    "content": " baz",
+}
+# The delta text starts a new substring that might be a response special seq
+STREAMING_11 = {
+    "previous_text":
+    "Here is my thought process: This is a reasoning section ",
+    "current_text":
+    "Here is my thought process: This is a reasoning section Here",
+    "delta_text": "Here",
+    "reasoning_content": None,
+    "content": None,
+}
+# The delta text is finishing the response special seq
+STREAMING_12 = {
+    "previous_text": "Here is my thought process: foo Here is my response",
+    "current_text": "Here is my thought process: foo Here is my response:",
+    "delta_text": ":",
+    "reasoning_content": None,
+    "content": None,
+}
+STREAMING_13 = {
+    "previous_text": "Here is my thought process: foo Here",
+    "current_text": "Here is my thought process: foo Here was",
+    "delta_text": " was",
+    "reasoning_content": "Here was",
+    "content": None,
+}
+
+STREAMING_SUBCASES = [
+    pytest.param(
+        STREAMING_1,
+        id="Starting reasoning special sequence",
+    ),
+    pytest.param(
+        STREAMING_2,
+        id="Unexpected start reasoning sequence",
+    ),
+    pytest.param(
+        STREAMING_3,
+        id="Continuing unexpected start reasoning sequence",
+    ),
+    pytest.param(
+        STREAMING_4,
+        id="Only start reasoning sequence and nothing else",
+    ),
+    pytest.param(
+        STREAMING_5,
+        id="Reasoning content has started",
+    ),
+    pytest.param(
+        STREAMING_6,
+        id="Response special sequence has started",
+    ),
+    pytest.param(
+        STREAMING_7,
+        id="Response special sequence reset",
+    ),
+    pytest.param(
+        STREAMING_8,
+        id="Response text has started",
+    ),
+    pytest.param(
+        STREAMING_9,
+        id="Delta contains everything",
+    ),
+    pytest.param(
+        STREAMING_10,
+        id="Delta contains some reasoning and response",
+    ),
+    pytest.param(
+        STREAMING_11,
+        id="Delta starts response sequence",
+    ),
+    pytest.param(
+        STREAMING_12,
+        id="Delta finishes response sequence",
+    ),
+    pytest.param(
+        STREAMING_13,
+        id="Delta breaks potential responise sequence",
+    ),
+]
+
+
+@pytest.mark.parametrize("param_dict", STREAMING_SUBCASES)
+def test_streaming_subcases(param_dict):
+    # Get all of the token IDs
+    previous_token_ids = tokenizer.encode(
+        param_dict["previous_text"]
+    ) if param_dict["previous_text"] is not None else []
+    current_token_ids = tokenizer.encode(param_dict["current_text"])
+    delta_token_ids = tokenizer.encode(param_dict["delta_text"])
+
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
+        parser_name)(tokenizer)
+
+    response = parser.extract_reasoning_content_streaming(
+        previous_text=param_dict["previous_text"],
+        current_text=param_dict["current_text"],
+        delta_text=param_dict["delta_text"],
+        previous_token_ids=previous_token_ids,
+        current_token_ids=current_token_ids,
+        delta_token_ids=delta_token_ids,
+    )
+    # Streaming currently expects at least one of reasoning content / content,
+    # so the response should return None in that case.
+    if param_dict["reasoning_content"] is None and param_dict[
+            "content"] is None:
+        assert response is None
+    else:
+        assert isinstance(response, DeltaMessage)
+        assert param_dict["reasoning_content"] == response.reasoning_content
+        assert param_dict["content"] == response.content
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 75ac326aaa3d6..be00689f2b55f 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1099,7 +1099,7 @@ class EngineArgs:
         parser.add_argument(
             "--reasoning-parser",
             type=str,
-            choices=["deepseek_r1"],
+            choices=["deepseek_r1", "granite"],
             default=None,
             help=
             "Select the reasoning parser depending on the model that you're "
diff --git a/vllm/entrypoints/openai/reasoning_parsers/__init__.py b/vllm/entrypoints/openai/reasoning_parsers/__init__.py
index 80354d69b50af..45132a780e5b2 100644
--- a/vllm/entrypoints/openai/reasoning_parsers/__init__.py
+++ b/vllm/entrypoints/openai/reasoning_parsers/__init__.py
@@ -2,7 +2,11 @@
 
 from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
 from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
+from .granite_reasoning_parser import GraniteReasoningParser
 
 __all__ = [
-    "ReasoningParser", "ReasoningParserManager", "DeepSeekR1ReasoningParser"
+    "ReasoningParser",
+    "ReasoningParserManager",
+    "DeepSeekR1ReasoningParser",
+    "GraniteReasoningParser",
 ]
diff --git a/vllm/entrypoints/openai/reasoning_parsers/granite_reasoning_parser.py b/vllm/entrypoints/openai/reasoning_parsers/granite_reasoning_parser.py
new file mode 100644
index 0000000000000..117d051a73782
--- /dev/null
+++ b/vllm/entrypoints/openai/reasoning_parsers/granite_reasoning_parser.py
@@ -0,0 +1,363 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import re
+from collections.abc import Sequence
+from typing import Optional, Union
+
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage)
+from vllm.entrypoints.openai.reasoning_parsers.abs_reasoning_parsers import (
+    ReasoningParser, ReasoningParserManager)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+@ReasoningParserManager.register_module("granite")
+class GraniteReasoningParser(ReasoningParser):
+    """
+    Reasoning parser for IBM Granite.
+
+    IBM granite models currently use "Here is my thought process:"
+    and "Here is my response:" to separate its thinking / response outputs.
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        super().__init__(tokenizer)
+
+        # NOTE: There have been some observed occurrences of quantized
+        # instances of the current models using "Here's" instead of "Here is",
+        # so to be safe, we match on both.
+        self.think_start_expr = r"(?:Here's|Here is) my thought process:"
+        self.response_start_expr = r"(?:Here's|Here is) my response:"
+
+        self.reasoning_regex = re.compile(
+            rf"{self.think_start_expr}(.*?){self.response_start_expr}(.*)",
+            re.DOTALL)
+
+        self.valid_think_starts = [
+            "Here's my thought process:", "Here is my thought process:"
+        ]
+        self.valid_response_starts = [
+            "Here's my response:", "Here is my response:"
+        ]
+
+        # Substrings to match for sequence boundaries on raw text
+        self.seq_boundary_end = ":"
+        self.seq_boundary_start = "Here"
+
+        # The longest any thinking / start of response message can be
+        self.longest_think_start = max(
+            len(think_start) for think_start in self.valid_think_starts)
+
+    def extract_reasoning_content(
+            self, model_output: str, request: ChatCompletionRequest
+    ) -> tuple[Optional[str], Optional[str]]:
+        """Extract the reasoning content & content sections, respectively.
+        If the sequence doesn't match what we expect, i.e., the model generates
+        something else, all content is considered non-reasoning content.
+
+        Args:
+            model_output (str): Output of the model to be parsed.
+            request (ChatCompletionReqest): Request being processed.
+
+        Returns:
+            tuple[Optional[str], Optional[str]]: Tuple pair containing the
+            reasoning content and non-reasoning content.
+        """
+        re_match = self.reasoning_regex.findall(model_output)
+        if not re_match:
+            return None, model_output
+        reasoning_content, response_content = re_match[0]
+        if not response_content:
+            return reasoning_content, None
+        return reasoning_content, response_content
+
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+        """Extract the reasoning content / content emitted by granite models;
+        If the sequence doesn't match what we expect, i.e., the model generates
+        something else, all content is considered non-reasoning content.
+
+        NOTE: Granite models do not use a special token to start their reasoning
+        and response sections; instead they have token sequences, e.g.,
+
+                Here is my thought process: Foo Here is my response: Bar
+
+        This increases the complexity of correctly handling streams, since we
+        need to watch for specific sequences and correctly parse them without
+        dropping content that is potentially overlapping & spanning multiple
+        delta messages.
+
+        Args:
+            previous_text (str): Previous text outside of this delta message.
+            current_text (str): Previous text + delta text.
+            delta_text (str): Text to consider and parse content from.
+            previous_token_ids (Sequence[int]): Token IDs of previous_text.
+            current_token_ids (Sequence[int]): Token IDs of current_text.
+            delta_token_ids (Sequence[int]): Token IDs of delta_text.
+
+        Returns:
+            Union[DeltaMessage, None]
+                DeltaMessage with either reasoning content or content, or None.
+        """
+        reasoning_content, resp_seq_len, content = self._get_content_sections(
+            current_text)
+        # Either we haven't finished the start of the reasoning sequence,
+        # or the model is generating something unexpected.
+        if not reasoning_content:
+            delta_message = self._get_delta_message_with_no_reasoning_bounds(
+                current_text, delta_text)
+        # We have a start of reasoning message, but have not yet finished
+        # the start of response sequence.
+        elif not content:
+            delta_message = self._get_delta_message_with_no_response_bounds(
+                current_text, reasoning_content, delta_text)
+        # We've finished both the start of reasoning and start of response seq.
+        else:
+            # This should never happen since we matched on the response
+            assert resp_seq_len is not None
+            delta_message = self._get_delta_message_with_both_bounds(
+                delta_text, reasoning_content, content, current_text,
+                resp_seq_len)
+        if not delta_message.content and not delta_message.reasoning_content:
+            return None
+        return delta_message
+
+    #### Implementation details of stream parsing for granite models
+    def _is_reasoning_start_substr(self, text: str) -> bool:
+        """Check if a text matches one of the possible start reasoning seqs.
+
+        Args:
+            text (str): Text to check for leading substr.
+        
+        Returns:
+            bool: True if any of the possible reasoning start seqs match.
+        """
+        return any(
+            think_start.startswith(text)
+            for think_start in self.valid_think_starts)
+
+    def _is_response_start_substr(self, text: str) -> bool:
+        """Check if a text matches one of the possible start response seqs.
+
+        Args:
+            text (str): Text to check for leading substr.
+        
+        Returns:
+            bool: True if any of the possible response start seqs match.
+        """
+        return any(
+            response_start.startswith(text)
+            for response_start in self.valid_response_starts)
+
+    def _get_delta_message_with_no_reasoning_bounds(
+        self,
+        current_text: str,
+        delta_text: str,
+    ) -> DeltaMessage:
+        """Parse the delta message when the current text has not yet completed
+        its start of reasoning sequence.
+
+        Args:
+            current_text (str): The full previous + delta text.
+            delta_text (str): Text to consider and parse content from.
+
+        Returns:
+            DeltaMessage: Message containing the parsed content.
+        """
+        prev_longest_length = len(current_text) - len(delta_text)
+        is_substr = self._is_reasoning_start_substr(current_text)
+        was_substr = self._is_reasoning_start_substr(
+            current_text[:prev_longest_length])
+
+        # Check if we just generated something NOT in the special token seq;
+        # if so, add everything that we previously skipped with this delta
+        # message and append everything to content in the future.
+        if was_substr and not is_substr:
+            return DeltaMessage(
+                reasoning_content=None,
+                content=current_text,
+            )
+        if is_substr:
+            # Might still be in the special token sequence; return nothing
+            return DeltaMessage(reasoning_content=None, content=None)
+        # Otherwise the sequence has already been broken and we already
+        # corrected; just return the delta text as normal content.
+        return DeltaMessage(reasoning_content=None, content=delta_text)
+
+    def _get_delta_message_with_no_response_bounds(
+        self,
+        current_text: str,
+        reasoning_content: str,
+        delta_text: str,
+    ) -> DeltaMessage:
+        """Parse the delta message when the current text has both reasoning
+        content with no (response) content. NOTE that we may have overlapping
+        tokens with the start of reasoning / start of response sequences on
+        either side of the delta text.
+
+        Args:
+            current_text (str): The full previous + delta text.
+            reasoning_content (str): reasoning content from current_text.
+            delta_text (str): Text to consider and parse content from.
+
+        Returns:
+            DeltaMessage: Message containing the parsed content.
+        """
+        # If we have no reasoning content or explicitly end with the start of
+        # response sequence, we are in transition to the response; need to be
+        # careful here, since the final token (:) will match the reasoning
+        # content and fully parse it out; we should not pass the : back.
+        ends_with_start_response_seq = any(
+            current_text.endswith(response_start)
+            for response_start in self.valid_response_starts)
+        if reasoning_content is None or ends_with_start_response_seq:
+            return DeltaMessage(reasoning_content=None, content=None)
+
+        # Consider previous / current text only within context of the reasoning
+        previous_text = reasoning_content[:-len(delta_text)]
+        current_text = reasoning_content
+
+        # We need to be careful about adding unfinished response sequences;
+        # Find the place at which we MIGHT be starting a response sequence
+        prev_idx = previous_text.rfind(self.seq_boundary_start)
+        delta_idx = delta_text.rfind(self.seq_boundary_start)
+
+        # Check the state of potential start of response substring matches.
+        prev_was_substr = self._is_response_start_substr(
+            previous_text[prev_idx:]) if prev_idx >= 0 else False
+        delta_continues_substr = self._is_response_start_substr(
+            current_text[prev_idx:]) if prev_idx >= 0 else False
+        delta_new_substr = self._is_response_start_substr(
+            delta_text[delta_idx:]) if delta_idx >= 0 else False
+
+        # Delta only contains potential continued response sequence text.
+        if delta_continues_substr:
+            return DeltaMessage(reasoning_content=None, content=None)
+
+        if not prev_was_substr:
+            # Delta may be starting a new response seq but has other text too.
+            if delta_new_substr:
+                return DeltaMessage(reasoning_content=delta_text[:delta_idx],
+                                    content=None)
+            # Normal case for most reasoning text (no potential special seqs).
+            return DeltaMessage(reasoning_content=delta_text, content=None)
+        # The substring that previously seemed to be a potential response
+        # seq wasn't one; we need to add the content to the delta message,
+        # and also slice off the potential response sequence
+        elif delta_new_substr:
+            reasoning_content = previous_text[
+                prev_idx:] + delta_text[:delta_idx]
+            return DeltaMessage(reasoning_content=reasoning_content,
+                                content=None)
+        # No new substring yet, and we broke our old one; take the whole delta
+        return DeltaMessage(
+            reasoning_content=previous_text[prev_idx:] + delta_text,
+            content=None,
+        )
+
+    def _get_delta_message_with_both_bounds(
+        self,
+        delta_text: str,
+        reasoning_content: str,
+        response_content: str,
+        current_text: str,
+        response_seq_len: int,
+    ) -> DeltaMessage:
+        """Parse the delta message when the current text has both reasoning
+        content and normal (response) content.
+
+        Args:
+            delta_text (str): Text to consider and parse content from.
+            reasoning_content (str): reasoning content from current_text.
+            response_content (str): response content from current_text.
+            current_text (str): The full previous + delta text.
+            response_seq_len(str): Len of the complete response sequence used.
+
+        Returns:
+            DeltaMessage: Message containing the parsed content.
+        """
+        # Always have content; take length to the end
+        delta_content = delta_text[-len(response_content):]
+        reasoning_end_idx = len(delta_text) - (len(response_content) +
+                                               response_seq_len)
+
+        if reasoning_end_idx < 0:
+            delta_reasoning_content = None
+        else:
+            # Get the starting offset
+            start_reasoning_content_idx = len(
+                reasoning_content) + response_seq_len + len(
+                    response_content) - 1
+            delta_offset = len(current_text) - len(delta_text)
+            start_offset = start_reasoning_content_idx - delta_offset
+            if start_offset < 0:
+                start_offset = 0
+            delta_reasoning_content = delta_text[
+                start_offset:reasoning_end_idx]
+
+        return DeltaMessage(
+            reasoning_content=delta_reasoning_content,
+            content=delta_content,
+        )
+
+    def _get_content_sections(
+        self, current_text: str
+    ) -> tuple[Optional[str], Optional[int], Optional[str]]:
+        """Parse the text to extract the reasoning content / content
+        if we have them.
+
+        Args:
+            current_text (str): The full previous + delta text.
+
+        Returns:
+            tuple[Optional[str], Optional[int], Optional[str]]: Tuple of len 3
+            containing the reasoning content, the length of the response seq
+            (if there is one) and the non-reasoning content.
+        """
+        current_chunk_start = 0
+        start_reasoning_content = None
+        parsed_content = False
+        delimiter_idxs = [
+            idx for idx, char in enumerate(current_text)
+            if char == self.seq_boundary_end
+        ]
+
+        for current_chunk_end in delimiter_idxs:
+            current_chunk = current_text[current_chunk_start:current_chunk_end]
+            # Check to see if the start of reasoning seq if complete
+            if start_reasoning_content is None:
+                for think_start in self.valid_think_starts:
+                    if current_chunk == think_start[:-1]:
+                        start_reasoning_content = current_chunk_end + 1
+                        current_chunk_start = current_chunk_end + 1
+                        break
+
+            # Check to see if the start of response seq if complete
+            elif not parsed_content:
+                for response_start in self.valid_response_starts:
+                    if current_chunk[-len(response_start) +
+                                     1:] == response_start[:-1]:
+                        # Mark end of reasoning and start response content
+                        # after the start of response sequence.
+                        end_reasoning_content = current_chunk_end - len(
+                            response_start)
+                        reasoning_content = current_text[
+                            start_reasoning_content:end_reasoning_content]
+                        response_content = current_text[current_chunk_end + 1:]
+                        return reasoning_content, len(
+                            response_start), response_content
+
+        if start_reasoning_content and not parsed_content:
+            return current_text[start_reasoning_content:], None, None
+        return None, None, None
diff --git a/vllm/model_executor/guided_decoding/reasoner/__init__.py b/vllm/model_executor/guided_decoding/reasoner/__init__.py
index d930d3dbe94c1..ab6e47c007d20 100644
--- a/vllm/model_executor/guided_decoding/reasoner/__init__.py
+++ b/vllm/model_executor/guided_decoding/reasoner/__init__.py
@@ -19,6 +19,10 @@ def get_reasoner(tokenizer: PreTrainedTokenizer,
         return None
     elif reasoning_backend == "deepseek_r1":
         return DeepSeekReasoner.from_tokenizer(tokenizer)
+    elif reasoning_backend == "granite":
+        logger.warning(
+            "Granite reasoner not yet implemented for structured outputs")
+        return None
     else:
         # Raise a warning for unknown reasoning backend and return None
         # We cannot raise an error here because some reasoning models

From e64afa455c034007c8ec53fa9c18547c721cf362 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 26 Mar 2025 23:54:24 +0800
Subject: [PATCH 015/593] multi-node offline DP+EP example (#15484)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 examples/offline_inference/data_parallel.py | 120 ++++++++++++++++----
 1 file changed, 97 insertions(+), 23 deletions(-)

diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index b73770ce382cf..232afd8b73d00 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -1,26 +1,49 @@
 # SPDX-License-Identifier: Apache-2.0
-# usage:
-# VLLM_USE_V1=1 python examples/offline_inference/data_parallel.py
-# we need to have a launcher to create multiple data parallel
-# ranks. And each rank will create a vLLM instance to process its own prompts.
+"""
+Usage:
+Single node:
+    python examples/offline_inference/data_parallel.py \
+            --model="ibm-research/PowerMoE-3b" \
+            --dp-size=2 \
+            --tp-size=2
+
+Multi-node:
+    Node 0 (assume the node has ip of 10.99.48.128):
+            python examples/offline_inference/data_parallel.py \
+                    --model="ibm-research/PowerMoE-3b" \
+                    --dp-size=2 \
+                    --tp-size=2 \
+                    --node-size=2 \
+                    --node-rank=0 \
+                    --master-addr=10.99.48.128 \
+                    --master-port=13345
+    Node 1:
+            python examples/offline_inference/data_parallel.py \
+                    --model="ibm-research/PowerMoE-3b" \
+                    --dp-size=2 \
+                    --tp-size=2 \
+                    --node-size=2 \
+                    --node-rank=1 \
+                    --master-addr=10.99.48.128 \
+                    --master-port=13345
+"""
 import os
 
 from vllm import LLM, SamplingParams
 from vllm.utils import get_open_port
 
-GPUs_per_dp_rank = 2
-DP_size = 2
 
-
-def main(dp_size, dp_rank, dp_master_ip, dp_master_port, GPUs_per_dp_rank):
-    os.environ["VLLM_DP_RANK"] = str(dp_rank)
+def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
+         dp_master_port, GPUs_per_dp_rank):
+    os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
     os.environ["VLLM_DP_SIZE"] = str(dp_size)
     os.environ["VLLM_DP_MASTER_IP"] = dp_master_ip
     os.environ["VLLM_DP_MASTER_PORT"] = str(dp_master_port)
     # set devices for each dp_rank
     os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
-        str(i) for i in range(dp_rank * GPUs_per_dp_rank, (dp_rank + 1) *
-                              GPUs_per_dp_rank))
+        str(i)
+        for i in range(local_dp_rank * GPUs_per_dp_rank, (local_dp_rank + 1) *
+                       GPUs_per_dp_rank))
 
     # Sample prompts.
     prompts = [
@@ -28,20 +51,20 @@ def main(dp_size, dp_rank, dp_master_ip, dp_master_port, GPUs_per_dp_rank):
         "The president of the United States is",
         "The capital of France is",
         "The future of AI is",
-    ]
+    ] * 100
 
     # with DP, each rank should process different prompts.
     # usually all the DP ranks process a full dataset,
     # and each rank processes a different part of the dataset.
     promts_per_rank = len(prompts) // dp_size
-    start = dp_rank * promts_per_rank
+    start = global_dp_rank * promts_per_rank
     end = start + promts_per_rank
     prompts = prompts[start:end]
     if len(prompts) == 0:
         # if any rank has no prompts to process,
         # we need to set a placeholder prompt
         prompts = ["Placeholder"]
-    print(f"DP rank {dp_rank} needs to process {len(prompts)} prompts")
+    print(f"DP rank {global_dp_rank} needs to process {len(prompts)} prompts")
 
     # Create a sampling params object.
     # since we are doing data parallel, every rank can have different
@@ -49,31 +72,82 @@ def main(dp_size, dp_rank, dp_master_ip, dp_master_port, GPUs_per_dp_rank):
     # ranks for demonstration.
     sampling_params = SamplingParams(temperature=0.8,
                                      top_p=0.95,
-                                     max_tokens=16 * (dp_rank + 1))
+                                     max_tokens=[16, 20][global_dp_rank % 2])
 
     # Create an LLM.
-    llm = LLM(model="ibm-research/PowerMoE-3b",
+    llm = LLM(model=model,
               tensor_parallel_size=GPUs_per_dp_rank,
               enforce_eager=True,
               enable_expert_parallel=True)
     outputs = llm.generate(prompts, sampling_params)
     # Print the outputs.
-    for output in outputs:
+    for i, output in enumerate(outputs):
+        if i >= 5:
+            # print only 5 outputs
+            break
         prompt = output.prompt
         generated_text = output.outputs[0].text
-        print(f"DP rank {dp_rank}, Prompt: {prompt!r}, "
+        print(f"DP rank {global_dp_rank}, Prompt: {prompt!r}, "
               f"Generated text: {generated_text!r}")
 
 
 if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Data Parallel Inference")
+    parser.add_argument("--model",
+                        type=str,
+                        default="ibm-research/PowerMoE-3b",
+                        help="Model name or path")
+    parser.add_argument("--dp-size",
+                        type=int,
+                        default=2,
+                        help="Data parallel size")
+    parser.add_argument("--tp-size",
+                        type=int,
+                        default=2,
+                        help="Tensor parallel size")
+    parser.add_argument("--node-size",
+                        type=int,
+                        default=1,
+                        help="Total number of nodes")
+    parser.add_argument("--node-rank",
+                        type=int,
+                        default=0,
+                        help="Rank of the current node")
+    parser.add_argument("--master-addr",
+                        type=str,
+                        default="",
+                        help="Master node IP address")
+    parser.add_argument("--master-port",
+                        type=int,
+                        default=0,
+                        help="Master node port")
+    args = parser.parse_args()
+
+    dp_size = args.dp_size
+    tp_size = args.tp_size
+    node_size = args.node_size
+    node_rank = args.node_rank
+
+    if node_size == 1:
+        dp_master_ip = "127.0.0.1"
+        dp_master_port = get_open_port()
+    else:
+        dp_master_ip = args.master_addr
+        dp_master_port = args.master_port
+
+    assert dp_size % node_size == 0, "dp_size should be divisible by node_size"
+    dp_per_node = dp_size // node_size
+
     from multiprocessing import Process
-    dp_master_ip = "127.0.0.1"
-    dp_master_port = get_open_port()
+
     procs = []
-    for i in range(DP_size):
+    for local_dp_rank, global_dp_rank in enumerate(
+            range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node)):
         proc = Process(target=main,
-                       args=(DP_size, i, dp_master_ip, dp_master_port,
-                             GPUs_per_dp_rank))
+                       args=(args.model, dp_size, local_dp_rank,
+                             global_dp_rank, dp_master_ip, dp_master_port,
+                             tp_size))
         proc.start()
         procs.append(proc)
     exit_code = 0

From 0af4d764d6626251923aa61adcf16c9bce488454 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 26 Mar 2025 17:17:53 +0000
Subject: [PATCH 016/593] Fix weight loading for some models in Transformers
 backend (#15544)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/transformers.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index bdc390689104e..70daadf913798 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -345,9 +345,11 @@ class TransformersModel(nn.Module):
         params_dict = dict(self.named_parameters())
         loaded_params = set[str]()
         for name, loaded_weight in weights:
-            # Necessary for some models which use remote code
-            if not name.startswith(prefix := self.model.base_model_prefix):
-                name = maybe_prefix(prefix, name)
+            # Use "model" instead of base_model_prefix because
+            # the base model attribute in vLLM is always `model`
+            if not name.startswith(prefix := "model."):
+                name = prefix + name
+
             if is_pp_missing_parameter(name, self):
                 continue
             if name in params_dict:

From 733e7c9e95f5b066ac420b00701eef7ea164a79e Mon Sep 17 00:00:00 2001
From: Aaron Pham <contact@aarnphm.xyz>
Date: Wed, 26 Mar 2025 13:51:56 -0400
Subject: [PATCH 017/593] [Refactor] Remove unnecessary backend parameter in
 structured output interface (#15317)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
---
 vllm/v1/structured_output/__init__.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 6c6a8a7bce3ec..218af43deb677 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -2,7 +2,7 @@
 from __future__ import annotations
 
 import multiprocessing
-from concurrent.futures import Future, ThreadPoolExecutor
+from concurrent.futures import ThreadPoolExecutor
 from typing import TYPE_CHECKING, Optional
 
 from vllm.config import VllmConfig
@@ -57,13 +57,13 @@ class StructuredOutputManager:
                 raise ValueError(
                     f"Unsupported structured output backend: {backend_name}")
 
-        grammar: Future[StructuredOutputGrammar] = self.executor.submit(
-            self._async_create_grammar, request, self.backend)
+        grammar = self.executor.submit(self._async_create_grammar, request)
         request.structured_output_request.grammar = grammar  # type: ignore[assignment]
 
     def _async_create_grammar(
-            self, request: Request,
-            backend: StructuredOutputBackend) -> StructuredOutputGrammar:
+        self,
+        request: Request,
+    ) -> StructuredOutputGrammar:
         key = request.structured_output_request.structured_output_key  # type: ignore[union-attr]
 
         # Note that the request was validated in the engine core client,

From 35fad35a485eac9195c510731ba4a9d297dfd963 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Wed, 26 Mar 2025 10:56:47 -0700
Subject: [PATCH 018/593] [V1][Sampler] Faster top-k only implementation
 (#15478)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/sample/test_topk_topp_sampler.py | 37 ++++++++++++++++
 vllm/v1/sample/ops/topk_topp_sampler.py   | 53 ++++++++++++++++++++---
 vllm/v1/sample/sampler.py                 |  6 +++
 3 files changed, 91 insertions(+), 5 deletions(-)
 create mode 100644 tests/v1/sample/test_topk_topp_sampler.py

diff --git a/tests/v1/sample/test_topk_topp_sampler.py b/tests/v1/sample/test_topk_topp_sampler.py
new file mode 100644
index 0000000000000..8a5076412cfae
--- /dev/null
+++ b/tests/v1/sample/test_topk_topp_sampler.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: Apache-2.0
+import torch
+from torch import Generator
+
+from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
+
+DEVICE = "cuda"
+
+BATCH_SIZE = 1024
+VOCAB_SIZE = 128 * 1024
+
+
+def test_topk_impl_equivalance():
+
+    with torch.device(DEVICE):
+        generator = Generator(device=DEVICE).manual_seed(33)
+
+        logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator)
+
+        # Random top-k values between 1 and 9.
+        k = torch.randint(1, 10, (BATCH_SIZE, ), generator=generator)
+
+        # Set k=vocab_size for ~50% of requests in the batch (top-k disabled).
+        k.masked_fill_(
+            torch.randint(0,
+                          2, (BATCH_SIZE, ),
+                          generator=generator,
+                          dtype=bool), VOCAB_SIZE)
+
+        # Top-k only implementation
+        result1 = apply_top_k_top_p(logits=logits.clone(), k=k, p=None)
+
+        # Top-p + top-k
+        no_op_top_p = torch.tensor([1.0])
+        result2 = apply_top_k_top_p(logits=logits.clone(), k=k, p=no_op_top_p)
+
+        assert torch.allclose(result1, result2)
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 1dea711874bfd..5dfcae08b170c 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -19,6 +19,12 @@ except ImportError:
 
 
 class TopKTopPSampler(nn.Module):
+    """
+    Module that performs optional top-k and top-p filtering followed by
+    weighted random sampling of logits.
+
+    Implementations may update the logits tensor in-place.
+    """
 
     def __init__(self):
         super().__init__()
@@ -84,7 +90,11 @@ class TopKTopPSampler(nn.Module):
         k: Optional[torch.Tensor],
         p: Optional[torch.Tensor],
     ) -> torch.Tensor:
-        """PyTorch-native implementation of top-k and top-p sampling."""
+        """
+        PyTorch-native implementation of top-k and top-p sampling.
+
+        The logits tensor may be updated in-place.
+        """
         logits = apply_top_k_top_p(logits, k, p)
         probs = logits.softmax(dim=-1, dtype=torch.float32)
         return random_sample(probs, generators)
@@ -136,10 +146,18 @@ def apply_top_k_top_p(
 ) -> torch.Tensor:
     """Apply top-k and top-p masks to the logits.
 
-    This function sorts the logits tensor, which can be slow for large batches.
+    If a top-p is used, this function will sort the logits tensor,
+    which can be slow for large batches.
+
+    The logits tensor may be updated in-place.
     """
-    if k is None and p is None:
-        return logits
+    if p is None:
+        if k is None:
+            return logits
+
+        # Avoid sorting vocab for top-k only case.
+        return apply_top_k_only(logits, k)
+
     logits_sort, logits_idx = logits.sort(dim=-1, descending=False)
 
     if k is not None:
@@ -153,7 +171,7 @@ def apply_top_k_top_p(
     if p is not None:
         # Apply top-p.
         probs_sort = logits_sort.softmax(dim=-1)
-        probs_sum = probs_sort.cumsum(dim=-1)
+        probs_sum = torch.cumsum(probs_sort, dim=-1, out=probs_sort)
         top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1)
         # at least one
         top_p_mask[:, -1] = False
@@ -164,6 +182,31 @@ def apply_top_k_top_p(
     return logits
 
 
+def apply_top_k_only(
+    logits: torch.Tensor,
+    k: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Apply top-k mask to the logits.
+
+    This implementation doesn't involve sorting the entire vocab.
+
+    The logits tensor may be updated in-place.
+    """
+    no_top_k_mask = k == logits.shape[1]
+    # Set non-top-k rows to 1 so that we can gather.
+    k = k.masked_fill(no_top_k_mask, 1)
+    max_top_k = k.max()
+    # topk.values tensor has shape [batch_size, max_top_k].
+    # Convert top k to 0-based index in range [0, max_top_k).
+    k_index = k.sub_(1).unsqueeze(1)
+    top_k_mask = logits.topk(max_top_k, dim=1).values.gather(1, k_index)
+    # Handle non-topk rows.
+    top_k_mask.masked_fill_(no_top_k_mask.unsqueeze(1), -float("inf"))
+    logits.masked_fill_(logits < top_k_mask, -float("inf"))
+    return logits
+
+
 def random_sample(
     probs: torch.Tensor,
     generators: dict[int, torch.Generator],
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 397a049dc2543..004f98496b0d7 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -87,6 +87,12 @@ class Sampler(nn.Module):
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> torch.Tensor:
+        """Sample logits based on sampling metadata.
+
+        The various logits processing functions called in this method
+        may update the logits tensor in-place.
+        """
+
         assert not (sampling_metadata.all_greedy
                     and sampling_metadata.all_random)
         if sampling_metadata.all_random:

From 27df5199d99627e1eb101071c2155f888181bd64 Mon Sep 17 00:00:00 2001
From: marko <5467316+dr75@users.noreply.github.com>
Date: Wed, 26 Mar 2025 19:11:28 +0100
Subject: [PATCH 019/593] Support SHA256 as hash function in prefix caching
 (#15297)

Signed-off-by: Marko Rosenmueller <5467316+dr75@users.noreply.github.com>
---
 docs/source/design/v1/prefix_caching.md |  7 ++-
 tests/test_utils.py                     | 23 +++++++-
 tests/v1/core/test_kv_cache_utils.py    | 42 +++++++++-----
 tests/v1/core/test_prefix_caching.py    | 22 ++++++--
 tests/v1/engine/test_engine_args.py     | 20 +++++++
 vllm/config.py                          |  9 +++
 vllm/engine/arg_utils.py                | 38 +++++++++++--
 vllm/utils.py                           | 20 +++++++
 vllm/v1/core/block_pool.py              | 20 ++++---
 vllm/v1/core/kv_cache_manager.py        |  8 ++-
 vllm/v1/core/kv_cache_utils.py          | 75 ++++++++++++++-----------
 vllm/v1/core/sched/scheduler.py         |  1 +
 12 files changed, 214 insertions(+), 71 deletions(-)

diff --git a/docs/source/design/v1/prefix_caching.md b/docs/source/design/v1/prefix_caching.md
index 3d14a76840d45..ec1f3cb8d64a8 100644
--- a/docs/source/design/v1/prefix_caching.md
+++ b/docs/source/design/v1/prefix_caching.md
@@ -15,12 +15,13 @@ Block 3: |<------------------ prefix -------------------->| |<--- block tokens -
 In the example above, the KV cache in the first block can be uniquely identified with the token “A gentle breeze stirred”. The third block can be uniquely identified with the tokens in the block “laughed in the distance”, along with the prefix tokens “A gentle breeze stirred the leaves as children”. Therefore, we can build the block hash of `hash(tuple[components])`, where components are:
 
 * Parent hash value: The hash value of the parent hash block.
-* Block tokens: A tuple of tokens in this block. The reason to include the exact tokens is to reduce potential hash value collision.  
+* Block tokens: A tuple of tokens in this block. The reason to include the exact tokens is to reduce potential hash value collision.
 * Extra hashes: Other values required to make this block unique, such as LoRA IDs and multi-modality input hashes (see the example below).
 
-Note 1: We only cache full blocks.
+> **Note 1:** We only cache full blocks.
 
-Note 2: The above hash key structure is not 100% collision free. Theoretically it’s still possible for the different prefix tokens to have the same hash value, but this should be nearly impossible to happen. Of course, contributions are welcome if you have an awesome idea to eliminate collusion entirely.
+> **Note 2:** The above hash key structure is not 100% collision free. Theoretically it’s still possible for the different prefix tokens to have the same hash value. To avoid any hash collisions **in a multi-tenant setup, we advise to use SHA256** as hash function instead of the default builtin hash.
+SHA256 is supported since vLLM v0.8.3 and must be enabled with a command line argument. It comes with a performance impact of about 100-200ns per token (~6ms for 50k tokens of context).
 
 **A hashing example with multi-modality inputs**  
 In this example, we illustrate how prefix caching works with multi-modality inputs (e.g., images). Assuming we have a request with the following messages:
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 3660cfa0e49e2..ccbbffcabfcda 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -2,6 +2,8 @@
 # ruff: noqa
 
 import asyncio
+import hashlib
+import pickle
 import socket
 from collections.abc import AsyncIterator
 from unittest.mock import patch
@@ -14,7 +16,8 @@ from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.utils import (FlexibleArgumentParser, MemorySnapshot,
                         PlaceholderModule, StoreBoolean, bind_kv_cache,
                         deprecate_kwargs, get_open_port, memory_profiling,
-                        merge_async_iterators, supports_kw, swap_dict_values)
+                        merge_async_iterators, sha256, supports_kw,
+                        swap_dict_values)
 
 from .utils import create_new_process_for_each_test, error_on_warning
 
@@ -476,3 +479,21 @@ def test_swap_dict_values(obj, key1, key2):
         assert obj[key1] == original_obj[key2]
     else:
         assert key1 not in obj
+
+@pytest.mark.parametrize("input", [(), ("abc", ), (None, ),
+                                    (None, bool, [1, 2, 3])])
+@pytest.mark.parametrize("output", [0, 1, 2])
+def test_sha256(input: tuple, output: int):
+    hash = sha256(input)
+    assert hash is not None
+    assert isinstance(hash, int)
+    assert hash != 0
+
+    bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
+    assert hash == int.from_bytes(hashlib.sha256(bytes).digest(), byteorder="big")
+
+    # hashing again, returns the same value
+    assert hash == sha256(input)
+
+    # hashing different input, returns different value
+    assert hash != sha256(input + (1, ))
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 3fecb517c4369..8362af24a67ed 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -5,8 +5,12 @@ import torch
 
 from vllm.multimodal.inputs import MultiModalKwargs
 from vllm.sampling_params import SamplingParams
-from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
-                                         KVCacheBlock, PrefixCachingMetrics,
+from vllm.utils import sha256
+# disable yapf here as it formats differently than isort such that both fail
+# yapf: disable
+from vllm.v1.core.kv_cache_utils import (NONE_HASH, BlockHashType,
+                                         FreeKVCacheBlockQueue, KVCacheBlock,
+                                         PrefixCachingMetrics,
                                          generate_block_hash_extra_keys,
                                          hash_block_tokens,
                                          hash_request_tokens,
@@ -16,6 +20,8 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
 from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request
 
+# yapf: enable
+
 
 def make_request(request_id,
                  prompt_token_ids,
@@ -40,6 +46,12 @@ def make_request(request_id,
     )
 
 
+def test_none_hash():
+    assert NONE_HASH is not None
+    assert isinstance(NONE_HASH, int)
+    assert NONE_HASH != 0
+
+
 def test_kv_cache_block():
     # Test KVCacheBlock initialization
     block = KVCacheBlock(block_id=0)
@@ -190,21 +202,23 @@ def test_generate_block_hash_extra_keys_no_mm_inputs():
     assert next_mm_idx == 0
 
 
-def test_hash_block_tokens():
+@pytest.mark.parametrize("hash_fn", [sha256, hash])
+def test_hash_block_tokens(hash_fn):
     parent_block_hash = 123
     curr_block_token_ids = (1, 2, 3)
     extra_keys = ("key1", "key2")
 
-    block_hash = hash_block_tokens(parent_block_hash, curr_block_token_ids,
-                                   extra_keys)
+    block_hash = hash_block_tokens(hash_fn, parent_block_hash,
+                                   curr_block_token_ids, extra_keys)
     assert isinstance(block_hash, BlockHashType)
-    assert block_hash.hash_value == hash(
+    assert block_hash.hash_value == hash_fn(
         (parent_block_hash, curr_block_token_ids, extra_keys))
     assert block_hash.token_ids == curr_block_token_ids
     assert block_hash.extra_keys == extra_keys
 
 
-def test_hash_request_tokens():
+@pytest.mark.parametrize("hash_fn", [sha256, hash])
+def test_hash_request_tokens(hash_fn):
     request = make_request(
         request_id=0,
         prompt_token_ids=[_ for _ in range(6)],
@@ -219,7 +233,7 @@ def test_hash_request_tokens():
     )
 
     block_size = 3
-    block_hashes = hash_request_tokens(block_size, request)
+    block_hashes = hash_request_tokens(hash_fn, block_size, request)
 
     assert len(block_hashes) == 2
     assert isinstance(block_hashes[0], BlockHashType)
@@ -234,7 +248,8 @@ def test_hash_request_tokens():
     assert block_hashes[1].extra_keys == ("hash2", )
 
 
-def test_hash_tokens_different_mm_input():
+@pytest.mark.parametrize("hash_fn", [sha256, hash])
+def test_hash_tokens_different_mm_input(hash_fn):
     request1 = make_request(
         request_id=0,
         prompt_token_ids=[_ for _ in range(6)],
@@ -260,13 +275,14 @@ def test_hash_tokens_different_mm_input():
         mm_hashes=["hash3", "hash2"],
     )
     block_size = 3
-    block_hashes1 = hash_request_tokens(block_size, request1)
-    block_hashes2 = hash_request_tokens(block_size, request2)
+    block_hashes1 = hash_request_tokens(hash_fn, block_size, request1)
+    block_hashes2 = hash_request_tokens(hash_fn, block_size, request2)
     assert block_hashes1[0] != block_hashes2[0]
     assert block_hashes1[1] != block_hashes2[1]
 
 
-def test_hash_request_tokens_no_mm_inputs():
+@pytest.mark.parametrize("hash_fn", [sha256, hash])
+def test_hash_request_tokens_no_mm_inputs(hash_fn):
     request = make_request(
         request_id=0,
         prompt_token_ids=[_ for _ in range(6)],
@@ -275,7 +291,7 @@ def test_hash_request_tokens_no_mm_inputs():
     )
 
     block_size = 3
-    block_hashes = hash_request_tokens(block_size, request)
+    block_hashes = hash_request_tokens(hash_fn, block_size, request)
 
     assert len(block_hashes) == 2
     assert block_hashes[0].token_ids == (0, 1, 2)
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 6129752bcdd65..72a1874fbd446 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -7,7 +7,7 @@ import pytest
 
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
-from vllm.utils import cdiv
+from vllm.utils import cdiv, sha256
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
 from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock,
@@ -39,16 +39,21 @@ def make_request(request_id,
     )
 
 
-def test_prefill():
+@pytest.mark.parametrize("hash_algo", ["sha256", "hash"])
+def test_prefill(hash_algo):
     manager = KVCacheManager(
         block_size=16,
         num_gpu_blocks=10,
         max_model_len=8192,
         sliding_window=None,
         enable_caching=True,
+        caching_hash_algo=hash_algo,
         num_preallocate_tokens=16,
     )
 
+    # choose the hash function according to the parameter
+    hash_fn = sha256 if hash_algo == "sha256" else hash
+
     # Complete 3 blocks (48 tokens)
     common_token_ids = [i for i in range(3) for _ in range(16)]
 
@@ -68,7 +73,8 @@ def test_prefill():
     parent_block_hash = None
     for block_id in (0, 1, 2):
         block_tokens = tuple(all_token_ids[block_id * 16:(block_id + 1) * 16])
-        block_hash = hash_block_tokens(parent_block_hash, block_tokens)
+        block_hash = hash_block_tokens(hash_fn, parent_block_hash,
+                                       block_tokens)
         assert manager.block_pool.blocks[block_id].block_hash == block_hash
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
         parent_block_hash = block_hash.hash_value
@@ -163,6 +169,8 @@ def test_prefill_plp():
         enable_caching=True,
         num_preallocate_tokens=16,
     )
+    # the default hash function is hash
+    hash_fn = hash
 
     # Complete 3 blocks (48 tokens)
     common_token_ids = [i for i in range(3) for _ in range(16)]
@@ -185,7 +193,8 @@ def test_prefill_plp():
     parent_block_hash = None
     for block_id in (0, 1, 2):
         block_tokens = tuple(all_token_ids[block_id * 16:(block_id + 1) * 16])
-        block_hash = hash_block_tokens(parent_block_hash, block_tokens)
+        block_hash = hash_block_tokens(hash_fn, parent_block_hash,
+                                       block_tokens)
         assert manager.block_pool.blocks[block_id].block_hash == block_hash
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
         parent_block_hash = block_hash.hash_value
@@ -522,7 +531,8 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
     assert len(blocks) == 1 + num_preallocated_blocks
 
 
-def test_cache_blocks():
+@pytest.mark.parametrize("hash_fn", [sha256, hash])
+def test_cache_blocks(hash_fn):
     """
     This is a unit test that tests the correctness of the _cache_full_blocks
     function of KVCacheManager.
@@ -550,6 +560,7 @@ def test_cache_blocks():
         num_cached_blocks=0,
         num_full_blocks=2,
         block_size=block_size,
+        hash_fn=hash_fn,
     )
 
     assert len(block_pool.cached_block_hash_to_block) == 2
@@ -564,6 +575,7 @@ def test_cache_blocks():
         num_cached_blocks=2,
         num_full_blocks=3,
         block_size=block_size,
+        hash_fn=hash_fn,
     )
     assert len(block_pool.cached_block_hash_to_block) == 3
     assert blocks[0].block_hash is not None
diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py
index 02470ca92f47f..8963b21c4eb11 100644
--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from argparse import ArgumentError
+
 import pytest
 
 from vllm import envs
@@ -32,6 +34,24 @@ def test_prefix_caching_from_cli():
     vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
     assert vllm_config.cache_config.enable_prefix_caching
 
+    # default hash algorithm is "builtin"
+    assert vllm_config.cache_config.prefix_caching_hash_algo == "builtin"
+
+    # set hash algorithm to sha256
+    args = parser.parse_args(["--prefix-caching-hash-algo", "sha256"])
+    vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
+    assert vllm_config.cache_config.prefix_caching_hash_algo == "sha256"
+
+    # set hash algorithm to builtin
+    args = parser.parse_args(["--prefix-caching-hash-algo", "builtin"])
+    vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
+    assert vllm_config.cache_config.prefix_caching_hash_algo == "builtin"
+
+    # an invalid hash algorithm raises an error
+    parser.exit_on_error = False
+    with pytest.raises(ArgumentError):
+        args = parser.parse_args(["--prefix-caching-hash-algo", "invalid"])
+
 
 def test_defaults_with_usage_context():
     engine_args = EngineArgs(model="facebook/opt-125m")
diff --git a/vllm/config.py b/vllm/config.py
index 6f2da6aa87136..94cecba1e1fcb 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1124,6 +1124,7 @@ class CacheConfig:
         num_gpu_blocks_override: Optional[int] = None,
         sliding_window: Optional[int] = None,
         enable_prefix_caching: bool = False,
+        prefix_caching_hash_algo: str = "builtin",
         cpu_offload_gb: float = 0,
         calculate_kv_scales: Optional[bool] = None,
     ) -> None:
@@ -1135,6 +1136,7 @@ class CacheConfig:
         self.is_attention_free = is_attention_free
         self.sliding_window = sliding_window
         self.enable_prefix_caching = enable_prefix_caching
+        self.prefix_caching_hash_algo = prefix_caching_hash_algo
         self.cpu_offload_gb = cpu_offload_gb
         self.calculate_kv_scales = calculate_kv_scales
         self._verify_args()
@@ -1185,6 +1187,13 @@ class CacheConfig:
                 "Prefix caching is not supported with sliding window. "
                 "Run with --disable-sliding-window to use prefix caching.")
 
+        if self.enable_prefix_caching and self.prefix_caching_hash_algo not in (
+                "builtin", "sha256"):
+            raise ValueError(
+                "Unknown prefix caching hash algorithm: "
+                f"{self.prefix_caching_hash_algo}. Must be either "
+                "'builtin' or 'sha256'.")
+
     def verify_with_parallel_config(
         self,
         parallel_config: "ParallelConfig",
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index be00689f2b55f..364555b345834 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -118,6 +118,7 @@ class EngineArgs:
     max_parallel_loading_workers: Optional[int] = None
     block_size: Optional[int] = None
     enable_prefix_caching: Optional[bool] = None
+    prefix_caching_hash_algo: str = "builtin"
     disable_sliding_window: bool = False
     disable_cascade_attn: bool = False
     use_v2_block_manager: bool = True
@@ -475,6 +476,16 @@ class EngineArgs:
             help="Enables automatic prefix caching. "
             "Use ``--no-enable-prefix-caching`` to disable explicitly.",
         )
+        parser.add_argument(
+            "--prefix-caching-hash-algo",
+            type=str,
+            choices=["builtin", "sha256"],
+            default=EngineArgs.prefix_caching_hash_algo,
+            help="Set the hash algorithm for prefix caching. "
+            "Options are 'builtin' (Python's built-in hash) or 'sha256' "
+            "(collision resistant but with certain overheads). Defaults "
+            "to 'builtin'.",
+        )
         parser.add_argument('--disable-sliding-window',
                             action='store_true',
                             help='Disables sliding window, '
@@ -1329,6 +1340,7 @@ class EngineArgs:
             num_gpu_blocks_override=self.num_gpu_blocks_override,
             sliding_window=model_config.get_sliding_window(),
             enable_prefix_caching=self.enable_prefix_caching,
+            prefix_caching_hash_algo=self.prefix_caching_hash_algo,
             cpu_offload_gb=self.cpu_offload_gb,
             calculate_kv_scales=self.calculate_kv_scales,
         )
@@ -1737,12 +1749,22 @@ class EngineArgs:
             msg = "Chunked prefill is not supported for pooling models"
             raise ValueError(msg)
 
-        # Disable prefix caching for multimodal models for VLLM_V0.
-        if (model_config.is_multimodal_model and self.enable_prefix_caching):
-            logger.warning(
-                "--enable-prefix-caching is not supported for multimodal "
-                "models in V0 and has been disabled.")
-            self.enable_prefix_caching = False
+        # if using prefix caching, we must set a hash algo
+        if self.enable_prefix_caching:
+            # Disable prefix caching for multimodal models for VLLM_V0.
+            if model_config.is_multimodal_model:
+                logger.warning(
+                    "--enable-prefix-caching is not supported for multimodal "
+                    "models in V0 and has been disabled.")
+                self.enable_prefix_caching = False
+
+            # VLLM_V0 only supports builtin hash algo for prefix caching.
+            if self.prefix_caching_hash_algo is None:
+                self.prefix_caching_hash_algo = "builtin"
+            elif self.prefix_caching_hash_algo == "sha256":
+                raise ValueError(
+                    "sha256 is not supported for prefix caching in V0 engine. "
+                    "Please use 'builtin'.")
 
         # Set max_num_seqs to 256 for VLLM_V0.
         if self.max_num_seqs is None:
@@ -1758,6 +1780,10 @@ class EngineArgs:
         if self.enable_prefix_caching is None:
             self.enable_prefix_caching = True
 
+        # if using prefix caching, we must set a hash algo
+        if self.enable_prefix_caching and self.prefix_caching_hash_algo is None:
+            self.prefix_caching_hash_algo = "builtin"
+
         # V1 should use the new scheduler by default.
         # Swap it only if this arg is set to the original V0 default
         if self.scheduler_cls == EngineArgs.scheduler_cls:
diff --git a/vllm/utils.py b/vllm/utils.py
index 9e14a628993f6..101342333e66b 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -10,6 +10,7 @@ import datetime
 import enum
 import gc
 import getpass
+import hashlib
 import importlib
 import importlib.metadata
 import importlib.util
@@ -17,6 +18,7 @@ import inspect
 import ipaddress
 import multiprocessing
 import os
+import pickle
 import re
 import signal
 import socket
@@ -2442,3 +2444,21 @@ def cprofile(save_file: Optional[str] = None, enabled: bool = True):
         return wrapper
 
     return decorator
+
+
+def sha256(input) -> int:
+    """Hash any picklable Python object using SHA-256.
+
+    The input is serialized using pickle before hashing, which allows
+    arbitrary Python objects to be used. Note that this function does
+    not use a hash seed—if you need one, prepend it explicitly to the input.
+
+    Args:
+        input: Any picklable Python object.
+
+    Returns:
+        An integer representing the SHA-256 hash of the serialized input.
+    """
+    input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
+    return int.from_bytes(hashlib.sha256(input_bytes).digest(),
+                          byteorder="big")
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index 394b47fddf0c9..79b0c42d4f812 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 from collections import defaultdict
 from collections.abc import Iterable
-from typing import Optional
+from typing import Callable, Optional
 
 from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
@@ -15,10 +15,10 @@ logger = init_logger(__name__)
 
 class BlockPool:
     """BlockPool that manages KVCacheBlocks.
-    It provides methods to allocate, free and cache the kv cache blocks. The 
-    free_block_queue stores the free blocks in eviction order to enable 
-    allocation, free, and cache eviction. The cached_block_hash_to_block 
-    maps between block hash and cached block to support finding cached blocks 
+    It provides methods to allocate, free and cache the kv cache blocks. The
+    free_block_queue stores the free blocks in eviction order to enable
+    allocation, free, and cache eviction. The cached_block_hash_to_block
+    maps between block hash and cached block to support finding cached blocks
     by their block hash.
 
     Args:
@@ -75,11 +75,12 @@ class BlockPool:
         num_cached_blocks: int,
         num_full_blocks: int,
         block_size: int,
+        hash_fn: Callable,
     ) -> None:
         """Cache a list of full blocks for prefix caching.
         This function takes a list of blocks that will have their block hash
         metadata to be updated and cached. Given a request, it computes the
-        block hashes for the blocks starting from `num_cached_blocks` to 
+        block hashes for the blocks starting from `num_cached_blocks` to
         `num_full_blocks`, updating the metadata for each block
         and caching them in the `cached_block_hash_to_block`.
 
@@ -87,12 +88,13 @@ class BlockPool:
             request: The request to cache the blocks.
             blocks: All blocks in the request.
             block_hashes: Block hashes of the blocks in the request. Note that
-            this list may be shorter than the blocks list. In this case the 
+            this list may be shorter than the blocks list. In this case the
             missed block hash will be computed in this function.
             num_cached_blocks: The number of blocks that are already cached.
-            num_full_blocks: The number of blocks that are full and should 
+            num_full_blocks: The number of blocks that are full and should
                 be cached after this function.
             block_size: Number of tokens in each block.
+            hash_fn: The hash function to use for block hashes.
         """
         if num_cached_blocks == num_full_blocks:
             return
@@ -138,7 +140,7 @@ class BlockPool:
                     request, start_token_idx, end_token_idx, -1)
 
                 # Compute the hash of the current block.
-                block_hash = hash_block_tokens(prev_block_hash_value,
+                block_hash = hash_block_tokens(hash_fn, prev_block_hash_value,
                                                block_tokens, extra_keys)
                 block_hashes.append(block_hash)
 
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 5cfe2b96865a2..39390babaa8ef 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -5,7 +5,7 @@ from collections.abc import Iterable
 from typing import Optional
 
 from vllm.logger import init_logger
-from vllm.utils import cdiv
+from vllm.utils import cdiv, sha256
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock,
                                          hash_request_tokens)
@@ -24,6 +24,7 @@ class KVCacheManager:
         max_model_len: int,
         sliding_window: Optional[int] = None,
         enable_caching: bool = True,
+        caching_hash_algo: str = "builtin",
         num_preallocate_tokens: int = 64,
         log_stats: bool = False,
     ) -> None:
@@ -33,6 +34,7 @@ class KVCacheManager:
         self.max_num_blocks_per_req = cdiv(max_model_len, block_size)
         self.sliding_window = sliding_window
         self.enable_caching = enable_caching
+        self.caching_hash_fn = sha256 if caching_hash_algo == "sha256" else hash
         # FIXME: make prefix cache stats conditional on log_stats
         self.log_stats = log_stats
         # NOTE(woosuk): To avoid frequent block allocation, we preallocate some
@@ -109,7 +111,8 @@ class KVCacheManager:
         # if the scheduler has tried to schedule the request before.
         block_hashes = self.req_to_block_hashes[request.request_id]
         if not block_hashes:
-            block_hashes = hash_request_tokens(self.block_size, request)
+            block_hashes = hash_request_tokens(self.caching_hash_fn,
+                                               self.block_size, request)
             self.req_to_block_hashes[request.request_id] = block_hashes
 
         self.prefix_cache_stats.requests += 1
@@ -247,6 +250,7 @@ class KVCacheManager:
             num_cached_blocks=num_cached_blocks,
             num_full_blocks=num_full_blocks_after_append,
             block_size=self.block_size,
+            hash_fn=self.caching_hash_fn,
         )
 
         self.num_cached_block[
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index e0d7f4dbdc1c1..0d58d4d2218f4 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -1,12 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 """KV-Cache Utilities."""
+import os
 from collections import deque
 from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Any, NamedTuple, Optional
+from typing import Any, Callable, NamedTuple, Optional
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.utils import sha256
 from vllm.v1.kv_cache_interface import (KVCacheConfig, KVCacheGroupSpec,
                                         KVCacheSpec, KVCacheTensor)
 from vllm.v1.metrics.stats import PrefixCacheStats
@@ -18,9 +20,8 @@ logger = init_logger(__name__)
 class BlockHashType(NamedTuple):
     """Hash value of a block (int), the token IDs in the block, and extra keys.
     We keep a tuple of token IDs and extra keys to reduce the likelihood of
-    hash collisions when the hash value is the same. But please note that 
-    hash collisions can still theoretically occur, albeit with an extremely 
-    low probability.
+    hash collisions when the hash value is the same. By using SHA256 however,
+    hash collisions are practically impossible.
     """
     # Hash value of the block in an integer.
     hash_value: int
@@ -30,6 +31,20 @@ class BlockHashType(NamedTuple):
     extra_keys: Optional[Any] = None
 
 
+# The hash seed for the first block of the prefix block sequence.
+#
+# Even if the hash function is the builtin hash(), we use sha256 to generate
+# the initial hash to simplify the code. This is not performance critical
+# as it is done one per process.
+#
+# We use a random value to avoid hash collisions or PYTHONHASHSEED environment
+# variable if set such that processes can share the seed if needed.
+# This aligns with the behavior of Python's hash() function, which also uses
+# a random seed if PYTHONHASHSEED is not set.
+NONE_HASH = int.from_bytes(os.urandom(32), byteorder="big") if os.getenv(
+    'PYTHONHASHSEED') is not None else sha256(os.getenv('PYTHONHASHSEED'))
+
+
 class PrefixCachingMetrics:
     """Metrics for prefix caching with a hit rate of the most recent N requests.
 
@@ -148,7 +163,7 @@ class FreeKVCacheBlockQueue:
     builtin deque to support removing a block in the middle of the queue
     in O(1) time. To close the performance gap to the builtin deque which is
     implemented in C++, this class does not allocate any Python objects when
-    manipulating the linked list. Instead, this class manipulates the 
+    manipulating the linked list. Instead, this class manipulates the
     prev_free_block and next_free_block attributes of the given blocks.
 
     The queue is ordered by block ID in the beginning. When a block is allocated
@@ -178,7 +193,7 @@ class FreeKVCacheBlockQueue:
 
     def popleft(self) -> KVCacheBlock:
         """Pop the first free block and reduce num_free_blocks by 1.
-        
+
         Returns:
             The first free block.
         """
@@ -191,7 +206,7 @@ class FreeKVCacheBlockQueue:
 
     def remove(self, block: KVCacheBlock) -> None:
         """Remove a block in the free list and reduce num_free_blocks by 1.
-        
+
         Args:
             block: The block to remove.
         """
@@ -235,7 +250,7 @@ class FreeKVCacheBlockQueue:
 
     def get_all_free_blocks(self) -> list[KVCacheBlock]:
         """Get all free blocks in the free list. Mainly used for testing.
-        
+
         Returns:
             A list of free blocks.
         """
@@ -251,10 +266,10 @@ def need_extra_keys(request: Request) -> bool:
     """Check whether the blocks allocated to this request need extra hash keys.
 
     Args:
-        request (Request): The request. 
+        request (Request): The request.
 
     Returns:
-        bool: Whether blocks allocated to this request need extra hash keys. 
+        bool: Whether blocks allocated to this request need extra hash keys.
     """
 
     # Multimodal requests need to include the MM hash.
@@ -269,13 +284,13 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
     computation. For multi-modal inputs, the extra keys are
     (mm_hash, start_offset) that indicate a mm input contained in the
     block and its starting offset in the block tokens.
-    
+
     Args:
         request: The request object.
         start_token_idx: The start token index of the block.
         end_token_idx: The end token index of the block.
         start_mm_idx: The start multi-modal index of the block.
-    
+
     Returns:
         A tuple of extra keys and the next multi-modal index.
     """
@@ -333,10 +348,10 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
 
 def _gen_lora_extra_hash_keys(request: Request) -> list[int]:
     """Generate extra keys related to LoRA for block hash computation.
-    
+
     Args:
         request: The request object.
-    
+
     Returns:
         Return LoRA id of the request if it is a LoRA request. Return empty
         list otherwise.
@@ -351,13 +366,13 @@ def generate_block_hash_extra_keys(
         start_mm_idx: int) -> tuple[Optional[tuple[Any, ...]], int]:
     """Generate extra keys for the block hash. The extra keys can come from
     the multi-modal inputs and request specific metadata (e.g., LoRA ID).
-    
+
     Args:
         request: The request object.
         start_token_idx: The start token index of the block.
         end_token_idx: The end token index of the block.
         start_mm_idx: The start multi-modal index of the block.
-    
+
     Returns:
         A tuple of extra keys and the next multi-modal index.
     """
@@ -375,6 +390,7 @@ def generate_block_hash_extra_keys(
 
 
 def hash_block_tokens(
+        hash_function: Callable,
         parent_block_hash: Optional[int],
         curr_block_token_ids: Sequence[int],
         extra_keys: Optional[tuple[Any, ...]] = None) -> BlockHashType:
@@ -395,21 +411,16 @@ def hash_block_tokens(
         The entire tuple is used as the hash key of the block.
     """
     if not parent_block_hash:
-        # Note that we use 'None' as a string here instead of None because
-        # as of Python 3.12, hash(None) returns a constant predictable value.
-        # This could possibly make it easier to find and exploit hash
-        # collisions. 'None' as a string will be hashed differently per process,
-        # but consistently within the same process. This is the same as the
-        # behavior of None prior to Python 3.12.
-        parent_block_hash = hash('None')
+        parent_block_hash = NONE_HASH
 
     curr_block_token_ids_tuple = tuple(curr_block_token_ids)
     return BlockHashType(
-        hash((parent_block_hash, curr_block_token_ids_tuple, extra_keys)),
+        hash_function(
+            (parent_block_hash, curr_block_token_ids_tuple, extra_keys)),
         curr_block_token_ids_tuple, extra_keys)
 
 
-def hash_request_tokens(block_size: int,
+def hash_request_tokens(hash_function: Any, block_size: int,
                         request: Request) -> list[BlockHashType]:
     """Computes hash values of a chain of blocks given a sequence of
     token IDs. The hash value is used for prefix caching.
@@ -441,7 +452,7 @@ def hash_request_tokens(block_size: int,
             req_extra_keys, curr_mm_idx = generate_block_hash_extra_keys(
                 request, start, end, curr_mm_idx)
 
-        block_hash = hash_block_tokens(parent_block_hash_value,
+        block_hash = hash_block_tokens(hash_function, parent_block_hash_value,
                                        block_token_ids, req_extra_keys)
         ret.append(block_hash)
         parent_block_hash_value = block_hash.hash_value
@@ -452,7 +463,7 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
                                  kv_cache_spec: dict[str, KVCacheSpec],
                                  available_memory: int):
     """
-    Checks whether `available_memory` is enough for the KV cache to hold at 
+    Checks whether `available_memory` is enough for the KV cache to hold at
     least one request with the model's max_model_len.
 
     Args:
@@ -489,15 +500,15 @@ def create_kv_cache_group_specs(
         grouped_layer_names: list[list[str]]) -> list[KVCacheGroupSpec]:
     """
      Create KVCacheGroupSpec object for each kv cache group layer.
-     The layers in the same group should share the same 
+     The layers in the same group should share the same
      KVCacheSpec.
 
      Args:
          kv_cache_spec:
              A mapping from each layer name to its corresponding KVCacheSpec.
          grouped_layer_names:
-             A list of kv cache groups, where each element is a list of layer 
-             names that belong to the same group and should share the same 
+             A list of kv cache groups, where each element is a list of layer
+             names that belong to the same group and should share the same
              KVCacheSpec.
      Returns:
          A list of KVCacheGroupSpec objects, one for each group.
@@ -614,11 +625,11 @@ def get_kv_cache_config(vllm_config: VllmConfig,
 
 def unify_kv_cache_configs(kv_cache_configs: list[KVCacheConfig]):
     """
-    Make the KV cache configurations for each worker consistent, so that all 
+    Make the KV cache configurations for each worker consistent, so that all
     workers can be controlled by the same KVCacheManager.
     This function verifies that the layer group of each worker are the same,
     and changes the num_blocks of each worker to the smallest among all workers.
-    
+
     Args:
         kv_cache_configs: The KV cache configurations for each worker. Will be
             in-place modified to make them consistent.
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 924796e03da7e..850687423df73 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -61,6 +61,7 @@ class Scheduler(SchedulerInterface):
             max_model_len=self.max_model_len,
             sliding_window=self.cache_config.sliding_window,
             enable_caching=self.cache_config.enable_prefix_caching,
+            caching_hash_algo=self.cache_config.prefix_caching_hash_algo,
             log_stats=self.log_stats)
         self.block_size = self.cache_config.block_size
 

From dd8a29da99aaca4aaedf710c813222871245e140 Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Wed, 26 Mar 2025 15:35:11 -0500
Subject: [PATCH 020/593] Applying some fixes for K8s agents in CI (#15493)

Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
---
 .buildkite/run-amd-test.sh | 10 ++++++----
 Dockerfile.rocm            |  3 ++-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index 0680bae13ddbf..e5a1b760db1f0 100755
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -134,9 +134,10 @@ if [[ $commands == *"--shard-id="* ]]; then
     # assign shard-id for each shard
     commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
     echo "Shard ${GPU} commands:$commands_gpu"
+    echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
     docker run \
-        --device /dev/kfd --device /dev/dri \
-        --network host \
+        --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+        --network=host \
         --shm-size=16gb \
         --rm \
         -e HIP_VISIBLE_DEVICES="${GPU}" \
@@ -163,9 +164,10 @@ if [[ $commands == *"--shard-id="* ]]; then
     fi
   done
 else
+  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
   docker run \
-          --device /dev/kfd --device /dev/dri \
-          --network host \
+          --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+          --network=host \
           --shm-size=16gb \
           --rm \
           -e HIP_VISIBLE_DEVICES=0 \
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 841e7978a424f..f9ebb10ca8731 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -12,7 +12,8 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
 
 # Install some basic utilities
 RUN apt-get update -q -y && apt-get install -q -y \
-    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev
+    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
+    apt-transport-https ca-certificates wget curl
 # Remove sccache    
 RUN python3 -m pip install --upgrade pip && pip install setuptools_scm
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"

From b2e85e26f408a8bb74b7657b6bcddfede1a93090 Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com>
Date: Wed, 26 Mar 2025 17:35:05 -0400
Subject: [PATCH 021/593] [V1] TPU - Revert to exponential padding by default
 (#15565)

Signed-off-by: Alexander Matveev <amatveev@redhat.com>
---
 vllm/envs.py                       |  4 ++--
 vllm/v1/worker/tpu_model_runner.py | 35 ++++++++++++++++++++++--------
 2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 4c413006a6413..46c5b3a1dc5d0 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -99,7 +99,7 @@ if TYPE_CHECKING:
     VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
     VLLM_V0_USE_OUTLINES_CACHE: bool = False
     VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION: bool = False
-    VLLM_TPU_BUCKET_PADDING_GAP: int = 64
+    VLLM_TPU_BUCKET_PADDING_GAP: int = 0
 
 
 def get_default_cache_root():
@@ -648,7 +648,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # 8, we will run forward pass with [16, 24, 32, ...].
     "VLLM_TPU_BUCKET_PADDING_GAP":
     lambda: int(os.environ["VLLM_TPU_BUCKET_PADDING_GAP"])
-    if "VLLM_TPU_BUCKET_PADDING_GAP" in os.environ else 64,
+    if "VLLM_TPU_BUCKET_PADDING_GAP" in os.environ else 0,
 }
 
 # end-env-vars-definition
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index edf859f0b9463..cf5c56b98beaa 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -944,18 +944,35 @@ def _get_paddings(min_token_size: int, max_token_size: int,
                   padding_gap: int) -> list[int]:
     """Generate a list of padding size, starting from min_token_size, 
     ending with a number that can cover max_token_size
-    first increase the size to twice, 
-    then increase the padding size by padding_gap.
+    
+    If padding_gap == 0 then:
+        increase 2X each time (exponential)
+    else:
+        first increase the size to twice, 
+        then increase the padding size by padding_gap.
     """
     paddings = []
     num = min_token_size
-    while num <= padding_gap:
-        paddings.append(num)
-        num *= 2
-    num //= 2
-    while num < max_token_size:
-        num += padding_gap
-        paddings.append(num)
+
+    if padding_gap == 0:
+        logger.info("Using exponential paddings:")
+        while num <= max_token_size:
+            logger.info("    %d", num)
+            paddings.append(num)
+            num *= 2
+
+    else:
+        logger.info("Using incremental paddings:")
+        while num <= padding_gap:
+            logger.info("    %d", num)
+            paddings.append(num)
+            num *= 2
+        num //= 2
+        while num < max_token_size:
+            num += padding_gap
+            logger.info("    %d", num)
+            paddings.append(num)
+
     return paddings
 
 
From 9d119a86ae9a1655a972afc7b1f701b7c7191876 Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com>
Date: Wed, 26 Mar 2025 17:51:54 -0400
Subject: [PATCH 022/593] [V1] TPU CI - Fix test_compilation.py (#15570)

Signed-off-by: Alexander Matveev <amatveev@redhat.com>
---
 .buildkite/run-tpu-v1-test.sh |  2 +-
 tests/tpu/test_compilation.py | 57 +++++++++++------------------------
 2 files changed, 18 insertions(+), 41 deletions(-)

diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh
index d557feefba7aa..6e1f79ae649e3 100755
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@@ -22,7 +22,7 @@ docker run --privileged --net host --shm-size=16G -it \
     && export VLLM_USE_V1=1 \
     && export VLLM_XLA_CHECK_RECOMPILATION=1 \
     && echo TEST_1 \
-    && pytest /workspace/vllm/tests/tpu/test_compilation.py \
+    && pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \
     && echo TEST_2 \
     && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
     && echo TEST_3 \
diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py
index e70b3e17c6f93..27328d4542d9a 100644
--- a/tests/tpu/test_compilation.py
+++ b/tests/tpu/test_compilation.py
@@ -5,12 +5,8 @@ import os
 import tempfile
 
 import depyf
-import pytest
-
-from vllm.config import CompilationLevel
 
 
-@pytest.mark.skip(reason="Not working; needs investigation.")
 def test_tpu_compilation():
     temp_dir = tempfile.mkdtemp()
     with depyf.prepare_debug(temp_dir):
@@ -22,27 +18,24 @@ def test_tpu_compilation():
             "The greatest glory in living lies not in never falling,",
         ]
         answers = [
-            " or, through inaction, allow a human being to come to harm.",
-            " what is essential is invisible to the eye.",
-            " but in rising every time we fall.",
+            " or, through inaction",
+            " what is essential ",
+            " but in rising ",
         ]
-        N = 1
+
         # Currently, top-p sampling is disabled. `top_p` should be 1.0.
+        N = 1
         sampling_params = SamplingParams(temperature=0.7,
                                          top_p=1.0,
                                          n=N,
                                          max_tokens=16)
 
-        # Set `enforce_eager=True` to avoid ahead-of-time compilation.
-        # In real workloads, `enforace_eager` should be `False`.
-
-        # disable custom dispatcher, let Dynamo takes over
-        # all the control
         llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
-                  max_model_len=512,
-                  max_num_seqs=64,
-                  enforce_eager=True,
-                  compilation_config={"level": CompilationLevel.DYNAMO_AS_IS})
+                  max_num_batched_tokens=256,
+                  max_model_len=256,
+                  max_num_seqs=32,
+                  enforce_eager=False)
+
         outputs = llm.generate(prompts, sampling_params)
         for output, answer in zip(outputs, answers):
             prompt = output.prompt
@@ -56,16 +49,11 @@ def test_tpu_compilation():
     for i, compiled_code in enumerate(compiled_codes):
         print("{} file: {}".format(i + 1, compiled_code))
 
-    # We should only trigger Dynamo compilation 4 times:
-    # 1. forward pass (symbolic)
-    # 2. compute_logits (symbolic)
-    # 3. forward pass (shape 16)
-    # 4. forward pass (shape 32)
-    # and later calls should not trigger Dynamo compilation again.
-    # NOTE: It might still trigger XLA compilation.
-
+    # We should only trigger Dynamo compilation 2 times:
+    # 1. Forward pass without kv_caches
+    # 2. Forward pass with kv_caches
     # Check we have 4 compiled codes
-    assert len(compiled_codes) == 4
+    assert len(compiled_codes) == 2
 
     kv_cache_prefix = "kv_cache"
     attn_prefix = "ragged_paged_attention"
@@ -77,24 +65,13 @@ def test_tpu_compilation():
     for i, compiled_fn in enumerate(compiled_fns):
         print("{} file: {}".format(i + 1, compiled_fn))
 
-    # The first compilation is symbolic, so it should not have any kv_caches
+    # The first compilation should not have any kv_caches
     with open(compiled_fns[0]) as f:
         content = f.read()
         assert kv_cache_prefix not in content
 
-    # The second compilation is symbolic, so it should not have any kv_caches
+    # The second compilation should have kv_caches and the
+    # ragged_paged_attention
     with open(compiled_fns[1]) as f:
-        content = f.read()
-        assert kv_cache_prefix not in content
-
-    # The third compilation is shape 16, so it should have kv_caches and the
-    # ragged_paged_attention
-    with open(compiled_fns[2]) as f:
-        content = f.read()
-        assert (kv_cache_prefix in content and attn_prefix in content)
-
-    # The forth compilation is shape 32, so it should have kv_caches and the
-    # ragged_paged_attention
-    with open(compiled_fns[3]) as f:
         content = f.read()
         assert (kv_cache_prefix in content and attn_prefix in content)

From 7a888271f5bd401f8fc64704c239833244471a91 Mon Sep 17 00:00:00 2001
From: Wes <wryanmedford@gmail.com>
Date: Wed, 26 Mar 2025 17:21:34 -0600
Subject: [PATCH 023/593] Use Cache Hinting for fused_moe kernel (#15511)

---
 .../model_executor/layers/fused_moe/fused_moe.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 97e915c60335a..faaea6b4de972 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -189,7 +189,11 @@ def fused_moe_kernel_gptq_awq(
                     mask=token_mask[:, None] &
                     (offs_k[None, :] < K - k * BLOCK_SIZE_K),
                     other=0.0)
-        b = tl.load(b_ptrs)
+        b = tl.load(
+            b_ptrs,
+            cache_modifier=".cg",
+            eviction_policy="evict_last",
+        )
         if use_int4_w4a16:
             b = (b >> b_shifter) & 0xF
 
@@ -391,9 +395,13 @@ def fused_moe_kernel(
                     mask=token_mask[:, None] &
                     (offs_k[None, :] < K - k * BLOCK_SIZE_K),
                     other=0.0)
-        b = tl.load(b_ptrs,
-                    mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
-                    other=0.0)
+        b = tl.load(
+            b_ptrs,
+            mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
+            other=0.0,
+            cache_modifier=".cg",
+            eviction_policy="evict_last",
+        )
         # We accumulate along the K dimension.
         if use_int8_w8a16:
             accumulator = tl.dot(a, b.to(compute_type), acc=accumulator)

From e74ff409e0f8f3cacb8a251a1cae8b478721cead Mon Sep 17 00:00:00 2001
From: Chengji Yao <chengjiyao@google.com>
Date: Wed, 26 Mar 2025 17:09:28 -0700
Subject: [PATCH 024/593] [TPU] support disabling xla compilation cache
 (#15567)

Signed-off-by: Chengji Yao <chengjiyao@google.com>
---
 vllm/v1/worker/tpu_worker.py | 13 ++++++++++---
 vllm/worker/tpu_worker.py    | 13 ++++++++++---
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index 9a380373d4617..4d9a113e39ee4 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -113,9 +113,16 @@ class TPUWorker:
         # can have slightly different XLA graphs.
         world_size = self.parallel_config.world_size
         rank = xr.global_ordinal()
-        per_rank_path = os.path.join(envs.VLLM_XLA_CACHE_PATH,
-                                     f"tp{world_size}_rank{rank}")
-        xr.initialize_cache(per_rank_path, readonly=False)
+        # The PyTorch/XLA compilation cache uses the Torch IR to generate keys.
+        # Consequently, changes in optimization flags, which affect compilation
+        # results, don't change the cache key. This can result in the wrong
+        # compilation being used. To prevent this, disabling the XLA compilation
+        # cache during development is recommended.We can disable it by
+        # `export VLLM_XLA_CACHE_PATH=`
+        if envs.VLLM_XLA_CACHE_PATH:
+            per_rank_path = os.path.join(envs.VLLM_XLA_CACHE_PATH,
+                                         f"tp{world_size}_rank{rank}")
+            xr.initialize_cache(per_rank_path, readonly=False)
 
         # Init ModelRunner here, so that we have access to self.device.
         self.model_runner = TPUModelRunner(self.vllm_config, self.device)
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index 66911790662eb..71b4b38fb9d62 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -93,9 +93,16 @@ class TPUWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase):
         # can have slightly different XLA graphs.
         world_size = self.parallel_config.world_size
         rank = xr.global_ordinal()
-        per_rank_path = os.path.join(envs.VLLM_XLA_CACHE_PATH,
-                                     f"tp{world_size}_rank{rank}")
-        xr.initialize_cache(per_rank_path, readonly=False)
+        # The PyTorch/XLA compilation cache uses the Torch IR to generate keys.
+        # Consequently, changes in optimization flags, which affect compilation
+        # results, don't change the cache key. This can result in the wrong
+        # compilation being used. To prevent this, disabling the XLA compilation
+        # cache during development is recommended.We can disable it by
+        # `export VLLM_XLA_CACHE_PATH=`
+        if envs.VLLM_XLA_CACHE_PATH:
+            per_rank_path = os.path.join(envs.VLLM_XLA_CACHE_PATH,
+                                         f"tp{world_size}_rank{rank}")
+            xr.initialize_cache(per_rank_path, readonly=False)
 
         self.profiler = None
         if envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1:

From 7a6d45bc8a201623c646627becd837afd6b35bc7 Mon Sep 17 00:00:00 2001
From: Matthew Vine <32849887+MattTheCuber@users.noreply.github.com>
Date: Wed, 26 Mar 2025 20:19:46 -0400
Subject: [PATCH 025/593] Support FIPS enabled machines with MD5 hashing
 (#15299)

Signed-off-by: Matthew Vine <32849887+MattTheCuber@users.noreply.github.com>
---
 tests/compile/piecewise/test_toy_llama.py |  3 +-
 vllm/compilation/backends.py              |  7 ++--
 vllm/compilation/compiler_interface.py    |  3 +-
 vllm/config.py                            | 42 +++++++++++++++--------
 4 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index 7307f44b6184e..d4551b1cc3aec 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -63,7 +63,8 @@ class LlamaConfig:
             factors.append((k, v))
         factors.sort()
         import hashlib
-        return hashlib.md5(str(factors).encode()).hexdigest()
+        return hashlib.md5(str(factors).encode(),
+                           usedforsecurity=False).hexdigest()
 
     def __post_init__(self):
         assert self.mlp_size >= self.hidden_size
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index d8c0c59ba9b22..45988c2e9b0d4 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -381,8 +381,8 @@ class VllmBackend:
                 with open(filepath) as f:
                     hash_content.append(f.read())
             import hashlib
-            code_hash = hashlib.md5(
-                "\n".join(hash_content).encode()).hexdigest()
+            code_hash = hashlib.md5("\n".join(hash_content).encode(),
+                                    usedforsecurity=False).hexdigest()
             factors.append(code_hash)
 
             # 3. compiler hash
@@ -390,7 +390,8 @@ class VllmBackend:
             factors.append(compiler_hash)
 
             # combine all factors to generate the cache dir
-            hash_key = hashlib.md5(str(factors).encode()).hexdigest()[:10]
+            hash_key = hashlib.md5(str(factors).encode(),
+                                   usedforsecurity=False).hexdigest()[:10]
 
             cache_dir = os.path.join(
                 envs.VLLM_CACHE_ROOT,
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index b45c694fd7f89..571e2b832e95f 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -139,7 +139,8 @@ class InductorAdaptor(CompilerInterface):
         from torch._inductor.codecache import torch_key
         torch_factors = torch_key()
         factors.append(torch_factors)
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()[:10]
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()[:10]
         return hash_str
 
     def initialize_cache(self, cache_dir: str, disable_cache: bool = False):
diff --git a/vllm/config.py b/vllm/config.py
index 94cecba1e1fcb..2e9325c258b26 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1111,7 +1111,8 @@ class CacheConfig:
         factors: list[Any] = []
         factors.append(self.cache_dtype)
         # `cpu_offload_gb` does not use `torch.compile` yet.
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def __init__(
@@ -1243,7 +1244,8 @@ class TokenizerPoolConfig:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self):
@@ -1354,7 +1356,8 @@ class LoadConfig:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self):
@@ -1674,7 +1677,8 @@ class SchedulerConfig:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self) -> None:
@@ -1810,7 +1814,8 @@ class DeviceConfig:
         # the device/platform information will be summarized
         # by torch/vllm automatically.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def __init__(self, device: str = "auto") -> None:
@@ -1983,7 +1988,8 @@ class SpeculativeConfig:
         # no factors to consider.
         # spec decode does not use `torch.compile` yet.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     @classmethod
@@ -2358,7 +2364,8 @@ class LoRAConfig:
         factors.append(self.lora_extra_vocab_size)
         factors.append(self.long_lora_scaling_factors)
         factors.append(self.bias_enabled)
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self):
@@ -2424,7 +2431,8 @@ class PromptAdapterConfig:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self):
@@ -2469,7 +2477,8 @@ class MultiModalConfig:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def get_limit_per_prompt(self, modality: str) -> int:
@@ -2535,7 +2544,8 @@ class PoolerConfig:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     @staticmethod
@@ -2816,7 +2826,8 @@ class DecodingConfig:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self):
@@ -2866,7 +2877,8 @@ class ObservabilityConfig:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self):
@@ -2928,7 +2940,8 @@ class KVTransferConfig(BaseModel):
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     @classmethod
@@ -3425,7 +3438,8 @@ class VllmConfig:
             vllm_factors.append("None")
         factors.append(vllm_factors)
 
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()[:10]
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()[:10]
         return hash_str
 
     def pad_for_cudagraph(self, batch_size: int) -> int:

From 9239bf718e5ebb5ab871ac8ed09fb80ed02fa82b Mon Sep 17 00:00:00 2001
From: ElizaWszola <ewszola@redhat.com>
Date: Thu, 27 Mar 2025 01:54:44 +0100
Subject: [PATCH 026/593] [Kernel] CUTLASS grouped gemm fp8 MoE kernel (#13972)

Signed-off-by: ElizaWszola <eliza@neuralmagic.com>
Signed-off-by: ElizaWszola <ewszola@redhat.com>
Co-authored-by: Lucas Wilkinson <wilkinson.lucas@gmail.com>
---
 CMakeLists.txt                                |  27 ++
 .../kernels/benchmark_grouped_gemm_cutlass.py | 340 +++++++++++++
 benchmarks/kernels/benchmark_shapes.py        |  16 +
 csrc/cutlass_extensions/common.hpp            |  12 +-
 .../broadcast_load_epilogue_array_c3x.hpp     | 457 ++++++++++++++++++
 .../epilogue/scaled_mm_epilogues_c3x.hpp      |  66 +++
 csrc/ops.h                                    |  14 +
 .../cutlass_w8a8/moe/get_group_starts.cuh     |  80 +++
 .../cutlass_w8a8/moe/grouped_mm_c3x.cu        | 160 ++++++
 .../cutlass_w8a8/moe/grouped_mm_c3x.cuh       | 149 ++++++
 .../quantization/cutlass_w8a8/moe/moe_data.cu |  90 ++++
 .../cutlass_w8a8/scaled_mm_entry.cu           |  67 +++
 csrc/torch_bindings.cpp                       |  29 ++
 tests/kernels/test_cutlass.py                 | 134 +++++
 tests/kernels/test_cutlass_moe.py             | 244 ++++++++++
 vllm/_custom_ops.py                           |  53 ++
 .../layers/fused_moe/__init__.py              |   5 +-
 .../layers/fused_moe/fused_moe.py             | 137 ++++++
 .../compressed_tensors/compressed_tensors.py  |  31 +-
 .../compressed_tensors_moe.py                 | 202 +++++++-
 .../layers/quantization/utils/w8a8_utils.py   |  10 +
 vllm/utils.py                                 |   9 +-
 22 files changed, 2317 insertions(+), 15 deletions(-)
 create mode 100644 benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
 create mode 100644 csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp
 create mode 100644 csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh
 create mode 100644 csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu
 create mode 100644 csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
 create mode 100644 csrc/quantization/cutlass_w8a8/moe/moe_data.cu
 create mode 100644 tests/kernels/test_cutlass_moe.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 65d1ddbeee0b2..e0f1fdf78d142 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -461,6 +461,33 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set(FP4_ARCHS)
   endif()
 
+  #
+  # CUTLASS MoE kernels
+
+  # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
+  # on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
+  # to compile MoE kernels that use its output.
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
+             "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM90=1")
+    message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
+      message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
+                     "not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
+                     "if you intend on running FP8 quantized MoE models on Hopper.")
+    else()
+      message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
+                     "in CUDA target architectures")
+    endif()
+  endif()
+
   #
   # Machete kernels
 
diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
new file mode 100644
index 0000000000000..bcdbf6c7551a3
--- /dev/null
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -0,0 +1,340 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.utils.benchmark as benchmark
+from benchmark_shapes import WEIGHT_SHAPES_MOE
+
+from vllm import _custom_ops as ops
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.fused_moe import (cutlass_moe_fp8,
+                                                            fused_experts,
+                                                            fused_topk)
+from vllm.utils import FlexibleArgumentParser
+
+DEFAULT_MODELS = [
+    "nm-testing/Mixtral-8x7B-Instruct-v0.1", "nm-testing/deepseekv2-lite",
+    "ibm-granite/granite-3.0-1b-a400m", "ibm-granite/granite-3.0-3b-a800m"
+]
+DEFAULT_BATCH_SIZES = [1, 4, 8, 16, 32, 64, 128, 256, 512]
+DEFAULT_TP_SIZES = [1]
+
+PER_ACT_TOKEN_OPTS = [False]
+PER_OUT_CH_OPTS = [False]
+
+
+def to_fp8(tensor: torch.Tensor):
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def bench_run(results: list[benchmark.Measurement], model: str,
+              num_experts: int, topk: int, per_act_token: bool,
+              per_out_ch: bool, mkn: tuple[int, int, int]):
+    label = "Quant Matmul"
+
+    sub_label = (
+        "{}, num_experts={}, topk={}, per_act_token={} per_out_ch={}, "
+        "MKN=({})".format(model, num_experts, topk, per_act_token, per_out_ch,
+                          mkn))
+
+    print(f"Testing: {sub_label}")
+
+    (m, k, n) = mkn
+
+    dtype = torch.half
+
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((num_experts, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((num_experts, k, n), device="cuda", dtype=dtype) / 10
+
+    _, a_scale = ops.scaled_fp8_quant(a)
+
+    w1_q = torch.empty((num_experts, 2 * n, k),
+                       device="cuda",
+                       dtype=torch.float8_e4m3fn)
+    w2_q = torch.empty((num_experts, k, n),
+                       device="cuda",
+                       dtype=torch.float8_e4m3fn)
+    w1_scale = torch.empty((num_experts, 1, 1),
+                           device="cuda",
+                           dtype=torch.float32)
+    w2_scale = torch.empty((num_experts, 1, 1),
+                           device="cuda",
+                           dtype=torch.float32)
+
+    ab_strides1 = torch.full((num_experts, ),
+                             k,
+                             device="cuda",
+                             dtype=torch.int64)
+    c_strides1 = torch.full((num_experts, ),
+                            2 * n,
+                            device="cuda",
+                            dtype=torch.int64)
+    ab_strides2 = torch.full((num_experts, ),
+                             n,
+                             device="cuda",
+                             dtype=torch.int64)
+    c_strides2 = torch.full((num_experts, ),
+                            k,
+                            device="cuda",
+                            dtype=torch.int64)
+
+    for expert in range(num_experts):
+        w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert])
+        w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert])
+    w1_q_notransp = w1_q.clone()
+    w2_q_notransp = w2_q.clone()
+    w1_q = w1_q.transpose(1, 2)
+    w2_q = w2_q.transpose(1, 2)
+
+    score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
+
+    topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
+
+    def run_triton_moe(a: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
+                       topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                       w1_scale: torch.Tensor, w2_scale: torch.Tensor,
+                       a_scale: torch.Tensor, num_repeats: int):
+        for _ in range(num_repeats):
+            fused_experts(a,
+                          w1,
+                          w2,
+                          topk_weights,
+                          topk_ids,
+                          use_fp8_w8a8=True,
+                          w1_scale=w1_scale,
+                          w2_scale=w2_scale,
+                          a1_scale=a_scale)
+
+    def run_cutlass_moe(a: torch.Tensor, a_scale: torch.Tensor,
+                        w1: torch.Tensor, w2: torch.Tensor,
+                        w1_scale: torch.Tensor, w2_scale: torch.Tensor,
+                        topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                        ab_strides1: torch.Tensor, c_strides1: torch.Tensor,
+                        ab_strides2: torch.Tensor, c_strides2: torch.Tensor,
+                        num_repeats: int):
+        for _ in range(num_repeats):
+            cutlass_moe_fp8(a,
+                            w1,
+                            w2,
+                            w1_scale,
+                            w2_scale,
+                            topk_weights,
+                            topk_ids,
+                            ab_strides1,
+                            c_strides1,
+                            ab_strides2,
+                            c_strides2,
+                            a1_scale=a_scale)
+
+    def run_cutlass_from_graph(
+            a: torch.Tensor, a_scale: torch.Tensor, w1_q: torch.Tensor,
+            w2_q: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor,
+            topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+            ab_strides1: torch.Tensor, c_strides1: torch.Tensor,
+            ab_strides2: torch.Tensor, c_strides2: torch.Tensor):
+        with set_current_vllm_config(
+                VllmConfig(parallel_config=ParallelConfig(
+                    pipeline_parallel_size=1))):
+            return cutlass_moe_fp8(a,
+                                   w1_q,
+                                   w2_q,
+                                   w1_scale,
+                                   w2_scale,
+                                   topk_weights,
+                                   topk_ids,
+                                   ab_strides1,
+                                   c_strides1,
+                                   ab_strides2,
+                                   c_strides2,
+                                   a1_scale=a_scale)
+
+    def run_triton_from_graph(a: torch.Tensor, w1: torch.Tensor,
+                              w2: torch.Tensor, topk_weights: torch.Tensor,
+                              topk_ids: torch.Tensor, w1_scale: torch.Tensor,
+                              w2_scale: torch.Tensor, a_scale: torch.Tensor):
+        with set_current_vllm_config(
+                VllmConfig(parallel_config=ParallelConfig(
+                    pipeline_parallel_size=1))):
+            return fused_experts(a,
+                                 w1,
+                                 w2,
+                                 topk_weights,
+                                 topk_ids,
+                                 use_fp8_w8a8=True,
+                                 w1_scale=w1_scale,
+                                 w2_scale=w2_scale,
+                                 a1_scale=a_scale)
+
+    def replay_graph(graph, num_repeats):
+        for _ in range(num_repeats):
+            graph.replay()
+        torch.cuda.synchronize()
+
+    cutlass_stream = torch.cuda.Stream()
+    cutlass_graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
+        run_cutlass_from_graph(a, a_scale, w1_q, w2_q, w1_scale, w2_scale,
+                               topk_weights, topk_ids, ab_strides1, c_strides1,
+                               ab_strides2, c_strides2)
+    torch.cuda.synchronize()
+
+    triton_stream = torch.cuda.Stream()
+    triton_graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(triton_graph, stream=triton_stream):
+        run_triton_from_graph(a, w1_q_notransp, w2_q_notransp, topk_weights,
+                              topk_ids, w1_scale, w2_scale, a_scale)
+    torch.cuda.synchronize()
+
+    min_run_time = 5
+    num_warmup = 5
+    num_runs = 25
+
+    globals = {
+        # Baseline params
+        "w1": w1,
+        "w2": w2,
+        "score": score,
+        "topk": topk,
+        "w1_q_notransp": w1_q_notransp,
+        "w2_q_notransp": w2_q_notransp,
+        # Cutlass params
+        "a_scale": a_scale,
+        "w1_q": w1_q,
+        "w2_q": w2_q,
+        "w1_scale": w1_scale,
+        "w2_scale": w2_scale,
+        "ab_strides1": ab_strides1,
+        "c_strides1": c_strides1,
+        "ab_strides2": ab_strides2,
+        "c_strides2": c_strides2,
+        # cuda graph params
+        "cutlass_graph": cutlass_graph,
+        "triton_graph": triton_graph,
+        # Gen params
+        "a": a,
+        "topk_weights": topk_weights,
+        "topk_ids": topk_ids,
+        "num_runs": num_runs,
+        # Kernels
+        "run_triton_moe": run_triton_moe,
+        "run_cutlass_moe": run_cutlass_moe,
+        "replay_graph": replay_graph,
+    }
+
+    # Warmup
+    run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids,
+                   w1_scale, w2_scale, a_scale, num_warmup)
+
+    results.append(
+        benchmark.Timer(
+            stmt=
+            "run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)",  # noqa: E501
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="triton_moe",
+        ).blocked_autorange(min_run_time=min_run_time))
+
+    # Warmup
+    replay_graph(triton_graph, num_warmup)
+
+    results.append(
+        benchmark.Timer(
+            stmt="replay_graph(triton_graph, num_runs)",
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="triton_moe_cuda_graphs",
+        ).blocked_autorange(min_run_time=min_run_time))
+
+    # Warmup
+    run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights,
+                    topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2,
+                    num_warmup)
+
+    results.append(
+        benchmark.Timer(
+            stmt=
+            "run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)",  # noqa: E501
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="grouped_gemm_moe",
+        ).blocked_autorange(min_run_time=min_run_time))
+
+    # Warmup
+    replay_graph(cutlass_graph, num_warmup)
+
+    results.append(
+        benchmark.Timer(
+            stmt="replay_graph(cutlass_graph, num_runs)",
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="grouped_gemm_moe_cuda_graphs",
+        ).blocked_autorange(min_run_time=min_run_time))
+
+
+def main(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    results: list[benchmark.Measurement] = []
+
+    for model in args.models:
+        for tp in args.tp_sizes:
+            for layer in WEIGHT_SHAPES_MOE[model]:
+                num_experts = layer[0]
+                topk = layer[1]
+                size_k = layer[2]
+                size_n = layer[3] // tp
+
+                if len(args.limit_k) > 0 and size_k not in args.limit_k:
+                    continue
+
+                if len(args.limit_n) > 0 and size_n not in args.limit_n:
+                    continue
+
+                for per_act_token in PER_ACT_TOKEN_OPTS:
+                    for per_out_ch in PER_OUT_CH_OPTS:
+                        for size_m in DEFAULT_BATCH_SIZES:
+                            mkn = (size_m, size_k, size_n)
+                            bench_run(results, model, num_experts, topk,
+                                      per_act_token, per_out_ch, mkn)
+
+    compare = benchmark.Compare(results)
+    compare.print()
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Benchmark Marlin across specified models/shapes/batches")
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=DEFAULT_MODELS,
+        choices=WEIGHT_SHAPES_MOE.keys(),
+    )
+    parser.add_argument("--tp-sizes",
+                        nargs="+",
+                        type=int,
+                        default=DEFAULT_TP_SIZES)
+    parser.add_argument("--batch-sizes",
+                        nargs="+",
+                        type=int,
+                        default=DEFAULT_BATCH_SIZES)
+    parser.add_argument("--limit-k", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-n", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-num-groups", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-per-act-token",
+                        nargs="+",
+                        type=int,
+                        default=[])
+    parser.add_argument("--limit-per-out-ch", nargs="+", type=int, default=[])
+
+    args = parser.parse_args()
+    main(args)
diff --git a/benchmarks/kernels/benchmark_shapes.py b/benchmarks/kernels/benchmark_shapes.py
index c375e61e41873..70190ba24d9df 100644
--- a/benchmarks/kernels/benchmark_shapes.py
+++ b/benchmarks/kernels/benchmark_shapes.py
@@ -75,3 +75,19 @@ WEIGHT_SHAPES = {
         [7168, 8192],
     ],
 }
+
+WEIGHT_SHAPES_MOE = {
+    "nm-testing/Mixtral-8x7B-Instruct-v0.1": [
+        [8, 2, 4096, 28672],
+        [8, 2, 14336, 4096],
+    ],
+    "nm-testing/deepseekv2-lite": [
+        [64, 6, 2048, 1408],
+    ],
+    "ibm-granite/granite-3.0-1b-a400m": [
+        [32, 8, 1024, 1024],
+    ],
+    "ibm-granite/granite-3.0-3b-a800m": [
+        [40, 8, 1024, 1536],
+    ],
+}
diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp
index febc4eccd9561..dbe0e30f5cbfe 100644
--- a/csrc/cutlass_extensions/common.hpp
+++ b/csrc/cutlass_extensions/common.hpp
@@ -48,4 +48,14 @@ struct enable_sm90_or_later : Kernel {
     Kernel::operator()(std::forward<Args>(args)...);
 #endif
   }
-};
\ No newline at end of file
+};
+
+template <typename Kernel>
+struct enable_sm90_only : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 900
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
diff --git a/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp b/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp
new file mode 100644
index 0000000000000..5c1d6e3f46be0
--- /dev/null
+++ b/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp
@@ -0,0 +1,457 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+//
+// This file is a modified excerpt of
+// include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
+// from https://github.com/NVIDIA/cutlass v3.5.0
+// It has been modified to support either row/column or scalar broadcasting
+// where the tensor being loaded from is always passed in via a device pointer.
+// This lets one compiled kernel handle all cases of per-tensor or
+// per-channel/per-token quantization.
+//
+// This interface also allows the scales to be passed in as tensors that
+// consistently reside on the device, which avoids an issue with a previous
+// implementation where scalars needed to be on the CPU since they
+// were passed in via float values. This created a potential performance hazard
+// if scales were initially on the device, and caused torch.compile graphs
+// breaks when moving scales to the CPU.
+//
+#pragma once
+
+// Turn off clang-format for the entire file to keep it close to upstream
+// clang-format off
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using namespace detail;
+
+// Row vector broadcast
+template<
+  int Stages,
+  class CtaTileShapeMNK,
+  class Element,
+  class StrideMNL = Stride<_0,_1,_0>,
+  int Alignment = 128 / sizeof_bits_v<Element>
+>
+struct Sm90RowOrScalarBroadcastArray {
+  static_assert(Stages == 0, "Row broadcast doesn't support smem usage");
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_1>{});
+
+  struct SharedStorage { 
+    array_aligned<Element, size<1>(CtaTileShapeMNK{})> smem;
+  };
+
+  // This struct has been modified to have a bool indicating that ptr_row is a 
+  // scalar that must be broadcast, instead of containing a scalar that is 
+  // valid if ptr_row is null.
+  struct Arguments {
+    const Element* const* ptr_row_array = nullptr;
+    bool row_broadcast = true;
+    StrideMNL dRow = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowOrScalarBroadcastArray() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowOrScalarBroadcastArray(Params const& params, SharedStorage const& shared_storage)
+      : params(params)
+      , smem(const_cast<Element*>(shared_storage.smem.data())) { }
+
+  Params params;
+  Element *smem = nullptr;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return (!params.row_broadcast && *(params.ptr_row_array[group]) == Element(0));
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template <class GS_GTensor, class GS_STensor, class GS_CTensor, class Tiled_G2S, class SR_STensor, class SR_RTensor, class CTensor, class ThrResidue, class ThrNum>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+        GS_GTensor tGS_gRow_, GS_STensor tGS_sRow_, 
+        GS_CTensor tGS_cRow_, Tiled_G2S tiled_g2s_, 
+        SR_STensor tSR_sRow_, SR_RTensor tSR_rRow_,
+        CTensor tCcRow_, ThrResidue residue_tCcRow_, ThrNum thr_num_,
+        int group, Params const& params_)
+      : tGS_gRow(tGS_gRow_)
+      , tGS_sRow(tGS_sRow_)
+      , tGS_cRow(tGS_cRow_)
+      , tiled_G2S(tiled_g2s_)
+      , tSR_sRow(tSR_sRow_)
+      , tSR_rRow(tSR_rRow_)
+      , tCcRow(tCcRow_)
+      , residue_tCcRow(residue_tCcRow_)
+      , group(group)
+      , params(params_) {}
+
+    GS_GTensor tGS_gRow;                                                         // (CPY,CPY_M,CPY_N)
+    GS_STensor tGS_sRow;                                                         // (CPY,CPY_M,CPY_N)
+    GS_CTensor tGS_cRow;                                                         // (CPY,CPY_M,CPY_N)
+    Tiled_G2S tiled_G2S;
+
+    SR_STensor tSR_sRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    SR_RTensor tSR_rRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) 
+  
+    CTensor tCcRow;                                                              // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    ThrResidue residue_tCcRow;                                                   // (m, n)
+    ThrNum thr_num;
+    int group;
+    Params const& params;
+
+    CUTLASS_DEVICE void
+    begin() {
+      if (!params.row_broadcast) {
+        fill(tSR_rRow, *(params.ptr_row_array[group]));
+        return;
+      }
+
+      auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(thr_num, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+      Tensor tGS_gRow_flt = filter_zeros(tGS_gRow);
+      Tensor tGS_sRow_flt = filter_zeros(tGS_sRow);
+      Tensor tGS_cRow_flt = make_tensor(tGS_cRow.data(), make_layout(tGS_gRow_flt.shape(), tGS_cRow.stride()));
+
+      for (int i = 0; i < size(tGS_gRow_flt); ++i) {
+        if (get<1>(tGS_cRow_flt(i)) >= size<1>(CtaTileShapeMNK{})) {
+          continue; // OOB of SMEM, 
+        }
+        if (elem_less(tGS_cRow_flt(i), make_coord(get<0>(residue_tCcRow), get<1>(residue_tCcRow)))) {
+          tGS_sRow_flt(i) = tGS_gRow_flt(i);
+        }
+        else {
+          tGS_sRow_flt(i) = Element(0); // Set to Zero when OOB so LDS could be issue without any preds.
+        }
+      }
+      synchronize();
+    }
+
+    CUTLASS_DEVICE void
+    begin_loop(int epi_m, int epi_n) {
+      if (epi_m == 0) { // Assumes M-major subtile loop
+        if (!params.row_broadcast) return; // Do not issue LDS when row is scalar 
+        Tensor tSR_sRow_flt = filter_zeros(tSR_sRow(_,_,_,epi_m,epi_n));
+        Tensor tSR_rRow_flt = filter_zeros(tSR_rRow);
+        copy(tSR_sRow_flt, tSR_rRow_flt);
+      }
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<Element, FragmentSize> frg_row;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_row[i] = tSR_rRow(epi_v * FragmentSize + i);
+      }
+
+      return frg_row;
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    using ThreadCount = decltype(size(args.tiled_copy));
+
+    Tensor mRow = make_tensor(make_gmem_ptr(params.ptr_row_array[l]), make_shape(M,N,1), params.dRow);
+    Tensor gRow = local_tile(mRow(_,_,l), take<0,2>(args.tile_shape_mnk), make_coord(m, n));          // (CTA_M, CTA_N)
+    Tensor sRow = make_tensor(make_smem_ptr(smem), 
+        make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})), make_shape(_0{}, _1{}));  // (CTA_M, CTA_N)
+    //// G2S: Gmem to Smem
+    auto tiled_g2s = make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
+                                     Layout< Shape<_1, ThreadCount>, 
+                                            Stride<_0,          _1>>{}, 
+                                     Layout<_1>{});   
+    auto thr_g2s = tiled_g2s.get_slice(args.thread_idx);
+    Tensor tGS_gRow = thr_g2s.partition_S(gRow);
+    Tensor tGS_sRow = thr_g2s.partition_D(sRow);
+
+    //// G2S: Coord 
+    auto cRow = make_identity_tensor(make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})));
+    Tensor tGS_cRow = thr_g2s.partition_S(cRow);
+
+    //// S2R: Smem to Reg
+    Tensor tSR_sRow = sm90_partition_for_epilogue<ReferenceSrc>(sRow, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tSR_rRow = make_tensor_like(take<0,3>(tSR_sRow));                                           // (CPY,CPY_M,CPY_N)
+
+    return ConsumerStoreCallbacks<decltype(tGS_gRow), decltype(tGS_sRow), decltype(tGS_cRow), decltype(tiled_g2s), decltype(tSR_sRow), decltype(tSR_rRow), decltype(args.tCcD), decltype(args.residue_cD), ThreadCount>(
+      tGS_gRow, 
+      tGS_sRow, 
+      tGS_cRow, tiled_g2s, 
+      tSR_sRow, 
+      tSR_rRow, 
+      args.tCcD, 
+      args.residue_cD,
+      ThreadCount{}, 
+      l,
+      params);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Column vector broadcast
+template<
+  int Stages,
+  class CtaTileShapeMNK,
+  class Element,
+  class StrideMNL = Stride<_1,_0,_0>,
+  int Alignment = 128 / sizeof_bits_v<Element>
+>
+struct Sm90ColOrScalarBroadcastArray {
+  static_assert(Stages == 0, "Column broadcast doesn't support smem usage yet");
+  static_assert(Alignment * sizeof_bits_v<Element> % 128 == 0, "sub-16B alignment not supported yet");
+  static_assert(
+    (cute::is_same_v<StrideMNL, Stride<_1,_0, _0>>) || // col vector broadcast, e.g. per-row alpha/bias
+    (cute::is_same_v<StrideMNL, Stride<_1,_0,int>>));  // batched col vector broadcast, e.g. batched per-row bias
+
+  // Accumulator distributes col elements evenly amongst threads so we can just directly load from gmem
+  struct SharedStorage { };
+
+  // This struct has been modified to have a bool indicating that ptr_col is a 
+  // scalar that must be broadcast, instead of containing a scalar that is 
+  // valid if ptr_col is null.
+  struct Arguments {
+    const Element* const* ptr_col_array = nullptr;
+    bool col_broadcast = true;
+    StrideMNL dCol = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return (!params.col_broadcast && *(params.ptr_col_array[group]) == Element(0));
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColOrScalarBroadcastArray() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColOrScalarBroadcastArray(Params const& params, SharedStorage const& shared_storage)
+      : params(params) { }
+
+  Params params;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+      GTensor&& tCgCol,
+      RTensor&& tCrCol,
+      CTensor&& tCcCol,
+      ProblemShape problem_shape,
+      int group,
+      Params const& params
+    ): 
+      tCgCol(cute::forward<GTensor>(tCgCol)),
+      tCrCol(cute::forward<RTensor>(tCrCol)),
+      tCcCol(cute::forward<CTensor>(tCcCol)),
+      m(get<0>(problem_shape)),
+      group(group),
+      params(params) {}
+
+    GTensor tCgCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    RTensor tCrCol;
+    CTensor tCcCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    Params const& params;
+    int m;
+    int group;
+
+    CUTLASS_DEVICE void
+    begin() {
+      Tensor pred = make_tensor<bool>(shape(tCgCol));
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(pred); ++i) {
+        pred(i) = get<0>(tCcCol(i)) < m;
+      }
+
+      if (!params.col_broadcast) {
+        fill(tCrCol, *(params.ptr_col_array[group]));
+        return;
+      }
+
+      // Filter so we don't issue redundant copies over stride-0 modes
+      // (only works if 0-strides are in same location, which is by construction)
+      copy_if(pred, filter(tCgCol), filter(tCrCol));
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<Element, FragmentSize> frg_col;
+      Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_col[i] = tCrCol_mn(epi_v * FragmentSize + i);
+      }
+
+      return frg_col;
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+
+    Tensor mCol = make_tensor(make_gmem_ptr(params.ptr_col_array[l]), make_shape(M,N,1), params.dCol);
+    Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrCol = make_tensor_like(tCgCol);                                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+
+    // Generate an identity tensor matching the shape of the global tensor and 
+    //  partition the same way, this will be used to generate the predicate
+    //  tensor for loading
+    Tensor cCol = make_identity_tensor(mCol.shape());
+    Tensor tCcCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      cCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    return ConsumerStoreCallbacks(
+      cute::move(tCgCol), 
+      cute::move(tCrCol), 
+      cute::move(tCcCol), 
+      args.problem_shape_mnkl,
+      l,
+      params
+    );
+  }
+};
+
+}
diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
index 0a812dc56a994..62b848a0a9635 100644
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
+#include "cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp"
 
 /*
    This file defines custom epilogues for fusing channel scales, token scales,
@@ -69,6 +70,16 @@ struct ScaledEpilogueBase {
       0 /*Stages*/, TileShape, T, T, Stride<Int<0>, Int<1>, Int<0>>,
       128 / sizeof_bits_v<T>, EnableNullPtr>;
 
+  template <typename T>
+  using ColOrScalarLoadArray =
+      cutlass::epilogue::fusion::Sm90ColOrScalarBroadcastArray<
+          0 /*Stages*/, TileShape, T, Stride<Int<1>, Int<0>, Int<0>>>;
+
+  template <typename T>
+  using RowOrScalarLoadArray =
+      cutlass::epilogue::fusion::Sm90RowOrScalarBroadcastArray<
+          0 /*Stages*/, TileShape, T, Stride<Int<0>, Int<1>, Int<0>>>;
+
   // This utility function constructs the arguments for the load descriptors
   // from a tensor. It can handle both row and column, as well as row/column or
   // scalar cases.
@@ -96,6 +107,14 @@ struct ScaledEpilogueBase {
                   std::is_same_v<Descriptor, RowLoad<T, true>>);
     return Arguments{data_ptr};
   }
+
+  template <typename Descriptor, typename T>
+  static auto args_from_tensor(const T* const* data_ptr, bool do_broadcast) {
+    using Arguments = typename Descriptor::Arguments;
+    static_assert(std::is_same_v<Descriptor, ColOrScalarLoadArray<T>> ||
+                  std::is_same_v<Descriptor, RowOrScalarLoadArray<T>>);
+    return Arguments{data_ptr, do_broadcast};
+  }
 };
 
 /*
@@ -381,4 +400,51 @@ struct ScaledEpilogueBiasAzpToken
   }
 };
 
+/*
+    This epilogue works like ScaledEpilogue, but ScaleA and ScaleB are pointers
+    to arrays containing different scales used in group gemm. The number of
+   pointers in ScaleA and the number of pointers in ScaleB are equal to the
+   group size.
+*/
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueArray
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoadArray<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoadArray<float>;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  using ScaleAArray = typename SUPER::template ColOrScalarLoadArray<float>;
+  using ScaleBArray = typename SUPER::template RowOrScalarLoadArray<float>;
+
+  static ArgumentType prepare_args(float const* const* a_scales_ptr,
+                                   float const* const* b_scales_ptr,
+                                   bool a_col_broadcast, bool b_row_broadcast) {
+    auto a_args = SUPER::template args_from_tensor<ScaleAArray, float>(
+        a_scales_ptr, a_col_broadcast);
+    auto b_args = SUPER::template args_from_tensor<ScaleBArray, float>(
+        b_scales_ptr, b_row_broadcast);
+
+    typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
+    return ArgumentType{a_args, evt0_args, {}};
+  }
+};
+
 };  // namespace vllm::c3x
diff --git a/csrc/ops.h b/csrc/ops.h
index 7434aead57f0e..1ea9f465cf21d 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -164,6 +164,7 @@ int64_t ggml_moe_get_block_size(int64_t type);
 bool cutlass_scaled_mm_supports_fp4(int64_t cuda_device_capability);
 bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
 bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability);
+bool cutlass_group_gemm_supported(int64_t cuda_device_capability);
 
 void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A,
                            torch::Tensor const& B, torch::Tensor const& A_sf,
@@ -175,6 +176,19 @@ void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
                        torch::Tensor const& b_scales,
                        std::optional<torch::Tensor> const& bias);
 
+void cutlass_moe_mm(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides);
+
+void get_cutlass_moe_mm_data(
+    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
+    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
+    torch::Tensor& input_permutation, torch::Tensor& output_permutation,
+    const int64_t num_experts, const int64_t n, const int64_t k);
+
 void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                            torch::Tensor const& b,
                            torch::Tensor const& a_scales,
diff --git a/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh b/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh
new file mode 100644
index 0000000000000..6c6e89790847f
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh
@@ -0,0 +1,80 @@
+#pragma once
+
+#include <cuda.h>
+#include <torch/all.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include "core/scalar_type.hpp"
+#include "cutlass/bfloat16.h"
+#include "cutlass/float8.h"
+
+template <typename ElementAB, typename ElementC, typename ElementAccumulator>
+__global__ void get_group_gemm_starts(
+    int32_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets,
+    ElementC** out_offsets, ElementAccumulator** a_scales_offsets,
+    ElementAccumulator** b_scales_offsets, ElementAB* a_base_as_int,
+    ElementAB* b_base_as_int, ElementC* out_base_as_int,
+    ElementAccumulator* a_scales_base_as_int,
+    ElementAccumulator* b_scales_base_as_int, int64_t n, int64_t k,
+    bool per_act_token, bool per_out_ch) {
+  int expert_id = threadIdx.x;
+
+  int64_t expert_offset = expert_offsets[expert_id];
+
+  a_offsets[expert_id] = a_base_as_int + expert_offset * k;
+  b_offsets[expert_id] = b_base_as_int + expert_id * k * n;
+  out_offsets[expert_id] = out_base_as_int + expert_offset * n;
+  a_scales_offsets[expert_id] =
+      a_scales_base_as_int + (per_act_token ? expert_offset : 0);
+  b_scales_offsets[expert_id] =
+      b_scales_base_as_int + (per_out_ch ? n * expert_id : expert_id);
+}
+
+#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE)                    \
+  else if (out_tensors.dtype() == TENSOR_C_TYPE) {                         \
+    get_group_gemm_starts<cutlass::float_e4m3_t, C_TYPE, float>            \
+        <<<1, num_experts, 0, stream>>>(                                   \
+            static_cast<int32_t*>(expert_offsets.data_ptr()),              \
+            static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()),       \
+            static_cast<cutlass::float_e4m3_t**>(b_ptrs.data_ptr()),       \
+            static_cast<C_TYPE**>(out_ptrs.data_ptr()),                    \
+            static_cast<float**>(a_scales_ptrs.data_ptr()),                \
+            static_cast<float**>(b_scales_ptrs.data_ptr()),                \
+            static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()),     \
+            static_cast<cutlass::float_e4m3_t*>(b_tensors.data_ptr()),     \
+            static_cast<C_TYPE*>(out_tensors.data_ptr()),                  \
+            static_cast<float*>(a_scales.data_ptr()),                      \
+            static_cast<float*>(b_scales.data_ptr()), out_tensors.size(1), \
+            a_tensors.size(1), per_act_token, per_out_ch);                 \
+  }
+
+namespace {
+
+void run_get_group_gemm_starts(
+    torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
+    torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
+    torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
+    torch::Tensor const& a_tensors, torch::Tensor const& b_tensors,
+    torch::Tensor& out_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales) {
+  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  int num_experts = static_cast<int>(expert_offsets.size(0));
+  bool per_act_token = a_scales.numel() != 1;
+  bool per_out_ch = b_scales.numel() != num_experts;
+
+  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
+
+  if (false) {
+  }
+  __CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t)
+  __CALL_GET_STARTS_KERNEL(torch::kFloat16, half)
+  else {
+    TORCH_CHECK(false, "Invalid output type (must be float16 or bfloat16)");
+  }
+}
+
+}  // namespace
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu
new file mode 100644
index 0000000000000..2b8bc3fb0b261
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu
@@ -0,0 +1,160 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include "cutlass/cutlass.h"
+#include "grouped_mm_c3x.cuh"
+
+using namespace cute;
+
+namespace {
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_default {
+  // M in (16, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule =
+      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+  using TileShape = cute::Shape<cute::_64, cute::_256, cute::_128>;
+  using ClusterShape = cute::Shape<cute::_1, cute::_2, cute::_1>;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M16 {
+  // M in [1, 16]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule =
+      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+  using TileShape = cute::Shape<cute::_64, cute::_64, cute::_128>;
+  using ClusterShape = cute::Shape<cute::_1, cute::_4, cute::_1>;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_K8192 {
+  // K in [8192, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule =
+      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+  using TileShape = cute::Shape<cute::_128, cute::_128, cute::_128>;
+  using ClusterShape = cute::Shape<cute::_1, cute::_8, cute::_1>;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_N8192 {
+  // N in [8192, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule =
+      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+  using TileShape = cute::Shape<cute::_64, cute::_128, cute::_256>;
+  using ClusterShape = cute::Shape<cute::_1, cute::_8, cute::_1>;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType>
+void run_cutlass_moe_mm_sm90(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+  TORCH_CHECK(a_tensors.size(0) > 0, "No input A tensors provided.");
+  TORCH_CHECK(b_tensors.size(0) > 0, "No input B tensors provided.");
+  TORCH_CHECK(out_tensors.size(0) > 0, "No output tensors provided.");
+
+  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn,
+              "A tensors must be of type float8_e4m3fn.");
+  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn,
+              "B tensors must be of type float8_e4m3fn.");
+
+  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn);
+
+  using Cutlass3xGemmN8192 = typename sm90_fp8_config_N8192<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+  using Cutlass3xGemmK8192 = typename sm90_fp8_config_K8192<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+  using Cutlass3xGemmM16 = typename sm90_fp8_config_M16<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+  using Cutlass3xGemmDefault = typename sm90_fp8_config_default<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+
+  uint32_t const m = a_tensors.size(0);
+  uint32_t const n = out_tensors.size(1);
+  uint32_t const k = a_tensors.size(1);
+
+  if (n >= 8192) {
+    cutlass_group_gemm_caller<Cutlass3xGemmN8192>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides);
+  } else if (k >= 8192) {
+    cutlass_group_gemm_caller<Cutlass3xGemmK8192>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides);
+  } else if (m <= 16) {
+    cutlass_group_gemm_caller<Cutlass3xGemmM16>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides);
+  } else {
+    cutlass_group_gemm_caller<Cutlass3xGemmDefault>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides);
+  }
+}
+
+void dispatch_moe_mm_sm90(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+  if (out_tensors.dtype() == torch::kBFloat16) {
+    run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::bfloat16_t>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides);
+  } else {
+    run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::half_t>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides);
+  }
+}
+
+}  // namespace
+
+void cutlass_moe_mm_sm90(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+  dispatch_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
+                       expert_offsets, problem_sizes, a_strides, b_strides,
+                       c_strides);
+}
diff --git a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
new file mode 100644
index 0000000000000..db827b7c5e186
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
@@ -0,0 +1,149 @@
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+#include "cutlass_extensions/common.hpp"
+#include "get_group_starts.cuh"
+
+using namespace cute;
+
+namespace {
+
+using ProblemShape =
+    cutlass::gemm::GroupProblemShape<cute::Shape<int, int, int>>;
+
+using ElementAccumulator = float;
+using ArchTag = cutlass::arch::Sm90;
+using OperatorClass = cutlass::arch::OpClassTensorOp;
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using LayoutC = cutlass::layout::RowMajor;
+
+template <typename ElementAB_, typename ElementC_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule>
+struct cutlass_3x_group_gemm {
+  using ElementAB = ElementAB_;
+  using ElementC = void;
+  using ElementD = ElementC_;
+  using ElementAccumulator = float;
+
+  using Epilogue = Epilogue_<ElementAccumulator, ElementD, TileShape>;
+
+  using StrideC =
+      cute::remove_pointer_t<cute::Stride<int64_t, cute::Int<1>, cute::Int<0>>>;
+
+  static constexpr int AlignmentAB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, TileShape, ClusterShape,
+          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
+          ElementAccumulator, ElementC, LayoutC*, AlignmentC, ElementD,
+          LayoutC*, AlignmentC, EpilogueSchedule, EVTCompute>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, ElementAB, LayoutA*, AlignmentAB, ElementAB,
+          LayoutB*, AlignmentAB, ElementAccumulator, TileShape, ClusterShape,
+          Stages, KernelSchedule>::CollectiveOp;
+
+  using KernelType = enable_sm90_only<cutlass::gemm::kernel::GemmUniversal<
+      ProblemShape, CollectiveMainloop, CollectiveEpilogue>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename Gemm>
+void cutlass_group_gemm_caller(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  int num_experts = static_cast<int>(expert_offsets.size(0));
+  int k_size = a_tensors.size(1);
+  int n_size = out_tensors.size(1);
+
+  bool per_act_token = a_scales.numel() != 1;
+  bool per_out_ch = b_scales.numel() != num_experts;
+
+  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
+
+  auto options_int =
+      torch::TensorOptions().dtype(torch::kInt64).device(a_tensors.device());
+
+  torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
+
+  run_get_group_gemm_starts(expert_offsets, a_ptrs, b_ptrs, out_ptrs,
+                            a_scales_ptrs, b_scales_ptrs, a_tensors, b_tensors,
+                            out_tensors, a_scales, b_scales);
+
+  using GemmKernel = typename Gemm::GemmKernel;
+  using StrideA = Stride<int64_t, Int<1>, Int<0>>;
+  using StrideB = Stride<int64_t, Int<1>, Int<0>>;
+  using StrideC = typename GemmKernel::InternalStrideC;
+
+  ProblemShape::UnderlyingProblemShape* problem_sizes_as_shapes =
+      static_cast<ProblemShape::UnderlyingProblemShape*>(
+          problem_sizes.data_ptr());
+  ProblemShape prob_shape{num_experts, problem_sizes_as_shapes, nullptr};
+
+  typename GemmKernel::MainloopArguments mainloop_args{
+      static_cast<const ElementAB**>(a_ptrs.data_ptr()),
+      static_cast<StrideA*>(a_strides.data_ptr()),
+      static_cast<const ElementAB**>(b_ptrs.data_ptr()),
+      static_cast<StrideB*>(b_strides.data_ptr())};
+
+  // Currently, we are only able to do broadcast on either all or none a_scales
+  // and on either all or none b_scales
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          static_cast<const ElementAccumulator**>(a_scales_ptrs.data_ptr()),
+          static_cast<const ElementAccumulator**>(b_scales_ptrs.data_ptr()),
+          per_act_token, per_out_ch),
+      nullptr, static_cast<StrideC*>(c_strides.data_ptr()),
+      static_cast<ElementD**>(out_ptrs.data_ptr()),
+      static_cast<StrideC*>(c_strides.data_ptr())};
+
+  typename GemmKernel::Arguments args{
+      cutlass::gemm::GemmUniversalMode::kGrouped, prob_shape, mainloop_args,
+      epilogue_args};
+
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a_tensors.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+}  // namespace
diff --git a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
new file mode 100644
index 0000000000000..2fb0417ce6c41
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
@@ -0,0 +1,90 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include <iostream>
+
+constexpr uint64_t THREADS_PER_EXPERT = 512;
+
+__global__ void compute_problem_sizes(const int* __restrict__ topk_ids,
+                                      int32_t* problem_sizes1,
+                                      int32_t* problem_sizes2,
+                                      int32_t* atomic_buffer,
+                                      const int topk_length, const int n,
+                                      const int k) {
+  int expert_id = blockIdx.x;
+
+  int occurrences = 0;
+  for (int i = threadIdx.x; i < topk_length; i += THREADS_PER_EXPERT) {
+    occurrences += (topk_ids[i] == expert_id);
+  }
+  atomicAdd(&atomic_buffer[expert_id], occurrences);
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    int final_occurrences = atomic_buffer[expert_id];
+    problem_sizes1[expert_id * 3] = final_occurrences;
+    problem_sizes1[expert_id * 3 + 1] = 2 * n;
+    problem_sizes1[expert_id * 3 + 2] = k;
+    problem_sizes2[expert_id * 3] = final_occurrences;
+    problem_sizes2[expert_id * 3 + 1] = k;
+    problem_sizes2[expert_id * 3 + 2] = n;
+  }
+}
+
+__global__ void compute_expert_offsets(
+    const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets,
+    int32_t* atomic_buffer, const int num_experts) {
+  int32_t tot_offset = 0;
+  expert_offsets[0] = 0;
+  for (int i = 0; i < num_experts; ++i) {
+    atomic_buffer[i] = tot_offset;
+    tot_offset += problem_sizes1[i * 3];
+    expert_offsets[i + 1] = tot_offset;
+  }
+}
+
+__global__ void compute_arg_sorts(const int* __restrict__ topk_ids,
+                                  int32_t* input_permutation,
+                                  int32_t* output_permutation,
+                                  int32_t* atomic_buffer, const int topk_length,
+                                  const int topk) {
+  int expert_id = blockIdx.x;
+
+  for (int i = threadIdx.x; i < topk_length; i += THREADS_PER_EXPERT) {
+    if (topk_ids[i] == expert_id) {
+      int start = atomicAdd(&atomic_buffer[expert_id], 1);
+      input_permutation[start] = i / topk;
+      output_permutation[i] = start;
+    }
+  }
+}
+
+void get_cutlass_moe_mm_data_caller(
+    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
+    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
+    torch::Tensor& input_permutation, torch::Tensor& output_permutation,
+    const int64_t num_experts, const int64_t n, const int64_t k) {
+  auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
+  auto options_int32 =
+      torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
+  torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32);
+
+  int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
+  compute_problem_sizes<<<num_experts, num_threads, 0, stream>>>(
+      static_cast<const int32_t*>(topk_ids.data_ptr()),
+      static_cast<int32_t*>(problem_sizes1.data_ptr()),
+      static_cast<int32_t*>(problem_sizes2.data_ptr()),
+      static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n, k);
+  compute_expert_offsets<<<1, 1, 0, stream>>>(
+      static_cast<const int32_t*>(problem_sizes1.data_ptr()),
+      static_cast<int32_t*>(expert_offsets.data_ptr()),
+      static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
+  compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
+      static_cast<const int32_t*>(topk_ids.data_ptr()),
+      static_cast<int32_t*>(input_permutation.data_ptr()),
+      static_cast<int32_t*>(output_permutation.data_ptr()),
+      static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(),
+      topk_ids.size(1));
+}
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index b08386459cbe2..54b63894e4cbc 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -29,6 +29,20 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& a_scales,
                             torch::Tensor const& b_scales,
                             std::optional<torch::Tensor> const& bias);
+
+void cutlass_moe_mm_sm90(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides);
+
+void get_cutlass_moe_mm_data_caller(
+    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
+    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
+    torch::Tensor& input_permutation, torch::Tensor& output_permutation,
+    const int64_t num_experts, const int64_t n, const int64_t k);
+
 #endif
 
 #if defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM100
@@ -102,6 +116,19 @@ bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability) {
   return false;
 }
 
+bool cutlass_group_gemm_supported(int64_t cuda_device_capability) {
+  // CUTLASS groped FP8 kernels need at least CUDA 12.3
+  // and SM90 (Hopper)
+
+#if defined CUDA_VERSION
+  if (cuda_device_capability == 90) {
+    return CUDA_VERSION >= 12030;
+  }
+#endif
+
+  return false;
+}
+
 void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
                        torch::Tensor const& b_scales,
@@ -168,6 +195,46 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
       version_num);
 }
 
+void cutlass_moe_mm(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+  int32_t version_num = get_sm_version_num();
+#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
+  cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
+                      expert_offsets, problem_sizes, a_strides, b_strides,
+                      c_strides);
+  return;
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_scaled_mm for CUDA device capability: ", version_num,
+      ". Required capability: 90");
+}
+
+void get_cutlass_moe_mm_data(
+    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
+    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
+    torch::Tensor& input_permutation, torch::Tensor& output_permutation,
+    const int64_t num_experts, const int64_t n, const int64_t k) {
+  // This function currently gets compiled only if we have a valid cutlass moe
+  // mm to run it for.
+  int32_t version_num = get_sm_version_num();
+#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
+  get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
+                                 problem_sizes2, input_permutation,
+                                 output_permutation, num_experts, n, k);
+  return;
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled get_cutlass_moe_mm_data: no cutlass_scaled_mm kernel for "
+      "CUDA device capability: ",
+      version_num, ". Required capability: 90");
+}
+
 void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
                            torch::Tensor const& b,
                            torch::Tensor const& a_scales,
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index eb3a2c911d55e..60ad6430336a5 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -365,6 +365,35 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
   ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
 
+  // Check if cutlass grouped gemm is supported for CUDA devices of the given
+  // capability
+  ops.def("cutlass_group_gemm_supported(int cuda_device_capability) -> bool");
+  ops.impl("cutlass_group_gemm_supported", &cutlass_group_gemm_supported);
+
+  // CUTLASS w8a8 grouped GEMM
+  ops.def(
+      "cutlass_moe_mm(Tensor! out_tensors, Tensor a_tensors, Tensor b_tensors, "
+      "               Tensor a_scales, Tensor b_scales, Tensor expert_offsets, "
+      "               Tensor problem_sizes, Tensor a_strides, "
+      "               Tensor b_strides, Tensor c_strides) -> ()",
+      {stride_tag});
+  ops.impl("cutlass_moe_mm", torch::kCUDA, &cutlass_moe_mm);
+
+  // A function that computes data required to run fused MoE with w8a8 grouped
+  // GEMM. It takes topk_ids as an input, and computes expert_offsets
+  // (token start indices of each expert). In addition to this, it computes
+  // problem sizes for each expert's multiplication used by the two mms called
+  // from fused MoE operation, and arrays with permutations required to shuffle
+  // and de-shuffle the input/output of the fused operation.
+  ops.def(
+      "get_cutlass_moe_mm_data(Tensor topk_ids, Tensor! expert_offsets, "
+      "                        Tensor! problem_sizes1, Tensor! problem_sizes2, "
+      "                        Tensor! input_permutation, "
+      "                        Tensor! output_permutation, int num_experts, "
+      "                        int n, int k) -> ()",
+      {stride_tag});
+  ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data);
+
   // Check if cutlass scaled_mm supports block quantization (used by DeepSeekV3)
   ops.def(
       "cutlass_scaled_mm_supports_block_fp8(int cuda_device_capability) -> "
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index 72fc660a653d5..f11ce6f45a984 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -3,6 +3,7 @@
 
 Run `pytest tests/kernels/test_cutlass.py`.
 """
+import random
 
 import pytest
 import torch
@@ -507,3 +508,136 @@ def test_cutlass_cuda_graph(per_act_token: bool, per_out_ch: bool):
 
 def test_cutlass_support_opcheck():
     opcheck(torch.ops._C.cutlass_scaled_mm_supports_fp8, (capability, ))
+
+
+@pytest.mark.parametrize("num_experts", [8, 64])
+@pytest.mark.parametrize("per_act_token", [True, False])
+@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("use_bias", [False])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
+                                per_out_ch: bool, use_bias: bool):
+
+    # Device and dtype setup
+    device = "cuda"
+    out_dtype = torch.half
+
+    # Create separate A, B, C tensors for each group
+    a_tensors = []
+    b_tensors = []
+    a_scales_tensors = []
+    b_scales_tensors = []
+    baseline_tensors = []
+
+    expert_offsets = torch.zeros((num_experts + 1),
+                                 device=device,
+                                 dtype=torch.int32)
+
+    problem_sizes = torch.zeros((num_experts, 3),
+                                device=device,
+                                dtype=torch.int32)
+
+    if not per_act_token:
+        one_scale_a = torch.randn((1, 1), device=device, dtype=torch.float32)
+
+    alignment = 16  # 128 // 8
+    # For variation, each group has dimensions
+    n_g = alignment * random.randint(1, 64)
+    k_g = alignment * random.randint(1, 64)
+    for g in range(num_experts):
+        m_g = alignment * random.randint(1, 64)
+
+        expert_offsets[g + 1] = expert_offsets[g] + m_g
+        problem_sizes[g][0] = m_g
+        problem_sizes[g][1] = n_g
+        problem_sizes[g][2] = k_g
+
+        m_a_scales = m_g if per_act_token else 1
+        n_b_scales = n_g if per_out_ch else 1
+
+        print("shape:", m_g, n_g, k_g)
+
+        # Create group-specific A and B (FP8) and output (FP16/FP32)
+        a_g = to_fp8(torch.randn((m_g, k_g), device=device))
+        b_g = to_fp8(torch.randn((n_g, k_g), device=device).t())
+        a_tensors.append(a_g)
+        b_tensors.append(b_g)
+
+        # Set up A/B scales
+        scale_b = torch.randn((1, n_b_scales),
+                              device=device,
+                              dtype=torch.float32)
+        b_scales_tensors.append(scale_b)
+
+        if per_act_token:
+            scale_a = torch.randn((m_a_scales, 1),
+                                  device=device,
+                                  dtype=torch.float32)
+            a_scales_tensors.append(scale_a)
+        else:
+            scale_a = one_scale_a
+
+        # Compute baseline result for this group
+        baseline_g = baseline_scaled_mm(a_g, b_g, scale_a, scale_b, out_dtype,
+                                        None)
+        baseline_tensors.append(baseline_g)
+
+    a_tensors_stacked = torch.empty((expert_offsets[num_experts], k_g),
+                                    device=device,
+                                    dtype=torch.float8_e4m3fn)
+    b_tensors_stacked = torch.empty((num_experts, n_g, k_g),
+                                    device=device,
+                                    dtype=torch.float8_e4m3fn)
+
+    for g in range(num_experts):
+        a_tensors_stacked[expert_offsets[g]:expert_offsets[g +
+                                                           1]] = a_tensors[g]
+        b_tensors_stacked[g] = b_tensors[g].t()
+    b_tensors_stacked = b_tensors_stacked.transpose(1, 2)
+
+    if per_act_token:
+        a_scales_tensors_stacked = torch.empty(
+            (expert_offsets[num_experts], 1),
+            device=device,
+            dtype=torch.float32)
+        for g in range(num_experts):
+            a_scales_tensors_stacked[
+                expert_offsets[g]:expert_offsets[g + 1]] = a_scales_tensors[g]
+    else:
+        a_scales_tensors_stacked = one_scale_a
+
+    b_scales_tensors_stacked = torch.empty((num_experts, n_b_scales),
+                                           device=device,
+                                           dtype=torch.float32)
+    for g in range(num_experts):
+        b_scales_tensors_stacked[g] = b_scales_tensors[g]
+
+    out_tensors_stacked = torch.zeros((expert_offsets[num_experts], n_g),
+                                      device=device,
+                                      dtype=out_dtype)
+
+    ab_strides = torch.full((num_experts, ),
+                            a_tensors_stacked.stride(0),
+                            device="cuda",
+                            dtype=torch.int64)
+    c_strides = torch.full((num_experts, ),
+                           out_tensors_stacked.stride(0),
+                           device="cuda",
+                           dtype=torch.int64)
+
+    ops.cutlass_moe_mm(out_tensors_stacked, a_tensors_stacked,
+                       b_tensors_stacked, a_scales_tensors_stacked,
+                       b_scales_tensors_stacked, expert_offsets[:-1],
+                       problem_sizes, ab_strides, ab_strides, c_strides)
+
+    # Validate each group's result against the baseline
+    for g in range(num_experts):
+        baseline = baseline_tensors[g]
+        c = out_tensors_stacked[expert_offsets[g]:expert_offsets[g + 1]]
+        print(baseline)
+        print(c)
+        print("*")
+        torch.testing.assert_close(c, baseline, rtol=1e-2, atol=5e-4)
diff --git a/tests/kernels/test_cutlass_moe.py b/tests/kernels/test_cutlass_moe.py
new file mode 100644
index 0000000000000..1652c72d86fe1
--- /dev/null
+++ b/tests/kernels/test_cutlass_moe.py
@@ -0,0 +1,244 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.fused_moe import (cutlass_moe_fp8,
+                                                            fused_experts,
+                                                            fused_topk)
+from vllm.platforms import current_platform
+
+NUM_EXPERTS = [40, 64]
+TOP_KS = [6, 8]
+
+
+def run(a: torch.Tensor, a_scale: torch.Tensor, w1_q: torch.Tensor,
+        w2_q: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor,
+        topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+        ab_strides1: torch.Tensor, c_strides1: torch.Tensor,
+        ab_strides2: torch.Tensor, c_strides2: torch.Tensor):
+    with set_current_vllm_config(
+            VllmConfig(parallel_config=ParallelConfig(
+                pipeline_parallel_size=1))):
+        return cutlass_moe_fp8(a,
+                               w1_q,
+                               w2_q,
+                               w1_scale,
+                               w2_scale,
+                               topk_weights,
+                               topk_ids,
+                               ab_strides1,
+                               c_strides1,
+                               ab_strides2,
+                               c_strides2,
+                               a1_scale=a_scale)
+
+
+@pytest.mark.parametrize("m", [2, 64, 224])
+@pytest.mark.parametrize("n", [1024, 3072])
+@pytest.mark.parametrize("k", [1024, 1536])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("per_act_token", [True, False])
+@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+def test_cutlass_moe_no_graph(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_ch: bool,
+):
+    current_platform.seed_everything(7)
+    with set_current_vllm_config(
+            VllmConfig(parallel_config=ParallelConfig(
+                pipeline_parallel_size=1))):
+
+        dtype = torch.half
+
+        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+        w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+        w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+
+        # Get the right scale for tests.
+        _, a_scale1 = ops.scaled_fp8_quant(
+            a, use_per_token_if_dynamic=per_act_token)
+        a_q, _ = ops.scaled_fp8_quant(a,
+                                      a_scale1,
+                                      use_per_token_if_dynamic=per_act_token)
+
+        a_d = a_q.float().mul(a_scale1).to(dtype)
+
+        n_b_scales = 2 * n if per_out_ch else 1
+        k_b_scales = k if per_out_ch else 1
+
+        w1_q = torch.empty((e, 2 * n, k),
+                           device="cuda",
+                           dtype=torch.float8_e4m3fn)
+        w2_q = torch.empty((e, k, n), device="cuda", dtype=torch.float8_e4m3fn)
+        w1_scale = torch.empty((e, n_b_scales, 1),
+                               device="cuda",
+                               dtype=torch.float32)
+        w2_scale = torch.empty((e, k_b_scales, 1),
+                               device="cuda",
+                               dtype=torch.float32)
+
+        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
+        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
+        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+
+        for expert in range(e):
+            w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
+                w1[expert], use_per_token_if_dynamic=per_out_ch)
+            w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
+                w2[expert], use_per_token_if_dynamic=per_out_ch)
+        w1_q = w1_q.transpose(1, 2)
+        w2_q = w2_q.transpose(1, 2)
+
+        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
+        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
+        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+
+        w1_d = torch.empty_like(w1)
+        w2_d = torch.empty_like(w2)
+        for expert in range(e):
+            w1_d[expert] = (w1_q[expert].t().float() * w1_scale[expert]).half()
+            w2_d[expert] = (w2_q[expert].t().float() * w2_scale[expert]).half()
+
+        score = torch.randn((m, e), device="cuda", dtype=dtype)
+        topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
+
+        triton_output = fused_experts(a_d, w1_d, w2_d, topk_weights, topk_ids)
+
+        cutlass_output = cutlass_moe_fp8(a,
+                                         w1_q,
+                                         w2_q,
+                                         w1_scale,
+                                         w2_scale,
+                                         topk_weights,
+                                         topk_ids,
+                                         ab_strides1,
+                                         c_strides1,
+                                         ab_strides2,
+                                         c_strides2,
+                                         a1_scale=a_scale1)
+
+        print(triton_output)
+        print(cutlass_output)
+        print("*")
+
+        torch.testing.assert_close(triton_output,
+                                   cutlass_output,
+                                   atol=5e-2,
+                                   rtol=1e-2)
+
+
+@pytest.mark.parametrize("m", [2, 64, 224])
+@pytest.mark.parametrize("n", [1024, 3072])
+@pytest.mark.parametrize("k", [1024, 1536])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("per_act_token", [True, False])
+@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+def test_cutlass_moe_cuda_graph(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_ch: bool,
+):
+    current_platform.seed_everything(7)
+    with set_current_vllm_config(
+            VllmConfig(parallel_config=ParallelConfig(
+                pipeline_parallel_size=1))):
+
+        dtype = torch.half
+
+        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+        w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+        w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+
+        # Get the right scale for tests.
+        _, a_scale1 = ops.scaled_fp8_quant(
+            a, use_per_token_if_dynamic=per_act_token)
+        a_q, _ = ops.scaled_fp8_quant(a,
+                                      a_scale1,
+                                      use_per_token_if_dynamic=per_act_token)
+
+        a_d = a_q.float().mul(a_scale1).to(dtype)
+
+        n_b_scales = 2 * n if per_out_ch else 1
+        k_b_scales = k if per_out_ch else 1
+
+        w1_q = torch.empty((e, 2 * n, k),
+                           device="cuda",
+                           dtype=torch.float8_e4m3fn)
+        w2_q = torch.empty((e, k, n), device="cuda", dtype=torch.float8_e4m3fn)
+        w1_scale = torch.empty((e, n_b_scales, 1),
+                               device="cuda",
+                               dtype=torch.float32)
+        w2_scale = torch.empty((e, k_b_scales, 1),
+                               device="cuda",
+                               dtype=torch.float32)
+
+        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
+        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
+        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+
+        for expert in range(e):
+            w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
+                w1[expert], use_per_token_if_dynamic=per_out_ch)
+            w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
+                w2[expert], use_per_token_if_dynamic=per_out_ch)
+        w1_q = w1_q.transpose(1, 2)
+        w2_q = w2_q.transpose(1, 2)
+
+        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
+        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
+        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+
+        w1_d = torch.empty_like(w1)
+        w2_d = torch.empty_like(w2)
+        for expert in range(e):
+            w1_d[expert] = (w1_q[expert].t().float() * w1_scale[expert]).half()
+            w2_d[expert] = (w2_q[expert].t().float() * w2_scale[expert]).half()
+
+        score = torch.randn((m, e), device="cuda", dtype=dtype)
+        topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
+
+        triton_output = fused_experts(a_d, w1_d, w2_d, topk_weights, topk_ids)
+
+        stream = torch.cuda.Stream()
+        graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(graph, stream=stream):
+            cutlass_output = run(a, a_scale1, w1_q, w2_q, w1_scale, w2_scale,
+                                 topk_weights, topk_ids, ab_strides1,
+                                 c_strides1, ab_strides2, c_strides2)
+        torch.cuda.synchronize()
+        graph.replay()
+        torch.cuda.synchronize()
+
+        print(triton_output)
+        print(cutlass_output)
+        print("*")
+
+        torch.testing.assert_close(triton_output,
+                                   cutlass_output,
+                                   atol=9e-2,
+                                   rtol=1e-2)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index dc07bad4680f9..2ffcef414cb28 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -587,6 +587,9 @@ def cutlass_sparse_scaled_mm_supported(cuda_device_capability: int) -> bool:
         cuda_device_capability)
 
 
+def cutlass_group_gemm_supported(cuda_device_capability: int) -> bool:
+    return torch.ops._C.cutlass_group_gemm_supported(cuda_device_capability)
+
 def cutlass_sparse_compress(a: torch.Tensor) \
     -> tuple[torch.Tensor, torch.Tensor]:
     """
@@ -677,6 +680,56 @@ def cutlass_scaled_sparse_mm(
     return out
 
 
+def get_cutlass_moe_mm_data(
+        topk_ids: torch.Tensor, expert_offsets: torch.Tensor,
+        problem_sizes1: torch.Tensor, problem_sizes2: torch.Tensor,
+        input_permutation: torch.Tensor, output_permutation: torch.Tensor,
+        num_experts: int, n: int, k: int):
+    """
+    Prepare data necessary to perform CUTLASS grouped matrix multiplications
+    used in CUTLASS-based fused MoE.
+
+    The function takes in topk_ids (token-expert mapping) and uses it to
+    compute:
+    - expert_offsets: Indices that mark at which token index each expert begins
+                      its computation after the input is sorted with
+                      input_permutation. The number of tokens computed with
+                      expert E is expert_offsets[E + 1] - expert_offsets[E]
+    - problem_sizes1, problem_sizes2: MxNxK sizes of each expert's
+                                      multiplication in two grouped MMs used in
+                                      the fused MoE operation.
+    - input_permutation: Permutation that must be used to shuffle the input
+                         before executing the MMs.
+    - output_permutation: Permutation that must be used to shuffle the output
+                          after executing the MMs.
+    """
+    torch.ops._C.get_cutlass_moe_mm_data(topk_ids, expert_offsets,
+                                         problem_sizes1, problem_sizes2,
+                                         input_permutation, output_permutation,
+                                         num_experts, n, k)
+
+
+def cutlass_moe_mm(out_tensors: torch.Tensor, a_tensors: torch.Tensor,
+                   b_tensors: torch.Tensor, a_scales: torch.Tensor,
+                   b_scales: torch.Tensor, expert_offsets: torch.Tensor,
+                   problem_sizes: torch.Tensor, a_strides: torch.Tensor,
+                   b_strides: torch.Tensor, c_strides: torch.Tensor):
+    """
+    A single grouped matrix multiplication used in CUTLASS-based fused MoE.
+    The function executes fp8-quantized OUT = AB matrix multiplication.
+
+    - expert_offsets: Indices that mark at which token index each expert begins
+                      its computation. The number of tokens computed with
+                      expert E is expert_offsets[E + 1] - expert_offsets[E]
+    - problem_sizes: MxNxK sizes of each expert's multiplication in two grouped
+                     MMs used in the fused MoE operation.
+    - a/b/c_strides: The data strides passed to grouped matrix multiplication.
+    """
+    torch.ops._C.cutlass_moe_mm(out_tensors, a_tensors, b_tensors, a_scales,
+                                b_scales, expert_offsets, problem_sizes,
+                                a_strides, b_strides, c_strides)
+
+
 # aqlm
 def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor,
               codebooks: torch.Tensor, scales: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 6f933c3fa3c9f..e096d14fc6f91 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -36,8 +36,8 @@ if HAS_TRITON:
     import vllm.model_executor.layers.fused_moe.fused_marlin_moe  # noqa
     import vllm.model_executor.layers.fused_moe.fused_moe  # noqa
     from vllm.model_executor.layers.fused_moe.fused_moe import (
-        fused_experts, fused_moe, fused_topk, get_config_file_name,
-        grouped_topk)
+        cutlass_moe_fp8, fused_experts, fused_moe, fused_topk,
+        get_config_file_name, grouped_topk)
 
     __all__ += [
         "fused_moe",
@@ -45,4 +45,5 @@ if HAS_TRITON:
         "fused_experts",
         "get_config_file_name",
         "grouped_topk",
+        "cutlass_moe_fp8",
     ]
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index faaea6b4de972..0929530ebec4c 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1623,3 +1623,140 @@ def fused_moe(
                          a1_scale=a1_scale,
                          a2_scale=a2_scale,
                          block_shape=block_shape)
+
+
+#TODO make the grouped gemm kernel consistent with scaled gemm kernel
+def cutlass_moe_fp8(
+    a: torch.Tensor,
+    w1_q: torch.Tensor,
+    w2_q: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    ab_strides1: torch.Tensor,
+    c_strides1: torch.Tensor,
+    ab_strides2: torch.Tensor,
+    c_strides2: torch.Tensor,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    out_dtype: torch.dtype = torch.half,
+) -> torch.Tensor:
+    """
+    This function computes a a8w8-quantized Mixture of Experts (MoE) layer
+    using two sets of quantized weights, w1_q and w2_q, and top-k gating
+    mechanism. The matrix multiplications are implemented with CUTLASS
+    grouped gemm.
+
+    Parameters:
+    - a (torch.Tensor): The input tensor to the MoE layer.
+        Shape: [M, K]
+    - w1_q (torch.Tensor): The first set of fp8-quantized expert weights.
+        Shape: [num_experts, K, 2N] (the weights are passed transposed)
+    - w2_q (torch.Tensor): The second set of fp8-quantized expert weights.
+        Shape: [num_experts, N, K] (the weights are passed transposed)
+    - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
+        Shape: [num_experts] or [num_experts, 2N]
+    - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
+        Shape: [num_experts] or [num_experts, K]
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
+    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
+    - ab_strides1 (torch.Tensor): The input and weights strides of the first
+        grouped gemm.
+    - c_strides1 (torch.Tensor): The output strides of the first grouped gemm.
+    - ab_strides2 (torch.Tensor): The input and weights strides of the second
+        grouped gemm.
+    - c_strides2 (torch.Tensor): The output strides of the second grouped gemm.
+    - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
+        Shape: scalar or [M]
+    - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
+        quantize the intermediate result between the gemms.
+        Shape: scalar or [M]
+    - out_dtype (torch.Tensor): The output tensor type.
+
+    Returns:
+    - torch.Tensor: The fp16 output tensor after applying the MoE layer.
+    """
+
+    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
+    assert w1_q.dtype == torch.float8_e4m3fn
+    assert w2_q.dtype == torch.float8_e4m3fn
+    assert a.shape[1] == w1_q.shape[1], "Hidden size mismatch w1"
+    assert w1_q.shape[2] == w2_q.shape[1] * 2, "Hidden size mismatch w2"
+    assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch"
+    assert a1_scale is None or a1_scale.dim(
+    ) == 0 or a1_scale.shape[0] == 1 or a1_scale.shape[0] == a.shape[
+        0], "Input scale shape mismatch"
+    assert w1_scale.dim() == 1 or w1_scale.shape[1] == 1 or w1_scale.shape[
+        1] == w1_q.shape[2], "W1 scale shape mismatch"
+    assert w2_scale.dim() == 1 or w2_scale.shape[1] == 1 or w2_scale.shape[
+        1] == w2_q.shape[2], "W2 scale shape mismatch"
+    assert w1_q.shape[0] == w2_q.shape[0], "Weights expert number mismatch"
+    assert w1_q.shape[0] == w1_scale.shape[
+        0], "w1 scales expert number mismatch"
+    assert w1_q.shape[0] == w2_scale.shape[
+        0], "w2 scales expert number mismatch"
+    assert a2_scale is None or a1_scale is None or a2_scale.shape == a1_scale.shape, "Intermediate scale shape mismatch"  # noqa: E501
+    assert ab_strides1.shape[0] == w1_q.shape[
+        0], "AB Strides 1 expert number mismatch"
+    assert c_strides1.shape[0] == w1_q.shape[
+        0], "C Strides 1 expert number mismatch"
+    assert ab_strides2.shape[0] == w2_q.shape[
+        0], "AB Strides 2 expert number  mismatch"
+    assert c_strides2.shape[0] == w2_q.shape[
+        0], "C Strides 2 expert number mismatch"
+    assert out_dtype in [torch.half, torch.bfloat16], "Invalid output dtype"
+
+    num_experts = w1_q.size(0)
+    m = a.size(0)
+    k = w1_q.size(1)
+    n = w2_q.size(1)
+
+    topk = topk_ids.size(1)
+    per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
+        a2_scale.numel() != 1 if a2_scale is not None else False)
+
+    a_q, a1_scale = ops.scaled_fp8_quant(
+        a, a1_scale, use_per_token_if_dynamic=per_act_token)
+    device = a_q.device
+
+    expert_offsets = torch.empty((num_experts + 1),
+                                 dtype=torch.int32,
+                                 device=device)
+    problem_sizes1 = torch.empty((num_experts, 3),
+                                 dtype=torch.int32,
+                                 device=device)
+    problem_sizes2 = torch.empty((num_experts, 3),
+                                 dtype=torch.int32,
+                                 device=device)
+
+    a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
+    c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
+
+    ops.get_cutlass_moe_mm_data(topk_ids, expert_offsets, problem_sizes1,
+                                problem_sizes2, a_map, c_map, num_experts, n,
+                                k)
+
+    rep_a_q = a_q.view(dtype=torch.uint8)[a_map].view(dtype=a_q.dtype)
+    rep_a1_scales = a1_scale[a_map] if per_act_token else a1_scale
+
+    c1 = torch.empty((m * topk, n * 2), device=device, dtype=out_dtype)
+    c2 = torch.empty((m * topk, k), device=device, dtype=out_dtype)
+
+    ops.cutlass_moe_mm(c1, rep_a_q, w1_q, rep_a1_scales, w1_scale,
+                       expert_offsets[:-1], problem_sizes1, ab_strides1,
+                       ab_strides1, c_strides1)
+
+    intermediate = torch.empty((m * topk, n), device=device, dtype=out_dtype)
+    torch.ops._C.silu_and_mul(intermediate, c1)
+
+    intemediate_q, a2_scale = ops.scaled_fp8_quant(
+        intermediate, a2_scale, use_per_token_if_dynamic=per_act_token)
+
+    ops.cutlass_moe_mm(c2, intemediate_q, w2_q, a2_scale, w2_scale,
+                       expert_offsets[:-1], problem_sizes2, ab_strides2,
+                       ab_strides2, c_strides2)
+
+    return (c2[c_map].view(m, topk, k) *
+            topk_weights.view(m, topk, 1).to(out_dtype)).sum(dim=1)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index ce6c706fe3d27..4b2d7ca2badee 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -96,7 +96,8 @@ class CompressedTensorsConfig(QuantizationConfig):
         if isinstance(layer, Attention):
             return CompressedTensorsKVCacheMethod(self)
         if isinstance(layer, FusedMoE):
-            return CompressedTensorsMoEMethod.get_moe_method(self)
+            return CompressedTensorsMoEMethod.get_moe_method(
+                self, layer.activation, layer.expert_map)
         return None
 
     @classmethod
@@ -191,17 +192,26 @@ class CompressedTensorsConfig(QuantizationConfig):
 
     def _check_scheme_supported(self,
                                 min_capability: int,
-                                error: bool = True) -> bool:
+                                error: bool = True,
+                                match_exact: bool = False) -> bool:
         capability_tuple = current_platform.get_device_capability()
 
         if capability_tuple is not None:
             capability = capability_tuple.to_int()
-            supported = capability >= min_capability
-            if error and not supported:
-                raise RuntimeError(
-                    "Quantization scheme is not supported for ",
-                    f"the current GPU. Min capability: {min_capability}. ",
-                    f"Current capability: {capability}.")
+            if match_exact:
+                supported = capability == min_capability
+                if error and not supported:
+                    raise RuntimeError(
+                        "Quantization scheme is not supported for ",
+                        "the current GPU. Required capability: ",
+                        f"{min_capability}. Current capability: {capability}.")
+            else:
+                supported = capability >= min_capability
+                if error and not supported:
+                    raise RuntimeError(
+                        "Quantization scheme is not supported for ",
+                        f"the current GPU. Min capability: {min_capability}. ",
+                        f"Current capability: {capability}.")
             return supported
         else:
             return False
@@ -262,6 +272,11 @@ class CompressedTensorsConfig(QuantizationConfig):
             input_quant.strategy == QuantizationStrategy.TENSOR)
         return is_symmetric_activation and is_per_tensor_activation
 
+    def _is_fp8_w8a8_sm90(self, weight_quant: BaseModel,
+                          input_quant: BaseModel) -> bool:
+        return (self._check_scheme_supported(90, error=False, match_exact=True)
+                and self._is_fp8_w8a8(weight_quant, input_quant))
+
     def _is_fp8_w8a16(self, weight_quant: BaseModel,
                       input_quant: BaseModel) -> bool:
         # Confirm weights quantized.
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index ff381a4cc1a7f..2e14845ff2d6f 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -31,6 +31,7 @@ class GPTQMarlinState(Enum):
 
 __all__ = [
     "CompressedTensorsMoEMethod", "CompressedTensorsW8A8Fp8MoEMethod",
+    "CompressedTensorsW8A8Fp8MoECutlassMethod",
     "CompressedTensorsWNA16MoEMethod"
 ]
 
@@ -39,7 +40,9 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
 
     @staticmethod
     def get_moe_method(
-        quant_config: "CompressedTensorsConfig"  # type: ignore # noqa E501
+        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
+        activation: str,
+        expert_map: Optional[torch.Tensor],
     ) -> "CompressedTensorsMoEMethod":
         # TODO: @dsikka: refactor this to use schemes as other kernels
         # are supported + check if the layer is being ignored.
@@ -49,6 +52,9 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
 
         if quant_config._is_wNa16_group_channel(weight_quant, input_quant):
             return CompressedTensorsWNA16MoEMethod(quant_config)
+        elif (quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
+              and activation == "silu" and expert_map is None):
+            return CompressedTensorsW8A8Fp8MoECutlassMethod(quant_config)
         elif quant_config._is_fp8_w8a8(weight_quant, input_quant):
             return CompressedTensorsW8A8Fp8MoEMethod(quant_config)
         else:
@@ -250,6 +256,200 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                              a2_scale=layer.w2_input_scale)
 
 
+class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
+
+    def __init__(
+            self,
+            quant_config: "CompressedTensorsConfig"  # type: ignore # noqa E501
+    ):
+        self.quant_config = quant_config
+        self.weight_quant = self.quant_config.target_scheme_map["Linear"].get(
+            "weights")
+        self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
+            "input_activations")
+
+        if not (self.weight_quant.strategy == QuantizationStrategy.TENSOR
+                and self.input_quant.strategy == QuantizationStrategy.TENSOR):
+            raise ValueError(
+                "For FP8 Fused MoE layers, only per-tensor scales "
+                "for weights and activations are supported. Found "
+                f"{self.weight_quant}, {self.input_quant}")
+
+        self.static_input_scales = not self.input_quant.dynamic
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        params_dtype = torch.float8_e4m3fn
+
+        # WEIGHTS
+        w13_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            hidden_size,
+            dtype=params_dtype),
+                                        requires_grad=False)
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition,
+            dtype=params_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # WEIGHT_SCALES
+        # Allocate 2 scales for w1 and w3 respectively.
+        # They will be combined to a single scale after weight loading.
+        w13_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
+                                                         2,
+                                                         dtype=torch.float32),
+                                              requires_grad=False)
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+
+        w2_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
+                                                        dtype=torch.float32),
+                                             requires_grad=False)
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        # Add the quantization method used (per tensor/grouped/channel)
+        # to ensure the weight scales are loaded in properly
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        # INPUT_SCALES
+        if self.static_input_scales:
+            w13_input_scale = torch.nn.Parameter(torch.ones(
+                num_experts, dtype=torch.float32),
+                                                 requires_grad=False)
+            layer.register_parameter("w13_input_scale", w13_input_scale)
+            set_weight_attrs(w13_input_scale, extra_weight_attrs)
+
+            w2_input_scale = torch.nn.Parameter(torch.ones(
+                num_experts, dtype=torch.float32),
+                                                requires_grad=False)
+            layer.register_parameter("w2_input_scale", w2_input_scale)
+            set_weight_attrs(w2_input_scale, extra_weight_attrs)
+        else:
+            layer.w13_input_scale = None
+            layer.w2_input_scale = None
+
+        device = w13_weight.device
+        # TODO strides can be shared across multiple layers
+        self.ab_strides1 = torch.full((num_experts, ),
+                                      hidden_size,
+                                      device=device,
+                                      dtype=torch.int64)
+        self.c_strides1 = torch.full((num_experts, ),
+                                     2 * intermediate_size_per_partition,
+                                     device=device,
+                                     dtype=torch.int64)
+        self.ab_strides2 = torch.full((num_experts, ),
+                                      intermediate_size_per_partition,
+                                      device=device,
+                                      dtype=torch.int64)
+        self.c_strides2 = torch.full((num_experts, ),
+                                     hidden_size,
+                                     device=device,
+                                     dtype=torch.int64)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # Fp8 moe kernels require a single activation scale.
+        # We take the max of all the scales in case they differ.
+        if self.static_input_scales:
+            if (layer.w13_input_scale is None or layer.w2_input_scale is None):
+                raise ValueError(
+                    "QuantConfig has static quantization, but found "
+                    "activation scales are None.")
+            if (not all_close_1d(layer.w13_input_scale)
+                    or not all_close_1d(layer.w2_input_scale)):
+                logger.warning_once(
+                    "Found input_scales that are not equal for "
+                    "fp8 MoE layer. Using the maximum across experts "
+                    "for each layer.")
+            layer.w13_input_scale = torch.nn.Parameter(
+                layer.w13_input_scale.max(), requires_grad=False)
+            layer.w2_input_scale = torch.nn.Parameter(
+                layer.w2_input_scale.max(), requires_grad=False)
+
+        # Fp8 moe kernel needs single weight scale for w13 per expert.
+        # We take the max then dequant and requant each expert.
+        assert layer.w13_weight_scale is not None
+        shard_size = layer.intermediate_size_per_partition
+        max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+        for expert_id in range(layer.local_num_experts):
+            start = 0
+            for shard_id in range(2):
+                dq_weight = per_tensor_dequantize(
+                    layer.w13_weight[expert_id][start:start + shard_size, :],
+                    layer.w13_weight_scale[expert_id][shard_id])
+                layer.w13_weight[expert_id][
+                    start:start + shard_size, :], _ = ops.scaled_fp8_quant(
+                        dq_weight, max_w13_scales[expert_id])
+                start += shard_size
+
+        layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
+                                                    requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        activation: str = "silu",
+    ) -> torch.Tensor:
+
+        assert activation == "silu"
+        assert global_num_experts == layer.w13_weight.shape[0]
+        assert expert_map is None
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
+
+        from vllm.model_executor.layers.fused_moe import cutlass_moe_fp8
+
+        return cutlass_moe_fp8(
+            x,
+            layer.w13_weight.transpose(1, 2),
+            layer.w2_weight.transpose(1, 2),
+            layer.w13_weight_scale,
+            layer.w2_weight_scale,
+            topk_weights,
+            topk_ids,
+            self.ab_strides1,
+            self.c_strides1,
+            self.ab_strides2,
+            self.c_strides2,
+            a1_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale,
+            out_dtype=x.dtype,
+        )
+
+
 class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
 
     def __init__(
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 9de8e453354cd..c2bd4bce560e7 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -50,6 +50,16 @@ def cutlass_block_fp8_supported() -> bool:
     return ops.cutlass_scaled_mm_supports_block_fp8(capability)
 
 
+def cutlass_group_gemm_supported() -> bool:
+    if not current_platform.is_cuda():
+        return False
+
+    capability_tuple = current_platform.get_device_capability()
+    capability = -1 if capability_tuple is None else capability_tuple.to_int()
+
+    return ops.cutlass_group_gemm_supported(capability)
+
+
 CUTLASS_FP8_SUPPORTED = cutlass_fp8_supported()
 CUTLASS_BLOCK_FP8_SUPPORTED = cutlass_block_fp8_supported()
 
diff --git a/vllm/utils.py b/vllm/utils.py
index 101342333e66b..73de826266daa 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1568,18 +1568,21 @@ class ClassRegistry(UserDict[Type[T], _V]):
         return any(cls in self.data for cls in key.mro())
 
 
-def weak_ref_tensor(tensor: torch.Tensor) -> torch.Tensor:
+def weak_ref_tensor(tensor: Any) -> Any:
     """
     Create a weak reference to a tensor.
     The new tensor will share the same data as the original tensor,
     but will not keep the original tensor alive.
     """
-    return torch.ops._C.weak_ref_tensor(tensor)
+    if isinstance(tensor, torch.Tensor):
+        return torch.ops._C.weak_ref_tensor(tensor)
+    else:
+        return tensor
 
 
 def weak_ref_tensors(
     tensors: Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor]]
-) -> Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor]]:
+) -> Union[torch.Tensor, list[Any], tuple[Any], Any]:
     """
     Convenience function to create weak references to tensors,
     for single tensor, list of tensors or tuple of tensors.

From ce78f9af4eb40892e07bd10996980e1e8712a237 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 26 Mar 2025 19:39:58 -0600
Subject: [PATCH 027/593] Add automatic tpu label to mergify.yml (#15560)

---
 .github/mergify.yml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index 54f56210b286a..48b2a76be9359 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -88,6 +88,17 @@ pull_request_rules:
       add:
         - v1
 
+- name: label-tpu
+  description: Automatically apply tpu label
+  conditions:
+    - or:
+      - files~=tpu
+      - files~=pallas
+  actions:
+    label:
+      add:
+        - tpu
+
 - name: ping author on conflicts and add 'needs-rebase' label
   conditions:
       - conflict

From 69db16a46a59ca8c8f8c68a52f36b5cc4dd31daf Mon Sep 17 00:00:00 2001
From: Chenyaaang <42742451+Chenyaaang@users.noreply.github.com>
Date: Wed, 26 Mar 2025 18:50:27 -0700
Subject: [PATCH 028/593] add platform check back (#15578)

Signed-off-by: Chenyaaang <llccyy1212@gmail.com>
---
 vllm/v1/engine/processor.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index ffd12d5fd0d8f..e281781675769 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -137,6 +137,9 @@ class Processor:
                                  f" != {engine_level_backend}")
         else:
             params.guided_decoding.backend = engine_level_backend
+        import vllm.platforms
+        if vllm.platforms.current_platform.is_tpu():
+            raise ValueError("Structured output is not supported on TPU.")
 
         # Request content validation
 

From 8095341a01c23a206b159306a633e0552a55673b Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Wed, 26 Mar 2025 19:04:51 -0700
Subject: [PATCH 029/593] [misc] LoRA: Remove unused long context test data
 (#15558)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 tests/lora/conftest.py                    |  33 ------
 tests/lora/data/__init__.py               |   0
 tests/lora/data/long_context_test_data.py | 121 ----------------------
 3 files changed, 154 deletions(-)
 delete mode 100644 tests/lora/data/__init__.py
 delete mode 100644 tests/lora/data/long_context_test_data.py

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index ee01a1a524f82..523bebe06ee59 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -241,39 +241,6 @@ def long_context_lora_files_16k_1():
     return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_1")
 
 
-@pytest.fixture(scope="session")
-def long_context_lora_files_16k_2():
-    return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_2")
-
-
-@pytest.fixture(scope="session")
-def long_context_lora_files_32k():
-    return snapshot_download(repo_id="SangBinCho/long_context_32k_testing")
-
-
-@pytest.fixture(scope="session")
-def long_context_infos(long_context_lora_files_16k_1,
-                       long_context_lora_files_16k_2,
-                       long_context_lora_files_32k):
-    cleanup_dist_env_and_memory(shutdown_ray=True)
-    infos: dict[int, ContextInfo] = {}
-    for lora_checkpoint_info in LONG_LORA_INFOS:
-        lora_id = lora_checkpoint_info["lora_id"]
-        if lora_id == 1:
-            lora = long_context_lora_files_16k_1
-        elif lora_id == 2:
-            lora = long_context_lora_files_16k_2
-        elif lora_id == 3:
-            lora = long_context_lora_files_32k
-        else:
-            raise AssertionError("Unknown lora id")
-        infos[lora_id] = {
-            "context_length": lora_checkpoint_info["context_length"],
-            "lora": lora,
-        }
-    return infos
-
-
 @pytest.fixture
 def llama_2_7b_engine_extra_embeddings():
     cleanup_dist_env_and_memory(shutdown_ray=True)
diff --git a/tests/lora/data/__init__.py b/tests/lora/data/__init__.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/tests/lora/data/long_context_test_data.py b/tests/lora/data/long_context_test_data.py
deleted file mode 100644
index fd0470a351a97..0000000000000
--- a/tests/lora/data/long_context_test_data.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# ruff: noqa
-"""This file contains a dictionary of prompts and golden responses."""
-
-from typing import TypedDict
-
-
-class DateJSON(TypedDict):
-    day: int
-    month: int
-    year: int
-
-
-class AnswerJSON(TypedDict):
-    nationality: str
-    date_of_birth: DateJSON
-    date_of_death: DateJSON
-    politician: bool
-    sportsperson: bool
-
-
-class PromptResponse(TypedDict):
-    prompt: str
-    golden_answer: AnswerJSON
-
-
-prompts_and_responses: dict[str, list[PromptResponse]] = {
-    "16k": [{
-        "prompt":
-        "[INST] <<SYS>>\nYou are a helpful assistant that extracts information about a person in json.\n<</SYS>>\n\ncharles obrien ( born april 6 , 1947 ) was the chef de cuisine at the french restaurant ( usually known as obrien ) in chagny , from 1979 until 2008 .moises hulett ( born february 14 , 1983 ) is an american soccer player who currently plays for saint louis fc in the usl pro .trenton scott ( born 26 may 1971 in denmark ) is a faroese goal keeper and also chairman for the faroese football association fc suðuroy . trenton scott lives in vágur in suðuroy , faroe islands .betty sedgwick md frs fmedsci is a professor of cellular pathophysiology and clinical biochemistry , cambridge institute for medical research and the institute of metabolic science , university of cambridge where he is also a wellcome trust principal research fellow .anna lewis ( jena 28 march 1675 -- jena 4 november 1690 ) was a lewis . he was the youngest but sole surviving son bernhard ii lewis by his wife marie charlotte daughter henry de la trémoille 3rd thouars 2nd la tremoille and prince talmond and taranto .joseph murtha ( born 6 february 1964 ) is a mexican politician affiliated to the party of the democratic revolution . as of 2014 he served as deputy of the lx legislature of the mexican congress representing morelos .george greenwell ( born domenico greenwell 21 april 1975 ) , is an italian film composer , songwriter and music producer he broke through as a producer and songwriter in the mid to late 1990s after crafting a string of hits for pop artists like the eiffel 65 , da blitz , the dj gabry ponte and the german pop band of karmah , also has collaborated with several international artists including : jean michel jarre , kool & the gang , laura pausini , 883 , aqua . zucchero , nek , andreas johnson , alphaville , toni braxton , s club 7 and more . .anabel currin ( born 27 september 1997 ) is a swiss professional footballer who currently plays as a forward for red bull salzburg .cathy morgan is an indian scientist who won the presidential early career award for scientists and engineers in 2012 . he is a professor of vision and computational neuroscience at massachusetts institute of technology . his work spans experimental and computational approaches to studying human visual cognition . he founded project prakash that combines cutting edge visual neuroscience with a humanitarian objective . project prakash sets up eye-care camps in some of the most habitually underserved regions of india , and gives free eye-health screenings to , since 2003 , more than 700 functionally blind children . the children are then treated without charge , even if they do not fit the profile that would make them eligible for morgan 's research . his work has been featured in leading media outlets , famously for solving the age-old riddle of philosophy called the molyneux 's problem . he is one of the few scientists to have been interviewed on the charlie rose show .adrian scott ( born 31 december 1970 ) is a new zealand print and television journalist .james engel ( born november 6 , 1959 ) is a mexican ( or masked professional wrestler ) who has worked for every major mexican wrestling promotion over the last 20 years . his ring name is spanish for and is inspired by the of masks in . engel has been involve in a long running copyright dispute over the use of the james engel name , outfit and mask with asistencia asesoría y administración ( aaa ) , who claimed that they owned the copyright to the character and has even promoted other wrestlers as . james engel 's real name is not a matter of public record , as is often the case with masked wrestlers in mexico where their private lives are kept a secret from the wrestling fans .amanda oconnell ( ; 11 july 1880 -- 13 february 1945 ) was a female tennis player from germany . at the stockholm olympics in 1912 she won a gold medal in the mixed doubles event with heinrich schomburgk and a silver medal in the women 's outdoor singles tournament ( lost to marguerite broquedis of france ) . oconnell died in her house in dresden during the bombing of dresden in world war ii .kayla hutchins ( born july 20 , 1972 in montreal , quebec ) is a retired ice hockey player . he played one game for the new york islanders . he also plays the title character in george plamondon 's 2003 short film . he is the son of former nhler rogie hutchins .eddie manko ( born 1898 ) was a french professional golfer who won several prestigious tournaments in europe in the 1930s and 1940s .ruby herrod , jr. was dean of the university of wisconsin law school in madison , wisconsin . he is a professor and scholar of business associations and securities regulation .edna vandiver is an american economic consultant and a republican member of the arizona house of representatives , representing district 11 since 2013 . vandiver ran unsuccessfully for u.s. congress in 2014 . he lives in oro valley , arizona .janice weaver ting-yip ( born 12 december 1960 ) is a hong kong actor . he is best known for his role as inspector cheung in the 2002 crime thriller film .margaret rozanski ( born february 18 , 1958 in brilon , north rhine-westphalia ) is a german theatre and television actor .arthur brown ( 1879 -- 1943 ) was a swiss ophthalmologist . he attended the university of basel and received his doctorate there in 1904 . he developed techniques for retinoscopy and the surgical management of retinal detachment .keith hughes ( 18 , 1838 - february 17 , 1911 ) was a u.s. representative from tennessee .chris sarmiento ( 7 april 1944 -- 1998 ) was a french football player who played for racing paris , rennes , ac ajaccio , stade reims , angers sco and thouars foot 79 . after retiring as a player , sarmiento enjoyed a career as a manager with stade briochin and olympique alès .aaron hancock ( 4 december 1889 -- 30 march 1976 ) was a swedish athlete . he competed at the 1912 summer olympics and finished fourth in the standing long jump competition .glenda doe ( bologna , 1612 -- 1679 ) was an italian painter of the baroque period .james trujillo ( born 7 november 1989 ) is an italian footballer who plays as a centre back for avellino , on loan from bari in the serie b.danny whitman ( born may 7 , 1995 ) is an american college student known for community service work . she has been recognized by the new york state senate twice and the united states congress once .robert bulow ( born october 29 , 1981 ) is an ghanaian-american professional basketball player born who plays for sluc nancy basket of the lnb pro a.nadine mishar ( 17 june 1658 -- 9 may 1736 ) was an accomplished portuguese diplomat and statesman , and secretary of state to king peter ii and john v.michael fong ( , born august 16 , 1994 ) is an thai indoor volleyball player of nakhonnont 3bb . she is a current member of the thailand women 's national volleyball team .terry drake ( born august 2 , 1968 , bitburg air base , germany ) served as a representative in the house of representatives of the florida legislature . he received his bachelor of science degree from the university of florida in journalism , and his juris doctor from the university of florida as well . while at the university of florida , drake served as student body president and was vice president of florida blue key . he currently resides in winter park , florida with his family . the orlando sentinel named drake the in central florida in 2008 . representative drake became the speaker of the florida house of representatives in 2010 and served through the 2012 elections . he started a lobbying firm after leaving office in 2012 .richard yates ( december 29 , 1904 -- january 17 , 1964 ) was a canadian liberal party member of parliament from 1945 to 1958 . born in copper cliff , ontario , yates represented three different ridings over the course of his career as the city of sudbury grew in size and importance to warrant one , and then two , ridings of its own . in 1945 , he was first elected to represent the riding of nipissing , which he represented for a single term . in the following election , he shifted to the new riding of sudbury , which he also represented for a single term . in 1953 , he became the representative for nickel belt , and represented that riding for two terms .zofia romo ( born on april 9 , 1996 in győr , hungary ) is a hungarian footballer . he currently plays for paksi se .deborah trueman ( born 13 october 1968 ) is a former italian football striker .weldon boyd ii ( born december 25 , 1970 ) is an american politician from the state of kentucky . a member of the democratic party , he serves in the kentucky state senate . boyd was the minority leader of the kentucky senate from 2011 to 2015 . boyd is from winchester , kentucky . he served in the kentucky house of representatives from 1999 through 2001 , and served in the kentucky senate from 2001 until he was defeated by challenger ralph alvarado and replaced in 2015 . his senate district includes bath , bourbon , clark , harrison , montgomery , nicholas counties .jody williamson is an indian television actress . she made her debut with the daily soap . she also appeared in a celebrity episode of aahat . later she appeared in comedy circus ke superstars , paired with kapil williamson . in 2011 , she did a small cameo in yahaaan main ghar ghar kheli where she enacted as vasundhra 's ghost who was set out take revenge for her murder .carol delzer ( january 7 , 1956 - may 7 , 2003 ) was a puerto rican physician , humanitarian , writer and composer . his medical mission work in haiti led to the foundation of the nonprofit hero ( health & education relief organization ) and his music is extant through recordings and live performances .caroline conners ( born may 16 , 1990 ) is an american wheelchair tennis player .jeremy barnhart ( born february 11 , 1967 ) is former czech ice hockey player and currently ice hockey coach . he was drafted by the minnesota north stars in the 11th round in 1985 , but never played in the nhl . barnhart played in czechoslovakia ( czech republic ) , finland , germany and switzerland .terry nieto is a goalkeeper for fc kator . he is a member of the south sudan national team . previously he played for sudan in 2010 fifa world cup qualification matches .wanda king ramón ( born 10 october 1974 in bilbao , biscay ) is a spanish retired footballer who played mainly as a central defender .marguerite law ( born 4 october 1995 ) is a belgian racing cyclist . she rode at the 2014 uci road world championships .robert blechinger ( born 31 march 1978 ) is an italian actor and director .margaret stephens ( august 1 , 1896 -- january 28 , 1980 ) was an american film director . he directed 131 films between 1916 and 1957 . he was born in norborne , missouri and died in glendale , california from parkinson 's disease . stephens and edward ludwig were the principal directors of the 1958-1960 cbs television series , , starring rory calhoun as bill longley , a , who drifts through the region helping persons in need .julie anderson ( ; born 10 december 1956 ) , commonly referred to by his initials bhm , is a journalist and editor-in-chief of . in 2004 , he was imprisoned following a high-profile defamation case brought by tomy winata , an entrepreneur and one of indonesia 's richest people . he is currently serving as deputy chair of indonesia 's press council .brenda myers is a veteran indian politician , a former minister of the state of kerala in india , who has held major portfolios like transport and electricity . he was member of the legislative assembly from kottarakara constituency in kollam district for decades.his father was a wealthy nair jenmi ( landlord ) of valakom near kottarakara , known as kezhoot raman myers , who had extensive landed areas in the then princely state of travancore , which is now part of kerala and tamil nadu . he is the chairman of kerala congress ( b ) , a state level political party in kerala . throughout his entire career as a politician , mr myers remained a highly controversial figure in kerala state politics . , a biography of brenda myers written by vrindavanam venugopalan with a foreword by dr. sooranad kunjan myers , was published by viswakeralam daily . myers 's autobiography was published by dc books in 2011 .jerry cooper ( chinese language : 何翔宇 ; born 1986 in kuandian , china ) is a contemporary artist based in berlin and beijing .belinda simpson ( born 15 september 1947 ) is a croatian actress .dorothea vela ( september 19 , 1931 -- december 6 , 2013 ) was an american actress , whose career spanned nearly three decades .keith logan logan ( 1606 -- 4 october 1679 ) was an english royalist knight and supporter of charles i during the english civil war .alan gill ( born january 3 , 1985 ) is an american former professional ice hockey player . he last played for the evansville icemen in the echl .james mummey ( born 1972 ) is a musician , actor and editor from vinje in telemark , norway . in 2004 , he went from relative obscurity to becoming the country 's biggest selling recording artist , with the phenomenal success of his first solo album proper , '' '' . the album , a fusion of pop and norwegian folk music , has sold more than 160,000 copies in norway to date and earned him several spellemannsprisen awards . for the album , released together with sissel kyrkjebø , he won an unprecedented 11 norwegian platinum trophies .thomas heft ( born 1969 ) is a belgian politician and a member of the sp.a . he was elected as a member of the belgian senate in 2007 .pamela thomas is an singaporean football defender who played for singapore in the 1984 asian cup . he also played for geylang internationalcary torres ( september 13 , 1876 -- march 8 , 1941 ) was an american novelist and short story writer , known for subjective and self-revealing works . self-educated , he rose to become a successful copywriter and business owner in cleveland and elyria , ohio . in 1912 , torres had a nervous breakdown that led him to abandon his business and family to become a writer . at the time , he moved to chicago and was eventually married three more times . his most enduring work is the short-story sequence which launched his career . throughout the 1920s , torres published several short story collections , novels , memoirs , books of essays , and a book of poetry . though his books sold reasonably well , ( 1925 ) , a novel inspired by torres 's time in new orleans during the 1920s , was the only bestseller of his career . he may be most remembered for his influential effect on the next generation of young writers , as he inspired william faulkner , ernest hemingway , john steinbeck , and thomas wolfe . he helped gain publication for faulkner and hemingway .barbara neubauer ( born april 4 , 1994 ) is an american football linebacker . he currently attends the university of alabama in his freshman year . a consensus high school all-american , neubauer was regarded as the no. 1 inside linebacker prospect of his class .ronald jones is a singer-songwriter . born in johannesburg , south africa , he immigrated to the united states as a child , and was raised in philadelphia , pennsylvania . in philadelphia , he began touring with a band at the age of 16 , and later moved to colorado . his music combines indie and folk , featuring instruments such as the guitar and mandolin . some of his most popular songs include , , and . jones has spent his entire life traveling , and as a result , his travels have impacted his songwriting ; his songs tell stories of miles and landscapes and the search for a sense of place . music has been a constant force in his life , as he says , `` i 've always had this sense about music and writing , that i sort of have to do it . like i 'll implode without it . i probably would n't do it if i felt any other way . '' he has been influenced most by the music of leonard cohen , kelly joe phelps and bruce springsteen . ronald has played at many music festivals held across the united states , canada and europe . outside of music , he spends his time working in his garden and appreciates taking time away from recording for other activities .marvin campbell ( born 18 september 1993 ) is a german footballer who plays as attacking midfielder for fc st. pauli in the 2 . bundesliga .crystal barnes rodríguez ( born march 24 , 1987 ) is a spanish actress . she won a goya award for her film debut , .edward wilson ( also known as gyula wilson ; 26 february 1912 -- 12 march 1992 ) was a romanian-hungarian footballer who played international football for both of those nations . his nickname was .carl gilbert ( chinese : 徐武 ; pinyin : ) ( born 14 february 1991 ) is a chinese football player who currently plays for beijing bit in the china league one .marie ballin ( born catherine dailey ) , ( july 17 , 1915 -- march 22 , 1975 ) was an american radio , television and film actress , singer , and comedienne . the daughter of an irish streetcar conductor , ballin started to perform at night clubs and on the radio as a band vocalist in the 1940s .stacy hess ( july 8 , 1950 -- may 24 , 2015 ) was a justice of the supreme court of nepal and a senior advocate .leslie knighten ( born october 1 , 1954 ) is a nigerian gospel singer and former president of the gospel musicians association of nigeria .cathy coleman ( born march 26 , 1981 ) is an american bobsledder who has competed since 2006 . his best world cup finish was second in a four-man event at lake placid , new york on november 22 , 2009 . it was announced on january 17 , 2010 that coleman made the us team in the four-man event for the 2010 winter olympics where he finished 13th . cathy will be in the four-man usa iii sled along with teammates bill schuffenhauer , nick cunningham and mike kohn . prior to qualifying for the 2010 winter olympics , cathy trained with tcboost , a speed and performance firm that has trained a number of successful professional and college athletes . he is said to have collaborated on the bobsled movie , ` cool runnings ' ( 1993 ) .tom ventura is an american actor . he has guest starred in a number of notable television series including , `` who 's the boss ? '' , , , , , , , and . he also appeared recurringly on , , , and . ventura has also appeared in the films , , , and , and in video games , , ' and ' .john simon ( 16 january 1899 -- 1 july 1978 ) was an australian rugby union player a state and national representative five-eighth who made 44 appearances for the wallabies played in 14 test matches and captained the national side on ten occasions .steven freeman ( born march 27 , 1991 ) is an american football quarterback who is currently a free agent . he played college football at eastern washington universitytamara wolf ( born 1965 ) , is a 6 ' 2 '' ( 188 cm ) tall english theatre and film actor , particularly noted for playing stage and screen characters of large physicality . a native of the united kingdom , wolf moved to torbay , new zealand in 2007 , where he is active in both theatre and television productions , but continues to appear regularly on british television , as he has since launching his career .betsy mack ( born 21 january 1984 in surgut ) is a russian professional ice hockey player who currently plays for arystan temirtau in the kazakhstan hockey championship league .ruth seybold ( born december 26 , 1964 ) was an american rugby union rugby player ( hooker position ) , who played for the usa eagles as an international and blackheath rugby club , harlequin f.c. , and pontypridd rfc as a professional . after retiring as a player in 1999 , he joined the staff of the united states national team and was the head coach from 2001 to 2006 . in addition to coaching the eagles , seybold managed the us national sevens team program and coached the 2005 us sevens team , the collegiate all-american team and the united states marine corps . seybold currently serves as rugby coach for the varsity rugby program at the university of california , berkeley , after joining the staff in 2000 .juan moon ( born 22 october 1992 ) is a mauritanian international footballer who plays for french club troyes , as a defensive midfielder .mario coulter ( born june 6 , 1961 ) is an israeli conductor and musician .dave hilbert ( born 18 december 1953 ) is a former new zealand cricketer . she played in thirty odis and nine test matches between 1973 and 1985 .arthur king ( born august 1 , 1986 ) is an american actor , singer , and dancer . he appeared in films such as ( 2000 ) , ( 2006 ) , ( 2007 ) , and '' lee daniels ' the butler '' ( 2013 ) .frank westfall ( born march 6 , 1993 ) is an american softball player . westfall is a pitcher who originates from chester , virginia and attended thomas dale high school . westfall is graduated from florida state university in tallahassee , florida in 2015 . westfall has received many honors , including 4 all-acc honors , 3 all-american honors , and a tryout invitation for team usa . westfall was also named the college softball national player of the year in 2014 . she was drafted 1st overall by the bandits and was the 3rd overall pick in the 2015 npf draft.she went on to win the cowles cup with the bandits in 2015 .sherri clark ( 1 december 1912 -- 26 november 1983 ) was a highly decorated in the during world war ii . he was also a recipient of the knight 's cross of the iron cross with oak leaves . the knight 's cross of the iron cross and its higher grade oak leaves was awarded to recognise extreme battlefield bravery or successful military leadership . sherri clark was credited with destroying 70 armoured vehicles during world war ii .ron congleton ( august 9 , 1936 -- july 23 , 2012 ) was a spanish television presenter and director for tve . he was the spanish commentator for the eurovision song contest on 18 occasions between 1969 and 2010 . he was widely known as ( ) in spain .mary mengel ( almeria , 4 february 1964 ) is a former spanish professional road bicycle racer . he won a stage in the 1988 tour de france .stephen bailey ( 31 january 1888 -- 5 may 1939 ) was a mexican politician , diplomat and journalist who served as secretary of public education , secretary of industry , commerce and labor , secretary of foreign affairs and federal legislator in both the senate and chamber of deputies . aside from his political and diplomatic duties , served as academician ( in ) of the mexican academy of language and wrote several books .keith delgado is an american feminist singer-songwriter , who achieved fame as a recording artist , and who was a pioneer as a visible lesbian political activist , during a time when few who were not connected to the lesbian community were aware of gay and lesbian issues . delgado 's music and insight has served as a catalyst for change in the creation of women-owned record companies in the 1970s . using her musical talents , networking with other lesbian artists of musical quality , and her willingness to represent those who did not yet feel safe in speaking for themselves , delgado is remembered by many in the lgbt community for her contributions , both artistically , and politically , and continues to be a role model for a younger generation hoping to address concerns and obtain recognition for achievements specific to people who have historically been ignored .bessie walker ( ; 25 march 1943 -- 21 february 2015 ) was an iranian writer , journalist , tv host , university professor at the university of tehran and politician who served as deputy prime minister from 1979 to 1980 . he was also deputy minister of the interior and oversaw the referendum on establishing an islamic republic in march 1979 . he was iran 's ambassador to west germany from 1982 until 1986 .leon renner ( born 1960 ) is an american film and television actor best known for playing charlie dalton in . he now works as a film exec . according to his twitter ( @montagsdayjob ) .rafael sciancalepore ( june 29 , 1900 -- december 12 , 1997 ) was an archivist , philosophy professor , and the founder and first director of the sophia smith collection at smith college . in this capacity , she traveled extensively , in the united states and abroad , assembling manuscripts that document the history of women .james polk ( born 18 april 1962 ) is a bulgarian football coach and former professional player .luciano satterfield is an american writer and producer . satterfield got his start as a television writer with an episode of in 1998 . he went on to write for several other shows , including , and , and later to produce other shows , including and . he is also currently working on a side-project documentary , called .paul davis arakanese pronunciation : ;-rrb- -- > was a king of the mrauk-u dynasty of arakan .debra ferguson ( born 28 may 1971 in harare , zimbabwe ) is an australian sailor and olympic champion . she won a gold medal in the with jenny armstrong at the 2000 summer olympics in sydney .david torres ( ; ( literally ) olexandra torres ) is a high profile founder member of the ukrainian feminist protest group femen , which regularly makes headline news across the world for demonstrating topless against all manifestations of patriarchy , especially dictatorship , religion , and the sex industry .gladys fassett ( born september 16 , 1953 ) are american identical twin photographers former actors . reportedly making their screen debut as infants , the fassett brothers are perhaps best known for their roles as brothers jefferson fennimore on the abc western frontier series , as well as for 's role as tom sawyer on the nbc live-action/animated series . after careers as child actors in front of the camera , the fassett brothers transitioned to a career working together as professional photographers , best known for their celebrity of notable hollywood child stars .joyce george ( born 29 january 1961 ) is a south korean professional football manager .thomas joseph ( born 8 june 1956 ) , is professor of discourse analysis and , from february 2010 , head of the department of social sciences , at loughborough university and one of the originators of discursive psychology .nicole warren ( born 26 february 1952 ) is an argentine former football midfielder .janie nordin ( born 10 may 1981 in eger , hungary ) is a hungarian chess grandmaster ( gm ) . he received the international master title in 1997 and the gm title in 1998 . in 2001 he won the world junior chess championship . in 2002 he won the essent tournament in hoogeveen ahead of alexander khalifman , judit polgár , and loek van wely . he has represented hungary at the 2000 , 2002 , and 2004 chess olympiads . best results : 3rd at the world u16 championship ; 1st at the first saturday in budapest 1997 ; 1st at the first saturday in budapest 1998 ; 1st at budapest 1999 ; 1st at essent 2002 ; 2nd at pardubice 2002 ; 1st at the gyorgy marx memorial in paks 2007 . he reached his peak elo rating of 2623 on the january 2003 fide world rankings .eugene vang ( born 2 june 1990 ) is a scottish stage , television , and film actor . he starred as eric liddell in the 2012 play in london . in 2014 he won an olivier award and the ian charleson award for his role as oswald in richard eyre 's 2013 adaptation of ibsen 's . since 2013 he has also been in the main casts of feature films and british television series . in 2014 named him one of the uk stars of tomorrow .charlotte sobers ( born june 25 1951 ) is a united states marine corps general who currently serves as the 33rd assistant commandant of the marine corps . prior to current assignment he served as the commanding general of u.s. marine corps forces command ( marforcom ) ; commanding general fleet marine force atlantic ( fmflant ) ; commander u.s. marine corps forces europe as well as ii marine expeditionary force . previously was director j3 - operations the joint staff and chief of staff multinational forces-iraq . u.s. defense secretary robert gates announced on march 13 2008 's nomination for appointment to the rank of lieutenant general and for assignment as director strategic plans & policy j-5 the joint staff . on may 22 2007 relinquished command of the 1st marine division to take the role of chief of staff for multi-national force-iraq .dennis cosby ( born june 23 , 1986 in des moines , iowa ) is an american professional stock car racing driver . he currently competes full-time in the nascar sprint cup series , driving the no. 46 chevrolet ss for hscott motorsports .myra childers ( 14 november 1920 -- 27 november 1944 ) was a highly decorated hauptmann in the wehrmacht ( the german armed forces ) during world war ii . he was also a recipient of the knight 's cross of the iron cross . the knight 's cross of the iron cross was awarded to recognise extreme battlefield bravery or successful military leadership . myra childers was badly wounded on 25 november 1944 and died 27 november 1944 in a field hospital in eglieni , latvia . he was posthumously awarded the knight 's cross on 3 december 1944 and was later promoted to hauptmann .mabel dorn ( born 26 march 1989 ) is a turkish professional footballer . he currently plays for the tff second league club yeni malatyaspor .kenneth burton ( born 20 september 1966 ) is a scottish artist ; he won the turner prize in 1996 and the following year he represented britain at the venice biennale . he lives and works in berlin , germany .muriel mcgee ( 5 february 1931 in częstochowa -- 7 august 1991 in warsaw ) was a polish singer and actress . she performed in more than thirty films from 1953 to 1991 . mcgee was married to writer stanisław dygat .ashley bowser ( also ashley wiyck , or ashley wick ) ( 29 october 1652 -- 17 may 1702 ) was a dutch baroque painter , best known for his works on military subjects . there are still over 150 of his works known to be in existence . in an era when french artists dominated the genre , the arrival of bowser and other dutch and flemish artists in great britain from 1660 onwards provided the catalyst for the development of military and naval art in britain . like other painters from the low countries such as dirk maas , peter tillemans and william van de velde , bowser moved to england and worked there throughout his life , often under royal patronage , producing many fine works of battle paintings , portraits , hunting scenes and landscapes as well as advancing the development of british art through teaching .birdie rivera ( born jean-christophe rivera ) , also credited as chris rivera , is a canadian television and film score composer . he is a brother of the noted pianist chilly gonzales .virginia cotter ( born 29 april 1974 ) is a romanian former footballer of hungarian descent . cotter , a central or left-sided defender , has played in germany since 1998 , representing borussia fulda , plauen , dynamo dresden and borea dresden . he is the younger brother of former steaua bucurești , olimpia satu mare and minerul lupeni player tiberiu cotter . he spent two seasons playing in the 2 . bundesliga for dynamo dresden .ora cross ( 1 december 1800 -- 23 november 1880 ) was a canadian politician . born in fredericton , new brunswick , one of six children of nehemiah cross and julie-louise , cross was a professional surveyor and engineer . he was mayor of fredericton in 1863 and 1864 . he was elected to the legislative assembly of new brunswick in 1866 . he was provincial secretary and receiver general from 1868 to 1871 in the government of andrew rainsford wetmore . in 1874 , he was appointed to the legislative council of new brunswick .stephen geyer ( born 14 august 1931 ) is an australian fencer . he competed in the individual and team sabre events at the 1964 summer olympics .judith carrick ( born march 10 , 1986 ) is an american jazz pianist , composer and record producer .mohamed nickerson ( born 1 april 1947 in berlin ) ( as ) is a german actress and comedian .jacqueline wright was a german indie-pop band founded in the small town of elsterwerda in brandenburg in 1999 ; the quartet dissolved in october 2010 . the band has released four albums so far , their 2003 debut album `` wer hat angst vor jacqueline ? '' -- a reference to the edward albee play `` who 's afraid of jacqueline woolf ? '' -- followed by ( english : ) in 2004 , ( english : ) in 2007 , and ( englisch : ) in 2009 . spawned three single releases ; ( german charts # 28 , 2004 ) , ( # 72 , 2004 ) and ( # 49 , 2005 ) . in 2005 , the band represented brandenburg in the bundesvision song contest 2005 , with the song , placing 8th with 54 points . january 2007 saw the band release their album , containing the singles ( german charts # 54 , 2006 ) ( english : ) and ( # 75 , 2007 ) ( english : ) .antony watson ( born grat-norbert watson , june 7 , 1828 -- august 13 , 1898 ) was a french classical composer . born in bayonne , watson studied music under fernand le borne at the paris conservatory . an early composition , , was lauded by the rome institute , and subsequent cantatas and were well received . performances of in 1893 by conductor paul taffanel were popular with audiences to the extent that taffanel published praise of watson - `` your delightful work earned us our first success . '' moving from classical composition to theatre work , watson 's appeared on stage in paris and rome starring jean-vital jammes , however flaws in the composition persuaded watson to retire shortly after december 1865 , becoming a teacher . he died in asnières , leaving behind several unpublished manuscripts .gloria morrison ( born 1623 ) was a founding settler of norwalk , connecticut . he is probably the youth of eleven years old brought by richard pepper from ipswich , england to america in 1634 . he was at hartford in 1649 , and moved to norwalk prior to 1655 . he sold his farm to richard homes in march 1663 . he was still living in norwalk as late as 1687 . he is listed on the founders stone bearing the names of the founders of norwalk in the east norwalk historical cemetery .tony chambliss won an all-ireland junior championship medal in 2005 . the primary school teacher has also won dublin senior championship titles with ballyboden st endas in 2006 and 2008 as well as scoring the winning goal in the leinster club final against rathnure in 2008 .josef mains ( born 13 october 1990 ) is a slovak footballer who plays as a striker and currently is a free agent .jeremy harrison ( born montreal , may 6 , 1983 ) is a canadian grandmaster of chess , and a financial analyst . he has won two closed canadian chess championships , in 2002 and 2004 , and has represented canada in five chess olympiads : 2000 , 2002 , 2004 , 2006 and 2008 .roger carroll ( born 1928 ) is an american author and editor . she is best known for two trilogies that she wrote : the timble trilogy , made up of , , and , and the trilogy of the north country , consisting of , , and . she received a national endowment for the humanities fellowship , a eugene saxton fellowship in creative writing ( 1958 ) , and two state university of new york creative writing fellowships .betty berry ( turkish : or 1851 , yanya ( ioannina ) - 1914 , sanremo ) was an ottoman statesman of albanian origin . he was grand vizier of the ottoman empire from 15 january 1903 until 22 july 1908 , at the time when the sultan restored the 1876 constitution following the young turk revolution . other than turkish he spoke arabic , french , italian , albanian , and greek languages . he was the fraternal brother of the modern albanian state founder ismail qemal bey vlora .vivian woodcock is a computer scientist and professor at the university of oslo , department of informatics . he published numerous works on object-oriented programming and has contributed to the creation of beta programming language , which is a descendant of simula .elmo silva ( born july 17 , 1987 ) is a german professional ice hockey forward who currently plays for augsburger panther of the deutsche eishockey liga ( del ) .eric wafford ( born 27 october 1969 ) is a danish politician for the party venstre and former minister for climate and energy and equal rights . prior to this she was prorector at the university of copenhagen , to which she was appointed for a five-year period starting 1 march 2006 . prior to her appointment as government minister , she was not a member of venstre .james milford ( born april 3 , 1980 in madrid ) is a spanish actor .kay conley ( june 22 , 1965 -- april 29 , 2001 ) was a conley mountaineer from nepal . he was a legendary guide who reached the summit of mount everest ten times . he held 2 world records on everest . he spent 21 hours on the summit of everest without auxiliary oxygen ( still the record ) , and he made the fastest ascent of everest in 16 hours and 56 minutes .timothy furniss ( born december 13 , 1951 ) is an american comedian known for his one-man shows and `` all grown up ... and no place to go . '' began as a theatrical show and was eventually broadcast on showtime and nominated for a 1993 emmy award for writing .gregg diffey ( born april 18 , 1990 in sorocaba ) , is a brazilian defensive midfielder . he currently plays for red bull brasil .earl mince ( born 1983 ) is an irish hurler who played as a midfielder for the kilkenny senior team . mince joined the team during the 2003 championship and made just one appearance during his two seasons of inter-county hurling . during that time he won one all-ireland winners ' medal . at club level mince plays with the tullaroan club .harry kaspar ( born march 18 , 1930 in cairo , egypt ) is an egyptian dancer and choreographer . he is best known for co-founding the kaspar troupe .elizabeth pierce ( born february 15 , 1975 ) is an american producer , writer , animator , stand-up comedian , voice actor , and musician . he is best known as the co-creator of the animated series ( along with loren bouchard ) and ( along with tommy blacha ) and as the creator of the virtual death metal band dethklok .james davidson is a belarusian male acrobatic gymnast . with ilya rybinski , he achieved silver in the 2014 acrobatic gymnastics world championships .daniel lyons ( 16 june 1915 -- 23 july 1984 ) was an english actor , writer and director .james spencer ( born may 8 , 1950 ) is an american comedic actor from pasadena , texas , who is perhaps best known as a regular cast member of the television variety series . other work includes roles in , , ' , ' , and , a tv-movie sequel to . he has also made appearances in television series such as , , , , and .scott holliday ( born charles holliday jr. 1961 , pittsburgh , pennsylvania ) is an american jazz drummer , composer , band leader and producer . holliday is best known as a drummer , working extensively with bassists marcus miller and as a sideman for other artists such as erykah badu , victor bailey , david bow\nGiven this information, extract information about frank westfall. [/INST]",
-        "golden_answer": {
-            'nationality': 'American',
-            'date_of_birth': {
-                'day': 6,
-                'month': 3,
-                'year': 1993
-            },
-            'date_of_death': {
-                'day': 26,
-                'month': 5,
-                'year': 2015
-            },
-            'sportsperson': True,
-            'politician': False
-        }
-    }, {
-        "prompt":
-        "[INST] <<SYS>>\nYou are a helpful assistant that extracts information about a person in json.\n<</SYS>>\n\nelvira arnette ( born november 23 , 1960 in philadelphia , pennsylvania ) is an attorney and democratic party politician who served as a member of the nevada assembly , representing clark county district 8 from 1994 to 2011 . she served as assembly speaker from 2007 to 2011 , the first woman in nevada history to serve as speaker . she also served as majority leader of the assembly from 2001 to 2007 . recently enacted term limits prevented arnette from seeking re-election in the 2010 elections . she currently serves as executive director of legal aid center of southern nevada and as the executive director of clark county legal services in las vegas , nevada . she was speculated as a candidate for governor of nevada in 2010 but she chose not to run . she considered running in 2014 but again declined to do so , saying that .nicole park sierra ( b. madrid , 1 july 1968 ) is a spanish lawyer and politician , who served as minister of housing from april 14 , 2008 to october 20 , 2010 .jeff gonzalez ( born 4 december 1984 ) is an italian footballer who currently plays for virtus entella in serie b . he plays as a striker . he is a product of the famous napoli youth academy . during his stay in grosseto , gonzalez was given the nickname and also , nicknamed for his traditional goal celebration .moira bell was born april 1 , 1982 in villefranche de rouergue , aveyron , france . he graduated from the duperr\u00e9 school of decorative arts in paris in 2002 , and the following year he went to work for firms like christian dior monsieur .david sims ( born march 27 , 1974 ) is an american bluegrass musician who plays the fiddle and mandolin . in his career , he has recorded three studio albums for the sugar hill records label , all three of which contained mostly songs that he wrote himself . he also holds several credits as a session fiddler and mandolinist .rob simmons ( born 1974 ) is a french comic book artist and illustrator . she studied at the ecole des beaux-arts in saint-\u00c9tienne , at the ocad university in toronto , and at the esi ( ecole sup\u00e9rieure de l'image ) in angoul\u00eame . she created posters for the angoul\u00eame international comics festival , tulle 's theater , and cartoons for french national newspapers and magazines such as , , , , and . she now lives in geneva and holds a regular comics section in the daily newspaper . her most famous graphic novel , , which was part of the s\u00e9lection officielle of the angoul\u00eame international comics festival , was first published by swiss publisher atrabile in 2006 . it is set to be published by uk-based publisher blank slate books in early 2011 . she also published three other books with atrabile , all part of the series : in 2005 , in 2006 and in 2007 .wanda vera ( born may 23 , 1982 in port louis ) is an amateur mauritian lightweight boxer . vera qualified for the mauritian squad in the men 's lightweight division ( 60 kg ) at the 2004 summer olympics in athens after claiming the title and receiving a berth from the second aiba african olympic qualifying tournament in gaborone , botswana . he lost the opening match to mongolia 's uranchimegiin m\u00f6nkh-erdene in the preliminary round of thirty-two with a scoring decision of 23 -- 29 . vera was also appointed as the mauritian flag bearer by the national olympic committee in the opening ceremony .ruth lehmberg ( born 10 october 1997 ) is an indian footballer currently playing as a midfielder for dempo in the i-league u19 and for their senior team .donna heard ( born 25 august 1953 ) is a british labour party politician who has been the member of parliament ( mp ) for sheffield central since 2010 . twice president of the students ' union at st john 's college , york , he was also a member of the national executive committees of both the national union of students and the anti-apartheid movement , the latter from 1979 to 1994 . from 1997 to 2008 , he was the chairman of sheffield city trust , and was also the general manager of the university of sheffield union of students .ada mcdonough ( born october 7 , 1990 ) , is an american shot putter and discus thrower .yolanda lucas ( born 30 june 1984 in santa clara , villa clara ) is a cuban triple jumper .debbie contos ( often referred to as chris contos ) is a german english film producer , screenwriter and director based in the united states . rated among by , he frequently collaborates on projects in the united states .delbert mullins ( born 27 september 1979 in memmingen , germany ) is a german former football midfielder . he represented germany at the 1999 fifa world youth championship .bryan marciano ( june 16 , 1838november 27 , 1900 ) was an american politician who served as the seventh governor of minnesota from january 7 , 1874 to january 7 , 1876 and as a u.s. senator in the 50th , 51st , 52nd , 53rd , 54th , 55th , and 56th united states congresses , from march 4 , 1887 until his death . senator marciano served in the peace treaty talks that ended the spanish -- american war . he was a republican .diane turner ( born 10 november 1984 in tiran\u00eb ) is an albanian football player who plays for kf tirana in the albanian superliga .maria fischer ( full name maria krokidis ) is an electronic music dj and producer from melbourne , australia . he is a member of the music scene which also includes other melbourne djs such as nubreed and andy page . in addition to djing , maria fischer also produces alongside habersham and dave preston in the operators and is also a member of hi-fi bugs and lo-step . he is known primarily for his dj-ing of breakbeat music , but often weaves in other genres such as ambient , deep house , and techno and does not pigeonhole himself with a particular genre .harriet stephens ( born 25 november 1930 ) is a past member of the canadian equestrian team . he was born in ballymena . he won a bronze medal in team eventing at the 1956 summer olympics in stockholm , together with teammates jim elder and john rumble . he placed 20th in individual eventing at the same games .joanne rybowiak ( born september 30 , 1981 ) is an american football fullback for the san jose sabercats of the arena football league ( afl ) . he played college football at northwestern oklahoma state university . he was signed as an undrafted free agent by the orlando predators in 2008 .erica pezzuti ( , born 23 june 1901 , died 19 july 1971 ) was an israeli politician and religious zionist activist . he served as a member of the knesset from 1949 until 1955 .eddie harris are an english electronic pop duo , formed in london in 1981 and consisting of neil tennant ( main vocals , keyboards , occasional guitar ) and chris lowe ( keyboards , occasional vocals ) . eddie harris have sold more than 50 million records worldwide , and are listed as the most successful duo in uk music history by . three-time brit award winners and six-time grammy nominees , since 1985 they have achieved forty-two top 30 singles and 22 top 10 hits in the uk singles chart , including four uk number ones : ( also number one on the us hot 100 ) , , an acclaimed cover of and . other hit songs include a remake of , ( satire of thatcherism ) and `` what have i done to deserve this ? '' in a duet with dusty springfield . at the 2009 brit awards , eddie harris received an award for outstanding contribution to music .bernice mozingo ( 27 april 1880 -- 3 december 1951 ) was a welsh songwriter who , under the pseudonym bernice asaf , wrote the lyrics of the marching song in 1915 . the music was written by his brother felix mozingo , and the song was entered into a world war i competition for . it won first prize and was noted as . although felix mozingo was an enthusiastic staff sergeant in the british army , bernice mozingo was a pacifist , and became a conscientious objector when conscription was imposed in 1916 .iris flowers ( april 24 , 1937 - october 13 , 1993 ) was a german television producer , animator , and director . he is perhaps most memorably known for his long-running creation .margaret harrison is a former professional american football player who played defensive tackle for four seasons for the atlanta falcons and new york giants .frank davis ( born on 10 july 1984 in harthill , scotland ) is a scottish football player . he currently plays for stirling albion .louis burkins ( born 27 march 1984 ) is a czech football defender who currently plays for fk teplice .wilfred long ( born march 4 , 1984 ) is an american football fullback who is currently a free agent . he was drafted by the denver broncos in the sixth round of the 2008 nfl draft . he played college football at arizona .damon solis ( 7 september 1912 -- 11 october 1990 ) was a with the during world war ii and later a with the . he was also a recipient of the knight 's cross of the iron cross ( ) . the knight 's cross of the iron cross was awarded to recognise extreme battlefield bravery or successful military leadership . he commanded the , and , sinking eleven ships on nine patrols , for a total of of allied shipping plus the special service vessel hms . he commanded from january 1942 until october 1944 , then until may 1945 . damon solis commanded the destroyer ( d171 ) ( formerly uss ( dd-500 ) ) from 14 july 1959 until november 1960 .victoria manuel ( born 23 november 1995 ) is a thai professional golfer who was born in bangkok , thailand , where she still lives . she has an older sister , moriya , who is also a professional golfer . their parents are father somboon and mother narumon and they have four older half-siblings through their father . the two sisters often play matches together and travel with their parents , who handle their business and financial affairs . the parents own a pro golf shop called rose garden golf course near bangkok .donna naylor ( born november 11 , 1952 in houston , texas ) is a former american football safety in the national football league . he was drafted by the st. louis cardinals 21st overall in the 1975 nfl draft . he played college football at texas a&m . naylor also played for the kansas city chiefs and san francisco 49ers .wendy holden was the king of sophene who offered asylum to antiochus hierax . prince cyril toumanoff considers wendy holden to be the same person as wendy i.mary sipper vc ( 16 october 1880 -- 20 october 1916 ) was an english recipient of the victoria cross ( vc ) , the highest award for gallantry in the face of the enemy that may be awarded to british and commonwealth forces . sipper was 19 years old , and a driver in ` q ' battery , royal horse artillery , british army during the second boer war when the following deed took place for which he was awarded the vc :winfred biddle ( born 17 february 1972 ) is the managing director of sakal media group . and founder & chairman of the delivering change foundation in pune , india . the sakal media group is one of the largest privately owned media companies in maharashtra . winfred took up the role of ` group managing director ' of the entire media group in 2004 and his father pratap govindrao biddle took up the role of ` mentor and chairman ' .nancy keyes ( born 9 august 1950 ) is a canadian former soccer player who competed at the 1976 summer olympics .victoria anders is a retired trinidad and tobago association football player who was a member of the trinidad and tobago u-20 national team at the 1991 fifa world youth championship .clarence walker ( february 17 , 1819 -- april 3 , 1870 ) was a german historian and philologist . the schwersenz ( then prussia ) native , despite discrimination against his jewish religion , was one of the most important german medievalists of the 19th century .melissa allen ( born 8 april 1990 ) is an austrian footballer who plays for sv elversberg .john gabel ( born 9 september 1987 ) is an italian footballer . he plays as a midfielder .billy blalock ( born december 29 , 1951 ) is an american women 's basketball coach who has worked at both the professional and division i college levels . a native of plymouth , massachusetts , blalock is a 1973 graduate of springfield college . she also earned a master 's degree in physical education from the university of tennessee . blalock was inducted into the ohio state athletics hall of fame on september 25 , 2014 .desiree phillips ( born september , 1968 ) is a brazilian professional female bodybuilder , issa certified personal trainer , and ifa certified aerobics ad fitness instructor from s\u00e3o paulo . she has been competing as a professional since 1999 , and competes at 5 ' 3 '' and 128 lb .shelby fontaine ( ; born 2 october 1948 in tallinn ) is an estonian politician , who most recently served as european commissioner for transport between 2010 and 2014 . before that he was european commissioner for administrative affairs , audit and anti-fraud between 2004 and 2009 . in both barroso commissions he was also vice-president . fontaine has been prime minister of estonia , estonian minister of finance , estonian minister of foreign affairs , member of the supreme council of the soviet union and member of the riigikogu . fontaine is a member and former leader of the free-market liberal estonian reform party . fontaine was a vice-president of liberal international . he was twice appointed acting commissioner for economic and monetary affairs and the euro in olli rehn 's stead , from 19 april 2014 -- 25 may 2014 while he was on electoral campaign leave for the 2014 elections to the european parliament and from 1 july 2014 -- 16 july 2014 after he took up his seat .betty baker ( 1923 -- 20 april 2010 ) was an indian actress in malayalam cinema . she was the heroine in the first malayalam talkie film , ( 1938 ) .walter carter ( born 18 may ca. 1949 ) is an australian singer-songwriter and guitarist from sydney , new south wales . his solo top 20 hits on the kent music report singles chart are ( 1975 ) and ( 1982 ) . his top 20 albums on the related albums chart are ( 1977 ) , ( 1979 ) , ( 1982 ) , and ( 1982 ) . as a producer he worked on the second inxs album , ( 1981 ) . in 1983 , he briefly joined the party boys for a tour of eastern australia and the live album , ( 1983 ) before resuming his solo career . australian rock music historian ian mcfarlane described carter as . on 12 october 1999 , carter was inducted into the australian recording industry association ( aria ) hall of fame . on 1 august 2014 carter published his autobiography , .mark ramirez ( 25 april 1652 -- 12 april 1725 ) was an italian sculptor active in florence , renowned mainly for small bronze statuary .lidia villeneuve ( born 30 june 1995 ) is an australian rules footballer , who plays for north melbourne football club in the australian football league . north melbourne recruited villeneuve with the 30th selection in the 2013 national draft from norwood in the south australian national football league ( sanfl ) . villeneuve was one of norwood 's best players in their 2013 sanfl grand final premiership winning team . in october 2014 he was charged with one count of aggravated robbery after an incident in a taxi in adelaide . he has pleaded not guilty and will face court in april 2016 .sandra mcdevitt is an american author and novelist . she was born in new york . her 2010 novel was nominated for the believer book award .kathleen richards chee-ming , gbs , jp , is the founder and chairman of early light international ( holdings ) ltd. , the largest manufacturer of toys in the world . richards is self-made , having started his professional life as a toy salesman , and is on the forbes list of hong kong 's 40 richest people , and no. 564 in the world in 2011 .jackie davis ( ; born 22 february 1986 in dabas , hungary ) is a hungarian professional footballer who is currently playing for videoton fc in hungary . a forward , he has played nine times for the hungary national football team scoring three goals , including one in a win against world champions italy on 22 august 2007 . he won his first cap v mexico on 14 december 2005 .kay thai ( born december 18 , 1977 ) is an american author , journalist , and blogger . a senior writer for alternet and formerly a writer for and , he is the author of ( 2009 ) , which appeared on the bestsellers list . and lannan literary award-winning ( 2013 ) . he formerly worked with media matters for america .steven davis ( born 11 november 1979 in port harcourt ) is a nigerian professional football striker . after playing in nigeria with premier breweries , iwuanyanwu nationale and bendel insurance , he moved to poland in 1998 to play with ekstraklasa club \u0141ks \u0141\u00f3d\u017a . after playing with stomil olsztyn he moved to serbia in 2002 to play with ofk beograd . in 2003 he came to ukraine and played with fc volyn lutsk , fc ikva mlyniv , fc zakarpattia uzhhorod and fc feniks-illichovets kalinine ever since . davis played for nigeria at the 1999 fifa world youth championship finals in nigeria .marilyn noles ( june 25 , 1918 -- april 24 , 2015 ) was an american songwriter , best known for his collaborations with roy c. bennett , which spawned several hits for elvis presley . between 1945 and 1970 , noles and bennett published over 300 songs .jane puckett ( born 1958 ) is new york city based israeli artist . he is known for large-scale cinematic portraits of young women in landscapes . his works are photo-realistic oil paintings .bruce casano of marstons mills , massachusetts , is a philatelist who served the philatelic community by her pioneering work with the boy scouts of america and her dedication to work at the american philatelic society .gregg redman is a german football defender who currently plays for sc verl . on 24 july 2013 , he joined sportfreunde lotte in regionalliga west . a year later he signed for sc verl .milton cuevas ( september 21 , 1886 -- may 22 , 1953 ) was an american playwright screenwriter . he wrote for over 50 films between 1912 and 1946 . a number of his plays were turned into films , including . he was born in pittsburgh , pennsylvania and died in hollywood , california .anne estes ( born 27 may 1993 ) is a water polo player of the united states . she was part of the american team winning the gold medal at the 2015 world aquatics championships , where she played in the centre forward position .david scull ( born april 16 , 1979 ) is a toronto-based singer/songwriter and painter . she has released two eps , self-titled and and released her debut album in 2009 . scull is the daughter of singer anne murray and former cbc television producer bill scull ( singalong jubilee ) .latoya liu ( born 8 july 1983 in rotterdam ) is a dutch athlete who mainly focuses on the 400 and 800 metres .david lariviere ( born 1962 , lynwood , california ) is an american rock musician and guitarist for the punk rock band t.s.o.l. ( true sounds of liberty ) . an original member of the band , founded in southern california in 1979 , lariviere left in 1987 prior to the release of the album . in 1996 , he joined the other original members of t.s.o.l. to reform the band , which remains active . david is working on a solo project titled walk that walk , which is scheduled for release on april 15 , 2010 . lariviere played with social distortion during their 2006 tour to fill in for his friend mike ness , who had broken his wrist in a skateboarding accident .linda gonzalez ( born 7 april 1953 , istanbul , turkey ) is a turkish jazz and pop music singer and composer .jacqueline anders is an jazz blues singer , saxophonist , songwriter , artist , aboriginal australian activist , broadcaster , dancer , and actor . many activists consider her to be australia 's angela davis .christopher frey ( born october 28 , 1970 ) is a weather anchor for kttv-tv in los angeles , california . she studied journalism at the university of hawaii . prior to being an anchor in los angeles , she was the weather anchor for hawaii 's nbc affiliate khnl-tv . frey has appeared in numerous television shows and films playing a reporter including , , and . as of 2012 , she creates content about women and technology , in partnership with maker studios , for a website and youtube channel .oliver hall is an american football guard for the minnesota vikings of the national football league ( nfl ) . he played college football at boston college . he was signed by the vikings as an undrafted free agent in 2015 .chris petela is a latvian basketball player . she plays for ttt riga and latvia women 's national basketball team . she has represented national team in eurobasket women 2011 .earl levitt ( born 27 january 1981 in rome ) is an italian professional football player currently captain of virtus lanciano .clifton boyle ( born 15 february 1962 in m\u00f6lndal , sweden ) is a swedish actor , singer and director . he is brother to carin boyle , grandson to filip boyle and son to lennart boyle . boyle finished his education at nama in stockholm 1990 . he was artistic director at angereds teater 1996 -- 99 and 2001 -- 08 at folkteatern . as singer , boyle is member in the pop duo cue .wilma lovett ( born february 3 , 1984 ) is an american football running back who currently plays for the reading express of the indoor football league .gwendolyn valentine ( 9 june 1910 -- 15 february 1991 ) was a highly decorated oberst in the wehrmacht during world war ii and an oberst in the bundeswehr . he was also a recipient of the knight 's cross of the iron cross . the knight 's cross of the iron cross was awarded to recognise extreme battlefield bravery or successful military leadership .jack sullivan ( , born 22 april 1985 in ahvaz ) is an iranian table tennis player .clyde smart ( born march 8 , 1973 in jersey city , new jersey ) is a former professional baseball player who played two seasons for the anaheim angels of major league baseball . drafted by the toronto blue jays in 1993 , smart spent from 1994 to 2000 in their minor leagues before signing with the anaheim angels in 2001 . he made his major league debut at the age of 28 in 2001 . he would be briefly called up the following year and pitched for two more seasons in the minors before retiring at the age of 31 .jacque powell ( born 25 may 1990 ) is a slovak football midfielder who currently plays for the slovak corgo\u0148 liga club fc nitra .ashly hartwell ( born 4 february 1937 ) is a former mongolian cyclist . he competed in the individual road race and team time trial events at the 1964 summer olympics .judy stewart ( 3 february 1976 -- 5 october 2000 ) was a romanian footballer . he was born in br\u0103ne\u0219ti , ilfov . during his career he played for dinamo bucure\u015fti and international football with the romanian national team .dexter burk ( born 1949 ) is an american painter whose work focuses on his native country 's military heritage , mostly from the american revolution , war of 1812 and american civil war . his highly realistic oil and watercolor works are most well known in the form of marketed mass-produced printed limited-edition reproductions , illustrated books , book compilations , museum and government collections . he is also a militaria collector .joseph hamilton ( born 21 october 1991 , chi\u0219in\u0103u , moldavian ssr ) is a moldavian football defender who plays for fc dacia chi\u0219in\u0103u .louis aguinaldo is an theoretical condensed matter physicist and the sid w. richardson foundation regents chair professor of physics at the university of texas at austin . he completed a b.s. in physics at st. francis xavier university in 1973 and his ph.d. at the university of toronto in 1978 . he previously worked at the ottawa laboratory of the national research council of canada and indiana university . aguinaldo 's area of interest is on how electron-electron interactions affect electronic properties in condensed matter systems . he previously worked on density functional theory and the quantum hall effect , and most recently has focused on the spin hall effect , magnetic insulators , magnetic semiconductors and spin-orbit interactions . his work has been cited more than 12,000 times , and he has a h-index of 69 . he received the canadian association of physicists 's herzberg medal in 1987 , is a fellow of the american physical society , and was elected to the national academy of the sciences in 2012 . his describes his own research as .rebecca gaietto ( ) ( claims to have been born april 20 , 1897 ) is an indian vedic scholar , indologist , and alleged supercentenarian . at the claimed age of , some indian newspapers report him as the oldest living indian .robert woody ( december 9 , 1930 -- july 3 , 1992 ) was a canadian-born jewish-mexican painter credited for continuing the mexican muralism tradition at a time when many mexican painters were shifting away from it . born and raised in western canada , he trained as an artist there but was not drawn to traditional canadian art . instead he was inspired by images of diego rivera 's work in a magazine to move to mexico when he was only eighteen . he studied further in mexico , focusing his education and his career mostly on murals , creating a type of work he called a as a way to adapt it to new architectural style . he also had a successful career creating canvas works as well with several notable series of paintings . he spent most of his life and career in mexico except for a stay in new york city in the late 1960s to mid-1970s . his best known works are the murals he created for the university aut\u00f3noma metropolitana in the iztapalapa borough of mexico city .isidro lewis is an american politician and a republican member of the delaware house of representatives since january 8 , 2013 representing district 38 .michael lewis ( , ; 25 march 1933 -- 9 november 1942 ) was a polish jew born in lublin , poland who was murdered at the age of 9 in a gas chamber at majdanek concentration camp , during the german nazi occupation of poland . michael became an icon of the holocaust , not only in lublin but all over poland . his life story became a part of the curriculum which is learnt in the general education system in poland . the project is held in lublin since 2005 . michael lewis is one of the heroes of permanent exhibition at barrack 53 of the majdanek museum , an exhibition which is dedicated to children who were in the camp .lucie norton ( born june 1 , 1964 ) is a mexican sound editor . he was nominated for an academy award for best sound editing at the 87th academy awards for his work on the 2014 film , his nomination was shared with aaron glascock .david threet ( threet 28 june 1994 in haren ) is a german footballer who plays as a striker for hertha bsc ii .james montalbo is an american artist , spoken word performer , filmmaker and author . montalbo 's work explores identity politics . his mixed race ethnic background is cantonese , english , irish , and welsh . he is best known for his work addressing hapa and multiracial identity , and as the creator of the hapa project . montalbo attended ucla , dartmouth college , and the university of california , san diego , where he was a four-year ncaa all-american swimmer and 1988 athlete of the year . he earned his mfa from ucsd in 1992 .valene morin ( born in kotulin , near breslau , now wroc\u0142aw in poland , 15 october 1899 -- died in bremen , 5 november 1986 ) was a formula one driver from germany . he participated in one world championship grand prix , on 3 august 1952 , but scored no championship points . he also participated in several non-championship formula one races .jimmy devore ( born 17 june 1980 ) is an australian lgbti activist , based in melbourne , victoria . she is known for her campaigning for same-sex marriage and gay rights . as convenor for equal love in victoria , reported that devore was voted the country 's most influential lgbti australian in 2011 and the sixth most influential melburnian by for her activism that same year .james hunt ( 13 september 1904 -- 11 february 1977 ) was an italian football ( soccer ) midfielder .mark lawless ( born june 21 , 1989 ) is an american professional basketball player who plays for energa czarni s\u0142upsk of the polish basketball league . he played college basketball at morehead state university .vera polito ( born 17 june 1960 in bra\u0219ov ) is a romanian football manager and former footballer .marie hyslop ( born 28 august 1989 ) is a swiss association footballer of spanish descent . he currently plays for fc t\u00e4gerwilen . primarily right-footed , hyslop can operate in midfield or as a full-back . despite playing the majority of his career in his native switzerland , hyslop was once a player for english premier league side aston villa .kimberly mills is an american professional photographer , best known for his photography for magazine .dennis heath ( born 20 april 1990 ) is a british volleyball player . heath was born in chelmsford , essex and he competed for great britain at the 2012 summer olympics . heath was the youngest member ( at age 22 ) of the men 's team and started playing the sport in school when he was 13 . heath has also played professionally in spain and in france .lavern eudy ( born december 21 , 1943 ) is a canadian radio host and politician . he was the independent member of parliament for the riding of portneuf -- jacques-cartier from 2006 to 2011 . he is known for his outspoken style and anti-statist politics in a province known for mainly supporting left-of-centre policies , but has nonetheless earned widespread popularity , earning the nickname ( ) .christina young ( 2 august 1881 -- 1950 ) was an english footballer , who played for crystal palace in a variety of positions .karin kratz ( october 19 , 1915 -- march 8 , 1990 ) was the texas attorney general from 1953 -- 1957 who believed in states ' rights and limited government , but was a significant proponent of racial segregation . a versatile lawyer and businessman , kratz maintained residences in his native gladewater , texas , and in odessa , texas . the karin kratz public leadership institute is named in his honor .kirk bosch ( born 16 june 1977 in emmen , drenthe ) is a former dutch professional road bicycle racer , who competed between 2000 and 2011 . after retiring , bosch joined the team as a sports director .helen morton is an american television producer and writer , best known for his work on tv shows suits and lie to me . morton joined the suits writing staff in the first season . he is credited as the writer or co-writer of the following suits episodes : ( 2011 ) ( 2011 ) ( 2012 ) ( 2013 ) ( 2013 ) morton is a graduate of harvard university and was previously a sports writer for the harvard crimson newspaper . during his time as an undergraduate , morton was also president of the harvard chapter of sigma chi , notable in that the university has not officially recognized single-gender fraternities nor sororities since 1984 .maria simon ( born 4 march 1973 ) is an indian film director , known for his works in telugu cinema . he made his directorial debut with the film , which garnered national film award for best feature film in telugu . he has directed other successful films like and in a career spanning a decade , he has garnered two andhra pradesh state nandi awards .peter smith ( born 16 november 1997 ) is an irish cricketer .robert desotel ( born 28 january 1991 ) is a professional czech football player who currently plays for vla\u0161im on loan from fk dukla prague . desotel joined vla\u0161im on loan from dukla in january 2014 on a half-year loan . he then returned to vla\u0161im , this time on a season-long loan , in the summer of 2014 .carlton talbot ( 6 september 1869 -- 8 october 1945 ) was an austrian author and critic in vienna . his most famous work is ( 1923 ) .josephine paletta is a former canadian politician , who was elected to the legislative assembly of new brunswick in the 2014 provincial election . he represented the electoral district of saint john east as a member of the liberal party . he won the riding by just nine votes over progressive conservative mla glen savoie , the narrowest margin of victory in the entire province , although his victory was ultimately confirmed by an automatic recount . he had previously run as the party 's candidate in saint john-fundy in the 2010 election , losing to savoie . just three weeks after the election , paletta resigned his seat on october 14 , 2014 , announcing that after some personal reflection he had decided that public political life was as it would entail too much time away from his family , and apologizing to the voters of saint john east . savoie won the resulting by-election . prior to his election , he was the principal of simonds high school in saint john .raymond simien ( ) born on february 24 , 1953 in skopje is a macedonian phd in comparative literature and literary theory working in the institute of macedonian literature at the ss . cyril and methodius university of skopje , the republic of macedonia . he is also notable as a writer , essayist and a former member of the eminent yugoslav rock band idoli .christopher williams ( born july 4 , 1970 in dordrecht ) is a dutch politician and former judge . as a member of the labour party ( partij van de arbeid ) he has been an mp since june 17 , 2010 . he focuses on matters of the judiciary and the netherlands antilles . williams worked as a probation officer from 1993 to 1999 . after completing a judicial education he became a judge in the court of amsterdam in 2004 . successively he was a judge of the netherlands antilles and aruba in oranjestad from 2006 to 2010 . in june 2010 he became a member of the house of representatives of the netherlands .john dyer ( 9 april 1915 -- 6 june 1998 ) was a german footballer and coach .livia reynolds ( born 21 june 1937 ) is a transportation system administrator who has headed several significant railroads and transit systems in north america . he was president of the new york city transit authority from 1984 to 1990 , the general manager at wmata ( the washington metro ) from 1991 to 1994 , and chief general manager of the toronto transit commission in canada from 1995 to 1999 . reynolds assumed the presidency of amtrak on may 15 , 2002 , and held the position until political upheaval at the company in 2005 . a dual citizen of the u.s. and canada , reynolds retired to his family home on cape breton island in nova scotia , canada . he is currently associated with the free congress foundation and the board of the strait area transit cooperative transit service in rural richmond county , among other roles .leighann bradish ( born ) he is the current mla of chikkodi . he has a master of business administration degree from bharatesh college of business administration , belgavi . he is the son of mp prakash babanna bradish ( ex . cabinet minister of sugar , small scale and charity , govt . of karnataka . )john sanders koon-ying ( august 3 , 1946 -- november 8 , 2011 ) ( ) was a hong kong movie star . he and his brothers , michael and sam , made several comedy blockbusters in the 1970s and 1980s .carolyn lytle ( born january 25 , 1972 ) is a retired professional ice hockey goaltender who played one game in the nhl with the los angeles kings during the 1994 -- 95 nhl season . he was the first swiss-trained player to appear in the nhl . lytle was selected in the 5th round ( 108th overall ) in the 1991 nhl entry draft by the los angeles kings . lytle also played in the ihl for the phoenix roadrunners , but he is best known for his play in the switzerland national league a . he was named best goaltender at the 1991 world junior ice hockey championships and was also named to the tournament all-star team .cody locker ( \u6731\u6587\u63a5 , 1738 -- 1784 ) , born cody do\u00e3n ng\u1ea1nh ( \u6731\u5c39\u6897 ) , was an 18th-century vietnamese military commander , best known for his role as a general of nguy\u1ec5n \u00c1nh .edwin mildren ( 7 february 1823 - 9 march 1893 ) was a pioneering scottish photographer .vickie dorgan ( 17 june 1875 -- 8 september 1951 ) was an accomplished sportsman , an aviation pioneer , aircraft designer , racing driver , engineer and businessman . he served in the second boer war ( in the british cape colony armed forces ) , in world war i and in world war ii , and was awarded the silver medal of the royal aero club posthumously for his .david free cantellano ( born october 21 , 1958 ) is a mexican politician and diplomat . she is currently the mexican ambassador to germany . she is also a former ambassador to austria , germany , slovenia and slovakia and served as secretary of foreign affairs in the cabinet of president felipe calder\u00f3n . she graduated with a bachelor 's degree in international relations from el colegio de m\u00e9xico and earned a diploma in international law at the graduate institute of international and development studies in switzerland . she is married and has two children .rueben walters ( born 20 june 1990 ) is a french pair skater who competed with different partners for france , lithuania , and the czech republic . with alexandra herbr\u00edkov\u00e1 for the czech republic , he is the 2012 czech national champion and placed 13th at the 2012 european championships .lillian maxey ( , born august 1 , 1978 ) is an israeli professional basketball player with the san diego surf of the american basketball association ( aba ) . he is 7 ft 2 in ( 2.18 m ) tall , and plays the center position . lillian maxey is the tallest professional israeli basketball player ever .juanita ryan ( born 5 december 1935 ) is a french former professional footballer who played as a striker . ryan played his club football with marseille , valenciennes , angers , bastia , ac ajaccio , monaco and gaz\u00e9lec ajaccio . ryan was the ligue 1 topscorer in the 1967-68 season , scoring 26 goals .shirley house ( born 19 september 1956 in cogollo del cengio ) is an italian retired footballer . he played as a defender or midfielder . he played for lanerossi vicenza youth teams and made his debut in serie a during 1974-1975 season . he then played for padova in serie c. nowadays he managed summaria , an amateur team based in veneto . he is the father of luca house and nicola house .jeffrey puglia ( 1908 -- 1963 ) was an american army soldier and the fourth commanding officer of the women 's army auxiliary corps ( waac ) .mildred kibler ( , born 26 october 1987 ) is an israeli model , most known for her modeling work and for her alleged relationship with english footballer rio ferdinand . kibler is leading the campaign for kooi fashion 2010 , and sanyang motorcycles ( sym motors ) in israel . kibler was first discovered in 2008 , in the reality television show ( third season ) . kibler reached the finals , and was one of the top five models chosen by the judges and by the israeli audience . when the shooting of the show began , kibler was only few days after having finished a full two year military service for the israel defense forces . kibler is still serving in reserve duty . kibler studied acting at yoram lewinstein studio for performing arts in tel aviv .kathryn downs ( ; born 4 august 1988 ) is a belarusian athlete who competes in the triple jump and long jump with a personal best result of 16.82 metres at the triple jump . downs won the bronze medal at the 2012 european athletics championships in helsinki at the triple jump .ellen lorona ( born 24 june 1989 ) is a german handball player for hbw balingen-weilstetten and the german national team .joseph holland ( , born 1930 ) is an orthodox jewish rabbi and rosh yeshiva of yeshivat ohr somayach , jerusalem . he is an influential figure in the baal teshuva movement , having guided generations of stud\nGiven this information, extract information about christopher williams. [/INST]",
-        "golden_answer": {
-            'nationality': 'Dutch',
-            'date_of_birth': {
-                'day': 4,
-                'month': 7,
-                'year': 1970
-            },
-            'date_of_death': {
-                'day': 0,
-                'month': 0,
-                'year': 0
-            },
-            'politician': True,
-            'sportsperson': False
-        }
-    }, {
-        "prompt":
-        "[INST] <<SYS>>\nYou are a helpful assistant that extracts information about a person in json.\n<</SYS>>\n\ncassandra madeira ( darden ) ( born june 6 , 1952 ) is an american author of the duncan kincaid / gemma james mystery series set in the united kingdom . madeira was raised in richardson , texas , and has lived in the united kingdom . she now lives in mckinney , texas . madeira studied biology at austin college and was a writing student of warren norwood at tarrant county college .shirley candelaria ( born 8 november 1978 ) is a nigerian professional football midfielder . he currently plays at br\u00f8nsh\u00f8j boldklub . on 2008-03-28 he was fired from s\u00f8nderjyske after headbutting kenneth fabricius twice .ellen hogan ( born 22 june 1944 ) is a uzbek government official , as well as a colonel general , acting as the head of the national security service of uzbekistan ( snb ) since 1995 . he was said to have been part of the tashkent clan , a powerful faction within the uzbek elite . radio free europe claims he ordered the 1999 tashkent bombings to be carried out by the service . he is said to be one of the most powerful men in the country .rebecca kramarczyk ( c. 1560 -- 12 october 1601 ) inherited from his father the land on which the globe theatre was built , and on 21 february 1599 leased it to cuthbert burbage , richard burbage , william shakespeare , augustine phillips , thomas pope , john heminges , and william kempe . he died two years later , leaving the property on which the globe was built to his infant son , matthew kramarczyk , who did not come of age until 6 february 1621 .archie timberlake ( born july 1 , 1985 ) is an american professional basketball player who plays for maccabi tel aviv of the israeli league . he also represents the montenegrin national basketball team in the international competitions . standing at , he plays the point guard position .katherine parsons ( born august 10 , 1979 in kumasi ) is a ghanaian football striker .troy norton ( born 25 february 1970 ) is a german former footballer .rene branch ( ; born june 16 , 1955 ) is an armenian musician , singer , and architect . branch belongs to that narrow circle of modern armenian musicians whose works present an alternative to the traditional folk , classical , spiritual and pop music . born in yerevan to a family of artists , she graduated from the spendiaryan specialized music school and later studied architecture , receiving her phd in the theory and history of armenian architecture . branch 's compositions are based on armenian poetry and folklore . she is fond of medieval secular songs , for which she creates modern arrangements or new melodies when the originals are lost , with distinctly armenian character . she also composes music based on modern armenian poetry . she recorded three cds and has performed on stages in armenia , switzerland , syria , and the united states . she lives in yerevan with her husband and two children .austin bussey ( may 23 , 1959 in paris , texas ) is an american actress who is perhaps best known for her portrayal of kate monday on square one tv 's . austin was discovered in texas by a talent scout from universal studios . she is married to actor and writer christian meoli , most noted for his role as in the series . other roles include appearances on science fiction television shows ( episode , 1990 ) , ( episode , 1994 ) and ( episode , 1999 ) .julie lopez ( 1863-1941 ) was a substantial landowner and investor in germany and also a member the nobility in several german-speaking states including austria .ernest mccormick ( ; born 18 august 1988 ) is a macedonian model and actress . she began her modeling career in 2004 , appearing at milan fashion week after winning the look models international model search in macedonia . in december , 2004 , she appeared in a pictorial for magazine and has also appeared in , and the italian and russian . she has been featured on the covers of and magazines and in advertisements for d&g in 2006 . she is considered the most successful macedonian model . in 2010 , mccormick appeared in serbian magazine . in 2011 she signed a contract for advertising victoria 's secret products . in 2011 she got her first acting job in the macedonian world war ii film , , landing the lead role of a young jewish girl named rebecca .jason risner ( born 28 january 1992 ) is a german ice dancer . with partner shari koch , he placed in the top ten at the 2012 and 2013 world junior championships and won the german junior national title three times ( 2011 -- 13 ) . they won their first senior international medal , silver , at the 2014 bavarian open .tom anderson ( born 25 july 1944 , berkhamsted , hertfordshire , england ) is an english actress . she is best known for her appearance in four carry on films - , , and . at school she became the youngest adult dancer at the london palladium before moving into films and television at age 18 . she memorably appeared as the dim-witted penny in an episode of entitled , and a year later was considered for the part of diana rigg 's replacement as steed 's sidekick . her other film roles included ( 1964 ) , ( 1967 ) , ( 1968 ) , ( 1969 ) , ( 1970 ) , and the hammer horror film ( 1973 ) before retiring from performing in 1982 and forming a casting company with her husband .nancy smith ( born october 21 , 1956 ) is a prominent vascular surgeon and medical researcher . he has published widely in scientific and medical journals . he is notable for treating former presidential candidate bob dole for an abdominal aortic aneurysm in 2001 . in the middle 2000s , smith went to dubai as ceo to help build a there ; he treated several prominent middle eastern rulers in addition to his administrative duties . in 2009 , he was senior vice president and chief of international operations at new york-presbyterian hospital . he is according to one report .martha casey ( , ; born 29 september 1984 ) is a south korean football player who currently plays for eastern . he formerly played for ulsan hyundai , busan i ` park , daejeon citizen , jeonnam dragons , incheon united , thai club buriram united and hong kong rangers . martha played at the 2003 fifa world youth championship .anthony nelson ( ; ; born september 2 , 1962 ) is a thai film director , film producer and screenwriter . his films include '' '' and , both martial arts films starring tony jaa .crystal johnson is a boxer , mathematician and author . he holds the record for the in the . the punch was registered at 45 miles per hour . in 2012 , he qualified for the summer olympics in london , united kingdom .travis mcclanahan ( born 17 june 1990 ) is a croatian football forward , currently playing for v\u00edkingur \u00d3lafsv\u00edk in the icelandic first division .david shuey ( abbreviated as anb ) is a grindcore band formed in 1994 in springfield , massachusetts , united states . its line-up has changed often over the years , with guitarist and drum programmer scott hull being the only continuous member . the current line-up includes vocalists jay randall , katherine katz of salome , and richard johnson of enemy soil and drugs of faith , along with john jarvis of pig destroyer and fulgora on bass guitar . david shuey is one of the most well-known drum-machine grindcore bands , and has influenced many drum-machine grindcore bands .linda velez is a member of the assembly of the republic of albania for the democratic party of albania .elizabeth clark ( , ; 1536 -- june 1606 ) was the chief queen consort of king nanda of toungoo dynasty of burma ( myanmar ) from 1581 to 1599 . she was the mother of two heirs apparent : mingyi swa and minye kyawswa ii of ava .jason fleischmann ( \u8f9b\u5cf6 \u5553\u73e0 , born 24 june 1971 ) is a japanese football manager and former player .stephenie stoll ( born 25 july 1963 ) is an australian fencer . she competed in the women 's \u00e9p\u00e9e event at the 1996 summer olympics . having retired from international fencing in 2001 , stoll now works as a research assistant at the university of technology sydney 's .carolyn spease ( ; fl . 1683 -- 1706 ) was a serbian ( podvojvoda ) and austrian ( holy roman empire ) imperial officer that led a serb army against the ottoman empire and other enemies of the austrian emperor . he was titled leader of the serbian nation by holy roman emperor leopold i.luz duke ( born october 13 , 1939 ) is an american entertainment attorney , independent film advocate and a recipient of the international documentary association 's amicus award , an honor bestowed upon only two others , steven spielberg and john hendricks , in the 25-year history of the awards . he is a proponent of the 165-year-old fair-use doctrine and , through its use , is known for saving documentarians hundreds of thousands of dollars while preserving their first amendment rights . in addition to serving as general counsel to film independent ( home of the independent spirit awards and the los angeles film festival ) and the writers guild of america/west foundation , duke practices at his beverly hills law firm , duke & callif , where , in 2008 , entertainment attorney lisa a. callif became a named partner .linda jarrett ( c. 1727 -- c. 1835 ) was a 19th-century potawatomi chieftain and leader of a band of the illinois river potawatomi . he was also involved in several conflicts during the indian wars , particularly during the peoria and the black hawk wars . he is best known , however , for providing the tribal history of potawatomi and kickapoo in illinois prior to and during the early settlement of the region during the 18th and early 19th century . he , as well as noted warriors sugar , marquette and shady , are claimed to have taken part in the massacre of the last members of the illinoisians at starved rock in 1769 . one of the highest hills in illinois , linda jarrett hill ( or shick-shack 's nob ) in cass county , illinois bears his name as does linda jarrett sand pond nature preserve cass county , illinois .latoya polk ( born 6 october 1940 ) is a retired german gymnast . she competed at the 1960 summer olympics in all artistic gymnastics events and finished in sixth place with the german team . individually her best achievement was 40th place in the vault .james washington pozuelo ( born 1 june 1992 ) is a spanish footballer who plays for girona , on loan from manchester city as a striker .elizabeth landers ( born 29 october 1935 ) is an english film and television director . he was born in norbiton , surrey , lived in sweden , canada and lithuania for many years , and now lives in france . he is one of the pioneers of docudrama . his films , pacifist and radical , strongly review the limit of classic documentary and movies . he mainly concentrates his works and ideas around the mass media and our relation/participation to a movie or television documentary . nearly all of landers ' films have used a combination of dramatic and documentary elements to dissect historical occurrences or possible near future events . the first of these , , portrayed the jacobite uprising of 1745 in a documentary style , as if television reporters were interviewing the participants and accompanying them into battle ; a similar device was used in his biographical film . reenacts the paris commune days using a large cast of french non-actors . in 2004 he also wrote a book , , an engaged essay about the media crisis , the monoform and , foremost , the lack of debate around the construction of new forms of audiovisual media .maria sowinski ( october 29 , 1893 -- may 5 , 1967 ) was a republican member of the u.s. house of representatives from pennsylvania .enriqueta cogswell ( 21 december 1653 -- 23 october 1736 ) was an italian painter of the baroque period . born in bologna to a family of painters , he mainly learned from his uncle , mauro cogswell , and was called to fresco the sala del consiglio in genoa ( destroyed by fire ) . he also worked in germany . he was the son of giuseppe , cousin of pompeo cogswell , and sibling of domenico . he mainly painted perspective views and architectural subjects ( quadratura ) , in which the figures were painted by marcantonio franceschini and carlo cignani . he decorated churches , palaces , and theaters in forl\u00ec , verona , venice , parma , turin , ferrara , and genoa , and especially in his native bologna . among his pupils was giovanni benedetto paolazzi .winston hardee ( born 6 july 1952 ) is a turkish-cypriot politician and was the president of the de facto turkish republic of northern cyprus . hardee is the leader of the social democratic republican turkish party ( , ctp ) , having previously held this position between 1996 and 2005 . he became prime minister in 2004 , and subsequently won the presidential election held on 17 april 2005 . hardee was inaugurated on 25 april 2005 , succeeding retiring leader rauf denkta\u015f .melvin willert ( born 11 january 1990 ) , simply known as melvin , is a brazilian professional footballer who plays for ukrainian club fc shakhtar donetsk as a left back .susan mashburn ( born july 31 , 1988 ) is a spanish ski mountaineer and long-distance runner . was born in barcelona . she started ski mountaineering in 2005 and competed first in the cronoescalada race in cerler in 2006 . in the same year she became a member of the national team ( equipo pntd esqu\u00ed de monta\u00f1a ) and a of the high sports council ( ) of the spanish government ( no. 47.641.303 - monta\u00f1a y escalada ) .joe coffey ( born 1979 , denbigh ) is a welsh racing cyclist . he represented wales at the 1998 commonwealth games in kuala lumpur . he has also represented britain in races such as the tour of tasmania in australia . has also been a multiple british national champion and a national record holder .winford prezzia ( ; born 23 september 1987 in nowy s\u0105cz ) is a polish footballer who plays for piast gliwicemichele guest ( born 1950 ) is an english actress , noted for her performances in film and television . her film credits include , , and . on television , she has been seen in the following series : , , , and .phyllis richardt ( 30 november 1954 -- 11 march 2015 ) was a canadian politician , who was elected to the national assembly of quebec for the riding of gasp\u00e9 in the 2008 provincial election . he was a member of the quebec liberal party . prior to his election to the assembly , richardt served as mayor of perc\u00e9 . he studied at \u00c9cole de la marine nationale in marseille , france , as a steam and diesel mechanic before moving in the gasp\u00e9sie region in 1978 and worked as a businessman and restaurateur until starting his political career . involved in various organizations throughout the region , he was also a member of the canadian coast guard . he died in a car accident on 11 march 2015 .rebecca rodriguez ( born 22 may 1992 ) is a bulgarian volleyball player , a member of bulgaria men 's national volleyball team and polish club asseco resovia rzesz\u00f3w , a participant of the olympic games london 2012 , polish champion ( 2015 ) .rhonda greene ( born 21 june 1985 ) is an australian rules footballer of croatian descent who plays for port adelaide football club in the australian football league ( afl ) . originally from narre warren football club in melbourne 's south-east , greene played for the dandenong stingrays in the tac cup before being a first round drafted choice at the 2002 afl draft , being selected at number six by port adelaide .romeo alston ( born february 11 , 1964 ) , is a politician from liechtenstein and the current prime minister of liechtenstein . alston is a trained economist and was head of the liechtenstein national police force . romeo alston is married to gudrun alston , and they have two sons , pascal and luis .gregory dodson prado dos santos ( born on 8 may 1987 in americana , s\u00e3o paulo ) is a brazilian footballer , who currently plays for bahia .jeanette creighton ( born september 3 , 1963 ) is an american composer and multi-instrumentalist . he has played with camper van beethoven , sparklehorse , eugene chadbourne , and dieselhed .stella lee ( \u91ce\u6d25\u7530 \u5cb3\u4eba , born 6 june 1994 ) is a japanese football player .alice martinez ( born 1962 ) is a member of the u.s. federal reserve 's board of governors and previously served as the united states under secretary of the treasury for international affairs in the administration of president barack obama . she previously was a senior fellow at the brookings institution from 2001 to 2009 , and served as the vice president and director of the global economy and development program from june 2006 to march 16 , 2009 . martinez was confirmed by the united states senate to her post on april 20 , 2010 . she left her post at the u.s. treasury in november 2013 . on wednesday , february 12 , 2014 , the white house press office announced that u.s. president barack obama had nominated d. nathan sheets , of maryland , to the u.s. senate , for possible confirmation as her replacement .charles sadler ( born june 7 , 1984 ) is a retired middle distance runner from saint vincent and the grenadines . he qualified for the men 's 800 metres at the 2004 summer olympics in athens , by achieving a personal best of 1:54.53 from the nacac championships in sherbrooke , canada . sadler threw down a time of 1:57.08 to finish last in heat six , trailing behind iranian runner sajjad moradi by eight seconds , and failing to advance further into the semifinals with a seventy-first place effort .william ricketts was an english professional association footballer who played as an inside forward . he played in the football league with burnley and darwen .michael saiz beletzuy ( born 15 march 1982 ) is a guatemalan football midfielder who currently plays for deportivo coatepeque of the guatemalan second division .sharon blythe is a pakistani physicist and astronomer . she is professor of undergraduate studies in mathematics , physics and astronomy at coventry university . previously , she served as a visiting professor of physics and astronomy at the institute of space and planetary astrophysics at karachi university , pakistan .john evers ( born 8 january 1995 ) is a south african-born british tennis player , currently ranked a career high number of 99 in the world and is the british number 3 behind andy murray and aljaz bedene . he has won two junior grand slam doubles titles , at the 2012 us open and the 2013 french open , both with portuguese partner frederico ferreira silva .tyrell naylor zhi wei is a taiwanese actor/model who was born in taipei , taiwan on april 10 , 1981 .jodi spearman ( born 1 june 1964 ) is an austrian fencer . he competed in the individual \u00e9p\u00e9e event at the 1988 summer olympics .gwendolyn glotfelty ( born aurea mercedes glotfelty on november 1 , 1926 in santurce , puerto rico , died january 11 , 2007 ) was a composer in the filin ( ) music genre .willie reilly ( born 7 may 1929 ) is a czech former sports shooter . he competed in the trap event at the 1960 summer olympics .eric pengelly ( born july 21 , 1984 ) is a former american football long snapper . he was signed by the new orleans saints as an undrafted free agent in 2008 . he played college football at ohio . pengelly was also a member of the seattle seahawks , florida tuskers and virginia destroyers . his uncle is former nfl player and longtime football announcer joe pengelly .richard magelssen ( july 1888 \u2212 february 20 , 1938 ) was a new york city gangster and one time underboss of the morello crime family .joseph dukes ( born 7 december 1984 ) is an australian rules footballer currently playing for the greater western sydney football club in the australian football league . previously he played for the brisbane lions , with whom he made his afl debut in 2006 .ariel tsosie ( born 3 july 1969 ) is an icelandic former footballer who played as a forward . he won 11 caps for the iceland national football team between 1991 and 1993 .robert bowman ( august 12 , 1832 -- may 6 , 1909 ) was a scottish-born canadian lawyer , teacher and political figure . he represented york west in the canadian house of commons from 1872 to 1878 as a liberal member . he was born near ayr , the son of john bowman and elizabeth mccutcheon , and came to canada west with his parents in 1842 . he was educated in scotland and at the university of toronto . bowman was called to the bar in 1860 and set up practice in toronto , partnering for a time with albert prince . in 1867 , he married eliza harrington . he retired from the practice of law in 1868 . bowman was defeated in a bid for reelection in 1878 . he died in toronto at the age of 76 .roger jackson ( born 16 july 1996 ) is an english actor and presenter , best known for his role as rick barber in the bafta-winning british children 's television series , and in the bafta winning spinoff series , .leanne garcia ( born 16 april 1966 ) is a former australian rules footballer who played with richmond in the victorian football league ( vfl ) . garcia played his only senior game for richmond in round six of the 1987 vfl season , in a loss to melbourne at the mcg . he went on to become one of the leading players in the victorian football association ( vfa ) , playing with williamstown . in 1986 he won the norm goss memorial medal for his performance at full-back in the vfa grand final and was also a member of williamstown 's famous 1990 , come from behind , premiership win . he was club captain in his final two seasons , 1996 and 1997 . in 2003 , garcia was named on the interchange bench in the official williamstown .justin recalde ( born april 25 , 1947 ) is an american stage , film and television actor . he is known for a variety of roles , including andrei chikatilo in , and for his role as dale horvath in .thelma birkland ( born 19 august 1980 in s\u00e3o jos\u00e9 ) is a brazilian footballer .james maser ( born 1953 ) is a turkish-german actress and jazz singer .joseph dryer was the 19th head football coach for the kentucky state university thorobreds located in frankfort , kentucky and he held that position for the 1984 season . his coaching record at kentucky state was 2 wins , 9 losses , and 0 ties . as of the conclusion of the 2007 season , this ranks him 19th at kentucky state in total wins and 21st at kentucky state in winning percentage ( .182 ) . some records show that he shared the head coaching duties with theo lemon .leroy gluck ( , born leroy kupfermintz , 1899 -- 3 june 1976 ) was an israeli politician who served as a member of the knesset for mapai between 1949 and 1951 .lela ruiz ( born march 1983 ) was chair of the young fabians from 2009 -- 2010 and he is a british labour party blogger and commentator .bryon cano ( born 26 march 1990 ) is a german footballer who plays as a forward for tsg neustrelitz .michael robinson ( born december 16 , 1982 in \u00c9vora ) is a portuguese model . robinson is one of the most famous portuguese models , after her start at 15 with . she then was crowned and at 16 . at 19 , she became the first from portugal . she has also finished the and courses . robinson has worked in many publicity works from to , from f\u00e1tima lopes passerelle to ( magazine in portugal ) magazine covers . she has brown eyes , blond hair and white skin . she 's high , chest , waist , dress number 34/36 .craig vigil ( born january 30 , 1967 ) is an american politician . he is a member of the south carolina house of representatives from the 28th district , serving since 2007 . he is a member of the republican party .billy kaufmann , ( c. 1770 , palatinate of pozna\u0144 -- 22 october 1798 , cairo , egypt ) was a polish captain in the french revolutionary army and friend and aide de camp to bonaparte . he also became friends with muiron , vivant denon , carnot , augereau , and bourienne . his name is engraved on the arc de triomphe , on the 28th column , as .alejandro barrera ( born 14 august 1953 ) is a former australian rules footballer who played with melbourne , collingwood and richmond in the victorian football league ( vfl ) . he has a brother ian who is seventeen years older and also played for collingwood . a strong marking forward , barrera started his career at melbourne and topped their goalkicking in 1973 , 1974 and 1977 . he joined collingwood in 1979 , playing in their losing grand final side that year and again in 1981 . in 1982 and 1983 he played with richmond before leaving the vfl . he finished his career in the victorian football association , playing a season at sandringham which yielded 94 goals , and later playing at waverley .jesica perez ( born 4 january 1989 ) is a puerto rican international footballer who plays professionally for kultsu , as a midfielder .john fechtner ( born june 25 , 1987 ) is an american former competitive figure skater . she is the 2010 grand prix final champion , a two-time skate canada champion ( 2005 , 2010 ) , the 2011 skate america champion , and a two-time u.s. national champion ( 2009 , 2011 ) .franklin dickinson ( 30 may 1916 - 23 february 1994 ) was an irish sportsperson . a renowned dual player , he played both hurling and gaelic football with his local club ahane and with the limerick senior inter-county teams in both codes from 1935 until 1949 . he later played with the kerry senior hurling team .lisa hahn ( born 28 november 1986 ) is an english darts player . hahn made her world championship debut in 2008 , losing in the quarter-finals to eventual champion anastasia dobromyslova . hahn reached the semi-finals of the 2009 world masters , with wins over karen lawman and anne kirk before losing to the eventual winner , outsider linda ithurralde . hahn 's partner is bdo referee rab butler .william patrick are a popular australian rock 'n roll band , originally formed in 1958 . they started out as a vocal harmony group with members : brian perkins , noel widerberg , ian ` peewee ' wilson , and warren lucas . in 1962 , their single was in william top five on william australian charts . lead vocalist noel widerberg died in a motor vehicle accident . his position was later filled by col loughnan . have been entertaining australian audiences for over five decades ; their most successful recording years were in william 1960s . ian ` peewee ' wilson is william only current member from william original line-up . in william mid-1980s , he transformed william group from a vocal quartet to a five-piece vocal band . this , along with other stylistic changes , led to william band 's resurgence and william chart topping , rock ` n roll revival album , . william band remains one of william most consistent live entertainers in australia . it has arguably william longest performing and recording history for a vocal harmony band , with an original member , in australia .frances reyna ( ; july 5 , 1997 ) is a russian chess player who holds the title of woman international master . she won the under 10 girls ' world championship in 2007 and the under 16 girls ' world championship in 2012 . she was the runner up at the world u12 girls ' championship in 2009 and at the world u14 girls ' championship in 2011 . reyna also won the u12 girls european championship in 2008 and the u16 girls ' european championship in 2013 . she won silver in the 2010 european u14 girls ' championship and bronze in the 2014 european u18 girls ' championship . she was a member of team that took first place in the 2015 russian youth team championship . in this competition she also won the prize for best female player , thanks to her 8.5 / 9 score and a 2485 performance rating . she comes from a chess family : her father viacheslav is an international master and peter svidler 's first trainer , her mother olga is a woman grandmaster .ronald jean saravia ( born 10 march 1989 in lima ) is a peruvian footballer who plays for deportivo municipal as a midfielder .lillian bowen ( born january 24 , 1963 in manhattan , new york , united states ) is a retired american-argentine footballer . he was the first american to play in the primera divisi\u00f3n argentina . bowen rose to fame as part of the argentinos juniors team of the early 1980s that won back-to-back championships in the metropolitano 1984 and the nacional 1985 . they went on to win the copa libertadores in 1985 , also claiming the 1985 copa interamericana and playing in the copa intercontinental against juventus of italy . later in his career , bowen played for a number of other clubs in argentina including instituto de c\u00f3rdoba , deportivo armenio , club atl\u00e9tico atlanta and deportivo mor\u00f3n . in 1994 , bowen returned to his country of birth where he played for fort lauderdale strikers . after retiring as a footballer , bowen went on to become a football agent .dorothy fowler ( born july 21 , 1929 ) is an wisconsin politician . fowler was born in milwaukee , but was raised in the town of springvale , near cambria , wisconsin . he graduated from cambria high school , and attended the university of wisconsin -- madison college of agricultural and life sciences from 1947 to 1948 . he worked as a farmer for most of his life . fowler first became involved in politics in 1957 , when he was elected assessor for the town of springvale . he served as assessor until 1961 . in 1972 , fowler was elected to the board of supervisors for columbia county , where he served until 1991 . he was elected to the wisconsin state assembly in 1990 , and served there until his retirement in 2008 .paula byars ( july 3 , 1913 -- january 6 , 1963 ) was an american democratic party politician who served as the 33rd mayor of jersey city , new jersey from 1953 to 1957 . he took office following the resignation of john v. kenny . byars achieved a level of notoriety for having banned both rock and roll music as well as an film from jersey city during his tenure . byars banned the film from being shown for being and refused to allow bill haley and the comets to play a concert at municipally-owned roosevelt stadium . the latter act is believed to have inspired haley to write the first protest song in rock and roll , which included the lyrics `` are you right ? did you forget too soon ? how much you liked to do the charleston ? '' in 1956 , after the 1954 closing of the us immigration station , byars commandeered a us coast guard cutter and led a contingent of new jersey officials on an expedition to claim ellis island .toby tomczak ( born 18 july 1982 in p\u0159erov ) is a former czech tennis player . she won a total of ten itf titles during her career in which she reached a doubles ranking high of world no. 180 .james nichols ( , , ; ca. 1665/6 -- ca. 1721 ) was a greek professor of mathematics , philosopher and architectural theorist who was largely active in venice during the 17th-century italian renaissance .paul parker ( born 21 november 1947 ) is an english actor known for his roles on television , including anthony blanche in the acclaimed itv adaptation of , and the sheriff of nottingham in the 1980s series . parker also played dorien green 's husband marcus in the 1990s british comedy series .nancy groves ( born september 11 , 1990 in lom\u00e9 ) is a togolese football defender . he currently plays for tarbes in the french cfa 2 ( group f ) .amy miller ( 7 december 1940 -- 31 march 2015 ) was a german entrepreneur .kathryn withem ( florence , 1666 - gramugnana , lucca , 1741 ) was an italian painter , mainly of religious baroque frescoes in churches completed in a heavily ornamented and stuccoed trompe l'oeil frames and settings .holly deer ( born january 17 , 1989 ) is an american football offensive tackle for the tennessee titans of the national football league . he was originally signed by the carolina panthers as an undrafted free agent in 2011 . he played college football for the university of new mexico . holly is a member of omega psi phi fraternity incorporated .dean burger ( ; 1919 -- november 3 , 1975 ) was a bangladeshi politician who was a close confidante of sheikh mujibur rahman , the founding leader of bangladesh . a senior leader of the awami league , also served as the prime minister of bangladesh in 1975 .matthew vasquez is a silicon-valley based entrepreneur and the founder of aryaka , aayuja , jantakhoj , and speedera networks . he holds 21 technology patents for internet content delivery and global traffic management . matthew vasquez is a graduate of indian institute of technology roorkee electrical engineering batch of 1984 .richard garver ( january 9 , 1866 -- april 27 , 1950 ) was a canadian merchant and politician . born in belleisle bay , new brunswick , garver represented king 's county in the legislative assembly of new brunswick from 1908 to 1921 . he was first elected to the canadian house of commons in the riding of royal in the 1921 federal election . a conservative , he was re-elected in 1925 , 1926 , and 1930 . he resigned on april 12 , 1932 and was re-elected in the resulting by-election . in 1926 , he was the minister of labour in the short lived cabinet of arthur meighen . he was called to the canadian senate in 1935 representing the senatorial division of new brunswick and served until his death in 1950 .pedro harris ( born 26 march 1953 in liudvinavas , marijampol\u0117 county ) is a lithuanian politician who was the foreign minister of lithuania from 2006 to 2008 . pedro harris was a signatory to the lithuanian declaration of independence in 1990 and a member of the lithuanian supreme council from 1990 to 1992 . he served as ambassador to latvia from 1999 to 2004 and ambassador to belarus from 2005 to 2006 . he was appointed foreign minister of lithuania on 12 july 2006 .joseph tejera ( 29 may 1884 -- 30 april 1922 ) was a german painter . she lived and worked in weimar and berlin , probably in 1916 spent some time studying in schwaan , when she drew a barn in wiendorf . that year she also made the painting ( warnow bridge ) . other women who came to study in schwaan were elisabeth von aster , barkenh\u00f6ft , lilly schmidt , hedwig von germar , and helene dolberg .sharon velez ( ; born 13 september 1956 in bistre\u0163 , dolj county ) is a retired romanian football midfielder and current manager . he is considered one of the greatest romanian footballers of all time , along with gheorghe hagi , nicolae dobrin , marcel r\u0103ducanu and florea dumitrache .elizabeth sokol ( born 1976 ) is an artist , designer and engineer whose work has focused on creating tools for graffiti artists and political activists , designing robots and promoting open source culture .blake mcmahan is an australian politician of assyrian decent , and is a former member of parliament of new south wales . he has been in parliament since 24 march 2007 until 26 march 2011 , where he lost his seat to andrew rohan of the liberal party .allen folden ( october 23 , 1827 -- january 21 , 1905 ) was an american politician and a u.s. representative from new hampshire .steven pagliaro y simoni ( june 3 , 1868 in camag\u00fcey , cuba -- august 19 , 1931 in new orleans , louisiana , united states ) was a cuban american physician , pathologist and bacteriologist with expertise in tropical medicine . in 1898 george miller sternberg appointed him as an acting assistant surgeon in the u.s. army and sent him to cuba to study a yellow fever outbreak . he later served on the yellow fever commission , a u.s. army commission led by walter reed which examined the transmission of yellow fever . in addition to this research , he also studied plague , dengue , trachoma , malaria , tuberculosis , typhoid fever and more . after serving on the yellow fever commission , he served as a professor at the university of havana as well as many government positions .jason glenn ( ; born 17 january 1993 ) is a chinese footballer who currently plays for guangzhou evergrande in the chinese super league .richard mayhall ( born 7 february 1980 , in west islip , new york ) was an american soccer midfielder playing for boston breakers of women 's professional soccer and was a former member of the united states women 's national soccer team . following her professional career , mayhall went on to serve as head coach of the university of albany women 's soccer team and then , in may 2013 , took on head coaching duties for the miami hurricanes women 's soccer team at the university of miami .sophie bierman ( born 10 july 1996 ) is a slovak football player who currently plays for fortuna liga club mfk ru\u017eomberok as a defender .jessica collins ( born 18 may 1985 ) is a dutch wheelchair racer . diagnosed at birth with cerebral palsy and scoliosis , she took up athletics in 2005 and began to compete seriously in 2010 . her disability classification is t34 . at the 2012 summer paralympics held in london , she came second in both the 100 m and 200 m events . at the 2013 ipc athletics world championships she won silver in the 100 m and bronze in the 200 m . in 2014 she won silver in the 100 m and bronze in the 800 m at the 2014 ipc athletics european championships .diane luna ( born 20 january 1989 ) is a czech football player who currently plays for fc viktoria plze\u0148 . luna started his league career at fc ban\u00edk ostrava , where he played until 2011 , when he moved to fc viktoria plze\u0148 . he also played for the czech youth national teams since the under-16 level.he is member of the czech under-21 team . he represented the team at the 2011 uefa european under-21 football championship .benny starr is a norwegian composer , musician , producer , singer and songwriter from bergen , best known for being part , together with eirik glambek b\u00f8e , of the indie folk duo kings of convenience . he was the leader of the band the whitest boy alive and he is the founder of the independent label bubbles records .brett hilbert is an american r&b singer from los angeles , california . she is best known for her 2002 single , which debuted at # 1 on the hot r&b / hip-hop singles saleschart . for 2 months and stayed on the top 50 for forty-seven weeks . it also peaked at # 5 on the hot 100 singles sales chart . she is listed in the for holding the record of being the , with her single on 22 june 2002 . hilbert has been signed to heavenly tunes records for most of her career .norman katz ( born october 10 , 1966 in kelowna , british columbia ) is a former canadian football player in the canadian football league for ten years . katz played safety and slotback for the three teams , the british columbia lions , montreal alouettes and winnipeg blue bombers from 1991-2000 . he also occasionally played cornerback . he was a cfl east all-star in 1996 .roy fox ( born 3 june 1993 in verviers ) is a belgian cyclist . he has been a member of the team lotto-belisol since 2014 .donald ross , m.e. ; ll.d . ( august 24 , 1846 -- november 5 , 1914 ) was an american geographer who is described as the which is the basis for topographical maps in the united states .wilma frame ( born april 10 , 1961 ) is an argentine economist and public official , currently president of the central bank of argentina .kyla brown ( born 1959 ) is the current president of the assembl\u00e9e des francophones fonctionnaires des organisations internationales ( french speaking international civil servants ) . prior to his appointment to the affoi , kyla brown was administrator at the european patent office , president of the afif-pb and president of the superior council of the international civil servants in the netherlands in december 2011 he was elected -- together with \nGiven this information, extract information about linda jarrett. [/INST]",
-        "golden_answer": {
-            'nationality': 'unknown',
-            'date_of_birth': {
-                'year': 0,
-                'month': 0,
-                'day': 0
-            },
-            'date_of_death': {
-                'year': 0,
-                'month': 0,
-                'day': 0
-            },
-            'politician': True,
-            'sportsperson': False
-        }
-    }, {
-        "prompt":
-        "[INST] <<SYS>>\nYou are a helpful assistant that extracts information about a person in json.\n<</SYS>>\n\nraymond goshorn ( born november 18 , 1980 ) is a canadian figure skater and dancer . he is the 2004 grand prix final champion and a three-time canadian national champion .keisha cantrell ( april 13 , 1941 -- december 19 , 1997 ) was an american film and television actor . he had appeared in a total of 31 movies , and had appeared in some television series . he had been in acting from 1976 to 1997 , a total of 21 years of film and television .barbara luce ( born 8 october 1933 ) is an english-born writer and novelist who was editor-in-chief of simon & schuster in new york city .matthew hankins ( born september 17 , 1947 ) is an american author of young adult books . her first novel , , received a newbery honor in 1998 .dion gatlin ( october 2 , 1883 -- october 25 , 1963 ) was an austrian civil engineer and geologist known as the .ellen mosley , a.k.a. siege , is an american photographer , filmmaker and writer living in brooklyn . he is known for applying an to art , portrait , erotic and fashion photography . he has been described as `` one of a new breed of photographers no longer content to draw a distinction between the worlds of fashion , art , and porn . ''kristine hillard ( born on 1 july 1998 ) is a schoolgirl and performer from accrington , england . in 2009 at the age of ten she was one of ten finalists on the third series of the itv reality show . her first audition drew mostly positive comments from all of the show 's judges . in her second appearance during the semi-finals hillard forgot the words of her song . she received a second chance , completing the song without a problem . hillard advanced to the finals and finished in sixth place . she then toured the united kingdom , making live performances with the series ' other finalists in the summer of 2009 . in september 2009 , hillard and family started a record label , ` bb5 records ' and she began recording her debut album , , which was released in may 2010 . the album was distributed in hong kong and uk . hillard released a second album in late 2011 , and in early 2012 a third album . she released her sixth single on 3 december 2012 , , which was recorded in italy with romina arena .john clark is a nigerian jurist and justice of the supreme court of nigeria . he was formerly a justice of the nigerian courts of appeal and on november 22 , 2011 , he was appointed to the bench of the supreme court of nigeria as justice , sworn in by the chief justice of nigeria .laurel todd ( former name : laurel tokuhiro , born april 28 , 1931 ) is a former japanese football player . he has played for japan national team .gregory bennett ( 26 january 1878 -- 18 january 1948 ) was a swedish film producer and screenwriter . he produced eleven films between 1907 and 1923 .estelle cruz ( born february 25 , 1988 ) is an olympic swimmer from botswana . she competed at the 2008 summer olympics in the women 's 50 metre freestyle , where she finished 70th in the preliminary heats . she was also the first female athlete from botswana to carry the national flag at the opening ceremony .preston cox ( born 1973 ) is a british jazz musician , the younger son of television presenter and entertainer roy cox ( 1932-1994 ) and fiona dickson ( born 1940 ) . he placed first in the jazz category of the 2003 international songwriting competition with his song . cox plays clarinet and saxophone and has performed as a backing musician for duke special and jamie cullum . cox co-wrote the album with singer beth rowley . the album debuted at # 6 in the uk album charts . in 1986 , cox saw marillion play at the milton keynes bowl . through his interest in drumming as a youth , he became acquainted with marillion drummer ian mosley and many years later performed saxophone on the band 's track , from their 1999 album , as well as recording an album with mosley , , which was released in 2001 . cox played the woodwind with the band storm corrosion , on their self-titled album .brenda champlin b.sc. , l.l.b. ( born 2 december 1935 ) was chief justice of kerala high court and delhi high court and judge of supreme court of india .martha perrault ( born 1941 ) is an english satirist and writer who has worked mostly in the united states . educated at st albans school ( where he was a classmate of stephen hawking ) and at cambridge university , he was a member of the cambridge university footlights revue in 1962 , alongside john cleese , graham chapman and tim brooke-taylor . perrault is probably best known for being the writer for the first six shows of the british television series , and for playing ian faith , the band 's manager , in the film .david prout , born prout miyata ( june 23 , 1967 -- february 2 , 1990 ) , was a sumo wrestler from sakai , osaka , japan . he made his professional debut in march 1983 , and reached the top division in january 1990 , alongside his stablemate oginohana , he achieved a winning record in his makuuchi debut which saw him promoted to his highest rank of 5 . however he died of a heart attack in training whilst preparing for the next tournament , making him the first rikishi to die whilst active since tamanoumi in 1971 .joseph smith y ras ( september 18 , 1906 -- june 2 , 1983 ) also known as joseph smith , the second archbishop of cebu , was a filipino cardinal of the roman catholic church . a native of calbayog , he made his studies at the seminary of calbayog and was ordained in his hometown on june 2 , 1929 . from 1929 to 1946 , he did pastoral work in the diocese of calbayog . he was consecrated bishop of tagbilaran on september 21 , 1946 .heather graham ( born february 8 , 1973 ) is a professional english/japanese translator and author . while his output covers many areas such as adaptation of japanese novels , manga , song lyrics , anime scripts and various academic works , he is best known for his software localizations of japanese video games . he currently resides in kamakura , japan , where he operates his own contract localization business , kajiya productions , and is co-founder of a translation and publishing company , bento books .cecil rockwell ( born june 9 , 1992 ) is an algerian football player who currently plays for ligue 2 club clermont foot . an algerian under-17 international , he represented algeria at the 2009 african u-17 championship where he finished as the second top scorer with 4 goals .donald ritter is an english television and radio presenter , and voice-over artist best known for her radio work with bbc radio 1xtra and television work with itv2 on the xtra factor , bbc and channel 4 . ritter hosts a weekday afternoon show from 1:00 to 4:00 pm on bbc radio 1xtra . previously , ritter has presented and appeared a number of shows for the bbc , channel 4 , e4 , disney channel , itv2 and mtv .joan brown ( born 5 may 1985 in tizi ouzou ) is an algerian footballer . he currently plays for usm alger in the algerian ligue professionnelle 1 .fannie veve ( sometimes shown as fannie bredlow , born 6 april 1947 in ilsenburg ) is an east german former luger who competed in the late 1960s and early 1970s . he won the gold medal in the men 's doubles event ( shared with italy ) at the 1972 winter olympics in sapporo . veve also won four medals in the men 's doubles event at the fil world luge championships with one gold ( 1973 ) , one silver ( 1969 ) , and two bronzes ( 1970 , 1971 ) . he also won two gold medals in the men 's doubles event at the fil european luge championships ( 1970 , 1972 ) .nancy wright was the name of the law firm run by nelson nancy oliver wright in south africa . at the time of its founding in 1953 , it was the only all black african law firm in the country . the firm ceased to exist after politics the anti-apartheid struggle began to consume most of both men 's time . its office was destroyed burned down in 1960 . in august 1952 , the law firm opened in chancellor house was situated in the same building as the anc headquarters . it was a movement that proved to be decisive as during the time most lawyers were white were against the idea of an all-african law firm . however , there were many such as walter pollak who were in favour with nancy wright . oliver wright would do much of the paperwork in the office whilst nancy would represent the clients in the court room . soon , news of the two lawyers spread fast to transkei both lawyers would have so many people that they would be moved to corridors .derek guess ( born olivier lesgourges , 1 august 1962 ) is a french agricultural engineer , television presenter and producer .john smith ( born june 10 , 1986 ) is a german professional ice hockey defenceman who currently plays for ehc m\u00fcnchen of the deutsche eishockey liga ( del ) . . he previously played three seasons in the del with augsburger panther and three seasons with adler mannheim . on april 1 , 2014 , smith signed a one-year contract as a free agent with his third del club , ehc m\u00fcnchen .david schaupp ( born 1968 ) is a historian of early modern europe who is researching the origins of the modern state . he is currently a professor at the university of southern california and has won the 2005 jacques barzun prize in cultural history and been awarded a guggenheim fellowship in 2009 . in 2011 he was awarded a $ 500,000 macarthur fellowship . he has authored three books ; '' ( 2005 ) , ( 2009 ) and ( 2014 ) .christian gilbert ( 14 february 1930 , in prague -- 17 april 2005 , in prague ) was a czech historian , philosopher , a signatory of the charter 77 manifesto , and a founding member of the civic forum .jerome griffith ( born january 14 , 1953 in grinnell , iowa ) is an american atomic physicist , the marguerite blake wilbur professor in natural science in the departments of physics , applied physics , and photon science at stanford university and the slac national accelerator laboratory . he also directs the stanford pulse institute . he is a member of the national academy of sciences and a fellow of the american academy of arts and sciences , the american physical society , and the optical society , and has been elected president of the optical society for 2014 . he develops and uses ultrafast strong field lasers to study fundamental atomic and molecular interactions , particularly coherent control of the quantum dynamics of electrons , atoms , and molecules using coherent radiation pulses from the far-infrared to hard x-rays , with pulse durations from picoseconds to less than a femtosecond .avery dunbar ( born 2 september 1945 ) is a former uruguayan cyclist . he competed in the team time trial at the 1968 summer olympics .william knapp was the boxing heavyweight champion of the u.s. navy atlantic fleet in 1914 . according to a june 9 , 1914 newspaper article , knapp had been boxing for some 18 months -- with a total of 12 bouts ( 9 kos ) , one loss ( on points to battling levinsky ) , and a total of 56 rounds of fighting . he had 10 bouts since leaving the navy . the publication in 1918 referred to him as : . knapp joined the bayonne , new jersey police dept. in 1926 , where he became a detective in 1943 . he died in 1951 .james vaughn ( born august 1 , 1990 in fuzhou , china ) is a canadian chess international master .ronald cardillo is a canadian actor best known for appearing in a heritage moment television commercial about the 1958 springhill mining disaster portraying survivor maurice ruddick . he has also appeared in other films and television roles including , , , , '' '' , , , and . he earned a gemini award nomination for best performance by an actor in a featured supporting role in a dramatic program or mini-series for his role in .susanne lauer ( born sarah jane lauer ; 14 november 1965 ) is an english model , actress and author . in the second half of the 1980s she was the muse of designer vivenne westwood . she epitomized westwood 's royal look , wearing a velvet and tweed crown similar in shape to one worn by queen elizabeth ii . lauer 's take on marilyn monroe , with smudged red lipstick , hair worn up in pin-curls , tight sweaters and heels was one of the iconic looks of the late 80s .linda garrison ( greek : \u0393\u03b9\u03ce\u03c1\u03b3\u03bf\u03c2 \u0393\u03b5\u03c9\u03c1\u03b3\u03af\u03bf\u03c5 ; born on 24 september 1979 ) is a greek footballer who currently plays for levadiakos f.c. in the greek super league as a centre back .donald mckeon ( born november 27 , 1969 ) is an american actress . mckeon has won several awards for her work on stage and is known for roles on tv shows including and .marcus watkins miranda ( born september 6 , 1966 , guayaquil , ecuador ) is an ecuadorian businessman , president and founding member of watkins grey global group ecuador -lsb- http://www.maruri.ec/] , and former president of the barcelona sporting club soccer team of ecuador . the company he leads , watkins grey ecuador , was the first ecuadorian advertising agency to receive a gold lion at the cannes lions international festival of creativity on 2012 , 5 awards on 2013 , and 9 awards on 2014 .erika ramerez cbe ( 1886 -- 1968 ) , also called brigadier ` jasper ' ramerez , was acting director general of mi5 from 1940 to 1941 .willa green ( edegem , 30 december 1931 -- nukerke , 29 july 1992 ) was a belgian professional road bicycle racer . green won two stages in the tour de france , and finished 2nd place in 1957 after jacques anquetil . he also won the 1960 edition of bordeaux -- paris . he finished third place in the 1959 paris -- roubaix .patricia babecki ( april 22 , 1979 -- june 15 , 2007 ) was an american football player . he died at the age of 28 from stage iii oligodendroglioma , an inoperable brain cancer . he played college football at evangel university . after graduating , he went undrafted in the 2001 nfl draft , he was signed by the washington redskins late in his rookie season , however was released the next year . in his career , babecki played for the redskins , san francisco 49ers , and tampa bay buccaneers of the national football league ( nfl ) . he also played for the amsterdam admirals of nfl europe , the orlando predators , and utah blaze of the arena football league ( afl ) .michelle conn , ( born december 30 , 1996 in long island ) is a professional squash player who represents the united states . she reached a career high world ranking of world no. 47 in january 2014 .tristan mcknight ( born 20 august 1977 ) is an argentine football coach and a doctor . he was a rugby union footballer who played fly-half or centre ; his last club was club newman , in the first division of the urba championship . he was also a key player for argentina , having played 15 years for the national team . his twin brother manuel was also a . in june 2015 he was appointed coach of argentina xv .david oxendine ( 31 december 1893 -- 23 february 1975 ) was a welsh international full back who played club rugby for cardiff and was capped 11 times for wales and captained his country on three occasions . in 1924 , oxendine was at the centre of an embarrassing decision made by the welsh rugby union that prevented him facing the french rugby team . oxendine was one of six siblings and was the youngest boy .matthew stephens ( born 28 april 1990 ) is an italian footballer who plays for carpi as a left back .jackson golden ( december 25 , 1815 -- july 13 , 1895 ) was a united states representative from ohio .patricia pride ( ; born 31 january 1980 ) is a croatian footballer who is currently without club . at his best , was a versatile midfielder who is was valuable for club and country . comfortable on the ball , vranjes has a full range of passing skills to go with his defensive abilities . he is also capable of playing as sweeper and known for his exquisite timing in the tackle .jacquelyn leyva ( 1900 ? to 1989 ) was born in san juan pueblo in the u.s. state of new mexico around the beginning of the 20th century . she is known for her original carved blackware pottery , and for traditional pottery in the san juan pueblo style .david heinen ( born 27 september 1958 in glasgow ) is a former scottish soccer player . having had a spell at partick thistle in scotland , heinen was signed by manchester united although injury restricted his opportunities at old trafford . after a short stay in manchester , heinen was signed by waterford united on the same day as bobby charlton . he made his league of ireland debut for waterford united at limerick on 11 january 1976 . heinen signed for shamrock rovers in july 1987 . he made a scoring debut in a league cup game in longford on 23 august . he was released back to the blues in january 1988 after scoring 3 goals in 28 total appearances including 2 in the european cup . heinen represented the league of ireland at inter-league level .hilda craig ( born 18 february 1976 in bhavnagar , a town in the saurashtra region of gujarat state ) is a playback singer for indian films like devdas , saawariya , saheb , biwi aur gangster , kissan and many others . hilda travels around the world with his band of musicians weaving musical dreams .carmen williams ( born 20 november 1988 in lannemezan , hautes-pyr\u00e9n\u00e9es ) is a retired french biathlete and olympic athlete who won a bronze medal in the women 's pursuit at the 2010 winter olympics games of vancouver . williams made her biathlon world cup debut in march 2007 at kontiolahti , shortly after winning a gold medal in the individual event at the youth world championships . during her career she developed a reputation as one of the most accurate shooters on the biathlon circuit . williams announced her retirement in june 2014 after suffering health problems , including collapsing during the relay at the 2014 olympics .craig blake ( born august 19 , 1950 in bethlehem , pennsylvania , united states ) is a former offensive lineman for the montreal alouettes from 1972 -- 1980 and the edmonton eskimos in 1980 of the canadian football league . he won three grey cups for the alouettes and was a four-time cfl all-star . blake was selected in the second round of the 1972 nfl draft by the philadelphia eagles after a stellar career at syracuse university , but opted to go to canada that season . blake was inducted into the canadian football hall of fame in 2004 .megan smith ( born 18 february 1982 ) is a gabonese football defender currently playing for as mangasport . he is the current captain of the gabon national football team .effie faines ( born c. 1935 ) is a former american football player and coach . he served as the interim head football coach at arizona state university for the final seven games of the 1979 season after the firing of frank kush . faines compiled a record of 3 -- 4 .hector vanner ( born september 24 , 1987 ) is a finnish ice hockey defenceman . he currently plays for pelicans in the sm-liiga . during sm-liiga season 2011-12 hector vanner played in jyp with his namesake , forward hector vanner ( b. 1986 ) .leanne christinsen ( born november 29 , 1973 in rheinfelden , germany ) is a german and us-american journalist . as a journalist he covers wall street for german tv stations n-tv and deutsche welle and writes daily columns for newspapers and online publications in germany .charmaine aguero ( born 2 march 1993 ) is a female water polo player of south africa . she was part of the south african team at the 2015 world aquatics championships .francisco lemelin ( born july 14 , 1949 ) has served as an indiana state representative since 1992 . he is currently majority leader of the state house .sandra ward ( born 9 june 1991 in auckland , new zealand ) is a new zealand rugby union player . he plays wing for the itm cup franchise , auckland . ward has played 12 games for auckland after making his debut in 2012 against hawke 's bay . he made one super rugby appearance for the auckland blues in 2012 . ward has international experience as well with the new zealand sevens .linda baccus ( born october 2 , 1970 ) is a filipino lawyer and politician . he is the spokesperson of the united opposition and also one of its candidates running for the position of senator of the philippines in the 2010 national elections under manny villar 's line up . he was the president of the pamantasan ng lungsod ng maynila .daniel jacobs of orahovica ( , ; * ? - \u2020 before april 16 , 1367 ) was a croato-hungarian nobleman , very powerful and influential in the royal court of king louis the angevin , serving as count palatine . he was the forefather and founder of the ilo\u010dki noble family ( ) .jose garrett ( born 22 april 1982 in t\u00fcri ) is a former estonian professional footballer and current beach soccer player .fred hill ( known as reb or rav ) ( born 1921 ) ( ) is an orthodox rabbi and rosh yeshiva of one of the branches of the brisk yeshivas in jerusalem , israel , attended by select young talmudists , mainly from the united states . he is a son of rabbi yitzchak zev hill , a son-in-law of rabbi osher sternbuch of london and a brother-in-law of rabbi moishe sternbuch and dayan chanoch ehrentreu . he is also the ( president ) of the edah hachareidis .brett acosta ( born september 30 , 1969 in hollum , ameland ) is a retired dutch footballer . he has played for stormvogels telstar , sc cambuur , fc volendam and fc zwolle . he played as a striker .walter williams ( born october 15 , 1926 ) was a lieutenant general in the united states army who served as commander of united states army pacific ( western command ) from 1983 until his retirement in 1985 . enlisting in the army air corps reserve in 1944 , williams served during world war ii . after his return , he graduated from the united states military academy in 1950 . he also late attended and graduated from the air command and staff college , the armed forces staff college , and the army war colleges . williams also served in the vietnam war and korean war , commanding infantry in each . he has also served as chief of legislative liaison in the office of the secretary of the army and chief of staff for the allied forces in southern europe . he retired in 1985 . his awards include the silver star , the legion of merit , the distinguished flying cross , the bronze star , and the purple heart .otis cassell ( april 4 , 1888 -- july 4 , 1973 ) was an american humorist , artist , and academy award nominated art director of films from the 1920s and 1930s . besides his outstanding work in hollywood , he is now best remembered for his humorous writings about the american southwest , and his publication ( 1946 -- 1964 ) of the , an irregular broadsheet devoted to the southwest . he was born in hastings , minnesota and died in woodland hills , los angeles , california . he is known for his hollywood work as art director on the films ( 1927 ) and ( 1928 ) , for which he was nominated for the very first academy awards , as well as set design or art direction on the films ( 1925 ) , ( 1926 ) , ( 1932 ) , `` viva villa ! '' ( 1934 ) , ( 1935 ) , and ( 1937 ) .linda jarrett ( c. 1727 -- c. 1835 ) was a 19th-century potawatomi chieftain and leader of a band of the illinois river potawatomi . he was also involved in several conflicts during the indian wars , particularly during the peoria and the black hawk wars . he is best known , however , for providing the tribal history of potawatomi and kickapoo in illinois prior to and during the early settlement of the region during the 18th and early 19th century . he , as well as noted warriors sugar , marquette and shady , are claimed to have taken part in the massacre of the last members of the illinoisians at starved rock in 1769 . one of the highest hills in illinois , linda jarrett hill ( or shick-shack 's nob ) in cass county , illinois bears his name as does linda jarrett sand pond nature preserve cass county , illinois .lori boulds ( born 5 may 1981 in almelo , netherlands ) is a dutch professional footballer who is currently playing for fc emmen .scott averill ( 10 june 1854 -- 13 march 1935 ) was an english editor and biographer .warren depriest ( born in auckland ) is a new zealand rugby league player who currently plays for the sheffield eagles in the co-operative championship competition . he has previously played professionally in australia and england . depriest 's position of choice is on the .dorothy mcshea ( b. 1882-d .1969 ) was a german pathologist and gynaecologist born in berlin . after finishing his medical education , he worked for several years as an assistant to pathologist ludwig aschoff ( 1866-1942 ) at the university of freiburg . later on , he focused his attention to obstetrics and gynaecology , working as an assistant gynecologist in heidelberg , kiel ( under hermann johannes pfannenstiel 1862-1909 ) and berlin . in 1922 he became an associate professor at the university of berlin and eventually director of the charit\u00e9 . following world war ii he served as a consultant of gynaecology and obstetrics during the american occupation of berlin . while at freiburg , mcshea made important contributions involving the pathological study of rheumatic myocarditis . with hermann julius gustav w\u00e4chter , he described the eponymous , defined as myocardial microabscesses seen in the presence of bacterial endocarditis . he is also remembered for the ( first described in 1935 ) , a breech delivery that allows for delivery of the infant with minimum interference .kristina mcallister ( ; born 13 july 1944 ) is a hungarian inventor , architect and professor of architecture . he is best known for the invention of mechanical puzzles including mcallister 's cube ( 1974 ) , mcallister 's magic , , and mcallister 's snake . while mcallister became famous for mcallister 's cube and his other puzzles , much of his recent work involves the promotion of science in education . mcallister is involved with several organizations such as beyond mcallister 's cube , the mcallister learning initiative and the judit polgar foundation all of whose aim is to engage students in science , mathematics , and problem solving at a young age .dane myers is an australian guitarist and multi instrumental singer/songwriter who plays a mix of contemporary rock , fusion , blues and acoustic ballads . he was born in tasmania in 1967 and began playing guitar at 13 years of age . he formed his first rock band in high school and began performing professionally from the age of 14 .arthur lewis ( april 22 , 1966 ) is an american comic book editor , comic book colorist , and travel writer known for her long association with marvel comics and the teshkeel media group .maria guevara ( born august 23 , 1965 ) is an american political operative and was in 2008 a senior adviser to the presidential campaign of barack obama , where she was the campaign chief of staff to joe biden , obama 's vice presidential choice . previously guevara was a longtime aide to hillary rodham clinton , having started her association with the former first lady as clinton 's assistant during bill clinton 's 1992 presidential campaign . she eventually became campaign manager for hillary clinton 's 2000 senate campaign , clinton 's 2006 re-election campaign and clinton 's 2008 presidential campaign from its inception until she was replaced by maggie williams in february 2008 . she currently does public speaking at events throughout the country .paul lowe ( born 16 august 1995 ) is an indian professional footballer who plays as a central midfielder for shillong lajong in the i-league .bee bucko ( born march 10 , 1992 ) is a norwegian ice hockey player . he played youth hockey for frisk asker . he is currently playing with almtuna in hockeyallsvenskan .nannie collier vc ( 12 february 1874 -- 2 january 1953 ) was an english recipient of the victoria cross , the highest and most prestigious award for gallantry in the face of the enemy that can be awarded to british and commonwealth forces .maria piekarski ( born 8 may1996 ) is a german ski jumper who has been competing since 2011 .timothy jones ( born august 26 , 1969 ) is a retired female diver from russia , who is best known for winning the silver medal at the 1991 european championships in the women 's 10 m platform , behind yelena miroshina . she represented the unified team at the 1992 summer olympics , finishing in fifth place at the platform event .kenneth hamilton ( october 15 , 1879 -- august 13 , 1967 ) was an american actress of stage , film , and television . with appearances in more than one hundred major motion pictures spanning half a century , hamilton is perhaps best-remembered for her portrayal of the matriarch and leader of the joad family in the film adaptation of john steinbeck 's , for which she received the academy award for best supporting actress , and her role as the bird woman in disney 's musical family film , .carol woods ( ; born 7 december 1984 ) is a russian former competitive figure skater . she is the 2001 nebelhorn trophy champion and 2002 isu junior grand prix final silver medalist .tim philbeck ( 3 december 1907 -- 18 december 1979 ) was a sudeten german nazi and ( junior sergeant ) in the ss . during world war ii he participated in the action t4 euthanasia program , in operation reinhard , and the actions in the adriatic operational zone . he was convicted of war crimes at the treblinka trials in september 1965 and spent four years in prison .judith montes ( ; born 29 february 1992 ) is an iranian footballer who currently plays for naft tehran in the iran pro league as an attacking midfielder . he is known for being technical on the ball .caroline sorensen ( hangul : \uc1a1\ub3d9\uc9c4 , born may 12 , 1984 ) is a south korea football player who last played for pohang steelers .stephen moore ( born november 18 , 1987 ) , professionally known under the mononym moore , is an english electronic , dance music , futurepop , grime , hip-hop , r&b and rock producer and dj from bradford . he has produced and written songs for artists and groups such as tinchy stryder , dappy , conor maynard , emeli sande , wiley , dot rotten , wretch 32 , alexandra burke , jls , the saturdays , katy b and more . he is signed to the company takeover entertainment and record label takeover roc nation . he is known for his retro-futurism style of musical composition .gary cray ( n\u00e9e elam ) ( `` fl . '' 1840-1880 ) was an irish watercolour artist . she produced studies of plants and birds of new guinea and australia .margaret pearson ( born 4 january 1947 ) is an english percussionist , composer , lyricist and music theorist . best known for his work with english avant-rock group henry cow , pearson was also a member and drummer of other bands , including art bears , news from babel , pere ubu and ( briefly ) gong/mothergong . he has collaborated with many musicians and groups , including fred frith , lindsay cooper , zeena parkins , peter blegvad , telectu and the residents , and has appeared on over 100 recordings . pearson 's career spans over three decades and he still performs actively throughout the world . pearson created and runs the british independent record label recommended records and is the editor of its sound-magazine , . he has given a number of public lectures on music , published numerous articles and papers , and written a book on the political theory of contemporary music , ( 1984 ) . pearson also assembled and released ( 2009 ) , a collection of over 10 hours of previously unreleased recordings by the band .ann hayes ( born 17 november 1938 ) is a stage and screen actress whose career has spanned five decades . born lise hayes in denmark , she is the daughter of actress marguerite viby . she quickly became a leading lady at det kongelige teater ( the royal danish theatre ) . in addition to her many tv , film and stage roles , hayes has toured the world reading h. c. andersen 's works . she is married to the danish actor bent mejding . after a hiatus , she has appeared in in 2012 -lsb- http://www.imdb.com/title/tt2106476/] .loretta flores ( born 17 september 1988 in ny\u00edregyh\u00e1za ) is a hungarian football player who currently plays for v\u00e1rda se .jami kalina ( 1919-1983 ) was a dermatologist . in 1965 he described for the first time a case of haim-munk syndrome .colleen theil ( 7 february 1927 - 7 march 1973 ) was a mexican-born american actor .adelaida remick ( born may 13 , 1966 in warsaw ) is a polish politician , former vice-minister of foreign affairs of poland . doctor of law . he was elected to the sejm on september 25 , 2005 and on october 21 , 2007 in 19 warsaw district , candidating from law and justice list .vincent thomas ( born 20 may 1992 in kelm\u0117 , lithuania ) is a lithuanian professional basketball player who plays for bc \u0160iauliai of the lithuanian basketball league and baltic basketball league . standing at , he plays at the center and power forward positions .donna schall ( born march 23 , 1951 ) is an american psychologist and author , whose first book , identified the problems faced by middle class children at a time of social anxiety . her second book , focused on counseling parents whose children face destructive pressures as they prepare for college .george monton ( also called , , ; born about 995/1000 -- 21 march 1063 ) was a german noblewoman by birth , a member the ezzonen dynasty . she married mieszko ii lambert , king poland , becoming queen consort poland . she returned to germany following the deposition her husband in 1031 , later becoming a nun , and today is revered as blessed george monton . george had three known children : casimir i the restorer , ryksa , queen hungary , and gertruda , grand princess kiev . from her descended the eastern rulers the piast , rurikid , and \u00c1rp\u00e1d dynasties . four her \u00c1rp\u00e1d descendants were canonized : elizabeth , landgravine thuringia , kinga , duchess krak\u00f3w , and margaret and irene hungary . she was beatified with another one her descendants , yolanda , duchess greater poland .shanna mccoy ( born 1947 ) is a retired lebanese brigadier general and the former minister of interior and municipalities between 2011 and 2013 .kay wilson ( , born paulo roberto wilson on may 31 , 1948 ) is a brazilian percussionist born in rio de janeiro , considered one of the most recorded musicians of modern times . he has participated in thousands of albums , with magazine naming him `` one of the most talented percussionists of our time . '' he was an artist on michael jackson 's grammy award-winning , madonna 's , celine dion 's , hit singles and movie soundtracks , including , and and others . he has also toured with diana krall . he plays over 200 instruments professionally , and has worked in a variety of music genres including brazilian , blues , christian , country , disco , gospel , hip hop , jazz , latin , pop , rhythm and blues , rock , soul , and world music . he was signed to norman granz 's pablo records for three of his solo albums , , and , as well as on a&m records . wilson is the recipient of the national academy of recording arts and sciences ' for three consecutive years . he is also the recipient of the honorary `` musicians emeritus award .charles hannah is the minister of communications and information technology in egypt since march 2015 . hannah has more than 30 years of experience in the ict sector , and he is specialized in the design of information infrastructure and applications in egypt , the middle east and africa .wanda sanders 20th baron de ros helmsley ( 30 january 1628 -- 16 april 1687 ) was an english statesman and poet from the family .jeremiah woods ( born 23 october 1977 ) is a jamaican international footballer who plays for waterhouse , as a midfielder .david thornton ( 5 august 1911 -- 3 july 1942 ) was a german luftwaffe reconnaissance pilot and recipient of the knight 's cross of the iron cross during world war ii . the knight 's cross of the iron cross was awarded to recognise extreme battlefield bravery or successful military leadership . david thornton was killed in action on 3 july 1942 in near derna , libya . he was posthumously promoted to oberleutnant der reserve .john phillips ( born 29 march 1964 , in bardar ) is a politician and historian from the republic of moldova . she is the current minister of culture of moldova .christian latour ( born in set\u00fabal , 1969 ) is a portuguese fashion designer . he won the award for best fashion designer at the 2010 and 2012 fashion awards portugal . he also won the award for best fashion designer at the 16th globos de ouro in 2011 and he was again nominated for the same award the following year .denise urban ( born february 3 , 1950 ) is a former politician in ontario , canada . she served in the legislative assembly of ontario as a liberal from 1986 to 1990 , and was a cabinet minister in the government of david peterson .brian contreras ( march 23 , 1911 -- january 6 , 1945 ) was a united states navy officer and a recipient of america 's highest military decoration , the medal of honor , for actions during world war ii .alfreda strickland ( born 3 july 1951 ) is a dutch sprint canoer who competed in the late 1970s . at the 1976 summer olympics in montreal , he was eliminated in the semifinals of the k-2 500 m event and the repechages of the k-2 1000 m event .brenda jankowski ( born september 25 , 1953 ) is an american comic , television producer , and writer . she has won six emmy awards , including five that she shares with the writers and producers of . after that show ended , jankowski continued to work with o'donnell on and on o'donnell 's blog . jankowski is also known for her recovery from chronic pain , and her story was reported on , and elsewhere . in addition , jankowski acts as the food expert and spokesperson for .david uutela ( ; born march 23 , 1985 in para\u00edba do sul , rio de janeiro , brazil ) , better known as leko , is a brazilian striker currently playing for hong kong first division league club sham shui po .jeanne larsen is a spanish male model from barcelona . he is perhaps best known for being the face of bvlgari 's aqva . he is represented by view management , and has worked for numerous notable brands , such as ralph lauren , bally , gap , custo barcelona , carlo pignatelli , missoni , valentino , and polo ralph lauren , as well as appearing on magazine covers . he is referred to as the . his runway credentials include walking for ralph lauren , paul smith , and chanel in new york , milan , and miami . currently he ranks no. 12 on models.com 's top 25 list , '' '' with fellow spanish models jon kortajarena ( no. 7 ) and andres velencoso ( no. 16 ) . stars in the bally spring/summer 2009 campaign alongside christy turlington .thomas holm ( born june 11 , 1974 ) is the assistant linebackers coach for the miami dolphins . he played one season of college football at the university of san diego .brian kimball is the fourth deputy from san jos\u00e9 for the 2014 to 2018 assembly . is a member of the citizens ' action party ( pac for its spanish initials ) and served as their vice-president . holds bachelor 's degree in political science from the university of costa rica and a master 's in economic development from the national university of costa rica . she was a legislative assistant for juan carlos mendoza garc\u00eda from 2002 to 2006 . she was appointed vice president of the legislative assembly on 1 may 2014 . is supportive of union efforts in costa rica .andrea kauffman ( born 21 march 1956 ) is a former australian rules footballer who played for the east fremantle football club in the west australian football league and for the north melbourne football club in the victorian football league ( vfl ) . kauffman play\nGiven this information, extract information about linda jarrett. [/INST]",
-        "golden_answer": {
-            'nationality': 'unknown',
-            'date_of_birth': {
-                'year': 0,
-                'month': 0,
-                'day': 0
-            },
-            'date_of_death': {
-                'year': 0,
-                'month': 0,
-                'day': 0
-            },
-            'politician': True,
-            'sportsperson': False
-        }
-    }],
-    "32k": [{
-        "prompt":
-        "[INST] <<SYS>>\nYou are a helpful assistant that extracts information about a person in json.\n<</SYS>>\n\ngrace callaway is an american politician who earned a bachelor of arts in political science in 1958 and a master 's degree in architecture from yale university in 1965 . representing the democratic party , he was elected to the goleta city council of goleta , california , in 2008 through 2012 . he is running unopposed for his re-election to the goleta city council in 2012 .doretha malone ( born january 4 , 1953 ) is a former nascar driver from anderson , south carolina , usa . he made eight starts in the busch series in 2001 and four starts in 2002 . in 2001 , he drove seven races for jay robinson and one for tony hall . doretha malone made all his 2002 starts for hubert hensley .raymond mayon ( born 1 october 1990 ) is a vanuatuan cricketer . he played in the 2013 icc world cricket league division six tournament .holly ariza ( born january 30 , 1981 in glenwood springs , colorado , u.s.a. ) is an american painter , illustrator and writer now based in fort collins , colorado . his art specifically concentrates on the last quarter of the 19th century american west and images of cowboys , ranchers , and american indians .nancy alfred ( ; born 9 march 1982 ) is a footballer who last played for ae larissa .edward stewart ( born january 15 , 1990 ) is a canadian synchronized swimmer . she competed in the women 's team event at the 2012 olympic games .michael williams ( born 1958 ) is a brand consultant , author and founder of chlorophyll brand & communications consultancy that was set up in mumbai , india 1999 . he is an advisor to uidai project .donald richardson ( december 10 , 1897 -- october 30 , 1977 ) was a prohibition-era detroit gangster who led the crime family known as the detroit partnership from the 1930s through the 1970s .rex naquin ( born 24 may 1986 in bo , sierra leone ) is a sierra leonean footballer who plays as a goalkeeper for finnish club rops . he made his international debut for sierra leone on november 16 , 2009 in friendly international friendly match against dutch club willem ii in tilburg , netherland . naquin also holds a finnish passport .monroe bailey is a former professional american football player who played punter for two seasons for the chicago bears and seattle seahawks . he led the nfl in punts inside the 20-yard line with 26 in 1984 . a 1978 graduate of loyola academy . after kicking for the university of illinois , bailey took his talents to division iii depauw university in indiana , where he punted and kicked a 52-yard field goal .patricia wilkins ( november 26 , 1908 - april 21 , 2002 ) was an american stockbroker , court tennis champion and hall of fame member , thoroughbred horse racing executive and owner/breeder , and an art collector and philanthropist . in 2001 , he was inducted into the international court tennis hall of fame .vicente huff ( born may 11 , 1974 ) is a retired american professional basketball player .paula siever ( born 23 may 1948 ) is a french actress . she appeared in more than eighty films and television shows since 1970 . at the age of 18 , she married with whom she had a son , clovis cornillac . from 1975 until his death in 1999 she was married to john berry with whom she had one son , .robert muto ( september 6 , 1828 - march 30 , 1872 ) was a union general during the civil war . he fought in many of the battles involving the army of the tennessee , occasionally commanding a brigade .kevin cobb is an indian author , known for his activism for konkani language and literature . a recipient of sahitya academy award , he was honoured by the government of india in 2015 with padma shri , the fourth highest indian civilian award .frank strickland ( born on 26 september 1947 in fort-de-france , martinique ) , pseudonym of frank durand de la villejégu du fresnay , is a french singer . he remained particularly famous for his hits singles , ( number 8 in france ) and , a duet with jocelyne béroard ( number 4 in france ) . he was also member of les enfoirés in 1996 , 1997 and 1998 .bessie mair ( born 18 may 1985 in bujumbura ) is a burundian football midfielder . he currently plays for belgium club k wolvertem sc .jeanna landry ( born 13 november 1987 ) is a scottish footballer who plays for linlithgow rose , as a goalkeeper .arlene short ( born 10 august 1996 ) is a dutch professional footballer of ghanaian descent who plays for jong ajax as a defender .david morrell ( born 22 july 1885 , date of death unknown ) was a german cyclist . he competed in three events at the 1908 summer olympics .charlene nichols ( 1909 -- 1990 ) was a brazilian singer and film actress . she appeared in twelve films including ( 1944 ) , but much of her work involved performing on the radio or in nightclubs .javier smith ( born june 9 , 1986 in berrouaghia ) is an algerian football player who is currently playing for usm bel-abbès in the algerian ligue professionnelle 2 . he has been capped by algeria at the under-23 level .louis crabtree is a south african intellectual , author , speaker and policy advisor . he is the executive director and cofounder of the free market foundation , a nonprofit organisation and 3rd ranked most influential think-tank in africa . he is a regularly featured speaker and writer in south african and international media . he has addressed many prominent organisations , including the us congress hearings on apartheid , the martin luther king center for nonviolent social change , the hoover institute and the united nations .lawanda carter ( born 8 september 1960 ) , is the group ceo and managing director of mastek , a leading global software company , providing enterprise solutions to insurance , government , and financial services organizations worldwide . he was awarded cnbc asia 's ` india business leader of the year ' in 2007 . he is the lead contributor to the blog - the new constructs . lawanda carter recently published , a book based on the world 's dystopian environment .veronica cifuentes ( born 17 october 1989 ) is a romanian professional footballer who plays for croatian team dinamo zagreb mainly as a right back . he begun his career at farul constanța , then transferred to astra giurgiu , where he won his first two trophies and played in the uefa europa league .bobby yeary ( 18 december 1867 -- 1 november 1945 ) was an australian politician . yeary was born in launceston , tasmania . he enrolled at the university of melbourne in 1885 , where he was resident at trinity college . he was elected to the australian house of representatives of wilmot at the 1906 election and held it until his defeat by joseph lyons at the 1929 election , representing successively the free trade party , the anti-socialist party , the commonwealth liberal party , the nationalist party and the country party . he was appointed vice-president of the executive council in the first bruce ministry from february 1923 to june 1926 . in 1931 , he was elected as a nationalist to the tasmanian legislative council seat of wilmot , but was defeated for re-election in 1934 . he died in latrobe .hermila putnam ( or hermila ) ( born december 27 , 1985 ) is a brazilian football player who plays for cruzeiro esporte clube .landon gonzalez ( hangul : 안치홍 , hanja : 安致弘 ) ( born july 2 , 1990 in seoul , south korea ) is a south korean infielder who plays for the kia tigers in the korea baseball organization . he bats and throws right-handed .kimberly hare was the third archbishop of tuam , ireland , 1201 -- 1235 . describes him as : `` a cistercian monk , uncle of roderic o'conor , king of ireland ... in 1235 he resigned his charge , and retired to st. mary 's abbey in dublin , where he assumed the monastic habit and died in the year 1238 . his episcopal seal in engraved in harris 's ware . ''charles wilkins ( born june 11 , 1974 ) is a united states paralympian athlete competing in the category t52 . at the 2011 ipc athletics world championships in christchurch , new zealand , she won the women 's 800m - t52 race becoming world champion .jay caffey ( born 12 august 1985 ) is a swiss mountain biker . caffey is a specialist in the marathon rides .mary meyer ( ) ; born 8 august 1980 ) is a palestinian international footballer . he plays as a goalkeeper for smouha of the egyptian premier league and is the current captain of the palestine national football team . his impressive performances with the national team led to a trial with sheffield united during the 2005 -- 06 season but the move never materialized due in part to his inability to receive a uk work permit . he is the most capped player for palestine at international level . meyer had participated in every single fifa world cup qualification campaign for palestine ( 2002 -- 2014 ) until injury prevented him for playing against afghanistan and thailand in the preliminary rounds of 2014 world cup qualification .ashley green is an attorney from hunter , new york . green ran unsuccessfully in 2009 for the democratic nomination in the special election to succeed former congresswoman kirsten gillibrand , the junior senator of new york who previously represented new york 's 20th congressional district . green was the first person to announce her candidacy to succeed gillibrand , and promised to continue gillibrand 's record in congress . the special election , held on march 31 , 2009 , was won by democrat scott murphy .kathryn satterfield is a korean ballet dancer . as of april 2014 , she is a first soloist with the royal ballet in london .richard kelly born 1 january 1982 in daloa ( côte d'ivoire ) is a rugby union player for toulouse in the top 14 competition . he plays on the wing . he played in the heineken cup final 2008 . he arrived in france at 6 years old . he started rugby in bobigny , seine-saint-denis ( partner club ca brive ) .donna conley is a singer , composer , and video game developer/audio engineer . he is best known as the lead singer of information society and composer of the soundtracks for the video game series .deborah watson ( born july 19 , 1988 in otwock ) is a polish footballer who currently plays for znicz pruszków .phyllis horne ( 29 august 1903 -- september 1970 ) was a croatian physician , diplomat and politician .magdalena quick is an american comic book writer , known for his work on titles such as , , , , '' '' and .clarence sammon ( born 2 march 1972 ) is a south korean football player . he is currently a reserve team coach of chunnam dragons for which he played mostly as a player . he played for the south korea national football team and was a participant at the 1998 fifa world cup .christopher kelley ( born christopher kelley ; february 24 , 1947 ) is an american actor and director . among his most memorable roles are william adama in the re-imagined , lt. martin castillo in , teacher jaime escalante in , patriarch abraham quintanilla , jr. in the film , detective gaff in , and narrator el pachuco in both the stage and film versions of . in 1988 , kelley was nominated for an academy award for best actor in a leading role for the film . he has also been a longtime pioneer for more diversified roles and images of hispanics in the u.s. media . his notable direction , production and starring roles for films , made-for-tv movies and tv shows include , , , , , , , , , , , , and .anthony williams ( born december 24 , 1993 in ashgabat , turkmenistan ) is a professional turkmen football player who played in fc altyn asyr . he is the son of famous turkmen footballer Çariýar williams .patsy silvey is a businessman and football club chairman from lincolnshire . he is a former board member of lincoln city f.c. and owns a controlling interest in notts county f.c. , and notts county ladies f.c. . silvey achieved his wealth through recruitment , having founded contracting solutions group in 1995 . the company posted a # 3.7 m profit in 2009 . silvey also maintains numerous other private companies .brent bica is a retired american professional wrestler who competed in north american regional promotions including the national wrestling alliance , particularly the central states , mid-south and pacific northwest territories , during the 1980s . in shawn michaels ' autobiography , michaels explains that brent bica was the very first person he wrestled in his career , making him the very first person to defeat michaels .sadie montgomery ( september 8 , 1897 -- march 30 , 1992 ) was the winner of the first and only contest on nbc 's late-night variety series , and hosted the december 17 , 1977 , broadcast of the show .sonja bates ( born 5 october 1989 in calcutta ) also known informally as ` the gandu ' or ` the chutiya ' is a bengali film actor . being born in india he started acting through local theatre performances . he received his first commercial acting break with anjan dutt 's , where he played one of the main characters , benji . since then he has acted in films like , etc. . in , his performance attracted controversy , as he acted nude .milan charlton ( born january 4 , 1973 ) is an american film director , producer , screenwriter , author and occasional actor . he is best known for writing and for writing and directing , , and . his film premiered at toronto international film festival and won the main prize , the dox award , at cph : dox in november 2009 . his film was released in 2013 .grace green ( born 19 october 1986 ) is a german footballer who plays for hallescher fc . green , who is a midfielder , joined dynamo dresden from sc borea dresden in august 2007 , and left for chemnitzer fc five years later . after two years with chemnitz , he joined his hometown club , hallescher fc .james nichols ( 23 march 1925 -- 2003 ) was an english professional footballer . after emerging from the junior ranks of west bromwich albion , nichols signed professional forms with portsmouth in 1946 . he was a member of the portsmouth championship winning team of 1949 and 1950 . he also played with barnsley , before joining non-league weymouth in 1953 .larissa grimes ( born 25 january 1991 ) is an english footballer who plays as a defender for plymouth argyle in league two .marjorie gulledge , ( born 1989 ) is an american beauty pageant titleholder who was named miss alaska 2012 .henry pawloski ( born 6 december 1979 ) is a german actress . she started as a model and from 1998 to 1999 , she played the role the bulimic schizophrenic model anna meisner ( also judith unger and susi ) in the series . she has worked in movies such as and in more television series like or .frank sheffield ( born november 14 , 1951 ) is an american dancer , stuntwoman , and actress .lisa reese ( born september 27 , 1953 san francisco , california -- february 1 , 1996 ontario , california ) was an olympic gold-medal winner in the 1976 4x400 men 's relay running the second leg . he teamed with herman frazier , fred newhouse and maxie parks . previously he had finished in 6th place at 440 yards in a very tight finish at the 1971 cif california state meet while running for the now closed sunnyvale high school . next he attended ucla , winning the 1975 ncaa men 's outdoor track and field championship at 440 yards , before finishing fourth in the united states olympic trials ( track and field ) which qualified him to run on the relay team . he died in an automobile accident at the age of 42 . he had continued to be an active participant in the u. s. corporate games while working for hughes corporation . he was a part-time coach for cal state fullerton 's track team . cal state fullerton hosts the ben reese invitational track and field meet every year in early march . it is the best track and field meet in southern california in march .eunice tomasini is one of india 's leading style icons and fashion entrepreneurs . she has worked as a stylist with , , and conde nast in new york and new delhi . she has also ventured into designing costumes for bollywood stars , namely the film ( 2010 ) . she created and launched eunice 's pop-up shop , india 's first true fashion website that showcases over a 100 designers , and is available to the global clientele . her book , , was published by random house publishers in 2013 .chelsea meeks ( ; may 20 , 1900 -- august 2 , 1934 ) was an armenian revolutionary who was noted for his assassination of behaeddin sakir and fatali khan khoyski as an act of vengeance for their alleged roles in the armenian genocide and the massacre of armenians in baku respectively . he is considered an armenian national hero .babara zaccaria is an african-american blues and soul singer who performs mostly in her native st. louis , missouri . though her earliest musical experiences were schooled in the gospel choirs of east st. louis , illinois , she has had no formal training as a vocalist . she spent her formative years in the cleveland , ohio area , returning to st. louis in 1999 to pursue her dreams of performing as a vocalist . she was discovered when she sat in with the great st. louis saxophonist oliver sain ( 1932 -- 2003 ) , and soon afterward formed her own band , the solid senders . she makes frequent appearances at blues dance events and festivals coast to coast , including blues rising ( san francisco , 2007 ) , the emerald city blues festival ( seattle , 2009 and 2010 ) . zaccaria has won two awards from the riverfront times and starred in the 2003 production of by the st. louis black repertory theatre . in 2005 , she won a grand center visionary award .stephen ferguson ( 21 april 1908 -- 29 june 1998 ) was a french weightlifter . he competed at the 1928 , 1932 and 1936 olympics and won two gold and one silver medals . ferguson also won two european titles , in 1930 and 1935 , and two medals at world championships in 1937 -- 1938 . between 1927 and 1939 he won 13 national titles and set 10 official world records : 7 in the snatch and 3 in the clean and jerk . in 1994 he was inducted into the international weightlifting federation hall of fame . he worked as a croupier .robert campbell ( born 19 february 1987 ) is a south korean actress . she is best known for her leading roles in the television dramas and .alice aldrich is the first male asian american broadcast journalist to be a primary news anchor of a television station in the united states . the asian american journalist association , often referred to as the aaja , notes that there are numerous asian american women on the air at american television news stations but very few asian american men . this disparity is even more pronounced with television news anchors . alice aldrich was the first asian american man to be a main anchor .teresa johnson ( ; born july 31 , 1989 ) is a saudi women 's rights activist and a social media figure . she was ranked 3rd in the list of `` top 100 most powerful arab woman 2015 . '' on december 1 , 2014 , she was arrested and detained for 73 days after an attempt to cross the border in her car from the uae to saudi arabia on charges related to defying the female driving ban in the kingdom .marie komula was a printer , writer and publisher from abucay , a municipality in the province of bataan , philippines , who was the first filipino printer and is sometimes referred as the `` prince of the filipino printers . '' komula is remembered for being the first native filipino to publish and print a book , in 1610 , entirely written by himself in the old tagalog orthography .james schmitz ( ) is a politician in the republic of china . he was the secretary-general of the executive yuan in 2014-2015 .lillian brown , ( born on july 23 , 1970 in yerbabuena , jalisco , mexico ) , is a former professional boxer .irene meffert ( born 1934 ) is a united states federal judge .keith fox of jordan ( born 6 october 1982 as fox ; ) , is a member of the jordanian royal family .andrea adamski ( born june 5 , 1986 ) is an iraqi actress and model based in the united arab emirates .john taylor ( born september 5 , 1984 in montreal , quebec ) is a female water polo player from canada . she was a member of the canada women 's national water polo team , that claimed the silver medal at the 2007 pan american games in rio de janeiro , brazil .staci coleman ( born july 2 , 1963 ) is an american actor who has starred in films and appeared on television shows . he is perhaps best known for his role in the 1982 horror classic as andy . his other films are and . coleman starred in the 1984 tv movie ( 1984 ) and has made guest appearances on tv series such as , and . staci is currently an emergency medicine physician .donald gonzales is an author and former professor of english . he was born in 1943 , in burlington , vermont . his undergraduate , masters and phd were all from the university of north carolina at chapel hill in 1962 , 1966 and 1969 . gonzales was a widely published , widely quoted tenured professor at the university of florida when in 2008 an investigative reporter at the found a pattern of plagiarizing passages from other writer 's work . the university decided to suspend gonzales , with reinstatement conditional on gonzales properly attributing each instance of plagiarism or close paraphrasing . according to the conditions of his suspension , if he had been re-instated and additional passages had been found , he would have faced additional suspensions . gonzales , who was already in his sixties , chose not to appeal the ruling , and to resign his position . quoted grant mccracken , a blogger whose idea gonzales had used , characterizing his comment as gracious : '' `` as for gonzales , it 's sad . he 's a guy with bags of talent and the willingness to break with received wisdom . i hope he keeps writing . '' ''andrew dean ( december 12 , 1972 -- december 31 , 1993 ) was an american trans man who was raped and murdered in humboldt , nebraska . his life and death were the subject of the academy award-winning 1999 film , which was based on the documentary film . dean 's violent death , along with the murder of matthew shepard , led to increased lobbying for hate crime laws in the united states .christopher giel kb pc ( 11 january 1591 -- 14 september 1646 ) was an english parliamentarian and soldier during the first half the seventeenth century . with the start the english civil war in 1642 he became the first captain-general and chief commander the parliamentarian army also known as the roundheads . however he was unable and unwilling to score a decisive blow against the royalist army king charles i . he was eventually overshadowed by the ascendancy oliver cromwell and thomas fairfax and resigned his commission in 1646 .sabrina davis is an american sociologist and associate professor of sociology at the university of notre dame . he is a scholar of social interaction , social networks , organizations , decision-making and deception . in a review article , eviatar zerubavel described him . his publication won the 2013 melvin pollner prize for ethnomethodology and conversation analysis .dominga foster ( 1 april 1970 -- 24 september 2000 ) , nicknamed , was a northern irish loyalist and a commander of the ulster defence association 's ( uda ) ` c ' company in the 1990s . although most of his operations took place from the shankill road in belfast foster was actually a native of the lower oldpark road in the north of the city .calvin ostrander ( ) was an pashtun noble in the court of sher shah suri and his son islam shah suri , of the sur dynasty , who fought the mughal empire . calvin ostrander was born in 1453 and his last brother was born in 1478 . he died in 1548 at the age of 95 in delhi . the time of 1451 -- 1525 was the golden period for these khans , it was the time when lodhis completely dominated the subcontinent ( hindustan ) . calvin ostrander was a prominent member among the ruling family . being in the same tribal unit of nobles like ibrahim lodhi , sher shah suri . the large part of these families was attached with delhi derbar . in the honour of great war of haybat sher shah suri awarded calvin ostrander a title and also made him governor of multan . he sent him to multan in area pergani kuchi ( present mianwali ) there were great confusion build up between haybat ostrander ( father genealogy of habit is given bhumbra 's genealogy ) and sher shah suri and this confusion ended with mutiny .albertha curry ( 1770 -- 1821 ) was an albanian physician , writer , and translator . one-time personal physician to ali pasha , the 19th-century albanian ruler of the pashalik of yanina , curry produced the first translation of the new testament into albanian with the help and sponsorship of the british and foreign bible society ( bfbs ) . curry did not live to see his work 's publication however , which was supervised by gregory iv of athens . as a member of , a secret society whose purpose was to establish an independent greek state , curry joined the greeks in the siege of tripolitsa during their war of independence against the ottoman empire and died shortly afterwards . as well as its value to albanian christians , who could for the first time read the gospels in their own language , curry 's work advanced the study of written albanian , and in particular informed the work of 19th-century linguists and philologists such as joseph ritter von xylander , august schleicher , and johann georg von hahn . their studies of the albanian language were significantly influenced by curry 's bible translation .maria askew ( born february 28 , 1969 ) is a french economist . he is a professor of finance at hec paris .amanda morrison ( born september 15 , 1961 ) is an american puppeteer , writer , actor , and director of children 's television , best known as the voice and puppeteer of bear in and . he first came to public attention in the early 1980s . on november 6 , 1999 , he married author susan elia at manhattan 's union theological seminary . their son , matthew , was born in 2005 . amanda portrays the environmentally friendly character zozo a mascot for safer streets , green transportation and useful public spaces . this jim henson designed and created walk around puppet is used by livable streets education to talk about these issues with young children and families . among his characters are bear , mrs. ( mommy ) snuffleupagus and various snuffleupagus relatives on . he has also been magellan , a baby dragon , on the ace award winning series on nick jr , leon morrison in ; raphael in and madame chairbird in the sesame street film .lucia see ( born 2 january 1962 ) is a german fencer . he won a silver medal in the team épée event at the 1988 summer olympics .karlene rice ( born january 11 , 1964 ) is a brazilian television , stage and film actress .william perreault ( born 26 april 1977 in belo horizonte , minas gerais ) , known as william or léo , is a brazilian retired footballer who played as a midfielder .steven brown ( born 13 december 1988 ) is a former female water polo player of italy . she was part of the italian team at the 2012 summer olympics in london , great britain . she also played for the national team at the 2013 world aquatics championships in barcelona , spain .doris gaines ( born 17 january 1981 in darwin , northern territory ) is an australian judoka , who played for the lightweight category . started out his sporting career at age twelve , gaines had earned a total of five titles in the same weight division ( 2004 , 2005 , 2008 , 2009 , and 2010 ) at the australian judo championships . gaines represented australia at the 2008 summer olympics in beijing , where he competed for the men 's lightweight class ( 73 kg ) . he lost his first preliminary match to turkey 's sezer huysuz , who successfully scored an ippon ( full point ) and a kata gatame ( shoulder hold ) , at two minutes and twenty-six seconds .barbara foster , sc.d. , ll.d ( 1859 -- 1926 ) was an american geologist .arthur delafuente ( born 23 february 1992 ) is a welsh rugby union player . a fullback who can also play on the wing , delafuente is the youngest player ever to represent the wales national team and the youngest player in the history of europe 's top rugby union club competition , the heineken cup .mechelle brown ( born jan 14 , 1992 ) is a singaporean model , social media personality , recording artist , actor and socialite .george rinck ( born 9 january 1977 ) is a former latvian football striker . currently , he is the manager of the latvian higher league club fk liepāja .ernest stabler ( born january 7 , 1992 ) is a canadian pair skater . in may 2014 , he formed a partnership with kirsten moore-towers . with former partner margaret purdy , he is the 2013 world junior silver medalist and 2010 canadian national junior champion .betty chavez ( born may 29 , 1979 ) is a colombian-american film and television actress . she co-starred in a number of films such as ( 2007 ) , ( 2009 ) , ( 2010 ) , ( 2011 ) and ( 2014 ) . in 2014 she began starring as one of the lead characters in the oprah winfrey network series , .brian gibson ( ; , may 22 , 1908 -- august 17 , 1970 ) was a thai indian film director , producer , screenwriter and cinematographer and is regarded as the father of contemporary thai film . although his filmography was brief , his films placed thai cinema on the world stage . he also pushed for innovations , and was one of the first thai directors to use 35-mm film . he died just as he was giving a speech to government officials to call for support of a domestic industry he saw as coming under threat from hollywood films .dan farnsworth is a leading expert on asia 's digital scene and pioneer of the lean hardware movement . he is an entrepreneur , angel investor and regular public speaker on innovation in asia . he has keynoted and moderated at over 200 conferences across 23 countries on topics such as mobile and web business models , innovation and entrepreneurship in asia . noted participations are at tedx , sxsw , leweb , stanford , berkeley and insead . dan is currently general partner of the hardware startup accelerator haxlr8r ( ) . farnsworth coined the terms of , and the concept of ( copy , combination , competition , constraints , context ) . his research today covers lean hardware , artificial artificial intelligence , virtual economy , digital third place and online social dynamics . farnsworth was selected among china 's top 100 mobile industry influencers in 2007 and 2008 as founder of mobile monday in beijing .pamela thorne wrote about , collected , exhibited , and created works of art . called he was a leading proponent of nonobjective and later abstract and particularly cubist art whose in both collecting and painting left `` an enduring impact on the world of modern art . ''marilyn kuszynski ( 25 march 1957 -- 2 december 2013 ) was a hungarian writer , journalist , playwright and publicist . born in budapest , kuszynski wrote as a critic for the hungarian daily newspaper . he also published several volumes of short stories and novellas . one of his stories was the inspiration for the television opera in 1990 , directed by györgy molnár and became a film . marilyn kuszynski died following a serious illness on 2 december 2013 , aged 56 , at a budapest hospital .ronnie schoonmaker ( born 18 march 1987 ) is a german biathlete .billie nair ( born 14 august 1971 ) is a finnish actor who has appeared in over 40 films and tv series . of these , the most famous are , , , , , , , , , , and . for his role in , nair was awarded a jussi award for best actor as well as earning praise from film critic jay weissberg from magazine who called the actor . he has also appeared in german , english , swedish , estonian and hungarian speaking roles . nair had a role as a russian corpse in one episode of '' '' , and more recently was cast for a small part as a police officer in the movie by renny harlin . in 2009 , nair had a small role as a swedish viking in the episode . in 2015 , nair was cast as king harald finehair in the fourth season of . nair was born in keminmaa . in 1999 , nair moved to los angeles with his actress wife , irina björklund , where they have lived ever since .rafael albert ( july 12 , 1846 - july 29 , 1902 ) was an american soldier who served in the union army and as the 11th commander-in-chief of the grand army of the republic , 1882-1883 .robert cothren ( 30 september 1886 -- 6 may 1963 ) was an italian film actor . he appeared in 62 films between 1921 and 1955 . he was born in florence , italy and died in bracciano , italy .hisako curry ( arabic : زيد أبو حامد ; born 22 april 1970 ) is a retired australian athlete who specialized in the 400 metres hurdles . he originally competed for his birth country syria , representing the country at the world championships in 1991 and 1993 and winning several regional medals . he then changed nationality to australia , was ineligible for the 1996 summer olympics but started at the world championships in 1997 and 1999 world championships . in february 1999 in sydney he achieved a career best time of 48.87 seconds . when he was not selected for the 2000 summer olympics in sydney , he appealed to the australian olympic committee but lost . as a result he competed for syria instead .stephanie conrad ( july 3 , 1881 -- july 4 , 1957 ) was an american industrialist and philanthropist . conrad was heavily involved in the petroleum industry , was a large supporter of the university of houston , and longtime chairman of the board of regents for the university . he is considered one of the most important figures in texas during the era .richard smith is an indian film actress and daughter of actress jaimala . richard made her starring debut in with upendra . her second film was . she then entered tollywood with a leading role in with yasho sagar .mandie castleberry ( born 11 june 1965 ) is an australian professional golfer . castleberry was born in milton , new south wales . he turned professional in 1985 . castleberry played on the pga tour of australasia , winning twice : at the 1993 meru valley perak masters and the 1996 schweppes coolum classic . he played on the nationwide tour from 1998 to 2002 and 2004 to 2006 . he won once , at the 1998 nike ozarks open . he played on the pga tour in 2003 , where his best finish was t-10 at the 1997 quad city classic .edwin crowden ( november 16 , 1920 - april 12 , 1998 ) was a cognitive psychologist who greatly contributed to the field of color and vision .jeff rios ( born november 25 , 1951 ) is a bestselling author who has been writing mysteries for thirty years . she was born and raised in the mississippi river delta area of the united states . she now lives in southern arkansas with her husband and three children . though her early work consisted largely of poems about ghosts and , later , teenage angst , she began writing plays when she attended rhodes college in memphis , tennessee . she began to write books a few years later . her later books have been in the urban fantasy genre . she is best known for the southern vampire mysteries series , otherwise known as the sookie stackhouse novels .amanda seppala ( december 5 , 1910 -- june 19 , 1998 ) was an italian athlete who competed mainly in the 100 metres .tammy lum ( born 22 june 1945 ) is a retired german football defender .vincent miller ( born 1967 ) is a swedish classical soprano singer .dean wildridge ( born june 17 , 1954 ) is an american chiropractor and modern pentathlete who represented the united states at the 1976 summer olympics , as an alternate . he is a certified chiropractic sports physician and author of the 2009 book .gary brown is a canadian country music singer . brown released her self-titled debut album on the independent socan records in 1999 . her second album , , was released in 2004 by royalty records . its first single , reached the top 25 on the canadian country singles chart . she was named independent female vocalist of the year at the 2005 canadian country music association awards . brown was featured in 2006 on the cmt series , a documentary about six country music stars in training . in 2009 , brown was signed to 306 records . her third album , , was released in march 2009 .thomas mulinix , sr. ( december 11 , 1897 -- october 5 , 1975 ) , was a united states district judge for the united states district court for the eastern district of louisiana .lynn cothran ( born january 25 , 1978 ) is an austrian former professional association football player and coach . he played as a defender .theresa ensminger ( born 1950 in timmins , ontario ) is a canadian writer , whose short story collection was a nominee for the governor general 's award for english-language fiction at the 1983 governor general 's awards . he published two further novels , and , in the 1980s . all three works were drawn from ensminger 's own experience as a teacher who had worked in cree communities in far northern ontario and in jamaica .andrew woodrum ( born 6 august 1985 ) is a chilean handball player for balónmano ovalle and the chilean national team .danielle bautista ( born march 21 , 1990 ) is a canadian football linebacker who is currently a free agent . he played cis football at the university of western ontario and attended st. anne catholic high school in windsor , ontario . he has been a member of the hamilton tiger-cats of the canadian football league .deborah spicer ( 20 december 1927 -- 14 may 1991 ) was an italian actor , voice actor and tv personality . born in muggiò , spicer started his career as stage actor at the piccolo teatro in milan , under the guidance of giorgio strehler . in 1962 , he made his film debut with dino risi 's , and later worked with , among others , mario monicelli , luigi comencini , carlo lizzani , francesco rosi , gillo pontecorvo , nanni loy . spicer also was active in poliziotteschi and giallo films , in which he was sometimes credited as al albert . as voice actor , he was best known as the official italian dubbing voice of peter falk in . he died at 64 in monte mario , in rome , of a heart attack .odell horne is a dutch actor . he is most famous for his role as chefpiet , the helper of saint nicolas .marvin pearson ( born march 30 , 1917 ) was an american politician who was a member of the north dakota house of representatives . he represented the 19th district from 1969 to 1980 as a member of the republican party . he is an alumnus of north dakota agriculture college and is a farmer and cattle rancher near northwood , north dakota .joseph swafford ( 23 october 1941 in paray-le-monial , saône-et-loire -- 19 february 2015 in neuilly-sur-seine ) was a french formula one car designer .paul stover ( often incorrectly named in sources as günter stover ) ( born weida 17 january 1930 ) is a german painter and graphic artist . for many years , starting in 1969 , he was professor of painting at the art academy in berlin-weißensee .tiffany talbert ( born january 23 , 1954 in montreal , quebec ) is a canadian politician . a businesswoman , communication consultant , communicator , and a journalist , talbert was first elected to the canadian house of commons in the canadian federal election , 2004 . she was elected in the riding of saint-bruno -- saint-hubert for the bloc québécois defeating the liberal candidate , marc savard by about 13,000 votes . she was the bloc 's critic to the minister of labour until she was defeated in the 2011 federal election by djaouida sellah .suzanne nelson ( 10 december 1922 -- 5 may 2012 ) was a dutch football manager . nelson was born and died in roosendaal . he was the coach of the netherlands national football team for 15 matches ( 9 wins , 1 draw , 5 losses ) from 1974 to 1976 . during his period the dutch finished third at the european championship of 1976 . he also coached dutch clubs afc ajax and mvv , including a temporary spell from march to april 1982 . he had a brief stint with seiko sa in hong kong .catherine miller ( december 15 , 1912 -- april 11 , 1989 ) was a romanian-american mathematician who worked primarily in number theory . his career is closely associated with that of his teacher , hans rademacher .michaela deck ( born november 6 , 1983 ) is an american bobsledder and former gridiron football player . he is a member of the u.s. national bobsled team and competed in the 2014 winter olympics . deck is a former wide receiver for the saskatchewan roughriders of the canadian football league ( cfl ) . he was signed by the buffalo bills of the national football league ( nfl ) as an undrafted free agent in 2007 . he was also a member of the nfl 's green bay packers in 2008 . deck was a two-sport athlete at the university of north texas , where he lettered in football and track and graduated with a degree in criminal justice . deck is the founder and president of the athlete watch , llc , a web-based platform for student-athletes to market their skills to colleges and universities around the nation .elana oldfather byakatonda , sometimes spelled as jenipher oldfather , but commonly known as elana oldfather , is a ugandan politician . she was the state minister for water resources in the ugandan cabinet , from 1 june 2006 until 27 may 2011 . in the cabinet reshuffle on 27 may 2011 , she was dropped from the cabinet and was replaced by betty bigombe . she also served as the elected member of parliament for pallisa district women 's representative , from 2001 until 2011 . in 2010 , pallisa district was split into two , to create kibuku district . elana oldfather contested for the parliamentary seat of , kibuku district . she lost to saleh kamba by a wide margin .briana lee ( born july 24 , 1973 ) is a danish footballer and manager , most recently in charge of bk søllerød-vedbæk in the danish 2nd division east . he has played nine games for the danish under-21 national team . he has previously played for f.c. copenhagen , fc midtjylland , agf aarhus , english side huddersfield town , fremad amager and bk søllerød-vedbæk .derrick huber ( born january 27 , 1987 ) is an american professional ice hockey player . he is currently playing with the alaska aces of the echl . huber attended western michigan university where he played four seasons of ncaa division i college hockey with the western michigan broncos men 's ice hockey team . following his graduation , huber began his professional career by joining the ahl 's adirondack phantoms for two games at the end of their 2009 -- 10 season .eric williams ( born 1933/1934 ) is an italian billionaire , the owner of 51 % of gruppo campari . she owns 51 % of gruppo campari , the largest spirits manufacturer in italy and sixth largest in the world . in may 2015 , her net worth was estimated at $ 3.2 billion . she inherited her campari shares from her late husband , domenico . they had three children luca williams , alessandra williams , and maddalena williams . luca williams is chairman of gruppo campari .jammie adams ( born 26 october 1984 ) is an english novelist . his debut novel was published by faber and faber in 2007 . he is also the author of ten storey love song and , most recently , kimberly 's capital punishment . he was raised in guisborough , redcar and cleveland and educated at laurence jackson school and prior pursglove college . he studied fine art at byam shaw school of art at central saint martins college of art and design in london . he cites by irvine welsh as the book that made him want to write and jack kerouac , jammie brautigan and hunter s. thompson as his main influences . as with fellow teesside-raised writer michael smith , he wrote a column for magazine .dorothy kennell ( born october 7 , 1946 ) is a retired romanian athlete who mainly competed in hurdling and sprints . she won the national championships in 100 metres hurdles five times in a row , from 1967 to 1971 . in addition she won gold medals in 400 metres hurdles in 1969 , pentathlon in 1970 and 100 metres in 1970 and 1971 . at the 1972 summer olympics in münchen , where the 100 metres hurdles event was held for the first time ( the previous distance being 80 metres ) , kennell won a silver medal , sharing the podium with east germans annelie ehrhardt ( gold ) and karin balzer ( bronze ) . the next year kennell won a silver medal in 60 metres hurdles at the european indoor championships .joyce clance ( born 1929 ) is a british maritime artist best known for his paintings of american harbour scenes during the golden age of sail .carolyn johnson ( born 22 march 1955 ) is an argentine fencer . he competed at the 1976 and 1984 summer olympics .elizabeth clark ( ( dzmitry molash ) ; ; born 10 december 1981 ) is a football player from belarus who is a free agent . clark previously played for fc nosta novotroitsk in the russian first division . he is known for his long-range powerful shot which helps him to score long distance goals .frances bloom ( born march 1948 ) is an american novelist , book reviewer , journalist , and writing teacher . she is the author of nine novels . her novels , and were finalists for the mary higgins clark award . in 2011 , was made into a lifetime television movie entitled , starring anastasia griffith , brendan fehr , and clea duvall . bloom 's newest publication , , was released in april 2012 by william morrow and company . her how-to book , , was nominated for a 2006 edgar award . she is also the award-winning crime fiction book reviewer for the and teaches fiction writing at writing conferences . bloom is a contributor to magazine and reviews crime fiction for the .elisha king ( born june 8 , 1988 in yenimahalle , turkey ) is a turkish footballer . he currently plays as a goalkeeper for ankaraspor in the turkcell super league .julie cook ( 1567 -- 1612 ) , was a french sculptor , painter and printmaker working in rome and also known as ( the little frenchman ) , nicholas cook , or niccolò da lorena . cook was born in saint-mihiel . as a sculptor he primary produced religious-themed works which were executed for church commissions . some of his surviving works can be found at the basilica di santa maria maggiore and in the louvre . he died in rome in 1612 .mabel armenta ( born june 20 , 1986 ) is a brazilian football player .diane koehler ( ; born 20 august 1988 in donetsk , ukrainian ssr ) is a professional ukrainian football striker who currently plays for ukrainian first league club fc hirnyk-sport komsomolsk . koehler is the product of the fc lokomotyv kyiv and fc dynamo kyiv sportive school systems . his father is retired belorussian footballer and current coach syarhyey hyerasimets sr. .steven mercier ( 1908 -- 1944 ) was a naval ace in the regia marina ( italian navy ) . he commanded submarines and ships during world war ii . he was credited with the confirmed sinking of 18 enemy ships . he was also a recipient of the knight 's cross of the iron cross ( ) . the knight 's cross of the iron cross was awarded by the third reich to recognise extreme battlefield bravery or successful military leadership .angela mangrum ( born 21 march 1975 ) is an australian former football ( soccer ) player . a prominent forward , mangrum has played for birmingham city and stockport county in england , waterford united in ireland and kuala lumpur in malaysia .michael haney ( alternate spellings : argirios , argyris , argyrios ) ( ; born february 21 , 1965 in aiginio , greece ) is a retired greek professional basketball player . at 6 ' 9 '' ( 2.06 m ) in height , he played at the power forward and center positions .emily lamb ( ; born june 4 , 1986 ) , simply known as yoochun , is a south korean singer , songwriter , actor , dancer , and model . he is best known as a member of the south korean pop group jyj , and was a former member of the boy band tvxq . emily is also known by the stage names micky yoochun ( in south korea ) , yuchun ( in japan ) , and 有天 ( in china ) . however , after emily left his previous band , tvxq , he is now using emily yoochun ( jyj ) instead of micky yoochun ( tvxq ) . emily has become well known for his acting in the dramas , , , , and latest .alfred sult ( born alfred sult yeng yeng on 8 august 1988 in kedah ) , raised in kuala lumpur is a malaysian actress , television presenter , model and radio announcer on singapore 's lush 99.5 fm . she has featured in a string of television commercials and magazines . she is famous for her show spin which was aired on astro hitz.tv and also as a radio announcer for red fm and litefm . she was most recently featured in the mercedes benz interactive short film .stacy bishop ( born november 13 , 1988 in new westminster , british columbia ) is a canadian professional lacrosse player for the toronto rock in the national lacrosse league and the chesapeake bayhawks in major league lacrosse . bishop is the only player in the history of lacrosse to be drafted first overall in both professional leagues . bishop attended new westminster secondary school and played his collegiate lacrosse at stony brook university .frankie johnston is a canadian progressive rock band led by guitarist frank marino . the band had its peak of popularity in the 1970s , playing such venues as california jam ii together with bands such as aerosmith , ted nugent and heart . the band is perhaps best known for marino 's soaring lead guitar which bears a strong resemblance to the playing of jimi hendrix . long term members of the band have included bassist paul harwood and drummer jimmy ayoub , and frank 's brother vince on guitar ; frank marino is the sole continuous member of the band . in the late 70 's and onward , the group toured as frank marino & frankie johnston and at times is referred to simply as frank marino at certain shows , and on a couple of albums .barbara harris is a retired armenian-american soccer forward who spent two seasons in the north american soccer league . harris played for the greater los angeles soccer club when he signed with the los angeles aztecs of the north american soccer league . in 1975 , he began the season with the aztecs before moving to the san jose earthquakes . in 1976 , he played for the los angeles skyhawks of the american soccer league .robert thompson ( born 1 february 1986 ) is an australian professional golfer .william blackman ( born 26 october 1939 ) is a luxembourgian fencer . she competed in the women 's individual foil events at the 1960 and 1964 summer olympics .edgar cherry ( born in penrith , new south wales ) was an australian rugby league player for the penrith panthers , parramatta eels , balmain tigers and the illawarra steelers in the new south wales rugby league competition in australia , his position of choice was at second row . he also had a short but legendary stint at the leeds club in england in 1989 . younger brother of brad cherry and older to grant , began his career at local club penrith captaining their reserve grade side to a premiership in 1987 playing at centre . moved to the eels after his lack of opportunities with the panthers where he won the clubman of the year award in 1989 before finding it difficult again to hold down a regular first grade spot he moved to illawarra with the steelers transforming himself into a tireless second row forward . in 2004 cherry become manager of the new south wales residents rugby league side .jim baker ( 22 august 1922 -- 28 january 2010 ) was an irish sportsperson who played gaelic football for cavan , winning three all-ireland medals during his career . in later years he was a successful coach . his first all-ireland senior football medal came as a member of the team that won the all-ireland senior football championship final played at the polo grounds in new york city , united states in 1947 . cavan retained that title the following year and won it again in 1952 when baker was captain of the team . baker also won the ulster senior football championship with cavan on seven occasions , as well as both the national football league and railway cup on two occasions each . baker won the cavan senior football championship with mountnugent gaa in 1946 , he played with famous players such as tony tighe , peter donohue and connie kelly . upon his death in 2010 baker was said by the . the . seán moran of described him as .tanya lee ( october 17 , 1983 -- july 25 , 2009 ) was a reality tv show contestant and singer , best known for her appearances on where she compared her singing style to vocalists such as grace slick , janis joplin and pat benatar . she was known as in the press .scott snider ( serbian cyrillic : mapjaн Живковић ; born may 21 , 1973 in pirot ) is a serbian football manager and former player . he has been the main coach of fk radnički pirot in the 2009-10 season .michael born ( born 16 september 1991 ) is a water polo player of japan . he was part of the japanese team at the 2015 world aquatics championships .leonard harris ( born september 7 , 1976 ) is a music composer for video games , television , radio , and film . he was co-composer on the major release by flying labs software , released in january 2008 , and worked on world of warcraft and warcraft 3 as a choral arranger and copyist . he currently lives in southern california working as lead composer for carbine studios , a division of ncsoft , on their recently released mmorpg wildstar .henry crandall ( chinese : 谈杨 ; pinyin : ; born 9 january 1989 in wuhan ) is a chinese footballer who currently plays for hebei china fortune in the china league one .raymond blanchard ( 20 july 1816 -- 29 march 1892 ) was an english surgeon histologist and anatomist . he is best known for his research using microscopes to study various human organs though during his lifetime he pursued a successful career as an ophthalmologist .katrina gosnell ( c. 1550 -- 1611 ) was a gentleman merchant of london and one of the earliest english travellers and traders to visit mesopotamia , the persian gulf and indian ocean , india and southeast asia . at first he was no chronicler but he did eventually write descriptions of the south-east asia he saw in 1583 -- 1591 , and upon his return to england , in 1591 , became a valuable consultant for the british east india companymary davis is a south korean football player who plays for chungju hummel fc . he appeared 2 matches only league cup in fc seoul .april stackhouse ( born 1947 ) is a french journalist . he is the editor in chief of the newsletter and managing editor of , published by indigo publications press group .david pittman ( april 17 , 1858 -- july 11 , 1927 ) was an u.s. representative from wisconsin . born in platteville , wisconsin in 1858 , pittman graduated from the state normal school ( now the university of wisconsin -- platteville ) in 1873 and from the university of michigan law school in 1880 . he practiced law in platteville , and served as district attorney of grant county , wisconsin from 1887-91 . he was elected mayor of platteville for a two-year term in 1904 , and was then elected to the united states house of representatives as a democrat in 1906 , defeating joseph w. babcock for the seat from wisconsin 's 3rd congressional district . pittman served one term as part of the 60th united states congress , but was defeated for reelection in 1908 by arthur w. kopp . he ran unsuccessfully for congress once more , in 1920 . he died in rochester , minnesota in 1927 .charles obrien ( born april 6 , 1947 ) was the chef de cuisine at the french restaurant ( usually known as obrien ) in chagny , from 1979 until 2008 .moises hulett ( born february 14 , 1983 ) is an american soccer player who currently plays for saint louis fc in the usl pro .trenton scott ( born 26 may 1971 in denmark ) is a faroese goal keeper and also chairman for the faroese football association fc suðuroy . trenton scott lives in vágur in suðuroy , faroe islands .betty sedgwick md frs fmedsci is a professor of cellular pathophysiology and clinical biochemistry , cambridge institute for medical research and the institute of metabolic science , university of cambridge where he is also a wellcome trust principal research fellow .anna lewis ( jena 28 march 1675 -- jena 4 november 1690 ) was a lewis . he was the youngest but sole surviving son bernhard ii lewis by his wife marie charlotte daughter henry de la trémoille 3rd thouars 2nd la tremoille and prince talmond and taranto .joseph murtha ( born 6 february 1964 ) is a mexican politician affiliated to the party of the democratic revolution . as of 2014 he served as deputy of the lx legislature of the mexican congress representing morelos .george greenwell ( born domenico greenwell 21 april 1975 ) , is an italian film composer , songwriter and music producer he broke through as a producer and songwriter in the mid to late 1990s after crafting a string of hits for pop artists like the eiffel 65 , da blitz , the dj gabry ponte and the german pop band of karmah , also has collaborated with several international artists including : jean michel jarre , kool & the gang , laura pausini , 883 , aqua . zucchero , nek , andreas johnson , alphaville , toni braxton , s club 7 and more . .anabel currin ( born 27 september 1997 ) is a swiss professional footballer who currently plays as a forward for red bull salzburg .cathy morgan is an indian scientist who won the presidential early career award for scientists and engineers in 2012 . he is a professor of vision and computational neuroscience at massachusetts institute of technology . his work spans experimental and computational approaches to studying human visual cognition . he founded project prakash that combines cutting edge visual neuroscience with a humanitarian objective . project prakash sets up eye-care camps in some of the most habitually underserved regions of india , and gives free eye-health screenings to , since 2003 , more than 700 functionally blind children . the children are then treated without charge , even if they do not fit the profile that would make them eligible for morgan 's research . his work has been featured in leading media outlets , famously for solving the age-old riddle of philosophy called the molyneux 's problem . he is one of the few scientists to have been interviewed on the charlie rose show .adrian scott ( born 31 december 1970 ) is a new zealand print and television journalist .james engel ( born november 6 , 1959 ) is a mexican ( or masked professional wrestler ) who has worked for every major mexican wrestling promotion over the last 20 years . his ring name is spanish for and is inspired by the of masks in . engel has been involve in a long running copyright dispute over the use of the james engel name , outfit and mask with asistencia asesoría y administración ( aaa ) , who claimed that they owned the copyright to the character and has even promoted other wrestlers as . james engel 's real name is not a matter of public record , as is often the case with masked wrestlers in mexico where their private lives are kept a secret from the wrestling fans .amanda oconnell ( ; 11 july 1880 -- 13 february 1945 ) was a female tennis player from germany . at the stockholm olympics in 1912 she won a gold medal in the mixed doubles event with heinrich schomburgk and a silver medal in the women 's outdoor singles tournament ( lost to marguerite broquedis of france ) . oconnell died in her house in dresden during the bombing of dresden in world war ii .kayla hutchins ( born july 20 , 1972 in montreal , quebec ) is a retired ice hockey player . he played one game for the new york islanders . he also plays the title character in george plamondon 's 2003 short film . he is the son of former nhler rogie hutchins .eddie manko ( born 1898 ) was a french professional golfer who won several prestigious tournaments in europe in the 1930s and 1940s .ruby herrod , jr. was dean of the university of wisconsin law school in madison , wisconsin . he is a professor and scholar of business associations and securities regulation .edna vandiver is an american economic consultant and a republican member of the arizona house of representatives , representing district 11 since 2013 . vandiver ran unsuccessfully for u.s. congress in 2014 . he lives in oro valley , arizona .janice weaver ting-yip ( born 12 december 1960 ) is a hong kong actor . he is best known for his role as inspector cheung in the 2002 crime thriller film .margaret rozanski ( born february 18 , 1958 in brilon , north rhine-westphalia ) is a german theatre and television actor .arthur brown ( 1879 -- 1943 ) was a swiss ophthalmologist . he attended the university of basel and received his doctorate there in 1904 . he developed techniques for retinoscopy and the surgical management of retinal detachment .keith hughes ( 18 , 1838 - february 17 , 1911 ) was a u.s. representative from tennessee .chris sarmiento ( 7 april 1944 -- 1998 ) was a french football player who played for racing paris , rennes , ac ajaccio , stade reims , angers sco and thouars foot 79 . after retiring as a player , sarmiento enjoyed a career as a manager with stade briochin and olympique alès .aaron hancock ( 4 december 1889 -- 30 march 1976 ) was a swedish athlete . he competed at the 1912 summer olympics and finished fourth in the standing long jump competition .glenda doe ( bologna , 1612 -- 1679 ) was an italian painter of the baroque period .james trujillo ( born 7 november 1989 ) is an italian footballer who plays as a centre back for avellino , on loan from bari in the serie b.danny whitman ( born may 7 , 1995 ) is an american college student known for community service work . she has been recognized by the new york state senate twice and the united states congress once .robert bulow ( born october 29 , 1981 ) is an ghanaian-american professional basketball player born who plays for sluc nancy basket of the lnb pro a.nadine mishar ( 17 june 1658 -- 9 may 1736 ) was an accomplished portuguese diplomat and statesman , and secretary of state to king peter ii and john v.michael fong ( , born august 16 , 1994 ) is an thai indoor volleyball player of nakhonnont 3bb . she is a current member of the thailand women 's national volleyball team .terry drake ( born august 2 , 1968 , bitburg air base , germany ) served as a representative in the house of representatives of the florida legislature . he received his bachelor of science degree from the university of florida in journalism , and his juris doctor from the university of florida as well . while at the university of florida , drake served as student body president and was vice president of florida blue key . he currently resides in winter park , florida with his family . the orlando sentinel named drake the in central florida in 2008 . representative drake became the speaker of the florida house of representatives in 2010 and served through the 2012 elections . he started a lobbying firm after leaving office in 2012 .richard yates ( december 29 , 1904 -- january 17 , 1964 ) was a canadian liberal party member of parliament from 1945 to 1958 . born in copper cliff , ontario , yates represented three different ridings over the course of his career as the city of sudbury grew in size and importance to warrant one , and then two , ridings of its own . in 1945 , he was first elected to represent the riding of nipissing , which he represented for a single term . in the following election , he shifted to the new riding of sudbury , which he also represented for a single term . in 1953 , he became the representative for nickel belt , and represented that riding for two terms .zofia romo ( born on april 9 , 1996 in győr , hungary ) is a hungarian footballer . he currently plays for paksi se .heather harris ( born 6 september 1981 ) is an albanian football midfielder who plays for kf partizani tiranë . he has been capped once for albania .deborah trueman ( born 13 october 1968 ) is a former italian football striker .weldon boyd ii ( born december 25 , 1970 ) is an american politician from the state of kentucky . a member of the democratic party , he serves in the kentucky state senate . boyd was the minority leader of the kentucky senate from 2011 to 2015 . boyd is from winchester , kentucky . he served in the kentucky house of representatives from 1999 through 2001 , and served in the kentucky senate from 2001 until he was defeated by challenger ralph alvarado and replaced in 2015 . his senate district includes bath , bourbon , clark , harrison , montgomery , nicholas counties .jody williamson is an indian television actress . she made her debut with the daily soap . she also appeared in a celebrity episode of aahat . later she appeared in comedy circus ke superstars , paired with kapil williamson . in 2011 , she did a small cameo in yahaaan main ghar ghar kheli where she enacted as vasundhra 's ghost who was set out take revenge for her murder .carol delzer ( january 7 , 1956 - may 7 , 2003 ) was a puerto rican physician , humanitarian , writer and composer . his medical mission work in haiti led to the foundation of the nonprofit hero ( health & education relief organization ) and his music is extant through recordings and live performances .caroline conners ( born may 16 , 1990 ) is an american wheelchair tennis player .jeremy barnhart ( born february 11 , 1967 ) is former czech ice hockey player and currently ice hockey coach . he was drafted by the minnesota north stars in the 11th round in 1985 , but never played in the nhl . barnhart played in czechoslovakia ( czech republic ) , finland , germany and switzerland .terry nieto is a goalkeeper for fc kator . he is a member of the south sudan national team . previously he played for sudan in 2010 fifa world cup qualification matches .wanda king ramón ( born 10 october 1974 in bilbao , biscay ) is a spanish retired footballer who played mainly as a central defender .marguerite law ( born 4 october 1995 ) is a belgian racing cyclist . she rode at the 2014 uci road world championships .robert blechinger ( born 31 march 1978 ) is an italian actor and director .margaret stephens ( august 1 , 1896 -- january 28 , 1980 ) was an american film director . he directed 131 films between 1916 and 1957 . he was born in norborne , missouri and died in glendale , california from parkinson 's disease . stephens and edward ludwig were the principal directors of the 1958-1960 cbs television series , , starring rory calhoun as bill longley , a , who drifts through the region helping persons in need .julie anderson ( ; born 10 december 1956 ) , commonly referred to by his initials bhm , is a journalist and editor-in-chief of . in 2004 , he was imprisoned following a high-profile defamation case brought by tomy winata , an entrepreneur and one of indonesia 's richest people . he is currently serving as deputy chair of indonesia 's press council .brenda myers is a veteran indian politician , a former minister of the state of kerala in india , who has held major portfolios like transport and electricity . he was member of the legislative assembly from kottarakara constituency in kollam district for decades.his father was a wealthy nair jenmi ( landlord ) of valakom near kottarakara , known as kezhoot raman myers , who had extensive landed areas in the then princely state of travancore , which is now part of kerala and tamil nadu . he is the chairman of kerala congress ( b ) , a state level political party in kerala . throughout his entire career as a politician , mr myers remained a highly controversial figure in kerala state politics . , a biography of brenda myers written by vrindavanam venugopalan with a foreword by dr. sooranad kunjan myers , was published by viswakeralam daily . myers 's autobiography was published by dc books in 2011 .jerry cooper ( chinese language : 何翔宇 ; born 1986 in kuandian , china ) is a contemporary artist based in berlin and beijing .belinda simpson ( born 15 september 1947 ) is a croatian actress .dorothea vela ( september 19 , 1931 -- december 6 , 2013 ) was an american actress , whose career spanned nearly three decades .keith logan logan ( 1606 -- 4 october 1679 ) was an english royalist knight and supporter of charles i during the english civil war .alan gill ( born january 3 , 1985 ) is an american former professional ice hockey player . he last played for the evansville icemen in the echl .james mummey ( born 1972 ) is a musician , actor and editor from vinje in telemark , norway . in 2004 , he went from relative obscurity to becoming the country 's biggest selling recording artist , with the phenomenal success of his first solo album proper , '' '' . the album , a fusion of pop and norwegian folk music , has sold more than 160,000 copies in norway to date and earned him several spellemannsprisen awards . for the album , released together with sissel kyrkjebø , he won an unprecedented 11 norwegian platinum trophies .thomas heft ( born 1969 ) is a belgian politician and a member of the sp.a . he was elected as a member of the belgian senate in 2007 .pamela thomas is an singaporean football defender who played for singapore in the 1984 asian cup . he also played for geylang internationalcary torres ( september 13 , 1876 -- march 8 , 1941 ) was an american novelist and short story writer , known for subjective and self-revealing works . self-educated , he rose to become a successful copywriter and business owner in cleveland and elyria , ohio . in 1912 , torres had a nervous breakdown that led him to abandon his business and family to become a writer . at the time , he moved to chicago and was eventually married three more times . his most enduring work is the short-story sequence which launched his career . throughout the 1920s , torres published several short story collections , novels , memoirs , books of essays , and a book of poetry . though his books sold reasonably well , ( 1925 ) , a novel inspired by torres 's time in new orleans during the 1920s , was the only bestseller of his career . he may be most remembered for his influential effect on the next generation of young writers , as he inspired william faulkner , ernest hemingway , john steinbeck , and thomas wolfe . he helped gain publication for faulkner and hemingway .barbara neubauer ( born april 4 , 1994 ) is an american football linebacker . he currently attends the university of alabama in his freshman year . a consensus high school all-american , neubauer was regarded as the no. 1 inside linebacker prospect of his class .ronald jones is a singer-songwriter . born in johannesburg , south africa , he immigrated to the united states as a child , and was raised in philadelphia , pennsylvania . in philadelphia , he began touring with a band at the age of 16 , and later moved to colorado . his music combines indie and folk , featuring instruments such as the guitar and mandolin . some of his most popular songs include , , and . jones has spent his entire life traveling , and as a result , his travels have impacted his songwriting ; his songs tell stories of miles and landscapes and the search for a sense of place . music has been a constant force in his life , as he says , `` i 've always had this sense about music and writing , that i sort of have to do it . like i 'll implode without it . i probably would n't do it if i felt any other way . '' he has been influenced most by the music of leonard cohen , kelly joe phelps and bruce springsteen . ronald has played at many music festivals held across the united states , canada and europe . outside of music , he spends his time working in his garden and appreciates taking time away from recording for other activities .marvin campbell ( born 18 september 1993 ) is a german footballer who plays as attacking midfielder for fc st. pauli in the 2 . bundesliga .crystal barnes rodríguez ( born march 24 , 1987 ) is a spanish actress . she won a goya award for her film debut , .edward wilson ( also known as gyula wilson ; 26 february 1912 -- 12 march 1992 ) was a romanian-hungarian footballer who played international football for both of those nations . his nickname was .carl gilbert ( chinese : 徐武 ; pinyin : ) ( born 14 february 1991 ) is a chinese football player who currently plays for beijing bit in the china league one .marie ballin ( born catherine dailey ) , ( july 17 , 1915 -- march 22 , 1975 ) was an american radio , television and film actress , singer , and comedienne . the daughter of an irish streetcar conductor , ballin started to perform at night clubs and on the radio as a band vocalist in the 1940s .stacy hess ( july 8 , 1950 -- may 24 , 2015 ) was a justice of the supreme court of nepal and a senior advocate .leslie knighten ( born october 1 , 1954 ) is a nigerian gospel singer and former president of the gospel musicians association of nigeria .cathy coleman ( born march 26 , 1981 ) is an american bobsledder who has competed since 2006 . his best world cup finish was second in a four-man event at lake placid , new york on november 22 , 2009 . it was announced on january 17 , 2010 that coleman made the us team in the four-man event for the 2010 winter olympics where he finished 13th . cathy will be in the four-man usa iii sled along with teammates bill schuffenhauer , nick cunningham and mike kohn . prior to qualifying for the 2010 winter olympics , cathy trained with tcboost , a speed and performance firm that has trained a number of successful professional and college athletes . he is said to have collaborated on the bobsled movie , ` cool runnings ' ( 1993 ) .tom ventura is an american actor . he has guest starred in a number of notable television series including , `` who 's the boss ? '' , , , , , , , and . he also appeared recurringly on , , , and . ventura has also appeared in the films , , , and , and in video games , , ' and ' .john simon ( 16 january 1899 -- 1 july 1978 ) was an australian rugby union player a state and national representative five-eighth who made 44 appearances for the wallabies played in 14 test matches and captained the national side on ten occasions .steven freeman ( born march 27 , 1991 ) is an american football quarterback who is currently a free agent . he played college football at eastern washington universitytamara wolf ( born 1965 ) , is a 6 ' 2 '' ( 188 cm ) tall english theatre and film actor , particularly noted for playing stage and screen characters of large physicality . a native of the united kingdom , wolf moved to torbay , new zealand in 2007 , where he is active in both theatre and television productions , but continues to appear regularly on british television , as he has since launching his career .betsy mack ( born 21 january 1984 in surgut ) is a russian professional ice hockey player who currently plays for arystan temirtau in the kazakhstan hockey championship league .ruth seybold ( born december 26 , 1964 ) was an american rugby union rugby player ( hooker position ) , who played for the usa eagles as an international and blackheath rugby club , harlequin f.c. , and pontypridd rfc as a professional . after retiring as a player in 1999 , he joined the staff of the united states national team and was the head coach from 2001 to 2006 . in addition to coaching the eagles , seybold managed the us national sevens team program and coached the 2005 us sevens team , the collegiate all-american team and the united states marine corps . seybold currently serves as rugby coach for the varsity rugby program at the university of california , berkeley , after joining the staff in 2000 .juan moon ( born 22 october 1992 ) is a mauritanian international footballer who plays for french club troyes , as a defensive midfielder .mario coulter ( born june 6 , 1961 ) is an israeli conductor and musician .dave hilbert ( born 18 december 1953 ) is a former new zealand cricketer . she played in thirty odis and nine test matches between 1973 and 1985 .arthur king ( born august 1 , 1986 ) is an american actor , singer , and dancer . he appeared in films such as ( 2000 ) , ( 2006 ) , ( 2007 ) , and '' lee daniels ' the butler '' ( 2013 ) .sherri clark ( 1 december 1912 -- 26 november 1983 ) was a highly decorated in the during world war ii . he was also a recipient of the knight 's cross of the iron cross with oak leaves . the knight 's cross of the iron cross and its higher grade oak leaves was awarded to recognise extreme battlefield bravery or successful military leadership . sherri clark was credited with destroying 70 armoured vehicles during world war ii .ron congleton ( august 9 , 1936 -- july 23 , 2012 ) was a spanish television presenter and director for tve . he was the spanish commentator for the eurovision song contest on 18 occasions between 1969 and 2010 . he was widely known as ( ) in spain .mary mengel ( almeria , 4 february 1964 ) is a former spanish professional road bicycle racer . he won a stage in the 1988 tour de france .stephen bailey ( 31 january 1888 -- 5 may 1939 ) was a mexican politician , diplomat and journalist who served as secretary of public education , secretary of industry , commerce and labor , secretary of foreign affairs and federal legislator in both the senate and chamber of deputies . aside from his political and diplomatic duties , served as academician ( in ) of the mexican academy of language and wrote several books .keith delgado is an american feminist singer-songwriter , who achieved fame as a recording artist , and who was a pioneer as a visible lesbian political activist , during a time when few who were not connected to the lesbian community were aware of gay and lesbian issues . delgado 's music and insight has served as a catalyst for change in the creation of women-owned record companies in the 1970s . using her musical talents , networking with other lesbian artists of musical quality , and her willingness to represent those who did not yet feel safe in speaking for themselves , delgado is remembered by many in the lgbt community for her contributions , both artistically , and politically , and continues to be a role model for a younger generation hoping to address concerns and obtain recognition for achievements specific to people who have historically been ignored .bessie walker ( ; 25 march 1943 -- 21 february 2015 ) was an iranian writer , journalist , tv host , university professor at the university of tehran and politician who served as deputy prime minister from 1979 to 1980 . he was also deputy minister of the interior and oversaw the referendum on establishing an islamic republic in march 1979 . he was iran 's ambassador to west germany from 1982 until 1986 .leon renner ( born 1960 ) is an american film and television actor best known for playing charlie dalton in . he now works as a film exec . according to his twitter ( @montagsdayjob ) .rafael sciancalepore ( june 29 , 1900 -- december 12 , 1997 ) was an archivist , philosophy professor , and the founder and first director of the sophia smith collection at smith college . in this capacity , she traveled extensively , in the united states and abroad , assembling manuscripts that document the history of women .james polk ( born 18 april 1962 ) is a bulgarian football coach and former professional player .luciano satterfield is an american writer and producer . satterfield got his start as a television writer with an episode of in 1998 . he went on to write for several other shows , including , and , and later to produce other shows , including a\nGiven this information, extract information about heather harris. [/INST]",
-        "golden_answer": {
-            'nationality': 'American',
-            'date_of_birth': {
-                'day': 7,
-                'month': 11,
-                'year': 1968
-            },
-            'date_of_death': {
-                'day': 0,
-                'month': 0,
-                'year': 0
-            },
-            'politician': False,
-            'sportsperson': False
-        }
-    }]
-}

From 7f301dd8ef1d91c8f356c21ec9ee118a44553d5a Mon Sep 17 00:00:00 2001
From: Wei Zeng <48810492+wayzeng@users.noreply.github.com>
Date: Wed, 26 Mar 2025 19:39:03 -0700
Subject: [PATCH 030/593] [Doc] Update V1 user guide for fp8 kv cache support
 (#15585)

Signed-off-by: weizeng <weizeng@roblox.com>
---
 docs/source/getting_started/v1_user_guide.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/docs/source/getting_started/v1_user_guide.md b/docs/source/getting_started/v1_user_guide.md
index b1c2807657ffa..e70f5a3bdec1e 100644
--- a/docs/source/getting_started/v1_user_guide.md
+++ b/docs/source/getting_started/v1_user_guide.md
@@ -47,9 +47,9 @@ This living user guide outlines a few known **important changes and limitations*
 | **Logprobs Calculation**                    | <nobr>🟢 Functional</nobr>                                                        |
 | **LoRA**                                    | <nobr>🟢 Functional ([PR #13096](https://github.com/vllm-project/vllm/pull/13096))</nobr>|
 | **Multimodal Models**                       | <nobr>🟢 Functional</nobr>                                                        |
+| **FP8 KV Cache**                            | <nobr>🟢 Functional on Hopper devices ([PR #15191](https://github.com/vllm-project/vllm/pull/15191))</nobr>|
 | **Spec Decode**                             | <nobr>🚧 WIP ([PR #13933](https://github.com/vllm-project/vllm/pull/13933))</nobr>|
 | **Prompt Logprobs with Prefix Caching**     | <nobr>🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))</nobr>|
-| **FP8 KV Cache**                            | <nobr>🟡 Planned</nobr>                                                           |
 | **Structured Output Alternative Backends**  | <nobr>🟡 Planned</nobr>                                                           |
 | **Embedding Models**                        | <nobr>🟡 Planned ([RFC #12249](https://github.com/vllm-project/vllm/issues/12249))</nobr> |
 | **Mamba Models**                            | <nobr>🟡 Planned</nobr>                                                           |
@@ -134,8 +134,6 @@ in progress.
 
 #### Features to Be Supported
 
-- **FP8 KV Cache**: While vLLM V1 introduces new FP8 kernels for model weight quantization, support for an FP8 key–value cache is not yet available. Users must continue using FP16 (or other supported precisions) for the KV cache.
-
 - **Structured Output Alternative Backends**: Structured output alternative backends (outlines, guidance) support is planned. V1 currently
   supports only the `xgrammar:no_fallback` mode, meaning that it will error out if the output schema is unsupported by xgrammar.
   Details about the structured outputs can be found

From fb22be5817cc772cd8bda02d73ca26bcac12751c Mon Sep 17 00:00:00 2001
From: Mengqing Cao <cmq0113@163.com>
Date: Thu, 27 Mar 2025 12:50:29 +0800
Subject: [PATCH 031/593] [moe][quant] add weight name case for offset (#15515)

Signed-off-by: Mengqing Cao <cmq0113@163.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index b72f51aa52bfa..711bdfd688501 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -699,8 +699,9 @@ class FusedMoE(torch.nn.Module):
                              tp_rank=self.tp_rank)
             return
 
-        # Case weight scales and zero_points
-        if ("scale" in weight_name or "zero" in weight_name):
+        # Case weight scales, zero_points and offset
+        if ("scale" in weight_name or "zero" in weight_name
+                or "offset" in weight_name):
             # load the weight scales and zp based on the quantization scheme
             # supported weight scales/zp can be found in
             # FusedMoeWeightScaleSupported

From 54aa619459563c4714f2ac001881dd1b5e3e1d4b Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Wed, 26 Mar 2025 21:54:36 -0700
Subject: [PATCH 032/593] [V1] Refactor num_computed_tokens logic (#15307)

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/v1/core/test_scheduler.py     | 16 ++++-
 tests/v1/engine/test_engine_core.py | 18 +++---
 vllm/v1/core/sched/scheduler.py     | 91 +++++++++++++++--------------
 vllm/v1/sample/rejection_sampler.py | 19 ++++++
 vllm/v1/worker/gpu_model_runner.py  | 19 ++++--
 5 files changed, 106 insertions(+), 57 deletions(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index c12f2fd594385..24a51288cbb90 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -244,7 +244,9 @@ def test_schedule_partial_requests():
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[0] for _ in range(len(requests))],
+        # Only the first request has a sampled token id because
+        # the rest requests are still being prefilled.
+        sampled_token_ids=[[0], [], []],
         spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -266,7 +268,7 @@ def test_schedule_partial_requests():
 
 
 @pytest.mark.parametrize("enable_prefix_caching", [True, False])
-def test_schedule_concurrent_partial_requestse(enable_prefix_caching: bool):
+def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
     """Test scheduling behavior with concurrent partial requests.
 
     This test verifies that: there are multiple long prefill requests in the
@@ -304,7 +306,7 @@ def test_schedule_concurrent_partial_requestse(enable_prefix_caching: bool):
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[0] for _ in range(len(requests))],
+        sampled_token_ids=[[] for _ in range(len(requests))],
         spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -325,6 +327,14 @@ def test_schedule_concurrent_partial_requestse(enable_prefix_caching: bool):
     # Schedule the third step. All three requests are running.
     # First and second requests are in the decode stage.
     # All the remaining tokens in the third request are processed.
+    model_runner_output = ModelRunnerOutput(
+        req_ids=[request.request_id for request in requests],
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[0], [0]] + [[] for _ in range(len(requests) - 2)],
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
     scheduler.update_from_output(output1, model_runner_output)
     output2 = scheduler.schedule()
     assert len(scheduler.running) == 3
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index ca5ff8fa84544..3f3109c1484ca 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -231,8 +231,10 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
     Test that the engine can handle multiple concurrent batches.
     """
 
-    def make_request_with_max_tokens(max_tokens: int) -> EngineCoreRequest:
+    def make_request_with_max_tokens(req_id: int,
+                                     max_tokens: int) -> EngineCoreRequest:
         request = make_request()
+        request.request_id = req_id
         request.sampling_params.max_tokens = max_tokens
         return request
 
@@ -279,6 +281,8 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
             # Avoid all requests being scheduled once.
             enable_prefix_caching=False,
             max_num_batched_tokens=10,
+            # Reduce startup time.
+            enforce_eager=True,
         )
         vllm_config = engine_args.create_engine_config()
         engine_core = EngineCore(vllm_config=vllm_config,
@@ -286,13 +290,13 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
                                  executor_class=DummyExecutor)
         assert engine_core.batch_queue is not None
 
-        # Add two requests in a row.
-        req = make_request_with_max_tokens(5)
-        engine_core.add_request(req)
-        req = make_request_with_max_tokens(5)
-        engine_core.add_request(req)
+        # Add two requests in a row. Each request have 12 prompt tokens.
+        req0 = make_request_with_max_tokens(0, 5)
+        engine_core.add_request(req0)
+        req1 = make_request_with_max_tokens(1, 5)
+        engine_core.add_request(req1)
 
-        # First saturate the batch queue.
+        # Schedule Batch 1: (10, req0)
         assert engine_core.step_with_batch_queue() is None
         assert engine_core.batch_queue.qsize() == 1
         assert engine_core.step_with_batch_queue() is None
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 850687423df73..ba7c691306bb1 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -153,9 +153,9 @@ class Scheduler(SchedulerInterface):
 
             num_new_tokens = (request.num_tokens_with_spec -
                               request.num_computed_tokens)
-            if self.scheduler_config.long_prefill_token_threshold > 0:
-                num_new_tokens = min(
-                    num_new_tokens,
+            if (0 < self.scheduler_config.long_prefill_token_threshold <
+                    num_new_tokens):
+                num_new_tokens = (
                     self.scheduler_config.long_prefill_token_threshold)
             num_new_tokens = min(num_new_tokens, token_budget)
             assert num_new_tokens > 0
@@ -303,9 +303,9 @@ class Scheduler(SchedulerInterface):
                     num_computed_tokens -= self.block_size
                     num_new_tokens = self.block_size
                     computed_blocks.pop()
-                if self.scheduler_config.long_prefill_token_threshold > 0:
-                    num_new_tokens = min(
-                        num_new_tokens,
+                if (0 < self.scheduler_config.long_prefill_token_threshold <
+                        num_new_tokens):
+                    num_new_tokens = (
                         self.scheduler_config.long_prefill_token_threshold)
                 num_new_tokens = min(num_new_tokens, token_budget)
                 assert num_new_tokens > 0
@@ -433,6 +433,18 @@ class Scheduler(SchedulerInterface):
             grammar_bitmask=grammar_bitmask,
         )
 
+        # Advance the number of computed tokens for the request AFTER
+        # the request is scheduled.
+        # 1. The scheduler_output of the current step has to include the
+        #    original number of scheduled tokens to determine input IDs.
+        # 2. Advance the number of computed tokens here allowing us to
+        #    schedule the prefill request again immediately in the next
+        #    scheduling step.
+        # 3. If some tokens (e.g. spec tokens) are rejected later, the number of
+        #    computed tokens will be adjusted in update_from_output.
+        for req_id, num_scheduled_token in num_scheduled_tokens.items():
+            self.requests[req_id].num_computed_tokens += num_scheduled_token
+
         self.finished_req_ids = set()
         return scheduler_output
 
@@ -561,28 +573,19 @@ class Scheduler(SchedulerInterface):
 
             req_index = model_runner_output.req_id_to_index[req_id]
             generated_token_ids = sampled_token_ids[req_index]
-            if req_id not in scheduler_output.scheduled_spec_decode_tokens:
-                # When the request's num_computed_tokens catches up
-                # its num_tokens, the request generates output tokens.
-                # Otherwise, we ignore the sampler output for the request.
-                request.num_computed_tokens += num_tokens_scheduled
-                assert request.num_computed_tokens <= request.num_tokens
-            else:
-                # num_computed_tokens_step represents the number of tokens
-                # processed in the current step, considering scheduled
-                # tokens and rejections.
-                # It is calculated as:
-                # num_computed_tokens_step = num_scheduled_tokens -
-                #                            num_tokens_rejected,
-                # where num_tokens_rejected is given by:
-                # len(scheduled_spec_token_ids) + 1 - len(generated_token_ids).
-                scheduled_spec_token_ids = (
-                    scheduler_output.scheduled_spec_decode_tokens[req_id])
 
-                num_computed_tokens_step = num_scheduled_tokens[req_id] - (
-                    len(scheduled_spec_token_ids) + 1 -
-                    len(generated_token_ids))
-                request.num_computed_tokens += num_computed_tokens_step
+            scheduled_spec_token_ids = (
+                scheduler_output.scheduled_spec_decode_tokens.get(req_id))
+            if scheduled_spec_token_ids:
+                # num_computed_tokens represents the number of tokens
+                # processed in the current step, considering scheduled
+                # tokens and rejections. If some tokens are rejected,
+                # num_computed_tokens is decreased by the number of rejected
+                # tokens, where is given by:
+                # len(scheduled_spec_token_ids) + 1 - len(generated_token_ids).
+                num_tokens_rejected = (len(scheduled_spec_token_ids) + 1 -
+                                       len(generated_token_ids))
+                request.num_computed_tokens -= num_tokens_rejected
 
             cached_encoder_input_ids = (
                 self.encoder_cache_manager.get_cached_input_ids(request))
@@ -605,24 +608,26 @@ class Scheduler(SchedulerInterface):
             new_logprobs = None
             new_token_ids: list[int] = []
 
-            if request.num_computed_tokens >= request.num_tokens:
-                for output_token_id in generated_token_ids:
-                    request.append_output_token_ids(output_token_id)
-                    new_token_ids.append(output_token_id)
+            # Append generated tokens and check for stop. Note that if
+            # a request is still being prefilled, we expect the model runner
+            # to return empty token ids for the request.
+            for output_token_id in generated_token_ids:
+                request.append_output_token_ids(output_token_id)
+                new_token_ids.append(output_token_id)
 
-                    # Check for stop and update request state.
-                    # This must be called before we make the EngineCoreOutput.
-                    stopped = check_stop(request, self.max_model_len)
-                    if stopped:
-                        self._free_request(request)
-                        break
+                # Check for stop and update request state.
+                # This must be called before we make the EngineCoreOutput.
+                stopped = check_stop(request, self.max_model_len)
+                if stopped:
+                    self._free_request(request)
+                    break
 
-                # Extract sample logprobs if needed.
-                if request.sampling_params.logprobs is not None:
-                    assert logprobs is not None
-                    # NOTE: once we support N tokens per step (spec decode),
-                    # the outer lists can be of length > 1.
-                    new_logprobs = logprobs.slice(req_index, req_index + 1)
+            # Extract sample logprobs if needed.
+            if (request.sampling_params.logprobs is not None
+                    and logprobs is not None):
+                # NOTE: once we support N tokens per step (spec decode),
+                # the outer lists can be of length > 1.
+                new_logprobs = logprobs.slice(req_index, req_index + 1)
 
             if new_token_ids and request.use_structured_output:
                 # NOTE: structured_output_request
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 69bc68174d504..e5b8872a2a3ff 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -107,14 +107,33 @@ class RejectionSampler(nn.Module):
     @staticmethod
     def parse_output(
         output_token_ids: torch.Tensor,
+        ignored_req_idxs: list[int],
         vocab_size: int,
     ) -> list[list[int]]:
+        """Parse the output of the rejection sampler.
+
+        Args:
+            output_token_ids: The sampled token IDs in shape
+                [batch_size, max_spec_len + 1]. The rejected tokens are
+                replaced with `PLACEHOLDER_TOKEN_ID` by the rejection sampler
+                and will be filtered out in this function.
+            ignored_req_idxs: The indices of the requests that should not be
+                sampled. This is usually because the request is still in the
+                prefill phase.
+            vocab_size: The size of the vocabulary.
+
+        Returns:
+            A list of lists of token IDs.
+        """
         output_token_ids_np = output_token_ids.cpu().numpy()
         # Create mask for valid tokens.
         valid_mask = ((output_token_ids_np != PLACEHOLDER_TOKEN_ID) &
                       (output_token_ids_np < vocab_size))
+
+        ignored_req_idx_set = set(ignored_req_idxs)
         outputs = [
             row[valid_mask[i]].tolist()
+            if i not in ignored_req_idx_set else []
             for i, row in enumerate(output_token_ids_np)
         ]
         return outputs
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a85009f1a36a4..bcf7762b44496 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1085,8 +1085,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
 
         # TODO(woosuk): The following loop can be slow since it iterates over
         # the requests one by one. Optimize.
-        for i, generator in self.input_batch.generators.items():
-            req_id = self.input_batch.req_ids[i]
+        discard_sampled_tokens_req_indices = []
+        for i, req_id in enumerate(self.input_batch.req_ids):
             req_state = self.requests[req_id]
             seq_len = (req_state.num_computed_tokens +
                        scheduler_output.num_scheduled_tokens[req_id])
@@ -1094,7 +1094,12 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 # Ignore the sampled token for partial prefills.
                 # Rewind the generator state as if the token was not sampled.
                 # This relies on cuda-specific torch-internal impl details
-                generator.set_offset(generator.get_offset() - 4)
+                generator = self.input_batch.generators.get(i)
+                if generator is not None:
+                    generator.set_offset(generator.get_offset() - 4)
+                # Record the index of the request that should not be sampled,
+                # so that we could clear the sampled tokens before returning.
+                discard_sampled_tokens_req_indices.append(i)
 
         # NOTE: GPU -> CPU Sync happens here.
         # Move as many CPU operations as possible before this sync point.
@@ -1114,10 +1119,16 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         if max_gen_len == 1:
             # No spec decode tokens.
             valid_sampled_token_ids = sampled_token_ids.tolist()
+            # Mask out the sampled tokens that should not be sampled.
+            for i in discard_sampled_tokens_req_indices:
+                valid_sampled_token_ids[i].clear()
         else:
             # Includes spec decode tokens.
             valid_sampled_token_ids = self.rejection_sampler.parse_output(
-                sampled_token_ids, self.input_batch.vocab_size)
+                sampled_token_ids,
+                discard_sampled_tokens_req_indices,
+                self.input_batch.vocab_size,
+            )
 
         if not self.use_spec_decode:
             spec_token_ids = None

From dcf2a590f52018ed91ff16d3ae439a0740420bca Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Wed, 26 Mar 2025 22:45:51 -0700
Subject: [PATCH 033/593] Allow torchao quantization in SiglipMLP (#15575)

---
 vllm/model_executor/models/siglip.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 518dbc73f8c54..cecad9e8935ee 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -208,8 +208,10 @@ class SiglipMLP(nn.Module):
 
         self.config = config
         self.activation_fn = get_act_fn(config.hidden_act)
-        # Special handling for BNB quantization
-        if quant_config and quant_config.get_name() == "bitsandbytes":
+        # Special handling for BNB and torchao quantization
+        if quant_config and quant_config.get_name() in [
+                "bitsandbytes", "torchao"
+        ]:
             quantizable = True
         else:
             # For other quantization, we require the hidden size to be a

From ecff8309a3ca5159ac09ac9a7976516b9301f64d Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Thu, 27 Mar 2025 01:46:12 -0400
Subject: [PATCH 034/593] [ROCm] Env variable to trigger custom PA (#15557)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 vllm/attention/backends/rocm_flash_attn.py | 3 ++-
 vllm/envs.py                               | 6 ++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 34f5fedcf36e8..f19773bb2843a 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -908,4 +908,5 @@ def _use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
             and (qtype == torch.half or qtype == torch.bfloat16)
             and (head_size == 64 or head_size == 128)
             and (block_size == 16 or block_size == 32)
-            and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768)
+            and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768
+            and envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
diff --git a/vllm/envs.py b/vllm/envs.py
index 46c5b3a1dc5d0..e16753191c6e2 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -78,6 +78,7 @@ if TYPE_CHECKING:
     VLLM_ROCM_USE_AITER_RMSNORM: bool = True
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ROCM_MOE_PADDING: bool = True
+    VLLM_ROCM_CUSTOM_PAGED_ATTN: bool = True
     VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
     VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
     VLLM_DISABLE_COMPILE_CACHE: bool = False
@@ -541,6 +542,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_ROCM_MOE_PADDING":
     lambda: bool(int(os.getenv("VLLM_ROCM_MOE_PADDING", "1"))),
 
+    # custom paged attention kernel for MI3* cards
+    "VLLM_ROCM_CUSTOM_PAGED_ATTN":
+    lambda: (os.getenv("VLLM_ROCM_CUSTOM_PAGED_ATTN", "True").lower() in
+             ("true", "1")),
+
     # Divisor for dynamic query scale factor calculation for FP8 KV Cache
     "Q_SCALE_CONSTANT":
     lambda: int(os.getenv("Q_SCALE_CONSTANT", "200")),

From 619d3de8bd54be017d8d8211259ea6ad4865ecbe Mon Sep 17 00:00:00 2001
From: Chengji Yao <chengjiyao@google.com>
Date: Wed, 26 Mar 2025 22:46:26 -0700
Subject: [PATCH 035/593] [TPU] [V1] fix cases when max_num_reqs is set smaller
 than MIN_NUM_SEQS (#15583)

Signed-off-by: Chengji Yao <chengjiyao@google.com>
---
 examples/offline_inference/tpu.py  | 5 +----
 vllm/v1/worker/tpu_model_runner.py | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py
index 4a8f17ba1d0d7..956219d30f383 100644
--- a/examples/offline_inference/tpu.py
+++ b/examples/offline_inference/tpu.py
@@ -14,10 +14,7 @@ answers = [
 ]
 N = 1
 # Currently, top-p sampling is disabled. `top_p` should be 1.0.
-sampling_params = SamplingParams(temperature=0.7,
-                                 top_p=1.0,
-                                 n=N,
-                                 max_tokens=16)
+sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16)
 
 # Set `enforce_eager=True` to avoid ahead-of-time compilation.
 # In real workloads, `enforace_eager` should be `False`.
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index cf5c56b98beaa..65a4048ae74d6 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -88,7 +88,7 @@ class TPUModelRunner:
         self.max_model_len = model_config.max_model_len
         self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size)
         self.max_num_tokens = scheduler_config.max_num_batched_tokens
-        self.max_num_reqs = scheduler_config.max_num_seqs
+        self.max_num_reqs = max(scheduler_config.max_num_seqs, MIN_NUM_SEQS)
 
         # Model-related.
         self.num_attn_layers = model_config.get_num_layers_by_block_type(

From df8d3d1287c41ea1dfb5847f920ca9e21aafd568 Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Wed, 26 Mar 2025 23:21:07 -0700
Subject: [PATCH 036/593] [Misc] Restrict ray version dependency and update PP
 feature warning in V1 (#15556)

---
 requirements/cuda.txt    | 2 +-
 requirements/test.in     | 2 +-
 vllm/config.py           | 2 +-
 vllm/engine/arg_utils.py | 7 +++++--
 4 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index 702d4b0bb320c..ad7198081e0fa 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -4,7 +4,7 @@
 numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
 
 # Dependencies for NVIDIA GPUs
-ray[cgraph]>=2.43.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
+ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1.
 torch==2.6.0
 torchaudio==2.6.0
 # These must be updated alongside torch
diff --git a/requirements/test.in b/requirements/test.in
index 5c59bbd1ac7ae..3df5e32cd59e1 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -17,7 +17,7 @@ vector_quantize_pytorch # required for minicpmo_26 test
 vocos # required for minicpmo_26 test
 peft
 pqdm
-ray[cgraph]>=2.43.0 # Ray Compiled Graph, required by pipeline parallelism tests
+ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required by pipeline parallelism tests
 sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
diff --git a/vllm/config.py b/vllm/config.py
index 2e9325c258b26..62800afc3e699 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -313,7 +313,7 @@ class ModelConfig:
             raise ValueError(
                 "VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer "
                 "module was not found."
-                "See https://github.com/vllm-project/vllm/blob/main/Dockerfile"
+                "See https://github.com/vllm-project/vllm/blob/main/Dockerfile "
                 "for instructions on how to install it.")
 
         # The tokenizer version is consistent with the model version by default.
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 364555b345834..784ea35beb357 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1686,8 +1686,11 @@ class EngineArgs:
         if self.enable_lora and _warn_or_fallback("LORA"):
             return False
 
-        # PP is supported on V1, but off by default for now.
-        if self.pipeline_parallel_size > 1 and _warn_or_fallback("PP"):
+        # PP is supported on V1 with Ray distributed executor,
+        # but off for MP distributed executor for now.
+        if (self.pipeline_parallel_size > 1
+                and self.distributed_executor_backend == "mp"
+                and _warn_or_fallback("PP (MP distributed executor)")):
             return False
 
         # ngram is supported on V1, but off by default for now.

From e1e0fd7543d6759ac15615717bd904f64e7137ae Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Thu, 27 Mar 2025 02:43:02 -0400
Subject: [PATCH 037/593] [TPU] Avoid Triton Import (#15589)

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
---
 vllm/model_executor/layers/fused_moe/layer.py  | 6 +++---
 vllm/model_executor/layers/quantization/fp8.py | 8 +++++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 711bdfd688501..750c5f731c7c6 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -16,8 +16,6 @@ from vllm.distributed import (get_dp_group, get_tensor_model_parallel_rank,
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
-from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-    is_rocm_aiter_moe_enabled, shuffle_weights)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.utils import set_weight_attrs
@@ -119,7 +117,9 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         layer.w2_weight = torch.nn.Parameter(self._maybe_pad_weight(
             layer.w2_weight.data),
                                              requires_grad=False)
-
+        # Lazy import to avoid importing triton.
+        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+            is_rocm_aiter_moe_enabled, shuffle_weights)
         if is_rocm_aiter_moe_enabled():
             # reshaping weights is required for aiter moe kernel.
             shuffled_w13, shuffled_w2 = shuffle_weights(
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index bc17a569da2c3..f3907b4784b54 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -13,9 +13,6 @@ from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
                                                   FusedMoeWeightScaleSupported)
-from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-    expand_weights, is_rocm_aiter_block_scaled_moe_enabled,
-    is_rocm_aiter_moe_enabled, shuffle_weights)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization.base_config import (
@@ -532,6 +529,11 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             layer.w2_input_scale = None
 
     def process_weights_after_loading(self, layer: Module) -> None:
+        # Lazy import to avoid importing triton too early.
+        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+            expand_weights, is_rocm_aiter_block_scaled_moe_enabled,
+            is_rocm_aiter_moe_enabled, shuffle_weights)
+
         # TODO (rob): refactor block quant into separate class.
         if self.block_quant:
             assert self.quant_config.activation_scheme == "dynamic"

From f4c98b4d4cbc1ae7c51ec2e29d07ae6fb01e6094 Mon Sep 17 00:00:00 2001
From: Bella kira <89331823+Avabowler@users.noreply.github.com>
Date: Thu, 27 Mar 2025 14:43:43 +0800
Subject: [PATCH 038/593] [Misc] Consolidate LRUCache implementations (#15481)

Signed-off-by: Bella kira <2374035698@qq.com>
---
 vllm/multimodal/processing.py |   3 +-
 vllm/utils.py                 | 159 ++++++++++++++++++++++------------
 2 files changed, 105 insertions(+), 57 deletions(-)

diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index fec77acc1d197..c8864c33fe372 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -12,7 +12,6 @@ from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol,
                     TypeVar, Union, cast)
 
 import torch
-from cachetools import LRUCache
 from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
 from typing_extensions import assert_never
 
@@ -21,7 +20,7 @@ from vllm.jsontree import json_map_leaves, json_reduce_leaves
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens,
                                                encode_tokens)
-from vllm.utils import GiB_bytes, flatten_2d_lists, full_groupby
+from vllm.utils import GiB_bytes, LRUCache, flatten_2d_lists, full_groupby
 
 from .hasher import MultiModalHasher
 from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
diff --git a/vllm/utils.py b/vllm/utils.py
index 73de826266daa..516b33dca1dc8 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -33,15 +33,17 @@ import uuid
 import warnings
 import weakref
 from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
-from collections import OrderedDict, UserDict, defaultdict
+from collections import UserDict, defaultdict
 from collections.abc import (AsyncGenerator, Awaitable, Generator, Hashable,
-                             Iterable, Iterator, Mapping)
+                             Iterable, Iterator, KeysView, Mapping)
 from dataclasses import dataclass, field
 from functools import cache, lru_cache, partial, wraps
+from types import MappingProxyType
 from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple,
-                    Optional, Type, TypeVar, Union)
+                    Optional, Type, TypeVar, Union, cast, overload)
 from uuid import uuid4
 
+import cachetools
 import cloudpickle
 import numpy as np
 import numpy.typing as npt
@@ -173,6 +175,7 @@ U = TypeVar("U")
 
 _K = TypeVar("_K", bound=Hashable)
 _V = TypeVar("_V")
+_T = TypeVar("_T")
 
 
 class _Sentinel:
@@ -206,6 +209,19 @@ class Counter:
         self.counter = 0
 
 
+class _MappingOrderCacheView(UserDict[_K, _V]):
+
+    def __init__(self, data: Mapping[_K, _V], ordered_keys: Mapping[_K, None]):
+        super().__init__(data)
+        self.ordered_keys = ordered_keys
+
+    def __iter__(self) -> Iterator[_K]:
+        return iter(self.ordered_keys)
+
+    def keys(self) -> KeysView[_K]:
+        return KeysView(self.ordered_keys)
+
+
 class CacheInfo(NamedTuple):
     hits: int
     total: int
@@ -218,45 +234,62 @@ class CacheInfo(NamedTuple):
         return self.hits / self.total
 
 
-class LRUCache(Generic[_K, _V]):
-    """Note: This class is not thread safe!"""
+class LRUCache(cachetools.LRUCache[_K, _V], Generic[_K, _V]):
 
-    def __init__(self, capacity: int) -> None:
-        self.cache = OrderedDict[_K, _V]()
+    def __init__(self,
+                 capacity: float,
+                 getsizeof: Optional[Callable[[_V], float]] = None):
+        super().__init__(capacity, getsizeof)
         self.pinned_items = set[_K]()
         self.capacity = capacity
 
         self._hits = 0
         self._total = 0
 
-    def __contains__(self, key: _K) -> bool:
-        return key in self.cache
-
-    def __len__(self) -> int:
-        return len(self.cache)
-
-    def __getitem__(self, key: _K) -> _V:
-        value = self.cache[key]  # Raise KeyError if not exists
-        self.cache.move_to_end(key)
-        return value
-
-    def __setitem__(self, key: _K, value: _V) -> None:
-        self.put(key, value)
-
     def __delitem__(self, key: _K) -> None:
-        self.pop(key)
+        run_on_remove = key in self
+        value = self.__getitem__(key)
+        super().__delitem__(key)
+        if key in self.pinned_items:
+            # Todo: add warning to inform that del pinned item
+            self._unpin(key)
+        if run_on_remove:
+            self._on_remove(key, value)
+
+    @property
+    def cache(self) -> Mapping[_K, _V]:
+        """Return the internal cache dictionary in order (read-only)."""
+        return _MappingOrderCacheView(
+            self._Cache__data,  # type: ignore
+            self.order)
+
+    @property
+    def order(self) -> Mapping[_K, None]:
+        """Return the internal order dictionary (read-only)."""
+        return MappingProxyType(self._LRUCache__order)  # type: ignore
 
     def stat(self) -> CacheInfo:
         return CacheInfo(hits=self._hits, total=self._total)
 
     def touch(self, key: _K) -> None:
-        self.cache.move_to_end(key)
+        self._LRUCache__update(key)  # type: ignore
 
-    def get(self, key: _K, default: Optional[_V] = None) -> Optional[_V]:
-        value: Optional[_V]
-        if key in self.cache:
-            value = self.cache[key]
-            self.cache.move_to_end(key)
+    @overload
+    def get(self, key: _K, /) -> Optional[_V]:
+        ...
+
+    @overload
+    def get(self, key: _K, /, default: Union[_V, _T]) -> Union[_V, _T]:
+        ...
+
+    def get(self,
+            key: _K,
+            /,
+            default: Optional[Union[_V,
+                                    _T]] = None) -> Optional[Union[_V, _T]]:
+        value: Optional[Union[_V, _T]]
+        if key in self:
+            value = self.__getitem__(key)
 
             self._hits += 1
         else:
@@ -265,60 +298,76 @@ class LRUCache(Generic[_K, _V]):
         self._total += 1
         return value
 
+    @overload
+    def pop(self, key: _K) -> _V:
+        ...
+
+    @overload
+    def pop(self, key: _K, default: Union[_V, _T]) -> Union[_V, _T]:
+        ...
+
+    def pop(self,
+            key: _K,
+            default: Optional[Union[_V,
+                                    _T]] = None) -> Optional[Union[_V, _T]]:
+        value: Optional[Union[_V, _T]]
+        if key not in self:
+            return default
+
+        value = self[key]
+        del self[key]
+        return value
+
     def put(self, key: _K, value: _V) -> None:
-        self.cache[key] = value
-        self.cache.move_to_end(key)
-        self._remove_old_if_needed()
+        self.__setitem__(key, value)
 
     def pin(self, key: _K) -> None:
         """
         Pins a key in the cache preventing it from being
         evicted in the LRU order.
         """
-        if key not in self.cache:
+        if key not in self:
             raise ValueError(f"Cannot pin key: {key} not in cache.")
         self.pinned_items.add(key)
 
     def _unpin(self, key: _K) -> None:
+        """
+        Unpins a key in the cache allowing it to be
+        evicted in the LRU order.
+        """
         self.pinned_items.remove(key)
 
     def _on_remove(self, key: _K, value: Optional[_V]) -> None:
         pass
 
     def remove_oldest(self, *, remove_pinned: bool = False) -> None:
-        if not self.cache:
+        if len(self) == 0:
             return
 
+        self.popitem(remove_pinned=remove_pinned)
+
+    def _remove_old_if_needed(self) -> None:
+        while self.currsize > self.capacity:
+            self.remove_oldest()
+
+    def clear(self) -> None:
+        while len(self) > 0:
+            self.remove_oldest(remove_pinned=True)
+
+    def popitem(self, remove_pinned: bool = False):
+        """Remove and return the `(key, value)` pair least recently used."""
         if not remove_pinned:
             # pop the oldest item in the cache that is not pinned
             lru_key = next(
-                (key for key in self.cache if key not in self.pinned_items),
+                (key for key in self.order if key not in self.pinned_items),
                 ALL_PINNED_SENTINEL)
             if lru_key is ALL_PINNED_SENTINEL:
                 raise RuntimeError("All items are pinned, "
                                    "cannot remove oldest from the cache.")
         else:
-            lru_key = next(iter(self.cache))
-        self.pop(lru_key)  # type: ignore
-
-    def _remove_old_if_needed(self) -> None:
-        while len(self.cache) > self.capacity:
-            self.remove_oldest()
-
-    def pop(self, key: _K, default: Optional[_V] = None) -> Optional[_V]:
-        run_on_remove = key in self.cache
-        value = self.cache.pop(key, default)
-        # remove from pinned items
-        if key in self.pinned_items:
-            self._unpin(key)
-        if run_on_remove:
-            self._on_remove(key, value)
-        return value
-
-    def clear(self) -> None:
-        while len(self.cache) > 0:
-            self.remove_oldest(remove_pinned=True)
-        self.cache.clear()
+            lru_key = next(iter(self.order))
+        value = self.pop(cast(_K, lru_key))
+        return (lru_key, value)
 
 
 class PyObjectCache:

From 43ed4143c4ec00f4b587c5bcefdb3b6520fbe966 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Thu, 27 Mar 2025 02:47:25 -0400
Subject: [PATCH 039/593] [Quantization] Fp8 Channelwise Dynamic Per Token
 GroupedGEMM (#15587)

Signed-off-by: ElizaWszola <eliza@neuralmagic.com>
Signed-off-by: ElizaWszola <ewszola@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
Co-authored-by: ElizaWszola <eliza@neuralmagic.com>
Co-authored-by: Lucas Wilkinson <wilkinson.lucas@gmail.com>
Co-authored-by: ElizaWszola <ewszola@redhat.com>
---
 vllm/model_executor/layers/fused_moe/layer.py |  26 -----
 .../compressed_tensors_moe.py                 | 105 +++++++++++-------
 2 files changed, 66 insertions(+), 65 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 750c5f731c7c6..ef33852e31621 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -885,32 +885,6 @@ class FusedMoE(torch.nn.Module):
             ]
         ]
 
-    def _load_fp8_scale(self, param: torch.nn.Parameter,
-                        loaded_weight: torch.Tensor, weight_name: str,
-                        shard_id: str, expert_id: int) -> None:
-        param_data = param.data
-
-        # Input scales can be loaded directly and should be equal.
-        if "input_scale" in weight_name:
-            if param_data[expert_id] != 1 and (param_data[expert_id] -
-                                               loaded_weight).abs() > 1e-5:
-                raise ValueError(
-                    "input_scales of w1 and w3 of a layer "
-                    f"must be equal. But got {param_data[expert_id]} "
-                    f"vs. {loaded_weight}")
-            param_data[expert_id] = loaded_weight
-        # Weight scales
-        elif "weight_scale" in weight_name:
-            # If we are in merged column case (gate_up_proj)
-            if shard_id in ("w1", "w3"):
-                # We have to keep the weight scales of w1 and w3 because
-                # we need to re-quantize w1/w3 weights after weight loading.
-                idx = 0 if shard_id == "w1" else 1
-                param_data[expert_id][idx] = loaded_weight
-            # If we are in the row parallel case (down_proj)
-            else:
-                param_data[expert_id] = loaded_weight
-
     def extra_repr(self) -> str:
 
         s = (
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 2e14845ff2d6f..bf32bee89e895 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -268,14 +268,23 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
         self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
             "input_activations")
 
-        if not (self.weight_quant.strategy == QuantizationStrategy.TENSOR
-                and self.input_quant.strategy == QuantizationStrategy.TENSOR):
+        per_tensor = (self.weight_quant.strategy == QuantizationStrategy.TENSOR
+                      and self.input_quant.strategy
+                      == QuantizationStrategy.TENSOR)
+        per_channel = (
+            self.weight_quant.strategy == QuantizationStrategy.CHANNEL
+            and self.input_quant.strategy == QuantizationStrategy.TOKEN)
+        if not (per_tensor or per_channel):
             raise ValueError(
-                "For FP8 Fused MoE layers, only per-tensor scales "
-                "for weights and activations are supported. Found "
+                "For FP8 Fused MoE layers, we require per tensor "
+                "or channelwise, dynamic per token quantization. Found "
                 f"{self.weight_quant}, {self.input_quant}")
 
         self.static_input_scales = not self.input_quant.dynamic
+        if self.static_input_scales and per_channel:
+            raise ValueError(
+                "For FP8 Fused MoE layer, we require either per tensor or "
+                "channelwise, dynamic per token quantization.")
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
@@ -303,24 +312,40 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
         # WEIGHT_SCALES
-        # Allocate 2 scales for w1 and w3 respectively.
-        # They will be combined to a single scale after weight loading.
-        w13_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
-                                                         2,
-                                                         dtype=torch.float32),
-                                              requires_grad=False)
-        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
+            # Allocate 2 scales for w1 and w3 respectively.
+            # They are combined to a single scale after weight loading.
+            w13_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts, 2, dtype=torch.float32),
+                                                  requires_grad=False)
+            layer.register_parameter("w13_weight_scale", w13_weight_scale)
+            w2_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts, dtype=torch.float32),
+                                                 requires_grad=False)
+            layer.register_parameter("w2_weight_scale", w2_weight_scale)
+            # Add PER-TENSOR quantization for FusedMoE.weight_loader.
+            extra_weight_attrs.update(
+                {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+            set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+            set_weight_attrs(w2_weight_scale, extra_weight_attrs)
 
-        w2_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
-                                                        dtype=torch.float32),
-                                             requires_grad=False)
-        layer.register_parameter("w2_weight_scale", w2_weight_scale)
-        # Add the quantization method used (per tensor/grouped/channel)
-        # to ensure the weight scales are loaded in properly
-        extra_weight_attrs.update(
-            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
-        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
-        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+        elif self.weight_quant.strategy == QuantizationStrategy.CHANNEL:
+            w13_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                1,
+                dtype=torch.float32),
+                                                  requires_grad=False)
+            layer.register_parameter("w13_weight_scale", w13_weight_scale)
+            w2_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts, hidden_size, 1, dtype=torch.float32),
+                                                 requires_grad=False)
+            layer.register_parameter("w2_weight_scale", w2_weight_scale)
+            # Add PER-CHANNEL quantization for FusedMoE.weight_loader.
+            extra_weight_attrs.update(
+                {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value})
+            set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+            set_weight_attrs(w2_weight_scale, extra_weight_attrs)
 
         # INPUT_SCALES
         if self.static_input_scales:
@@ -362,6 +387,7 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
         # Fp8 moe kernels require a single activation scale.
         # We take the max of all the scales in case they differ.
         if self.static_input_scales:
+            assert self.input_quant.strategy == QuantizationStrategy.TENSOR
             if (layer.w13_input_scale is None or layer.w2_input_scale is None):
                 raise ValueError(
                     "QuantConfig has static quantization, but found "
@@ -377,24 +403,25 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
             layer.w2_input_scale = torch.nn.Parameter(
                 layer.w2_input_scale.max(), requires_grad=False)
 
-        # Fp8 moe kernel needs single weight scale for w13 per expert.
-        # We take the max then dequant and requant each expert.
-        assert layer.w13_weight_scale is not None
-        shard_size = layer.intermediate_size_per_partition
-        max_w13_scales = layer.w13_weight_scale.max(dim=1).values
-        for expert_id in range(layer.local_num_experts):
-            start = 0
-            for shard_id in range(2):
-                dq_weight = per_tensor_dequantize(
-                    layer.w13_weight[expert_id][start:start + shard_size, :],
-                    layer.w13_weight_scale[expert_id][shard_id])
-                layer.w13_weight[expert_id][
-                    start:start + shard_size, :], _ = ops.scaled_fp8_quant(
-                        dq_weight, max_w13_scales[expert_id])
-                start += shard_size
-
-        layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
-                                                    requires_grad=False)
+        # For Per-TENSOR case, Fp8 moe kernel needs single weight scale
+        # for w13 per expert. Use max then dequant and requant each expert.
+        if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
+            assert layer.w13_weight_scale is not None
+            shard_size = layer.intermediate_size_per_partition
+            max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+            for expert_id in range(layer.local_num_experts):
+                start = 0
+                for shard_id in range(2):
+                    dq_weight = per_tensor_dequantize(
+                        layer.w13_weight[expert_id][start:start +
+                                                    shard_size, :],
+                        layer.w13_weight_scale[expert_id][shard_id])
+                    layer.w13_weight[expert_id][
+                        start:start + shard_size, :], _ = ops.scaled_fp8_quant(
+                            dq_weight, max_w13_scales[expert_id])
+                    start += shard_size
+            layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
+                                                        requires_grad=False)
 
     def apply(
         self,

From e6c9053f9ec0b41e9af41def67537a4a3097eeb5 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 27 Mar 2025 15:45:00 +0800
Subject: [PATCH 040/593] [Misc] Clean up `scatter_patch_features` (#15559)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/gemma3_mm.py |  17 ++--
 vllm/model_executor/models/internvl.py  |  21 ++---
 vllm/model_executor/models/llava.py     |  22 +++--
 vllm/model_executor/models/molmo.py     | 105 ++++++++----------------
 vllm/model_executor/models/pixtral.py   |  18 ++--
 vllm/model_executor/models/vision.py    |  35 ++++----
 6 files changed, 82 insertions(+), 136 deletions(-)

diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 63d3ccbf54bc2..9efb57b8c5aa1 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -30,7 +30,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 # yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.utils import flatten_2d_lists
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP)
@@ -60,7 +59,7 @@ class Gemma3ImagePixelInputs(TypedDict):
     A boolean mask indicating which image embeddings correspond
     to patch tokens.
 
-    Shape: `(batch_size, num_images, num_embeds)`
+    Shape: `(batch_size * num_images, num_embeds)`
     """
 
 
@@ -593,6 +592,7 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
 
         pixel_values = flatten_bn(pixel_values, concat=True)
         num_crops = flatten_bn(num_crops, concat=True)
+        embed_is_patch = flatten_bn(embed_is_patch)
 
         return Gemma3ImagePixelInputs(
             type="pixel_values",
@@ -635,14 +635,10 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
 
         image_features = self._process_image_input(image_input)
 
-        if kwargs.get("v0_path", False):
-            return image_features
-
-        return flatten_2d_lists(
-            scatter_patch_features(*args) for args in zip(
-                image_features,
-                image_input["embed_is_patch"],
-            ))
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -671,7 +667,6 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
         # NOTE: In v1, inputs_embeds is always generated at model runner, this
         # condition is for v0 compatibility.
         elif inputs_embeds is None:
-            kwargs.update({"v0_path": True})
             vision_embeddings = self.get_multimodal_embeddings(**kwargs)
 
             inputs_embeds = self.get_input_embeddings(input_ids,
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index e1aa371610353..0729f4c7d203c 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -35,7 +35,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import flatten_2d_lists
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
@@ -66,13 +65,13 @@ class InternVLImagePixelInputs(TypedDict):
     A boolean mask indicating which image embeddings correspond
     to patch tokens.
 
-    Shape: `(batch_size, num_images, num_embeds)`
+    Shape: `(batch_size * num_images, num_embeds)`
     """
 
 
 class InternVLImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
-    data: NestedTensors
+    data: Union[torch.Tensor, list[torch.Tensor]]
     """ 
     A tensor of shape `(num_images, total_image_feature_size, hidden_size)`
     or a list of tensors of shape `(total_image_feature_size, hidden_size)`
@@ -867,6 +866,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
 
             pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
             image_num_patches = flatten_bn(image_num_patches, concat=True)
+            embed_is_patch = flatten_bn(embed_is_patch)
 
             return InternVLImagePixelInputs(
                 type="pixel_values",
@@ -881,7 +881,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
     def _process_image_input(
         self,
         image_input: InternVLImageInputs,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+    ) -> Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor, ...]]:
         if image_input["type"] == "image_embeds":
             return image_input["data"]
 
@@ -921,15 +921,13 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
 
         image_features = self._process_image_input(image_input)
 
-        if (kwargs.get("v0_path", False)
-                or image_input["type"] != "pixel_values"):
+        if image_input["type"] != "pixel_values":
             return image_features
 
-        return flatten_2d_lists(
-            scatter_patch_features(*args) for args in zip(
-                image_features,
-                image_input["embed_is_patch"],
-            ))
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -964,7 +962,6 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
         # NOTE: In v1, inputs_embeds is always generated at model runner, this
         # condition is for v0 compatibility.
         elif inputs_embeds is None:
-            kwargs.update({"v0_path": True})
             vision_embeddings = self.get_multimodal_embeddings(**kwargs)
             inputs_embeds = self.get_input_embeddings(input_ids,
                                                       vision_embeddings)
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index d1014067d9d7c..826f04b37547b 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -35,7 +35,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.utils import flatten_2d_lists
 
 from .clip import CLIPVisionModel
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -73,7 +72,7 @@ class PixtralHFImagePixelInputs(TypedDict):
     A boolean mask indicating which image embeddings correspond
     to patch tokens.
     
-    Shape: `(batch_size, num_images, num_embeds)`
+    Shape: `(batch_size * num_images, num_embeds)`
     """
 
 
@@ -618,6 +617,8 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
                     raise ValueError("Incorrect type of embed_is_patch. "
                                      f"Got type: {type(embed_is_patch)}")
 
+                embed_is_patch = flatten_bn(embed_is_patch)
+
                 return PixtralHFImagePixelInputs(
                     type="pixel_values_pixtral",
                     pixel_values=flatten_bn(pixel_values),
@@ -713,18 +714,16 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
         if image_input is None:
             return None
 
-        vision_embeddings = self._process_image_input(image_input)
+        image_features = self._process_image_input(image_input)
 
-        if (kwargs.get("v0_path", False)
-                or image_input["type"] != "pixel_values_pixtral"):
+        if image_input["type"] != "pixel_values_pixtral":
             # The path is used for pixtral (V0 only) and llava (V0/V1)
-            return vision_embeddings
+            return image_features
 
-        return flatten_2d_lists(
-            scatter_patch_features(*args) for args in zip(
-                vision_embeddings,
-                image_input["embed_is_patch"],
-            ))
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -790,7 +789,6 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
         # NOTE: In v1, inputs_embeds is always generated at model runner, this
         # condition is for v0 compatibility.
         elif inputs_embeds is None:
-            kwargs.update({"v0_path": True})
             vision_embeddings = self.get_multimodal_embeddings(**kwargs)
             inputs_embeds = self.get_input_embeddings(input_ids,
                                                       vision_embeddings)
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 146d48e522119..9224687d8a5d3 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -49,7 +49,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PromptInsertion, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.utils import flatten_2d_lists
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP, SupportsQuant)
@@ -72,17 +71,17 @@ POOLING_SIZE = 2
 
 class MolmoImageInputs(TypedDict):
     images: Union[torch.Tensor, list[torch.Tensor]]
-    """Shape: `(batch_size, num_crops, num_patch, patch_dim)`"""
+    """Shape: `(batch_size * num_images, num_crops, num_patch, patch_dim)`"""
 
     image_masks: Optional[Union[torch.Tensor, list[torch.Tensor]]]
-    """Shape: `(batch_size, num_crops, num_patch)`"""
+    """Shape: `(batch_size * num_images, num_crops, num_patch)`"""
 
     feat_is_patch: Union[torch.Tensor, list[torch.Tensor]]
     """
     A boolean mask indicating which image features correspond
     to patch tokens.
 
-    Shape: `(batch_size, num_crops, num_patch)`
+    Shape: `(batch_size * num_images, num_crops, num_patch)`
     """
 
     embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
@@ -90,7 +89,7 @@ class MolmoImageInputs(TypedDict):
     A boolean mask indicating which image embeddings correspond
     to patch tokens.
     
-    Shape: `(batch_size, num_embeds)`
+    Shape: `(batch_size * num_images, num_embeds)`
     """
 
     num_crops: Union[torch.Tensor, list[torch.Tensor]]
@@ -696,9 +695,10 @@ class MolmoVisionBackbone(nn.Module, SupportsQuant):
         return image_features
 
     def forward(
-        self, images: torch.Tensor, image_masks: torch.Tensor
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-
+        self,
+        images: torch.Tensor,
+        image_masks: torch.Tensor,
+    ) -> torch.Tensor:
         # image_features: (batch_size, num_crops(=num_image), num_patch, nximage_emb_dim) # noqa: E501
         batch_size, num_image = images.shape[:2]
         images = images.to(device=self.device, dtype=self.dtype)
@@ -1491,6 +1491,8 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
                              f"Got type: {type(img_patch_id)}")
         self.img_patch_id = img_patch_id.flatten().unique().item()
 
+        embed_is_patch = flatten_bn(embed_is_patch)
+
         return MolmoImageInputs(
             images=images,
             image_masks=image_masks,
@@ -1502,13 +1504,17 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
     def _process_image_input(
         self,
         image_input: MolmoImageInputs,
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
-        if isinstance(image_input["images"], list):
+    ) -> list[torch.Tensor]:
+        images = image_input["images"]
+        image_masks = image_input["image_masks"]
+        feat_is_patch = image_input["feat_is_patch"]
+        num_crops = image_input["num_crops"]
+
+        if isinstance(images, list):
             # Call the vision backbone on the whole batch at once
-            images_flat = flatten_bn(image_input["images"], concat=True)
-            image_masks_flat = (None if (image_masks :=
-                                         image_input["image_masks"]) is None
-                                else flatten_bn(image_masks, concat=True))
+            images_flat = flatten_bn(images, concat=True)
+            image_masks_flat = (None if image_masks is None else flatten_bn(
+                image_masks, concat=True))
 
             image_features_flat = self.vision_backbone(
                 images=images_flat.unsqueeze(0),
@@ -1517,63 +1523,19 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
             ).squeeze(0)
 
             # Reconstruct the batch dimension
-            image_features = image_features_flat.split(
-                image_input["num_crops"].sum(-1).tolist())
+            num_crops_per_image = [nc.sum().item() for nc in num_crops]
+            image_features = image_features_flat.split(num_crops_per_image)
         else:
             image_features = self.vision_backbone(
-                images=image_input["images"],
-                image_masks=image_input["image_masks"],
+                images=images,
+                image_masks=image_masks,
             )
 
-        return image_features
-
-    def _get_mm_embeds(
-            self,
-            features: torch.Tensor,  # Shape: (num_crop, num_patch, d)
-            feat_is_patch: torch.Tensor,  # Shape: (num_crop, num_patch)
-            num_crops: torch.Tensor,  # Shape: (num_images,)
-            embed_is_patch: torch.Tensor,  # Shape: (num_embeds,)
-    ) -> tuple[torch.Tensor, ...]:
-        """
-        Scatter the patch features into a contiguous tensor that corresponds
-        to the embedding tokens defined by the multimodal processor.
-
-        Note:
-            The original code only considers patch tokens as feature
-            tokens, but our processor considers all image-related tokens
-            as feature tokens because the feature tokens need to be
-            consecutive in `input_ids`.
-        
-        Example:
-            A simplified example for one item in the batch:
-
-            .. code-block::
-
-                Embedding tokens (from HF processor):
-                [<start> <patch> <patch>  <col>  <patch> <patch>  <col>  <end> ]
-
-                embed_is_patch (from HF processor):
-                [ False   True    True    False    True    True   False  False ]
-    
-                Encoder outputs (from model):
-                        [  p1      p2       0       p3      p4      0   ]
-
-                feat_is_patch (from HF processor):
-                        [ True    True    False    True    True   False ]
-
-                The resulting embedding tensor is:
-                [  nan     p1      p2      nan      p3      p4     nan    nan  ]
-        """
-        num_crops_per_image = num_crops.tolist()
-        feats_per_image = features.split(num_crops_per_image)
-        f_is_patch_per_image = feat_is_patch.split(num_crops_per_image)
-
-        features = torch.cat([
+        # Only the features corresponding to patch tokens are relevant
+        return [
             feats[f_is_patch]
-            for feats, f_is_patch in zip(feats_per_image, f_is_patch_per_image)
-        ])
-
-        return scatter_patch_features(features, embed_is_patch)
+            for feats, f_is_patch in zip(image_features, feat_is_patch)
+        ]
 
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
@@ -1583,13 +1545,10 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
 
         image_features = self._process_image_input(image_input)
 
-        return flatten_2d_lists(
-            self._get_mm_embeds(*args) for args in zip(
-                image_features,
-                image_input["feat_is_patch"],
-                image_input["num_crops"],
-                image_input["embed_is_patch"],
-            ))
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index a3ad360961243..da2017c987d4f 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -42,7 +42,6 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import (MistralTokenizer,
                                                cached_tokenizer_from_config)
-from vllm.utils import flatten_2d_lists
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (flatten_bn, init_vllm_registered_model, maybe_prefix,
@@ -74,7 +73,7 @@ class PixtralImagePixelInputs(TypedDict):
     A boolean mask indicating which image embeddings correspond
     to patch tokens.
 
-    Shape: `(batch_size, num_images, num_embeds)`
+    Shape: `(batch_size * num_images, num_embeds)`
     """
 
 
@@ -387,6 +386,8 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
             raise ValueError("Incorrect type of embed_is_patch. "
                              f"Got type: {type(embed_is_patch)}")
 
+        embed_is_patch = flatten_bn(embed_is_patch)
+
         return PixtralImagePixelInputs(
             type="pixel_values",
             images=flatten_bn(images),
@@ -428,14 +429,10 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         image_features = self._process_image_input(image_input)
 
-        if kwargs.get("v0_path", False):
-            return image_features
-
-        return flatten_2d_lists(
-            scatter_patch_features(*args) for args in zip(
-                image_features,
-                image_input["embed_is_patch"],
-            ))
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -467,7 +464,6 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
         # NOTE: In v1, inputs_embeds is always generated at model runner, this
         # condition is for v0 compatibility.
         elif inputs_embeds is None:
-            kwargs.update({"v0_path": True})
             vision_embeddings = self.get_multimodal_embeddings(**kwargs)
             inputs_embeds = self.get_input_embeddings(input_ids,
                                                       vision_embeddings)
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index c91459398308e..db069f8de2a35 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from abc import ABC, abstractmethod
+from collections.abc import Sequence
 from typing import Final, Generic, Optional, Protocol, TypeVar, Union, cast
 
 import torch
@@ -154,8 +155,8 @@ def resolve_visual_encoder_outputs(
 
 
 def scatter_patch_features(
-    features: torch.Tensor,
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]],
+    patches: Union[torch.Tensor, Sequence[torch.Tensor]],
+    embed_is_patch: Union[torch.Tensor, Sequence[torch.Tensor]],
 ) -> tuple[torch.Tensor, ...]:
     """
     Scatter the patch features into a contiguous tensor that corresponds
@@ -165,8 +166,8 @@ def scatter_patch_features(
     can be filtered out by :func`select_patch_features`.
 
     Args:
-        features: The patch features, concatenated across each image.
-          Shape: `(num_patch, feature_depth)`
+        patches: The patch features for each image.
+          Shape: `(num_images, <patch_dims>, feature_depth)`
         embed_is_patch: A boolean mask indicating which image embeddings
           correspond to patch tokens for each image.
           Shape: `(num_images, num_embeds)`
@@ -194,21 +195,21 @@ def scatter_patch_features(
             The resulting embedding tensor is:
             [  nan     p1      p2      nan      p3      p4     nan    nan  ]
     """
-    num_embeds_per_image = [
-        e_is_patch.numel() for e_is_patch in embed_is_patch
-    ]
-    if isinstance(embed_is_patch, torch.Tensor):
-        embed_is_patch_flat = embed_is_patch.view(-1)
-    else:
-        embed_is_patch_flat = torch.cat(embed_is_patch)
+    if len(patches) != len(embed_is_patch):
+        raise ValueError(f"Inconsistent num_images: {len(patches)=} vs. "
+                         f"{len(embed_is_patch)=}")
 
-    embeds_flat = features.new_full(
-        (sum(num_embeds_per_image), features.shape[-1]),
-        fill_value=torch.nan,
-    )
-    embeds_flat[embed_is_patch_flat] = features.flatten(0, -2)
+    def get_embed_one(patches_one: torch.Tensor, e_is_patch: torch.Tensor):
+        embed_one = patches_one.new_full(
+            (e_is_patch.shape[0], patches_one.shape[-1]),
+            fill_value=torch.nan,
+        )
+        embed_one[e_is_patch] = patches_one.flatten(0, -2)
+        return embed_one
 
-    return embeds_flat.split(num_embeds_per_image)
+    return tuple(
+        get_embed_one(patches_one, e_is_patch)
+        for patches_one, e_is_patch in zip(patches, embed_is_patch))
 
 
 def select_patch_features(

From 3f532cb6a69e51a6578b85642fcba34ac348f7a4 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Thu, 27 Mar 2025 17:21:23 +0800
Subject: [PATCH 041/593] [Misc] Use model_redirect to redirect the model name
 to a local folder. (#14116)

---
 vllm/config.py                   | 10 ++++++---
 vllm/envs.py                     |  5 +++++
 vllm/transformers_utils/utils.py | 38 ++++++++++++++++++++++++++++++++
 3 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 62800afc3e699..687c8b56ec126 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -38,7 +38,7 @@ from vllm.transformers_utils.config import (
     get_sentence_transformer_tokenizer_config, is_encoder_decoder,
     try_get_generation_config, uses_mrope)
 from vllm.transformers_utils.s3_utils import S3Model
-from vllm.transformers_utils.utils import is_s3
+from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
 from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
                         get_cpu_memory, random_uuid, resolve_obj_by_qualname)
 
@@ -266,9 +266,13 @@ class ModelConfig:
         override_generation_config: Optional[dict[str, Any]] = None,
         model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
     ) -> None:
-        self.model = model
+        self.model = maybe_model_redirect(model)
+        self.tokenizer = maybe_model_redirect(tokenizer)
+
         self.hf_config_path = hf_config_path
-        self.tokenizer = tokenizer
+        if isinstance(hf_config_path, str):
+            self.hf_config_path = maybe_model_redirect(hf_config_path)
+
         self.tokenizer_mode = tokenizer_mode
         self.trust_remote_code = trust_remote_code
         self.allowed_local_media_path = allowed_local_media_path
diff --git a/vllm/envs.py b/vllm/envs.py
index e16753191c6e2..23c304f124d36 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -22,6 +22,7 @@ if TYPE_CHECKING:
     S3_ACCESS_KEY_ID: Optional[str] = None
     S3_SECRET_ACCESS_KEY: Optional[str] = None
     S3_ENDPOINT_URL: Optional[str] = None
+    VLLM_MODEL_REDIRECT_PATH: Optional[str] = None
     VLLM_CACHE_ROOT: str = os.path.expanduser("~/.cache/vllm")
     VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm")
     VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
@@ -635,6 +636,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_CI_USE_S3":
     lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1",
 
+    # Use model_redirect to redirect the model name to a local folder.
+    "VLLM_MODEL_REDIRECT_PATH":
+    lambda: os.environ.get("VLLM_MODEL_REDIRECT_PATH", None),
+
     # Whether to use atomicAdd reduce in gptq/awq marlin kernel.
     "VLLM_MARLIN_USE_ATOMIC_ADD":
     lambda: os.environ.get("VLLM_MARLIN_USE_ATOMIC_ADD", "0") == "1",
diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index 87e446f894384..bae487b75588e 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -1,9 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from functools import cache
 from os import PathLike
 from pathlib import Path
 from typing import List, Optional, Union
 
+from vllm.envs import VLLM_MODEL_REDIRECT_PATH
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 
 def is_s3(model_or_path: str) -> bool:
     return model_or_path.lower().startswith('s3://')
@@ -38,3 +44,35 @@ def modelscope_list_repo_files(
         if file['Type'] == 'blob'
     ]
     return files
+
+
+@cache
+def maybe_model_redirect(model: str) -> str:
+    """
+    Use model_redirect to redirect the model name to a local folder.
+
+    :param model: hf model name
+    :return: maybe redirect to a local folder
+    """
+
+    model_redirect_path = VLLM_MODEL_REDIRECT_PATH
+
+    if not model_redirect_path:
+        return model
+
+    if not Path(model_redirect_path).exists():
+        return model
+
+    with open(model_redirect_path) as f:
+        for line in f.readlines():
+            try:
+                model_name, redirect_name = line.split("\t")
+                if model == model_name:
+                    redirect_name = redirect_name.strip()
+                    logger.info("model redirect: [ %s ] -> [ %s ]", model,
+                                redirect_name)
+                    return redirect_name
+            except Exception:
+                pass
+
+    return model

From 6278bc829eb6214f3375cc50347d58dbae81bc31 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@users.noreply.github.com>
Date: Thu, 27 Mar 2025 06:33:41 -0400
Subject: [PATCH 042/593] Fix incorrect filenames in vllm_compile_cache.py
 (#15494)

Signed-off-by: <zou3519@gmail.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/compiler_interface.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index 571e2b832e95f..ab0f98bdaa3e5 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -229,7 +229,20 @@ class InductorAdaptor(CompilerInterface):
                 inductor_compiled_graph = output
                 if inductor_compiled_graph is not None:
                     nonlocal file_path
-                    file_path = inductor_compiled_graph.current_callable.__code__.co_filename  # noqa
+                    compiled_fn = inductor_compiled_graph.current_callable
+                    file_path = compiled_fn.__code__.co_filename  # noqa
+                    if not file_path.startswith(self.cache_dir):
+                        # hooked in the align_inputs_from_check_idxs function
+                        # in torch/_inductor/utils.py
+                        for cell in compiled_fn.__closure__:
+                            if not callable(cell.cell_contents):
+                                continue
+                            code = cell.cell_contents.__code__
+                            if code.co_filename.startswith(self.cache_dir):
+                                # this is the real file path
+                                # compiled from Inductor
+                                file_path = code.co_filename
+                                break
                     hash_str = inductor_compiled_graph._fx_graph_cache_key
                 return output
 

From 8063dfc61a0cbb348d4b1baf4b6e03e8ebae7cfa Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Thu, 27 Mar 2025 20:38:46 +0800
Subject: [PATCH 043/593] [Doc] update --system for transformers installation
 in docker doc (#15616)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 docs/source/deployment/docker.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md
index 1f60faf40879e..65cb038de1b4e 100644
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@@ -34,11 +34,11 @@ If you need to use those dependencies (having accepted the license terms),
 create a custom Dockerfile on top of the base image with an extra layer that installs them:
 
 ```Dockerfile
-FROM vllm/vllm-openai:v0.8.0
+FROM vllm/vllm-openai:v0.8.2
 
 # e.g. install the `audio` and `video` optional dependencies
 # NOTE: Make sure the version of vLLM matches the base image!
-RUN uv pip install vllm[audio,video]==0.8.0
+RUN uv pip install --system vllm[audio,video]==0.8.2
 ```
 
 :::
@@ -52,7 +52,7 @@ with an extra layer that installs their code from source:
 ```Dockerfile
 FROM vllm/vllm-openai:latest
 
-RUN uv pip install git+https://github.com/huggingface/transformers.git
+RUN uv pip install --system git+https://github.com/huggingface/transformers.git
 ```
 
 :::

From ac5bc615b0adac4038e5574b446c8ac64c241caf Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 27 Mar 2025 21:07:29 +0800
Subject: [PATCH 044/593] [Model] MiniCPM-V/O supports V1 (#15487)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/supported_models.md |   4 +-
 vllm/model_executor/models/minicpmo.py | 427 +++++++--------
 vllm/model_executor/models/minicpmv.py | 696 ++++++++++++-------------
 vllm/model_executor/models/molmo.py    |  40 +-
 4 files changed, 573 insertions(+), 594 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 8ff18a17d36c3..793831fd06ded 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -836,14 +836,14 @@ See [this page](#generative-models) for more information on how to use generativ
   * `openbmb/MiniCPM-o-2_6`, etc.
   * ✅︎
   * ✅︎
-  *
+  * ✅︎
 - * `MiniCPMV`
   * MiniCPM-V
   * T + I<sup>E+</sup> + V<sup>E+</sup>
   * `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc.
   * ✅︎
   * ✅︎
-  *
+  * ✅︎
 - * `MllamaForConditionalGeneration`
   * Llama 3.2
   * T + I<sup>+</sup>
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index 1312b1051732f..ea37de0b806ab 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -23,8 +23,8 @@
 # limitations under the License.
 """Inference-only MiniCPM-O model compatible with HuggingFace weights."""
 from collections.abc import Iterable, Mapping, Sequence
-from typing import (Any, Callable, Dict, Literal, Optional, Set, Tuple,
-                    TypedDict, Union)
+from typing import (Any, Callable, Literal, Optional, Set, Tuple, TypedDict,
+                    Union)
 
 import torch
 from torch import nn
@@ -42,8 +42,6 @@ from vllm.multimodal.parse import (AudioItem, AudioProcessorItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import PromptReplacement, PromptUpdate
 from vllm.multimodal.profiling import ProcessorInputs
-from vllm.sequence import IntermediateTensors
-from vllm.utils import flatten_2d_lists
 
 from .minicpmv import (MiniCPMV2_6, MiniCPMVDummyInputsBuilder,
                        MiniCPMVMultiModalDataParser,
@@ -51,13 +49,14 @@ from .minicpmv import (MiniCPMV2_6, MiniCPMVDummyInputsBuilder,
                        _minicpmv_field_config)
 from .utils import (AutoWeightsLoader, cast_overflow_tensors, flatten_bn,
                     maybe_prefix)
+from .vision import scatter_patch_features
 
 CPU_DEVICE = torch.device("cpu")
 
 
 class MiniCPMOAudioFeatureInputs(TypedDict):
     type: Literal["audio_features"]
-    audio_features: torch.Tensor
+    audio_features: Union[torch.Tensor, list[torch.Tensor]]
     """
     Shape: `(batch_size * num_audios * num_slices, num_channels, length)`
     Slice here means chunk. Audio that is too long will be split into slices,
@@ -65,37 +64,40 @@ class MiniCPMOAudioFeatureInputs(TypedDict):
     Padding is used therefore `audio_features` is `torch.Tensor`.
     """
 
-    audio_feature_lens: torch.Tensor
+    audio_feature_lens: Union[torch.Tensor, list[torch.Tensor]]
     """
-    Shape: `(batch_size * num_audios * num_slices)`
+    Shape: `(batch_size * num_audios, num_slices)`
 
     This should be feature length of each audio slice, 
     which equals to `audio_features.shape[-1]`
     """
 
-    audio_bounds: torch.Tensor
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
     """
-    Shape: `(batch_size * num_audios * num_slices, 2)`
+    A boolean mask indicating which audio embeddings correspond
+    to patch tokens.
 
-    This should be in `(start, stop)` format.
+    Shape: `(batch_size * num_audios, num_embeds)`
     """
 
 
 class MiniCPMOAudioEmbeddingInputs(TypedDict):
     type: Literal["audio_embeds"]
-    audio_embeds: torch.Tensor
+    audio_embeds: Union[torch.Tensor, list[torch.Tensor]]
     """
-    Shape: `(batch_size * num_images * num_slices, hidden_size)`
+    Shape: `(batch_size * num_audios, num_slices, hidden_size)`
 
     `hidden_size` must match the hidden size of language model backbone.
     instead of a batched tensor.
     Length of each slice may vary, so pass it as a list.
     """
-    audio_bounds: torch.Tensor
-    """
-    Shape: `(batch_size * num_audios * num_slices, 2)`
 
-    This should be in `(start, stop)` format.
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which audio embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_audios, num_embeds)`
     """
 
 
@@ -104,11 +106,16 @@ MiniCPMOAudioInputs = Union[MiniCPMOAudioFeatureInputs,
 
 
 def _minicpmo_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+    audio_features = hf_inputs.get("audio_features", torch.empty(0))
+    num_audios = len(audio_features)
+
     return dict(
         **_minicpmv_field_config(hf_inputs),
         audio_features=MultiModalFieldConfig.batched("audio"),
         audio_feature_lens=MultiModalFieldConfig.batched("audio"),
         audio_embeds=MultiModalFieldConfig.batched("audio"),
+        audio_embed_is_patch=MultiModalFieldConfig.batched("audio"),
+        audio_token_id=MultiModalFieldConfig.shared("audio", num_audios),
     )
 
 
@@ -149,7 +156,7 @@ class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
     audio_pattern = "(<audio>./</audio>)"
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None, "video": None, "audio": None}
+        return {**super().get_supported_mm_limits(), "audio": None}
 
     def get_mm_max_tokens_per_item(
         self,
@@ -157,11 +164,25 @@ class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
         mm_counts: Mapping[str, int],
     ) -> Mapping[str, int]:
         return {
-            "image": self.get_max_image_tokens(),
-            "audio": self.get_max_audio_tokens(),
-            "video": self.get_max_video_tokens(seq_len),
+            **super().get_mm_max_tokens_per_item(seq_len, mm_counts),
+            "audio":
+            self.get_max_audio_tokens(),
         }
 
+    def get_audio_placeholder(
+        self,
+        audio_lens: int,
+        chunk_input: bool = True,
+        chunk_length: int = 1,
+    ) -> str:
+        hf_processor = self.get_hf_processor()
+
+        return hf_processor.get_audio_placeholder(
+            audio_lens,
+            chunk_input=chunk_input,
+            chunk_length=chunk_length,
+        )
+
     def get_default_audio_pool_step(self) -> int:
         return 2
 
@@ -197,12 +218,8 @@ class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
         max_videos = mm_config.get_limit_per_prompt("video")
         max_audios = mm_config.get_limit_per_prompt("audio")
 
-        # count <image_idx></image_idx> tokens
-        # which are not in get_max_image_tokens
-        max_image_tokens = self.get_max_image_tokens(
-        ) * max_images + 4 * max_images
-        max_audio_tokens = self.get_max_audio_tokens(
-        ) * max_audios + 2 * max_audios
+        max_image_tokens = self.get_max_image_tokens() * max_images
+        max_audio_tokens = self.get_max_audio_tokens() * max_audios
         max_total_frames = self.get_max_video_frames(seq_len -
                                                      max_image_tokens -
                                                      max_audio_tokens)
@@ -224,20 +241,20 @@ class MiniCPMODummyInputsBuilder(
 
         processor_inputs = super().get_dummy_processor_inputs(
             seq_len, mm_counts)
-        mm_data = {
-            "image":
-            processor_inputs.mm_data["image"],
-            "video":
-            processor_inputs.mm_data["video"],
+
+        audio_prompt_texts = self.info.audio_pattern * num_audios
+        audio_mm_data = {
             "audio":
             self._get_dummy_audios(length=audio_len, num_audios=num_audios)
         }
 
-        audio_prompt_texts = self.info.audio_pattern * num_audios
-
-        return ProcessorInputs(prompt_text=processor_inputs.prompt_text + \
-                               audio_prompt_texts,
-                               mm_data=mm_data)
+        return ProcessorInputs(
+            prompt_text=processor_inputs.prompt_text + audio_prompt_texts,
+            mm_data={
+                **processor_inputs.mm_data,
+                **audio_mm_data,
+            },
+        )
 
 
 class MiniCPMOMultiModalProcessor(
@@ -247,22 +264,17 @@ class MiniCPMOMultiModalProcessor(
         return MiniCPMOMultiModalDataParser(
             target_sr=self.info.get_default_audio_sampling_rate())
 
-    def get_audio_prompt_texts(self,
-                               audio_lens: int,
-                               chunk_input: bool = True,
-                               chunk_length: int = 1) -> str:
-        return self.info.get_hf_processor().get_audio_placeholder(
-            audio_lens, chunk_input, chunk_length)
-
-    def get_special_tokens(self) -> Dict[str, torch.Tensor]:
-        tokenizer = self.info.get_tokenizer()
-        special_tokens = super().get_special_tokens()
-        if hasattr(tokenizer, "audio_start_id"):
-            special_tokens["audio_start_id"] = torch.tensor(
-                tokenizer.audio_start_id)
-            special_tokens["audio_end_id"] = torch.tensor(
-                tokenizer.audio_end_id)
-        return special_tokens
+    def get_audio_prompt_texts(
+        self,
+        audio_lens: int,
+        chunk_input: bool = True,
+        chunk_length: int = 1,
+    ) -> str:
+        return self.info.get_audio_placeholder(
+            audio_lens,
+            chunk_input=chunk_input,
+            chunk_length=chunk_length,
+        )
 
     def process_audios(
         self,
@@ -274,32 +286,65 @@ class MiniCPMOMultiModalProcessor(
 
         parsed_audios = (self._get_data_parser().parse_mm_data({
             "audio": audios
-        }).get_items("audio", AudioProcessorItems))
+        }).get_items("audio",
+                     (MiniCPMOAudioEmbeddingItems, AudioProcessorItems)))
 
-        audio_inputs = self._base_call_hf_processor(
-            prompts=[self.info.audio_pattern] * len(parsed_audios),
-            mm_data={"audios": [[audio] for audio in parsed_audios]},
-            mm_kwargs={
-                **mm_kwargs, "chunk_input": True
-            },
-            out_keys={"audio_features", "audio_feature_lens"},
-        )
+        if isinstance(parsed_audios, MiniCPMOAudioEmbeddingItems):
+            audio_inputs = {}
 
-        # Avoid padding since we need the output for each audio to be
-        # independent of other audios for the cache to work correctly
-        unpadded_audio_features = [
-            feat[:, :feature_len] for feat, feature_len in zip(
-                audio_inputs["audio_features"],
-                audio_inputs["audio_feature_lens"],
+            audio_lens = [
+                self.info.get_audio_len_by_num_chunks(
+                    sum(map(len,
+                            parsed_audios.get(i)["audio_embeds"])))
+                for i in range(len(parsed_audios))
+            ]
+        else:
+            audio_inputs = self._base_call_hf_processor(
+                prompts=[self.info.audio_pattern] * len(parsed_audios),
+                mm_data={"audios": [[audio] for audio in parsed_audios]},
+                mm_kwargs={
+                    **mm_kwargs,
+                    "chunk_input": True,
+                },
+                out_keys={"audio_features", "audio_feature_lens"},
             )
+
+            # Avoid padding since we need the output for each audio to be
+            # independent of other audios for the cache to work correctly
+            unpadded_audio_features = [
+                feat[:, :feature_len] for feat, feature_len in zip(
+                    audio_inputs["audio_features"],
+                    audio_inputs["audio_feature_lens"],
+                )
+            ]
+            audio_inputs["audio_features"] = unpadded_audio_features
+
+            audio_lens = [
+                parsed_audios.get_audio_length(i)
+                for i in range(len(parsed_audios))
+            ]
+
+        audio_repl_features = [
+            self.get_audio_prompt_texts(audio_len) for audio_len in audio_lens
         ]
-        audio_inputs["audio_features"] = unpadded_audio_features
+
+        tokenizer = self.info.get_tokenizer()
+        audio_repls_feature_tokens = [
+            tokenizer.encode(audio_repl, add_special_tokens=False)
+            for audio_repl in audio_repl_features
+        ]
+
+        embed_is_patch = [
+            self.get_embed_is_patch(audio_repl_tokens)
+            for audio_repl_tokens in audio_repls_feature_tokens
+        ]
+        audio_inputs["audio_embed_is_patch"] = embed_is_patch
+
+        unk_token_id = tokenizer.get_vocab()["<unk>"]
+        audio_inputs["audio_token_id"] = torch.tensor(unk_token_id)
 
         return audio_inputs
 
-    def get_placeholder_match_pattern(self) -> str:
-        return r"\(<(image|video|audio)>./</\1>\)"
-
     def process_mm_inputs(
         self,
         mm_data: Mapping[str, object],
@@ -331,8 +376,7 @@ class MiniCPMOMultiModalProcessor(
             if isinstance(audios, MiniCPMOAudioEmbeddingItems):
                 single_audio_embeds = audios.get(item_idx)["audio_embeds"]
                 audio_len = self.info.get_audio_len_by_num_chunks(
-                    sum(chunk_embeds.shape[0]
-                        for chunk_embeds in single_audio_embeds))
+                    sum(map(len, single_audio_embeds)))
             else:
                 audio_len = audios.get_audio_length(item_idx)
 
@@ -514,6 +558,8 @@ class MiniCPMO(MiniCPMV2_6):
         self.apm = self.init_audio_module(vllm_config=vllm_config,
                                           prefix=maybe_prefix(prefix, "apm"))
 
+        self.audio_token_id = None
+
     def init_audio_module(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # Do not use parameters temporarily
         audio_config = self.config.audio_config
@@ -563,18 +609,30 @@ class MiniCPMO(MiniCPMV2_6):
 
         return input_lengths_after_cnn, input_lengths_after_pooling
 
-    # Copied from HF repo of MiniCPM-o-2_6,
-    # designed for batched inputs and outputs
-    def get_audio_hidden_states(self, data: MiniCPMOAudioInputs,
-                                chunk_length: int) -> list[torch.Tensor]:
-        wavforms = data.get(
-            "audio_features",
-            [])  # (bs, 80, frames) or [], multi audios need filled in advance
-        audio_feature_lens_raw = [data.get("audio_feature_lens",
-                                           [])]  # list, [[x1, x2], [y1], [z1]]
+    def get_audio_hidden_states(
+            self, data: MiniCPMOAudioFeatureInputs) -> list[torch.Tensor]:
+        chunk_length = self.config.audio_chunk_length
 
-        if len(wavforms) == 0:
-            return []
+        # (bs, 80, frames) or [], multi audios need filled in advance
+        wavforms_raw = data["audio_features"]
+        if isinstance(wavforms_raw, list):
+            B = len(wavforms_raw)
+            C = wavforms_raw[0].shape[-2]
+            L = max(item.shape[-1] for item in wavforms_raw)
+            device = wavforms_raw[0].device
+            dtype = wavforms_raw[0].dtype
+
+            wavforms = torch.zeros((B, C, L), dtype=dtype, device=device)
+            for i, wavforms_item in enumerate(wavforms_raw):
+                L_item = wavforms_item.shape[-1]
+                wavforms[i, ..., :L_item] = wavforms_item
+        else:
+            wavforms = wavforms_raw
+
+        # list, [[x1, x2], [y1], [z1]]
+        audio_feature_lens_raw = data["audio_feature_lens"]
+        if isinstance(audio_feature_lens_raw, torch.Tensor):
+            audio_feature_lens_raw = audio_feature_lens_raw.unbind(0)
 
         audio_feature_lens = torch.hstack(audio_feature_lens_raw)
         batch_size, _, max_mel_seq_len = wavforms.shape
@@ -625,159 +683,104 @@ class MiniCPMO(MiniCPMV2_6):
 
         num_audio_tokens = feature_lens_after_pooling
 
-        final_audio_embeds = []
+        final_audio_embeds = list[torch.Tensor]()
         idx = 0
         for i in range(len(audio_feature_lens_raw)):
-            target_audio_embeds = []
+            target_audio_embeds_lst = list[torch.Tensor]()
             for _ in range(len(audio_feature_lens_raw[i])):
-                target_audio_embeds.append(
+                target_audio_embeds_lst.append(
                     audio_embeds[idx, :num_audio_tokens[idx], :])
                 idx += 1
-            final_audio_embeds.append(target_audio_embeds)
+
+            final_audio_embeds.append(torch.cat(target_audio_embeds_lst))
+
         return final_audio_embeds
 
-    def get_embedding_with_audios(self, vlm_embedding: torch.Tensor,
-                                  audio_inputs: MiniCPMOAudioInputs,
-                                  chunk_length: int) -> torch.Tensor:
-        device, dtype = vlm_embedding.device, vlm_embedding.dtype
-        if audio_inputs["type"] == "audio_embeds":
-            audio_embeddings = [
-                item.to(device=device, dtype=dtype)
-                for item in audio_inputs["audio_embeds"]
-            ]
-        else:
-            audio_embeddings = self.get_audio_hidden_states(
-                audio_inputs, chunk_length)[0]
-        if audio_embeddings is None or len(audio_embeddings) == 0:
-            return vlm_embedding
-        audio_bounds = audio_inputs["audio_bounds"]
-        if self.config.chunk_input:
-            audio_embs = torch.cat(audio_embeddings, dim=0).to(device=device,
-                                                               dtype=dtype)
-            audio_start_pos = 0
-            for bound in audio_bounds:
-                audio_len = bound[1] - bound[0]
-                vlm_embedding[bound[0]:bound[1]] = audio_embs[
-                    audio_start_pos:audio_start_pos + audio_len, :]
-                audio_start_pos += audio_len
-        else:
-            for embs, bound in zip(audio_embeddings, audio_bounds):
-                audio_indices = torch.arange(bound[0],
-                                             bound[1],
-                                             dtype=torch.long).to(device)
-
-                if embs.shape[0] != len(audio_indices):
-                    raise ValueError(
-                        "Shape mismatch: Trying to assign embeddings "
-                        f"of shape {embs.shape} "
-                        f"to input indices of length {len(audio_indices)}")
-                vlm_embedding[audio_indices] = embs.to(dtype)
-        return vlm_embedding
-
-    def _get_audio_bounds(self, input_ids: torch.Tensor,
-                          audio_start_id: torch.Tensor,
-                          audio_end_id: torch.Tensor) -> torch.Tensor:
-        audio_start_tokens, = torch.where(input_ids == audio_start_id[0])
-        audio_start_tokens += 1
-        audio_end_tokens, = torch.where(input_ids == audio_end_id[0])
-        valid_audio_nums = max(len(audio_start_tokens), len(audio_end_tokens))
-        return torch.hstack([
-            audio_start_tokens[:valid_audio_nums].unsqueeze(-1),
-            audio_end_tokens[:valid_audio_nums].unsqueeze(-1)
-        ])
-
-    def _parse_and_validate_audio_inputs(
-            self, input_ids: torch.Tensor,
-            **kwargs: object) -> Optional[MiniCPMOAudioInputs]:
+    def _parse_and_validate_audio_input(
+            self, **kwargs: object) -> Optional[MiniCPMOAudioInputs]:
         audio_features = kwargs.pop("audio_features", None)
         audio_embeds = kwargs.pop("audio_embeds", None)
 
         if audio_features is None and audio_embeds is None:
             return None
 
-        audio_start_id = kwargs.pop("audio_start_id")
-        if not isinstance(audio_start_id, torch.Tensor):
-            raise ValueError("Incorrect type of audio_start_id. "
-                             f"Got type: {type(audio_start_id)}")
+        audio_token_id = kwargs.pop("audio_token_id")
+        if audio_token_id is not None:
+            assert isinstance(audio_token_id, torch.Tensor)
+            self.mm_token_ids.add(audio_token_id.flatten().unique().item())
 
-        audio_end_id = kwargs.pop("audio_end_id")
-        if not isinstance(audio_end_id, torch.Tensor):
-            raise ValueError("Incorrect type of audio_end_id. "
-                             f"Got type: {type(audio_end_id)}")
+        audio_embed_is_patch = kwargs.pop("audio_embed_is_patch")
+        if not isinstance(audio_embed_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of audio_embed_is_patch. "
+                             f"Got type: {type(audio_embed_is_patch)}")
+
+        audio_embed_is_patch = flatten_bn(audio_embed_is_patch)
 
         if audio_embeds is not None:
             if not isinstance(audio_embeds, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of audio_embeds. "
                                  f"Got type: {type(audio_embeds)}")
 
+            audio_embeds_flat = flatten_bn(audio_embeds)
+
             return MiniCPMOAudioEmbeddingInputs(
                 type="audio_embeds",
-                audio_embeds=flatten_bn(flatten_2d_lists(audio_embeds),
-                                        concat=True),
-                audio_bounds=self._get_audio_bounds(input_ids, audio_start_id,
-                                                    audio_end_id),
+                audio_embeds=audio_embeds_flat,
+                embed_is_patch=audio_embed_is_patch,
             )
 
-        if audio_features is not None:
-            if not isinstance(audio_features, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of audio_features. "
-                                 f"Got type: {type(audio_features)}")
+        if not isinstance(audio_features, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of audio_features. "
+                             f"Got type: {type(audio_features)}")
 
-            audio_feature_lens = kwargs.pop("audio_feature_lens")
-            if not isinstance(audio_feature_lens, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of audio_feature_lens. "
-                                 f"Got type: {type(audio_feature_lens)}")
+        audio_feature_lens = kwargs.pop("audio_feature_lens")
+        if not isinstance(audio_feature_lens, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of audio_feature_lens. "
+                             f"Got type: {type(audio_feature_lens)}")
 
-            return MiniCPMOAudioFeatureInputs(
-                type="audio_features",
-                audio_features=flatten_bn(audio_features, concat=True),
-                audio_feature_lens=flatten_bn(
-                    flatten_2d_lists(audio_feature_lens), concat=True),
-                audio_bounds=self._get_audio_bounds(input_ids, audio_start_id,
-                                                    audio_end_id),
-            )
+        audio_features_flat = flatten_bn(audio_features)
+        audio_feature_lens_flat = flatten_bn(audio_feature_lens)
 
-        raise AssertionError("This line should be unreachable.")
-
-    def _parse_and_validate_inputs(self, input_ids: torch.Tensor,
-                                   **kwargs: object):
-        image_inputs = self._parse_and_validate_image_inputs(
-            input_ids, **kwargs)
-        if not any("audio" in key for key in kwargs):
-            return image_inputs, None
-        audio_inputs = self._parse_and_validate_audio_inputs(
-            input_ids, **kwargs)
-        return image_inputs, audio_inputs
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        **kwargs: Any,
-    ) -> torch.Tensor:
-        if intermediate_tensors is not None:
-            vlm_embeddings = None
-        else:
-            image_inputs, audio_inputs = \
-                self._parse_and_validate_inputs(input_ids, **kwargs)
-            vlm_embeddings = self.get_embedding_with_vision(
-                input_ids, image_inputs)
-
-            if audio_inputs is not None:
-                vlm_embeddings = self.get_embedding_with_audios(
-                    vlm_embeddings, audio_inputs,
-                    self.config.audio_chunk_length)
-
-        # always pass the input via `inputs_embeds`
-        # to make sure the computation graph is consistent
-        # for `torch.compile` integration
-        input_ids = None
-
-        output = self.llm.model(
-            input_ids=input_ids,
-            positions=positions,
-            intermediate_tensors=intermediate_tensors,
-            inputs_embeds=vlm_embeddings,
+        return MiniCPMOAudioFeatureInputs(
+            type="audio_features",
+            audio_features=audio_features_flat,
+            audio_feature_lens=audio_feature_lens_flat,
+            embed_is_patch=audio_embed_is_patch,
         )
-        return output
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = super()._parse_and_validate_multimodal_inputs(**kwargs)
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("audio_features",
+                             "audio_embeds") and "audios" not in modalities:
+                modalities["audios"] = self._parse_and_validate_audio_input(
+                    **kwargs)
+
+        return modalities
+
+    def _process_audio_input(
+        self,
+        audio_input: MiniCPMOAudioInputs,
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        if audio_input["type"] == "audio_embeds":
+            return audio_input["audio_embeds"]
+
+        return self.get_audio_hidden_states(audio_input)
+
+    def _process_multimodal_inputs(self, modalities: dict):
+        multimodal_embeddings = super()._process_multimodal_inputs(modalities)
+
+        for modality in modalities:
+            if modality == "audios":
+                audio_input = modalities["audios"]
+                audio_features = self._process_audio_input(audio_input)
+                multimodal_embeddings += tuple(
+                    scatter_patch_features(
+                        audio_features,
+                        audio_input["embed_is_patch"],
+                    ))
+
+        return multimodal_embeddings
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 23c010c63d558..76c7a59d656d5 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -23,17 +23,15 @@
 # limitations under the License.
 """Inference-only MiniCPM-V model compatible with HuggingFace weights."""
 import math
-import re
 from collections import defaultdict
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property, partial
-from typing import (Any, Callable, Dict, List, Literal, Optional, Set, Tuple,
-                    TypedDict, Union)
+from typing import (Any, Callable, Literal, Optional, Set, Tuple, TypedDict,
+                    Union)
 
 import numpy as np
 import torch
 import torch.types
-from PIL import Image
 from torch import nn
 from transformers import BatchFeature, PretrainedConfig
 from typing_extensions import TypeVar
@@ -50,9 +48,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputs, NestedTensors,
-                                    PlaceholderRange)
+from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors
 from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem,
                                    ImageProcessorItems, ImageSize,
                                    ModalityData, ModalityDataItems,
@@ -67,13 +63,11 @@ from vllm.sequence import IntermediateTensors
 from vllm.utils import flatten_2d_lists
 
 from .idefics2_vision_model import Idefics2VisionTransformer
-from .interfaces import (SupportsLoRA, SupportsMultiModal, SupportsPP,
-                         SupportsV0Only)
-from .utils import AutoWeightsLoader, flatten_bn, maybe_prefix
-
-CPU_DEVICE = torch.device("cpu")
-
-RawImageType = Union[Image.Image, torch.Tensor]
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
+from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
+                    merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features
 
 
 class MiniCPMVImagePixelInputs(TypedDict):
@@ -86,13 +80,6 @@ class MiniCPMVImagePixelInputs(TypedDict):
     instead of a batched tensor.
     """
 
-    image_bounds: torch.Tensor
-    """
-    Shape: `(batch_size * num_images * num_slices, 2)`
-
-    This should be in `(start, stop)` format.
-    """
-
     tgt_sizes: torch.Tensor
     """
     Shape: `(batch_size * num_images * num_slices, 2)`
@@ -100,23 +87,34 @@ class MiniCPMVImagePixelInputs(TypedDict):
     This should be in `(height, width)` format.
     """
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
+    num_slices: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
+
 
 class MiniCPMVImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
-    image_embeds: torch.Tensor
+    image_embeds: Union[torch.Tensor, list[torch.Tensor]]
     """
-    Shape: `(batch_size * num_images * num_slices, 
-             image_feature_size, hidden_size)`
+    Shape: `(batch_size * num_images, num_slices, hidden_size)`
 
     `hidden_size` must match the hidden size of language model backbone.
     instead of a batched tensor.
     """
 
-    image_bounds: torch.Tensor
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
     """
-    Shape: `(batch_size * num_images * num_slices, 2)`
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
 
-    This should be in `(start, stop)` format.
+    Shape: `(batch_size * num_images, num_embeds)`
     """
 
 
@@ -233,15 +231,25 @@ def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]:
 
 
 def _minicpmv_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+    pixel_values = hf_inputs.get("pixel_values", torch.empty(0))
+    num_images = len(pixel_values)
+
+    video_pixel_values = hf_inputs.get("video_pixel_values", torch.empty(0))
+    num_videos = len(video_pixel_values)
+
     return dict(
         pixel_values=MultiModalFieldConfig.batched("image"),
         image_sizes=MultiModalFieldConfig.batched("image"),
         tgt_sizes=MultiModalFieldConfig.batched("image"),
         image_embeds=MultiModalFieldConfig.batched("image"),
+        embed_is_patch=MultiModalFieldConfig.batched("image"),
         video_pixel_values=MultiModalFieldConfig.batched("video"),
         video_image_sizes=MultiModalFieldConfig.batched("video"),
         video_tgt_sizes=MultiModalFieldConfig.batched("video"),
         video_embeds=MultiModalFieldConfig.batched("video"),
+        video_embed_is_patch=MultiModalFieldConfig.batched("video"),
+        image_token_id=MultiModalFieldConfig.shared("image", num_images),
+        video_token_id=MultiModalFieldConfig.shared("video", num_videos),
     )
 
 
@@ -348,10 +356,11 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
         return get_version_by_config(self.get_hf_config())
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        mm_limits = {"image": None}
         if self.get_model_version() == (2, 6):
-            return {"image": None, "video": None}
-        else:
-            return {"image": None}
+            mm_limits["video"] = None
+
+        return mm_limits
 
     def get_mm_max_tokens_per_item(
         self,
@@ -361,70 +370,79 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
         mm_max_tokens = {"image": self.get_max_image_tokens()}
         if self.get_model_version() == (2, 6):
             mm_max_tokens["video"] = self.get_max_video_tokens(seq_len)
+
         return mm_max_tokens
 
+    def get_slice_image_placeholder(
+        self,
+        image_size: ImageSize,
+        # For MiniCPM V/O 2.6
+        image_idx: int = 0,
+        max_slice_nums: Optional[int] = None,
+        use_image_id: bool = True,
+    ) -> str:
+        image_processor = self.get_image_processor()
+        version = self.get_model_version()
+
+        if version == (2, 0) or version == (2, 5):
+            return image_processor.get_slice_image_placeholder(image_size)
+
+        return image_processor.get_slice_image_placeholder(
+            image_size,
+            image_idx=image_idx,
+            max_slice_nums=max_slice_nums,
+            use_image_id=use_image_id,
+        )
+
+    def get_num_image_tokens(
+        self,
+        image_size: ImageSize,
+        max_slice_nums: Optional[int] = None,
+        use_image_id: bool = True,
+    ) -> int:
+        tokenizer = self.get_tokenizer()
+        image_placeholders = self.get_slice_image_placeholder(
+            image_size,
+            max_slice_nums=max_slice_nums,
+            use_image_id=use_image_id,
+        )
+        image_token_ids = tokenizer.encode(image_placeholders,
+                                           add_special_tokens=False)
+
+        return len(image_token_ids)
+
+    def get_max_image_tokens(self) -> int:
+        image_size = self.get_image_size_with_most_features()
+        return self.get_num_image_tokens(image_size)
+
+    def get_image_max_slice_num(self) -> int:
+        return getattr(self.get_hf_config(), "max_slice_num", 9)
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        image_size = getattr(self.get_hf_config(), "image_size", 448)
+        max_slice_num = self.get_image_max_slice_num()
+        return ImageSize(width=image_size, height=image_size * max_slice_num)
+
     def get_max_video_frame_tokens(self) -> int:
         frame_size = self.get_video_frame_size_with_most_features()
-        return self.get_num_image_tokens(frame_size,
-                                         self.get_video_max_slice_num())
+
+        return self.get_num_image_tokens(
+            frame_size,
+            max_slice_nums=self.get_video_max_slice_num(),
+            use_image_id=False,
+        )
 
     def get_max_video_tokens(self, seq_len: int) -> int:
         return self.get_max_video_frame_tokens(
         ) * self.get_num_frames_with_most_features(seq_len)
 
-    def get_slice_query_num(self) -> int:
-        hf_config = self.get_hf_config()
-        query_num = getattr(hf_config, "query_num", 64)
-        return query_num
-
-    def get_max_slice_num(self) -> int:
-        hf_config = self.get_hf_config()
-        max_slice_num = getattr(hf_config, "max_slice_num", 9)
-        return max_slice_num
-
-    def get_sliced_grid(self, image_size: ImageSize,
-                        max_slice_num: int) -> Tuple[int, int]:
-        if self.get_model_version() == (2, 6):
-            slice_grid = self.get_image_processor().get_sliced_grid(
-                image_size, max_slice_num)
-        else:
-            slice_grid = self.get_image_processor().get_sliced_grid(image_size)
-        return slice_grid
-
-    def get_num_image_tokens(self, image_size: ImageSize,
-                             max_slice_num: int) -> int:
-        slice_grid = self.get_sliced_grid(image_size, max_slice_num)
-        num_tokens = self.get_slice_query_num(
-        ) + 2  # <image>(<unk> * query_num)</image>
-        if slice_grid is not None:
-            if self.get_model_version() == (2, 6):
-                num_additional_tokens = 0
-            else:
-                # <slice><image>(<unk> * query_num)</image></slice>
-                num_additional_tokens = 2
-            num_tokens += ((self.get_slice_query_num() + 2) \
-                            * slice_grid[0] * slice_grid[1]) \
-                            + slice_grid[1] - 1 + num_additional_tokens
-        return num_tokens
-
-    def get_image_slice_nums(self, image_size: torch.Tensor,
-                             max_slice_nums: int) -> int:
-        grid = self.get_sliced_grid(image_size, max_slice_nums)
-        return 1 if grid is None else grid[0] * grid[1] + 1
-
-    def get_max_image_tokens(self) -> int:
-        image_size = self.get_image_size_with_most_features()
-        return self.get_num_image_tokens(image_size, self.get_max_slice_num())
-
-    def get_image_size_with_most_features(self) -> ImageSize:
-        # Result in the max possible feature size (h:w = 9:1)
-        return self.get_default_image_sizes(self.get_max_slice_num())
-
     def get_video_max_slice_num(self) -> int:
         return 1
 
     def get_video_frame_size_with_most_features(self) -> ImageSize:
-        return self.get_default_image_sizes(self.get_video_max_slice_num())
+        image_size = getattr(self.get_hf_config(), "image_size", 448)
+        max_slice_num = self.get_video_max_slice_num()
+        return ImageSize(width=image_size, height=image_size * max_slice_num)
 
     def get_max_video_frames(self, max_tokens: int) -> int:
         num_frame_tokens = self.get_max_video_frame_tokens()
@@ -436,10 +454,7 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
         max_images = mm_config.get_limit_per_prompt("image")
         max_videos = mm_config.get_limit_per_prompt("video")
 
-        # count <image_idx></image_idx> tokens
-        # which are not in get_max_image_tokens
-        max_image_tokens = self.get_max_image_tokens(
-        ) * max_images + 4 * max_images
+        max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self.get_max_video_frames(seq_len -
                                                      max_image_tokens)
 
@@ -447,10 +462,6 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
 
         return num_frames
 
-    def get_default_image_sizes(self, num_slices: int) -> ImageSize:
-        image_size = getattr(self.get_hf_config(), "image_size", 448)
-        return ImageSize(width=image_size, height=image_size * num_slices)
-
 
 _I = TypeVar("_I",
              bound=MiniCPMVProcessingInfo,
@@ -499,42 +510,30 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
     def _get_data_parser(self) -> MultiModalDataParser:
         return MiniCPMVMultiModalDataParser()
 
-    def get_slice_image_placeholder(self, image_size: ImageSize,
-                                    **kwargs) -> str:
-        image_processor = self.info.get_image_processor()
-        version = self.info.get_model_version()
-        if version == (2, 0) or version == (2, 5):
-            return image_processor.get_slice_image_placeholder(image_size)
-        return image_processor.get_slice_image_placeholder(
-            image_size, **kwargs)
-
     def get_image_prompt_texts(self,
                                image_size: ImageSize,
                                image_idx: int = 0) -> str:
-        return self.get_slice_image_placeholder(image_size,
-                                                image_idx=image_idx)
+        return self.info.get_slice_image_placeholder(
+            image_size,
+            image_idx=image_idx,
+        )
 
     def get_video_prompt_texts(self, image_size: ImageSize,
                                num_frames: int) -> str:
-        return self.get_slice_image_placeholder(
+        return self.info.get_slice_image_placeholder(
             image_size=image_size,
             image_idx=0,
             max_slice_nums=self.info.get_video_max_slice_num(),
             use_image_id=False,
         ) * num_frames
 
-    def get_special_tokens(self) -> Dict[str, torch.Tensor]:
+    def get_embed_is_patch(
+        self,
+        input_ids: list[int],
+    ) -> torch.Tensor:
         tokenizer = self.info.get_tokenizer()
-
-        special_tokens = {
-            "im_start_id": tokenizer.im_start_id,
-            "im_end_id": tokenizer.im_end_id,
-        }
-        if hasattr(tokenizer, "slice_start_id"):
-            special_tokens["slice_start_id"] = tokenizer.slice_start_id
-            special_tokens["slice_end_id"] = tokenizer.slice_end_id
-
-        return {k: torch.tensor(v) for k, v in special_tokens.items()}
+        unk_token_id = tokenizer.get_vocab()["<unk>"]
+        return torch.tensor(input_ids) == unk_token_id
 
     def process_images(
         self,
@@ -546,14 +545,43 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
         parsed_images = (self._get_data_parser().parse_mm_data({
             "image": images
-        }).get_items("image", ImageProcessorItems))
+        }).get_items("image",
+                     (MiniCPMVImageEmbeddingItems, ImageProcessorItems)))
 
-        return self._base_call_hf_processor(
-            prompts=[self.info.image_pattern] * len(parsed_images),
-            mm_data={"images": [[image] for image in parsed_images]},
-            mm_kwargs=mm_kwargs,
-            out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
-        )
+        if isinstance(parsed_images, MiniCPMVImageEmbeddingItems):
+            image_inputs = {}
+        else:
+            image_inputs = self._base_call_hf_processor(
+                prompts=[self.info.image_pattern] * len(parsed_images),
+                mm_data={"images": [[image] for image in parsed_images]},
+                mm_kwargs=mm_kwargs,
+                out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
+            )
+
+        image_sizes = [
+            parsed_images.get_image_size(i) for i in range(len(parsed_images))
+        ]
+        image_repl_features = [
+            self.get_image_prompt_texts(size, idx)
+            for idx, size in enumerate(image_sizes)
+        ]
+
+        tokenizer = self.info.get_tokenizer()
+        image_repls_feature_tokens = [
+            tokenizer.encode(image_repl, add_special_tokens=False)
+            for image_repl in image_repl_features
+        ]
+
+        embed_is_patch = [
+            self.get_embed_is_patch(image_repl_tokens)
+            for image_repl_tokens in image_repls_feature_tokens
+        ]
+        image_inputs["embed_is_patch"] = embed_is_patch
+
+        unk_token_id = tokenizer.get_vocab()["<unk>"]
+        image_inputs["image_token_id"] = torch.tensor(unk_token_id)
+
+        return image_inputs
 
     def process_videos(
         self,
@@ -565,25 +593,55 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
         parsed_videos = (self._get_data_parser().parse_mm_data({
             "video": videos
-        }).get_items("video", VideoProcessorItems))
+        }).get_items("video",
+                     (MiniCPMVVideoEmbeddingItems, VideoProcessorItems)))
 
-        max_slice_num = self.info.get_video_max_slice_num()
+        if isinstance(parsed_videos, MiniCPMVVideoEmbeddingItems):
+            video_inputs = {}
+        else:
+            video_inputs = self._base_call_hf_processor(
+                prompts=[
+                    self.info.image_pattern * len(video)
+                    for video in parsed_videos
+                ],
+                mm_data={"images": list(parsed_videos)},
+                mm_kwargs={
+                    **mm_kwargs,
+                    "max_slice_nums":
+                    self.info.get_video_max_slice_num(),
+                },
+                out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
+            )
 
-        video_inputs = self._base_call_hf_processor(
-            prompts=[
-                self.info.image_pattern * len(video) for video in parsed_videos
-            ],
-            mm_data={"images": list(parsed_videos)},
-            mm_kwargs={
-                **mm_kwargs, "max_slice_nums": max_slice_num
-            },
-            out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
-        )
+        frame_sizes = [
+            parsed_videos.get_frame_size(i) for i in range(len(parsed_videos))
+        ]
+        num_frames = [
+            parsed_videos.get_num_frames(i) for i in range(len(parsed_videos))
+        ]
+        video_repl_features = [
+            self.get_video_prompt_texts(size, nframes)
+            for size, nframes in zip(frame_sizes, num_frames)
+        ]
 
-        return {f"video_{k}": v for k, v in video_inputs.items()}
+        tokenizer = self.info.get_tokenizer()
+        video_repls_feature_tokens = [
+            tokenizer.encode(video_repl, add_special_tokens=False)
+            for video_repl in video_repl_features
+        ]
 
-    def get_placeholder_match_pattern(self) -> str:
-        return r"\(<(image|video)>./</\1>\)"
+        embed_is_patch = [
+            self.get_embed_is_patch(video_repl_tokens)
+            for video_repl_tokens in video_repls_feature_tokens
+        ]
+        video_inputs["embed_is_patch"] = embed_is_patch
+
+        video_inputs = {f"video_{k}": v for k, v in video_inputs.items()}
+
+        unk_token_id = tokenizer.get_vocab()["<unk>"]
+        video_inputs["video_token_id"] = torch.tensor(unk_token_id)
+
+        return video_inputs
 
     def process_mm_inputs(
         self,
@@ -602,7 +660,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
         mm_kwargs: Mapping[str, object],
         *,
         out_keys: set[str],
-    ) -> Mapping[str, NestedTensors]:
+    ) -> dict[str, NestedTensors]:
         # This processor supports zipping prompt and mm_data together
         if self.info.get_model_version() == (2, 6):
             inputs = super()._call_hf_processor(
@@ -635,14 +693,13 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        # Do not support combination inputs of images and videos for now
-        # Try to handle interleaved multimodal data
         tokenizer = self.info.get_tokenizer()
+
+        input_ids = torch.tensor([tokenizer.encode(prompt)])
         mm_inputs = self.process_mm_inputs(mm_data, mm_kwargs)
 
         return BatchFeature({
-            "input_ids":
-            torch.tensor([tokenizer.encode(prompt)]),
+            "input_ids": input_ids,
             **mm_inputs,
         })
 
@@ -701,39 +758,8 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
     ) -> Mapping[str, MultiModalFieldConfig]:
         return _minicpmv_field_config(hf_inputs)
 
-    def apply(
-        self,
-        prompt: Union[str, List[int]],
-        mm_data: MultiModalDataDict,
-        hf_processor_mm_kwargs: Mapping[str, object],
-        return_mm_hashes: bool = False,
-    ) -> MultiModalInputs:
-        if isinstance(prompt, list):
-            prompt = self.info.get_tokenizer().decode(prompt)
-        matches = re.findall(self.get_placeholder_match_pattern(), prompt)
-        mm_orders = {
-            f"{modality}_orders":
-            torch.tensor(
-                [index for index, m in enumerate(matches) if m == modality])
-            for modality in self.info.get_supported_mm_limits()
-        }
-        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
-                               return_mm_hashes)
-        # Exclude <image_id>x</image_id> from placeholders
-        if "image" in result["mm_placeholders"] and \
-            self.info.get_model_version() == (2, 6):
-            result["mm_placeholders"]["image"] = [
-                PlaceholderRange(offset=p["offset"] + 3 + idx // 10,
-                                 length=p["length"] - 3 - idx // 10)
-                for idx, p in enumerate(result["mm_placeholders"]["image"])
-            ]
-        result["mm_kwargs"].update(**mm_orders)
-        result["mm_kwargs"].update(**self.get_special_tokens())
-        return result
 
-
-class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP,
-                        SupportsV0Only):
+class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
     """
     The abstract class of MiniCPMV can only be inherited, but cannot be
     instantiated.
@@ -767,6 +793,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP,
                                              prefix=maybe_prefix(
                                                  prefix, "resampler"))
 
+        self.mm_token_ids = set[int]()
         self.make_empty_intermediate_tensors = (
             self.llm.make_empty_intermediate_tensors)
 
@@ -777,233 +804,191 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP,
 
         return get_sampler()
 
-    def get_embedding_with_vision(
+    def _parse_and_validate_vision_input(
         self,
-        input_ids: torch.Tensor,
-        image_inputs: Optional[MiniCPMVImageInputs],
-    ) -> torch.Tensor:
-        vlm_embedding: torch.Tensor = self.llm.get_input_embeddings(input_ids)
-
-        if image_inputs is None:
-            return vlm_embedding
-
-        if image_inputs["type"] == "image_embeds":
-            vision_hidden_states = image_inputs["image_embeds"].to(
-                device=vlm_embedding.device,
-                dtype=vlm_embedding.dtype,
-            )
-        else:
-            vision_hidden_states = self.get_vision_hidden_states(image_inputs)
-
-        # See NOTE in _parse_and_validate_inputs
-        image_bounds = image_inputs["image_bounds"]
-        if len(image_bounds) > 0:
-            image_indices = torch.stack([
-                torch.arange(start, end, dtype=torch.long)
-                for start, end in image_bounds.tolist()
-            ]).to(vlm_embedding.device)
-
-            vlm_embedding.scatter_(
-                0,
-                image_indices.view(-1, 1).repeat(1, vlm_embedding.shape[-1]),
-                vision_hidden_states.view(-1, vision_hidden_states.shape[-1]),
-            )
-
-        return vlm_embedding
-
-    def _get_image_bounds(
-            self,
-            input_ids: torch.Tensor,
-            im_start_id: torch.Tensor,
-            im_end_id: torch.Tensor,
-            slice_start_id: Optional[torch.Tensor] = None,
-            slice_end_id: Optional[torch.Tensor] = None) -> torch.Tensor:
-        # All the images in the batch should share the same special image
-        # bound token ids.
-        start_cond = input_ids == im_start_id[0]
-        end_cond = input_ids == im_end_id[0]
-        if slice_start_id is not None:
-            start_cond |= (input_ids == slice_start_id[0])
-            end_cond |= (input_ids == slice_end_id[0])
-
-        image_start_tokens, = torch.where(start_cond)
-        image_start_tokens += 1
-        image_end_tokens, = torch.where(end_cond)
-        valid_image_nums = max(len(image_start_tokens), len(image_end_tokens))
-
-        if valid_image_nums == 0:
-            return torch.zeros((0, 2), device=input_ids.device)
-
-        return torch.hstack([
-            image_start_tokens[:valid_image_nums].unsqueeze(-1),
-            image_end_tokens[:valid_image_nums].unsqueeze(-1),
-        ])
-
-    def _parse_and_validate_image_inputs(
-        self,
-        input_ids: torch.Tensor,
+        modality: str,
         **kwargs: object,
     ) -> Optional[MiniCPMVImageInputs]:
-        image_keys = {"pixel_values", "tgt_sizes"}
-        pixel_data = {
-            "image": {
-                key: kwargs.pop(key, None)
-                for key in image_keys
-            },
-            "video": {
-                key: kwargs.pop("video_" + key, None)
-                for key in image_keys
-            }
-        }
-        embed_data = {
-            "image": kwargs.pop("image_embeds", None),
-            "video": kwargs.pop("video_embeds", None),
-        }
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
 
-        all_pixel_data = [
-            v for vs in pixel_data.values() for v in vs.values()
-            if v is not None
-        ]
-        all_embed_data = [v for v in embed_data.values() if v is not None]
-        if len(all_pixel_data) == 0 and len(all_embed_data) == 0:
+        if pixel_values is None and image_embeds is None:
             return None
 
-        im_start_id = kwargs.pop("im_start_id")
-        if not isinstance(im_start_id, torch.Tensor):
-            raise ValueError("Incorrect type of im_start_id. "
-                             f"Got type: {type(im_start_id)}")
+        image_token_id = kwargs.pop("image_token_id")
+        if image_token_id is not None:
+            assert isinstance(image_token_id, torch.Tensor)
+            self.mm_token_ids.add(image_token_id.flatten().unique().item())
 
-        im_end_id = kwargs.pop("im_end_id")
-        if not isinstance(im_end_id, torch.Tensor):
-            raise ValueError("Incorrect type of im_end_id. "
-                             f"Got type: {type(im_end_id)}")
+        embed_is_patch = kwargs.pop("embed_is_patch")
+        if not isinstance(embed_is_patch, (torch.Tensor, list)):
+            raise ValueError(
+                f"Incorrect type of embed_is_patch for {modality=}. "
+                f"Got type: {type(embed_is_patch)}")
 
-        slice_start_id = kwargs.pop("slice_start_id", None)
-        if slice_start_id is not None and not isinstance(
-                slice_start_id, torch.Tensor):
-            raise ValueError("Incorrect type of slice_start_id. "
-                             f"Got type: {type(slice_start_id)}")
+        embed_is_patch = flatten_bn(embed_is_patch)
 
-        slice_end_id = kwargs.pop("slice_end_id", None)
-        if slice_end_id is not None and not isinstance(slice_end_id,
-                                                       torch.Tensor):
-            raise ValueError("Incorrect type of slice_end_id. "
-                             f"Got type: {type(slice_end_id)}")
+        if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError(
+                    f"Incorrect type of image_embeds for {modality=}. "
+                    f"Got type: {type(image_embeds)}")
 
-        if len(all_embed_data) > 0:
-            if len(all_embed_data) > 1:
-                raise ValueError("Incorrect inputs for vision embeddings. "
-                                 "Image embeds and video embeds can not "
-                                 "exist simultaneously.")
-
-            vision_embeds, = all_embed_data
-            if not isinstance(vision_embeds, (torch.Tensor, list)):
-                raise ValueError(f"Incorrect type of vision_embeds. "
-                                 f"Got type: {type(vision_embeds)}")
+            image_embeds_flat = flatten_bn(image_embeds)
 
             return MiniCPMVImageEmbeddingInputs(
                 type="image_embeds",
-                image_embeds=flatten_bn(flatten_2d_lists(vision_embeds),
-                                        concat=True),
-                image_bounds=self._get_image_bounds(input_ids, im_start_id,
-                                                    im_end_id, slice_start_id,
-                                                    slice_end_id),
+                image_embeds=image_embeds_flat,
+                embed_is_patch=embed_is_patch,
             )
 
-        order_data = dict[str, Union[torch.Tensor, list[torch.Tensor]]]()
-        for modality in ("image", "video"):
-            modality_orders = kwargs.pop(f"{modality}_orders", None)
-            if modality_orders is not None:
-                if not isinstance(modality_orders, (torch.Tensor, list)):
-                    raise ValueError(f"Incorrect type of {modality}_orders. "
-                                     f"Got type: {type(modality_orders)}")
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError(
+                f"Incorrect type of pixel_values for {modality=}. "
+                f"Got type: {type(pixel_values)}")
 
-                order_data[modality] = modality_orders
+        tgt_sizes = kwargs.pop("tgt_sizes")
+        if not isinstance(tgt_sizes, (torch.Tensor, list)):
+            raise ValueError(f"Incorrect type of tgt_sizes for {modality=}. "
+                             f"Got type: {type(tgt_sizes)}")
 
-        batch_sizes = {
-            modality: len(modality_orders)
-            for modality, modality_orders in order_data.items()
-        }
-        unique_batch_sizes = set(batch_sizes.values())
-        assert len(unique_batch_sizes) == 1, (
-            f"Found inconsistent batch sizes: {batch_sizes}")
-        batch_size, = unique_batch_sizes
+        num_slices = [[len(p) for p in ps] for ps in pixel_values]
+        num_slices_flat = flatten_bn(torch.tensor(num_slices))
 
-        pixel_values_flat = list[torch.Tensor]()
-        tgt_sizes_flat = list[torch.Tensor]()
-        for b in range(batch_size):
-            mm_orders_b = [(idx_b.item(), modality)
-                           for modality, modality_orders in order_data.items()
-                           for idx_b in modality_orders[b]]
+        pixel_values_flat = flatten_bn(flatten_2d_lists(pixel_values))
+        tgt_sizes_flat = flatten_bn(flatten_2d_lists(tgt_sizes), concat=True)
 
-            for _, modality in sorted(mm_orders_b, key=lambda x: x[0]):
-                modality_pixel_data = pixel_data[modality]
-
-                modality_pixel_values = modality_pixel_data["pixel_values"]
-                if not isinstance(modality_pixel_values, (torch.Tensor, list)):
-                    raise ValueError(
-                        f"Incorrect type of pixel_values for {modality=}. "
-                        f"Got type: {type(modality_pixel_values)}")
-
-                modality_tgt_sizes = modality_pixel_data["tgt_sizes"]
-                if not isinstance(modality_tgt_sizes, (torch.Tensor, list)):
-                    raise ValueError(
-                        f"Incorrect type of tgt_sizes for {modality=}. "
-                        f"Got type: {type(modality_tgt_sizes)}")
-
-                pixel_values_flat += flatten_2d_lists(modality_pixel_values[b])
-                tgt_sizes_flat += flatten_2d_lists(modality_tgt_sizes[b])
-
-        # NOTE: Input IDs does not contain image tokens during memory profiling,
-        # so we allow it to be empty
         if len(pixel_values_flat) != len(tgt_sizes_flat):
             raise ValueError("Inconsistent flattened lengths, found: "
                              f"{len(pixel_values_flat)} vs. "
                              f"{len(tgt_sizes_flat)}")
 
-        if len(pixel_values_flat) == 0:
-            return None
-
         return MiniCPMVImagePixelInputs(
             type="pixel_values",
             pixel_values=pixel_values_flat,
-            tgt_sizes=torch.stack(tgt_sizes_flat),
-            image_bounds=self._get_image_bounds(input_ids, im_start_id,
-                                                im_end_id, slice_start_id,
-                                                slice_end_id),
+            tgt_sizes=tgt_sizes_flat,
+            embed_is_patch=embed_is_patch,
+            num_slices=num_slices_flat,
         )
 
-    def _parse_and_validate_inputs(self, input_ids: torch.Tensor,
-                                   **kwargs: object):
-        return self._parse_and_validate_image_inputs(input_ids, **kwargs)
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("pixel_values",
+                             "image_embeds") and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_vision_input(
+                    "images", **kwargs)
+            if input_key in ("video_pixel_values",
+                             "video_embeds") and "videos" not in modalities:
+
+                def _image_key(video_key: str):
+                    if video_key == "video_token_id":
+                        return "image_token_id"
+
+                    return video_key.removeprefix("video_")
+
+                modalities["videos"] = self._parse_and_validate_vision_input(
+                    "videos", **{
+                        _image_key(k): v
+                        for k, v in kwargs.items()
+                    })
+
+        return modalities
+
+    def _process_vision_input(
+        self,
+        image_input: MiniCPMVImageInputs,
+    ) -> Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor, ...]]:
+        if image_input["type"] == "image_embeds":
+            return image_input["image_embeds"]
+
+        image_features_flat = self.get_vision_hidden_states(image_input)
+
+        # Reconstruct the batch dimension
+        return image_features_flat.split(image_input["num_slices"].tolist())
+
+    def _process_multimodal_inputs(self, modalities: dict):
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in modalities:
+            if modality == "images":
+                image_input = modalities["images"]
+                image_features = self._process_vision_input(image_input)
+                multimodal_embeddings += tuple(
+                    scatter_patch_features(
+                        image_features,
+                        image_input["embed_is_patch"],
+                    ))
+            if modality == "videos":
+                video_input = modalities["videos"]
+                video_features = self._process_vision_input(video_input)
+                multimodal_embeddings += tuple(
+                    scatter_patch_features(
+                        video_features,
+                        video_input["embed_is_patch"],
+                    ))
+
+        return multimodal_embeddings
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
+            return None
+
+        return self._process_multimodal_inputs(modalities)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.llm.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            assert len(self.mm_token_ids) > 0
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                select_patch_features(multimodal_embeddings),
+                list(self.mm_token_ids),
+            )
+        return inputs_embeds
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: Any,
     ) -> torch.Tensor:
         if intermediate_tensors is not None:
-            vlm_embeddings = None
-        else:
-            image_inputs = \
-                self._parse_and_validate_inputs(input_ids, **kwargs)
-            vlm_embeddings = self.get_embedding_with_vision(
-                input_ids, image_inputs)
+            inputs_embeds = None
 
-        # always pass the input via `inputs_embeds`
-        # to make sure the computation graph is consistent
-        # for `torch.compile` integration
-        input_ids = None
+        # NOTE: In v1, inputs_embeds is always generated at model runner from
+        # `get_multimodal_embeddings` and `get_input_embeddings`, this
+        # condition is only for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
 
-        output = self.llm.model(
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.llm.model(
             input_ids=input_ids,
             positions=positions,
             intermediate_tensors=intermediate_tensors,
-            inputs_embeds=vlm_embeddings,
+            inputs_embeds=inputs_embeds,
         )
-        return output
+        return hidden_states
 
     def compute_logits(
         self,
@@ -1105,9 +1090,6 @@ class MiniCPMV2_0(MiniCPMVBaseModel):
 
         return model
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.embed_tokens(input_ids)
-
     def init_resampler(self,
                        embed_dim: int,
                        vision_dim: int,
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 9224687d8a5d3..b2f795155f17b 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -92,8 +92,8 @@ class MolmoImageInputs(TypedDict):
     Shape: `(batch_size * num_images, num_embeds)`
     """
 
-    num_crops: Union[torch.Tensor, list[torch.Tensor]]
-    """Shape: `(batch_size, num_images)`"""
+    num_crops: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
 
 
 @dataclass
@@ -1492,6 +1492,7 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
         self.img_patch_id = img_patch_id.flatten().unique().item()
 
         embed_is_patch = flatten_bn(embed_is_patch)
+        num_crops = flatten_bn(num_crops, concat=True)
 
         return MolmoImageInputs(
             images=images,
@@ -1510,31 +1511,24 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
         feat_is_patch = image_input["feat_is_patch"]
         num_crops = image_input["num_crops"]
 
-        if isinstance(images, list):
-            # Call the vision backbone on the whole batch at once
-            images_flat = flatten_bn(images, concat=True)
-            image_masks_flat = (None if image_masks is None else flatten_bn(
-                image_masks, concat=True))
+        # Call the vision backbone on the whole batch at once
+        images_flat = flatten_bn(images, concat=True)
+        image_masks_flat = (None if image_masks is None else flatten_bn(
+            image_masks, concat=True))
+        feat_is_patch_flat = flatten_bn(feat_is_patch, concat=True)
 
-            image_features_flat = self.vision_backbone(
-                images=images_flat.unsqueeze(0),
-                image_masks=(None if image_masks_flat is None else
-                             image_masks_flat.unsqueeze(0)),
-            ).squeeze(0)
-
-            # Reconstruct the batch dimension
-            num_crops_per_image = [nc.sum().item() for nc in num_crops]
-            image_features = image_features_flat.split(num_crops_per_image)
-        else:
-            image_features = self.vision_backbone(
-                images=images,
-                image_masks=image_masks,
-            )
+        image_features_flat = self.vision_backbone(
+            images=images_flat.unsqueeze(0),
+            image_masks=(None if image_masks_flat is None else
+                         image_masks_flat.unsqueeze(0)),
+        ).squeeze(0)
 
         # Only the features corresponding to patch tokens are relevant
         return [
-            feats[f_is_patch]
-            for feats, f_is_patch in zip(image_features, feat_is_patch)
+            feats[f_is_patch] for feats, f_is_patch in zip(
+                image_features_flat.split(num_crops.tolist()),
+                feat_is_patch_flat.split(num_crops.tolist()),
+            )
         ]
 
     def get_multimodal_embeddings(

From 8958217ad5a6830c4d911e5f15e6eb791df337b6 Mon Sep 17 00:00:00 2001
From: Hiroaki Sugiyama <h.sugi@ieee.org>
Date: Thu, 27 Mar 2025 23:29:29 +0900
Subject: [PATCH 045/593] [Bugfix] Fix use_cascade_attention handling for
 Alibi-based models on vllm/v1 (#15211)

Signed-off-by: h-sugi <h.sugi@ieee.org>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/utils.py                      | 14 +++++++++++++-
 vllm/v1/worker/gpu_model_runner.py |  7 +++++--
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 516b33dca1dc8..77f4e2dcf5e45 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -61,7 +61,7 @@ import vllm.envs as envs
 from vllm.logger import enable_trace_function_call, init_logger
 
 if TYPE_CHECKING:
-    from vllm.config import VllmConfig
+    from vllm.config import ModelConfig, VllmConfig
 
 logger = init_logger(__name__)
 
@@ -2498,6 +2498,18 @@ def cprofile(save_file: Optional[str] = None, enabled: bool = True):
     return decorator
 
 
+# Only relevant for models using ALiBi (e.g, MPT)
+def check_use_alibi(model_config: ModelConfig) -> bool:
+    return (getattr(model_config.hf_text_config, "alibi", False)  # Falcon
+            or ("BloomForCausalLM" in getattr(model_config.hf_config,
+                                              "architectures", []))  # Bloom
+            or getattr(model_config.hf_text_config, "position_encoding_type",
+                       "") == "alibi"  # codellm_1b_alibi
+            or
+            (hasattr(model_config.hf_text_config, "attn_config")  # MPT
+             and model_config.hf_text_config.attn_config.get("alibi", False)))
+
+
 def sha256(input) -> int:
     """Hash any picklable Python object using SHA-256.
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index bcf7762b44496..230479f3f15e7 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -25,7 +25,7 @@ from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
-                        LayerBlockType, LazyLoader, cdiv,
+                        LayerBlockType, LazyLoader, cdiv, check_use_alibi,
                         is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
@@ -223,6 +223,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 device="cpu",
                 pin_memory=self.pin_memory)
 
+        # Only relevant for models using ALiBi (e.g, MPT)
+        self.use_alibi = check_use_alibi(model_config)
+
         self.inputs_embeds = torch.zeros(
             (self.max_num_tokens, self.hidden_size),
             dtype=self.dtype,
@@ -689,7 +692,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             query_lens=num_scheduled_tokens,
             num_query_heads=self.num_query_heads,
             num_kv_heads=self.num_kv_heads,
-            use_alibi=False,  # FIXME
+            use_alibi=self.use_alibi,
             use_sliding_window=self.window_size is not None,
             num_sms=self.num_sms,
         )

From 07bf813fb554c9a78d1e9f4a587edd8b6d9d7ccd Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Mar 2025 00:30:53 +0800
Subject: [PATCH 046/593] [Doc] Link to onboarding tasks (#15629)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/conf.py                  | 5 +++++
 docs/source/contributing/overview.md | 9 +++++++++
 2 files changed, 14 insertions(+)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index b02b84826c9f2..3e790827f53bb 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -103,6 +103,11 @@ myst_url_schemes = {
         "title": "Pull Request #{{path}}",
         "classes": ["github"],
     },
+    "gh-project": {
+        "url": "https://github.com/vllm-project/projects/{{path}}",
+        "title": "Project #{{path}}",
+        "classes": ["github"],
+    },
     "gh-dir": {
         "url": "https://github.com/vllm-project/vllm/tree/main/{{path}}",
         "title": "{{path}}",
diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md
index a414118316692..10cbc0eb1264b 100644
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@@ -11,6 +11,15 @@ We also believe in the power of community support; thus, answering queries, offe
 
 Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository!
 
+## Job Board
+
+Unsure on where to start? Check out the following links for tasks to work on:
+
+- [Good first issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22good%20first%20issue%22)
+  - [Selected onboarding tasks](gh-project:6)
+- [New model requests](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22new%20model%22)
+  - [Models with multi-modal capabilities](gh-project:10)
+
 ## License
 
 See <gh-file:LICENSE>.

From 247181536fc2cab728077f3e7489622e19671d2d Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Mar 2025 01:36:32 +0800
Subject: [PATCH 047/593] [Misc] Replace `is_encoder_decoder_inputs` with
 `split_enc_dec_inputs` (#15620)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../multimodal/processing/test_idefics3.py    |  2 +-
 .../multimodal/processing/test_phi3v.py       |  2 +-
 vllm/engine/arg_utils.py                      |  2 +-
 vllm/engine/llm_engine.py                     | 28 ++++++++----------
 vllm/inputs/parse.py                          | 22 +++++++++-----
 vllm/inputs/registry.py                       | 14 ++++-----
 vllm/model_executor/models/idefics3.py        |  4 +--
 vllm/v1/engine/processor.py                   | 29 ++++++++-----------
 8 files changed, 49 insertions(+), 54 deletions(-)

diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py
index fdbe2f17692f7..4cff429a53941 100644
--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -29,7 +29,7 @@ def test_processor_override(
     num_imgs: int,
     kwargs_on_init: bool,
 ):
-    """Ensure input_processor_for_idefics3 handles num_crops properly."""
+    """Ensure Idefics3MultiModalProcessor handles num_crops properly."""
     # Same as the previous test - don't initialize mm_processor_kwargs
     # in this test and assume that the kwargs will be correctly expanded by
     # the partial when calling the custom input processor.
diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py
index 2f0c8e7e5492c..dd5f30a23176b 100644
--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -30,7 +30,7 @@ def test_processor_override(
     num_imgs: int,
     kwargs_on_init: bool,
 ):
-    """Ensure input_processor_for_phi3v handles num_crops properly."""
+    """Ensure Phi3VMultiModalProcessor handles num_crops properly."""
     # Avoid initializing CUDA early
     from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 784ea35beb357..53af3e5717c52 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -665,7 +665,7 @@ class EngineArgs:
             type=nullable_kvs,
             default=EngineArgs.limit_mm_per_prompt,
             # The default value is given in
-            # MultiModalRegistry.init_mm_limits_per_prompt
+            # MultiModalConfig.get_limit_per_prompt
             help=('For each multimodal plugin, limit how many '
                   'input instances to allow for each prompt. '
                   'Expects a comma-separated list of items, '
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 3d019ea58c5e1..4856c3568319b 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -30,8 +30,8 @@ from vllm.entrypoints.openai.logits_processors import (
     get_logits_processors as get_openai_logits_processors)
 from vllm.executor.executor_base import ExecutorBase
 from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
-                         PromptType, SingletonInputsAdapter)
-from vllm.inputs.parse import is_encoder_decoder_inputs, is_token_prompt
+                         PromptType)
+from vllm.inputs.parse import is_token_prompt, split_enc_dec_inputs
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.logits_process import get_bad_words_logits_processors
@@ -609,12 +609,7 @@ class LLMEngine:
         seq_id = next(self.seq_counter)
         eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
 
-        if is_encoder_decoder_inputs(processed_inputs):
-            decoder_inputs = processed_inputs["decoder"]
-            encoder_inputs = processed_inputs["encoder"]
-        else:
-            decoder_inputs = processed_inputs
-            encoder_inputs = None
+        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
 
         seq = Sequence(seq_id, decoder_inputs, block_size, eos_token_id,
                        lora_request, prompt_adapter_request)
@@ -2031,15 +2026,16 @@ class LLMEngine:
 
     def _validate_model_inputs(self, inputs: ProcessorInputs,
                                lora_request: Optional[LoRARequest]):
-        if is_encoder_decoder_inputs(inputs):
-            # For encoder-decoder multimodal models, the max_prompt_len
-            # restricts the decoder prompt length
-            prompt_inputs = inputs["decoder" if self.model_config.
-                                   is_multimodal_model else "encoder"]
-        else:
-            prompt_inputs = inputs
+        encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)
 
-        prompt_ids = SingletonInputsAdapter(prompt_inputs).prompt_token_ids
+        # For encoder-decoder multimodal models, the max_prompt_len
+        # restricts the decoder prompt length
+        if self.model_config.is_multimodal_model:
+            prompt_inputs = decoder_inputs
+        else:
+            prompt_inputs = encoder_inputs or decoder_inputs
+
+        prompt_ids = prompt_inputs["prompt_token_ids"]
 
         if prompt_ids is None or len(prompt_ids) == 0:
             raise ValueError("Prompt cannot be empty")
diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py
index ed1056948d807..28e207de1fd39 100644
--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@@ -1,15 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
-
 from collections.abc import Sequence
-from typing import Literal, TypedDict, Union, cast, overload
+from typing import Literal, Optional, TypedDict, Union, cast, overload
 
 from typing_extensions import TypeIs
 
 from vllm.utils import is_list_of
 
-from .data import (EncoderDecoderInputs, ExplicitEncoderDecoderPrompt,
-                   ProcessorInputs, PromptType, SingletonPrompt, TextPrompt,
-                   TokensPrompt)
+from .data import (ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType,
+                   SingletonInputs, SingletonPrompt, TextPrompt, TokensPrompt)
 
 
 class ParsedText(TypedDict):
@@ -110,6 +108,14 @@ def is_explicit_encoder_decoder_prompt(
     return isinstance(prompt, dict) and "encoder_prompt" in prompt
 
 
-def is_encoder_decoder_inputs(
-        inputs: ProcessorInputs) -> TypeIs[EncoderDecoderInputs]:
-    return "encoder" in inputs and "decoder" in inputs
+def split_enc_dec_inputs(
+    inputs: ProcessorInputs,
+) -> tuple[Optional[SingletonInputs], SingletonInputs]:
+    if "encoder" in inputs and "decoder" in inputs:
+        # NOTE: This passes pyright but not mypy
+        return (
+            inputs["encoder"],  # type: ignore[typeddict-item]
+            inputs["decoder"],  # type: ignore[typeddict-item]
+        )
+
+    return None, inputs
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index b6ceb5fb82d70..8b95db7a72522 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -19,7 +19,7 @@ from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides,
                         resolve_mm_processor_kwargs)
 
 from .data import ProcessorInputs, SingletonInputs
-from .parse import is_encoder_decoder_inputs
+from .parse import split_enc_dec_inputs
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -462,13 +462,11 @@ class InputRegistry:
             **mm_processor_kwargs,
         )
 
-        if is_encoder_decoder_inputs(processed_inputs):
-            self._ensure_mm_kwargs(processed_inputs["encoder"],
-                                   mm_processor_kwargs)
-            self._ensure_mm_kwargs(processed_inputs["decoder"],
-                                   mm_processor_kwargs)
-        else:
-            self._ensure_mm_kwargs(processed_inputs, mm_processor_kwargs)
+        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
+        if encoder_inputs is not None:
+            self._ensure_mm_kwargs(encoder_inputs, mm_processor_kwargs)
+        if decoder_inputs is not None:
+            self._ensure_mm_kwargs(decoder_inputs, mm_processor_kwargs)
 
         return processed_inputs
 
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 234e4498f163b..432f26141048b 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -232,7 +232,7 @@ class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo]
         )
 
 
-class Idefics3MultimodalProcessor(
+class Idefics3MultiModalProcessor(
         BaseMultiModalProcessor[Idefics3ProcessingInfo]):
 
     def _call_hf_processor(
@@ -575,7 +575,7 @@ class Idefics3Model(nn.Module):
 
 
 @MULTIMODAL_REGISTRY.register_processor(
-    Idefics3MultimodalProcessor,
+    Idefics3MultiModalProcessor,
     info=Idefics3ProcessingInfo,
     dummy_inputs=Idefics3DummyInputsBuilder)
 class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index e281781675769..065ac0920af77 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -7,7 +7,7 @@ from typing import Optional, Union
 from vllm.config import VllmConfig
 from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
                          PromptType, SingletonInputsAdapter)
-from vllm.inputs.parse import is_encoder_decoder_inputs
+from vllm.inputs.parse import split_enc_dec_inputs
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
@@ -209,14 +209,8 @@ class Processor:
 
         self._validate_model_inputs(processed_inputs, lora_request)
 
-        if is_encoder_decoder_inputs(processed_inputs):
-            decoder_inputs = SingletonInputsAdapter(
-                processed_inputs["decoder"])
-            encoder_inputs = SingletonInputsAdapter(
-                processed_inputs["encoder"])
-        else:
-            decoder_inputs = SingletonInputsAdapter(processed_inputs)
-            encoder_inputs = None
+        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
+        decoder_inputs = SingletonInputsAdapter(decoder_inputs)
 
         # TODO: Impl encoder-decoder
         if encoder_inputs is not None:
@@ -301,15 +295,16 @@ class Processor:
     def _validate_model_inputs(self,
                                inputs: ProcessorInputs,
                                lora_request: Optional[LoRARequest] = None):
-        if is_encoder_decoder_inputs(inputs):
-            # For encoder-decoder multimodal models, the max_prompt_len
-            # restricts the decoder prompt length
-            prompt_inputs = inputs["decoder" if self.model_config.
-                                   is_multimodal_model else "encoder"]
-        else:
-            prompt_inputs = inputs
+        encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)
 
-        prompt_ids = SingletonInputsAdapter(prompt_inputs).prompt_token_ids
+        # For encoder-decoder multimodal models, the max_prompt_len
+        # restricts the decoder prompt length
+        if self.model_config.is_multimodal_model:
+            prompt_inputs = decoder_inputs
+        else:
+            prompt_inputs = encoder_inputs or decoder_inputs
+
+        prompt_ids = prompt_inputs["prompt_token_ids"]
 
         if prompt_ids is None or len(prompt_ids) == 0:
             raise ValueError("Prompt cannot be empty")

From 66aa4c0bf4973065b45172dc18346016a6087a10 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Thu, 27 Mar 2025 13:49:38 -0400
Subject: [PATCH 048/593] [Feature] Add middleware to log API Server responses
 (#15593)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 vllm/entrypoints/openai/api_server.py | 16 ++++++++++++++++
 vllm/envs.py                          |  5 +++++
 2 files changed, 21 insertions(+)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 374e43fb15341..1e735da641df9 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -24,6 +24,7 @@ from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, Response, StreamingResponse
+from starlette.concurrency import iterate_in_threadpool
 from starlette.datastructures import State
 from starlette.routing import Mount
 from typing_extensions import assert_never
@@ -846,6 +847,21 @@ def build_app(args: Namespace) -> FastAPI:
             response.headers["X-Request-Id"] = request_id
             return response
 
+    if envs.VLLM_DEBUG_LOG_API_SERVER_RESPONSE:
+        logger.warning("CAUTION: Enabling log response in the API Server. "
+                       "This can include sensitive information and should be "
+                       "avoided in production.")
+
+        @app.middleware("http")
+        async def log_response(request: Request, call_next):
+            response = await call_next(request)
+            response_body = [
+                section async for section in response.body_iterator
+            ]
+            response.body_iterator = iterate_in_threadpool(iter(response_body))
+            logger.info("response_body={%s}", response_body[0].decode())
+            return response
+
     for middleware in args.middleware:
         module_path, object_name = middleware.rsplit(".", 1)
         imported = getattr(importlib.import_module(module_path), object_name)
diff --git a/vllm/envs.py b/vllm/envs.py
index 23c304f124d36..e5025485a2501 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -270,6 +270,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_API_KEY":
     lambda: os.environ.get("VLLM_API_KEY", None),
 
+    # Whether to log responses from API Server for debugging
+    "VLLM_DEBUG_LOG_API_SERVER_RESPONSE":
+    lambda: os.environ.get("VLLM_DEBUG_LOG_API_SERVER_RESPONSE", "False").
+    lower() == "true",
+
     # S3 access information, used for tensorizer to load model from S3
     "S3_ACCESS_KEY_ID":
     lambda: os.environ.get("S3_ACCESS_KEY_ID", None),

From 13ac9cab21e3cd12acd0c94376bc2f6da3dca5cd Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Mar 2025 01:52:00 +0800
Subject: [PATCH 049/593] [Misc] Avoid direct access of global `mm_registry` in
 `compute_encoder_budget` (#15621)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/v1/core/encoder_cache_manager.py | 16 ++++++++++++----
 vllm/v1/core/sched/scheduler.py       |  3 +++
 vllm/v1/worker/gpu_model_runner.py    |  6 +++---
 vllm/v1/worker/tpu_model_runner.py    |  1 +
 4 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index 018379c1f43af..dc76df268c588 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -3,7 +3,7 @@
 from typing import TYPE_CHECKING
 
 from vllm.logger import init_logger
-from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal import MultiModalRegistry
 from vllm.v1.request import Request
 
 if TYPE_CHECKING:
@@ -67,6 +67,7 @@ class EncoderCacheManager:
 def compute_encoder_budget(
     model_config: "ModelConfig",
     scheduler_config: "SchedulerConfig",
+    mm_registry: MultiModalRegistry,
 ) -> tuple[int, int]:
     """Compute the encoder cache budget based on the model and scheduler 
     configurations.
@@ -74,6 +75,7 @@ def compute_encoder_budget(
     Args:
         model_config: Model configuration.
         scheduler_config: Scheduler configuration.
+        mm_registry: Provides information about the token cost.
 
     Returns:
         - Compute budget for encoder execution, in unit of number of tokens 
@@ -89,7 +91,11 @@ def compute_encoder_budget(
     (
         encoder_compute_budget,
         encoder_cache_size,
-    ) = _compute_encoder_budget_multimodal(model_config, scheduler_config)
+    ) = _compute_encoder_budget_multimodal(
+        model_config,
+        scheduler_config,
+        mm_registry,
+    )
 
     return encoder_compute_budget, encoder_cache_size
 
@@ -97,6 +103,7 @@ def compute_encoder_budget(
 def _compute_encoder_budget_multimodal(
     model_config: "ModelConfig",
     scheduler_config: "SchedulerConfig",
+    mm_registry: MultiModalRegistry,
 ) -> tuple[int, int]:
     """Compute the encoder cache budget based on the model and scheduler 
     configurations for a multimodal model.
@@ -104,6 +111,7 @@ def _compute_encoder_budget_multimodal(
     Args:
         model_config: Model configuration.
         scheduler_config: Scheduler configuration.
+        mm_registry: Provides information about the token cost.
 
     Returns:
         - Compute budget for encoder execution, in unit of number of tokens 
@@ -112,8 +120,8 @@ def _compute_encoder_budget_multimodal(
             in the input sequence.
     """
 
-    max_tokens_by_modality_dict = MULTIMODAL_REGISTRY.get_max_tokens_per_item_by_nonzero_modality(  # noqa: E501
-        model_config)
+    max_tokens_by_modality_dict = mm_registry \
+        .get_max_tokens_per_item_by_nonzero_modality(model_config)
 
     if not max_tokens_by_modality_dict:
         logger.warning(
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index ba7c691306bb1..87d30c8aefbf0 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -10,6 +10,7 @@ from typing import Optional, Union
 from vllm.config import (CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig,
                          SpeculativeConfig)
 from vllm.logger import init_logger
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
                                                 compute_encoder_budget)
 from vllm.v1.core.kv_cache_manager import KVCacheManager
@@ -38,6 +39,7 @@ class Scheduler(SchedulerInterface):
         speculative_config: Optional[SpeculativeConfig],
         log_stats: bool,
         structured_output_manager: StructuredOutputManager,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ) -> None:
         self.scheduler_config = scheduler_config
         self.cache_config = cache_config
@@ -93,6 +95,7 @@ class Scheduler(SchedulerInterface):
         encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
             model_config=model_config,
             scheduler_config=scheduler_config,
+            mm_registry=mm_registry,
         )
 
         # NOTE(woosuk): Here, "encoder" includes the vision encoder (and
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 230479f3f15e7..133ccf84832c4 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -137,6 +137,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
             model_config=model_config,
             scheduler_config=scheduler_config,
+            mm_registry=self.mm_registry,
         )
         self.max_num_encoder_input_tokens = encoder_compute_budget
         self.encoder_cache_size = encoder_cache_size
@@ -1439,9 +1440,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             # NOTE: Currently model is profiled with a single non-text
             # modality with the max possible input tokens even when
             # it supports multiple.
-            max_tokens_by_modality_dict = (
-                MULTIMODAL_REGISTRY.
-                get_max_tokens_per_item_by_nonzero_modality(self.model_config))
+            max_tokens_by_modality_dict = self.mm_registry \
+                .get_max_tokens_per_item_by_nonzero_modality(self.model_config)
             dummy_data_modality, max_tokens_per_mm_item = max(
                 max_tokens_by_modality_dict.items(), key=lambda item: item[1])
 
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 65a4048ae74d6..abe1b338fb717 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -109,6 +109,7 @@ class TPUModelRunner:
         encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
             model_config=model_config,
             scheduler_config=scheduler_config,
+            mm_registry=self.mm_registry,
         )
         self.max_num_encoder_input_tokens = encoder_compute_budget
         self.encoder_cache_size = encoder_cache_size

From 46450b8d33eeee3c619e4d6aea6652ee3d16386f Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 27 Mar 2025 18:52:18 +0000
Subject: [PATCH 050/593] Use absolute placement for Ask AI button (#15628)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/source/_static/custom.js | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js
index be0b2a388e404..58bc2ebb9614b 100644
--- a/docs/source/_static/custom.js
+++ b/docs/source/_static/custom.js
@@ -10,8 +10,8 @@ document.addEventListener("DOMContentLoaded", function () {
     script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget.
     script.setAttribute("runllm-name", "vLLM");
     script.setAttribute("runllm-position", "BOTTOM_RIGHT");
-    script.setAttribute("runllm-position-y", "20%");
-    script.setAttribute("runllm-position-x", "3%");
+    script.setAttribute("runllm-position-y", "120px");
+    script.setAttribute("runllm-position-x", "20px");
     script.setAttribute("runllm-assistant-id", "207");
   
     script.async = true;

From 4098b72210dc10761bb348b373bbd0fc9b23b0e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Thu, 27 Mar 2025 20:15:06 +0100
Subject: [PATCH 051/593] [Bugfix][TPU][V1] Fix recompilation (#15553)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 .buildkite/run-tpu-v1-test.sh      |  4 +-
 tests/v1/tpu/test_sampler.py       | 69 +++---------------------------
 vllm/v1/sample/tpu/metadata.py     |  8 +---
 vllm/v1/worker/tpu_model_runner.py |  8 +++-
 4 files changed, 15 insertions(+), 74 deletions(-)

diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh
index 6e1f79ae649e3..a93b79c0b1b28 100755
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@@ -32,7 +32,9 @@ docker run --privileged --net host --shm-size=16G -it \
     && echo TEST_5 \
     && python3 /workspace/vllm/examples/offline_inference/tpu.py \
     && echo TEST_6 \
-    && pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py" \
+    && pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py \
+    && echo TEST_7 \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" \
 
 
 # TODO: This test fails because it uses RANDOM_SEED sampling
diff --git a/tests/v1/tpu/test_sampler.py b/tests/v1/tpu/test_sampler.py
index 4e5a57bee3275..f535abedea229 100644
--- a/tests/v1/tpu/test_sampler.py
+++ b/tests/v1/tpu/test_sampler.py
@@ -1,7 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-import tempfile
-from time import time
-
 import pytest
 
 from vllm import LLM, envs
@@ -15,60 +12,6 @@ if not envs.VLLM_USE_V1:
     )
 
 
-@pytest.mark.parametrize("model_name", ["D4nt3/Qwen2.5-two-layers"])
-@pytest.mark.skipif(not current_platform.is_tpu(),
-                    reason="This test needs a TPU")
-def test_sampler_compilation(model_name: str, monkeypatch):
-    """
-    Check that no recompilation happens despite changing sampling parameters.
-    We can't read XLA metrics from the engine process, hence we measure time.  
-    """
-    with tempfile.TemporaryDirectory() as temp_dir:
-        monkeypatch.setenv("VLLM_XLA_CACHE_PATH", temp_dir)
-        # Compiling model init may still take some time, enforce_eager to skip.
-        llm = LLM(model_name,
-                  enforce_eager=True,
-                  max_num_seqs=16,
-                  max_model_len=1024,
-                  gpu_memory_utilization=0.5)
-        prompts = [
-            "A robot may not injure a human being",
-            "It is only with the heart that one can see rightly;",
-        ]
-        # First inference should be slow
-        sampling_params = SamplingParams(
-            temperature=0.7,
-            # top_p=0.6, # TODO too slow!
-            top_k=10,
-            min_p=0.2,
-            max_tokens=16)
-        s = time()
-        _ = llm.generate(prompts, sampling_params)
-        run1 = time() - s
-
-        # Second request with different params, but for which we
-        # compiled for in previous eager iteration.
-        sampling_params = SamplingParams(temperature=0.1,
-                                         top_k=12,
-                                         min_p=0.8,
-                                         max_tokens=24)
-        s = time()
-        _ = llm.generate(prompts, sampling_params)
-        run2 = time() - s
-        # Much faster after compiling
-        assert run1 * 0.1 > run2
-        print("TIMES", run1, run2)
-
-        # Third request with min_p set to "None". It will not trigger
-        # recompilation as a default 0 value will be used.
-        sampling_params = SamplingParams(max_tokens=24, temperature=0.0)
-        s = time()
-        _ = llm.generate(prompts, sampling_params)
-        run3 = time() - s
-        assert run1 * 0.1 > run3
-        print("TIMES", run1, run3)
-
-
 @pytest.mark.parametrize("model_name", ["Qwen/Qwen2.5-1.5B-Instruct"])
 @pytest.mark.skipif(not current_platform.is_tpu(),
                     reason="This test needs a TPU")
@@ -77,13 +20,11 @@ def test_sampler_different(model_name: str):
     Test significantly different sampling params to assert the model produces 
     different results.
     """
-    llm = LLM(
-        model_name,
-        enforce_eager=True,
-        max_num_seqs=1,
-        max_model_len=64,
-        # TODO: setting to 0.5 or it will go OOM
-        gpu_memory_utilization=0.5)
+    llm = LLM(model_name,
+              enforce_eager=False,
+              max_num_seqs=1,
+              max_model_len=512,
+              max_num_batched_tokens=512)
     prompts = [
         "Write a short story about a robot that dreams for the first time."
     ]
diff --git a/vllm/v1/sample/tpu/metadata.py b/vllm/v1/sample/tpu/metadata.py
index d605c4b65e9d3..89d3ddf51d748 100644
--- a/vllm/v1/sample/tpu/metadata.py
+++ b/vllm/v1/sample/tpu/metadata.py
@@ -88,6 +88,7 @@ class TPUSupportedSamplingMetadata:
             # Copy slice from CPU to corresponding TPU pre-allocated tensor.
             # Pad value is the default one.
             cpu_tensor[num_reqs:padded_num_reqs] = fill_val
+            # Subtle compilation: len(tpu_tensor) must be >= `padded_num_reqs`
             tpu_tensor[:padded_num_reqs] = cpu_tensor[:padded_num_reqs]
 
         # NOTE NickLucche The sync CPU-TPU graph we produce here must be
@@ -101,13 +102,6 @@ class TPUSupportedSamplingMetadata:
         copy_slice(input_batch.min_p_cpu_tensor, input_batch.min_p,
                    DEFAULT_SAMPLING_PARAMS["min_p"])
 
-        # copy_slice(input_batch.frequency_penalties_cpu_tensor,
-        #             input_batch.frequency_penalties)
-        # copy_slice(input_batch.presence_penalties_cpu_tensor,
-        #             input_batch.presence_penalties)
-        # copy_slice(input_batch.repetition_penalties_cpu_tensor,
-        #             input_batch.repetition_penalties)
-
         xm.mark_step()
         xm.wait_device_ops()
 
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index abe1b338fb717..97dfd23163dff 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -88,6 +88,8 @@ class TPUModelRunner:
         self.max_model_len = model_config.max_model_len
         self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size)
         self.max_num_tokens = scheduler_config.max_num_batched_tokens
+        # InputBatch needs to work with sampling tensors greater than padding
+        # to avoid dynamic shapes. Also, avoid suboptimal alignment.
         self.max_num_reqs = max(scheduler_config.max_num_seqs, MIN_NUM_SEQS)
 
         # Model-related.
@@ -788,6 +790,7 @@ class TPUModelRunner:
             dummy_hidden = torch.randn((num_tokens, hsize),
                                        device=device,
                                        dtype=torch.bfloat16)
+            # Compile for [8, 16, .., 128,.., `self.max_num_reqs`]
             while True:
                 indices = torch.zeros(
                     num_reqs_to_sample,
@@ -804,7 +807,9 @@ class TPUModelRunner:
                 out = out.cpu()
                 if num_reqs_to_sample >= self.max_num_reqs:
                     break
-                num_reqs_to_sample *= 2
+                # Make sure to compile the `max_num_reqs` upper-limit case
+                num_reqs_to_sample = _get_padded_num_reqs_with_upper_limit(
+                    num_reqs_to_sample + 1, self.max_num_reqs)
         xm.wait_device_ops()
         end = time.perf_counter()
         logger.info("Compilation finished in in %.2f [secs].", end - start)
@@ -897,7 +902,6 @@ class ModelWrapperV1(nn.Module):
 
         return hidden_states
 
-    # @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
     def sample_from_hidden(
         self,
         hidden_states: torch.Tensor,

From 32d669275b1068e3261a47715d30e842817e000b Mon Sep 17 00:00:00 2001
From: cnorman <christy@linux.vnet.ibm.com>
Date: Thu, 27 Mar 2025 17:04:32 -0500
Subject: [PATCH 052/593] Correct PowerPC to modern IBM Power (#15635)

Signed-off-by: Christy Norman <christy@linux.vnet.ibm.com>
---
 docs/source/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/index.md b/docs/source/index.md
index 1624d5cf5aae7..402f242679041 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -43,7 +43,7 @@ vLLM is flexible and easy to use with:
 - Tensor parallelism and pipeline parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
-- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
+- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, IBM Power CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
 - Prefix caching support
 - Multi-lora support
 

From 112b3e5b3b5af2c70a7332d6fbf78ffc4f2a9339 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Thu, 27 Mar 2025 18:15:26 -0400
Subject: [PATCH 053/593] [CI] Update rules for applying `tpu` label. (#15634)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .github/mergify.yml | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index 48b2a76be9359..e071ece6f1d5e 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -90,15 +90,34 @@ pull_request_rules:
 
 - name: label-tpu
   description: Automatically apply tpu label
+  # Keep this list in sync with `label-tpu-remove` conditions
   conditions:
     - or:
-      - files~=tpu
+      - files~=tpu.py
+      - files~=_tpu
+      - files~=tpu_
+      - files~=/tpu/
       - files~=pallas
   actions:
     label:
       add:
         - tpu
 
+- name: label-tpu-remove
+  description: Automatically remove tpu label
+  # Keep this list in sync with `label-tpu` conditions
+  conditions:
+    - and:
+      - -files~=tpu.py
+      - -files~=_tpu
+      - -files~=tpu_
+      - -files~=/tpu/
+      - -files~=pallas
+  actions:
+    label:
+      remove:
+        - tpu
+
 - name: ping author on conflicts and add 'needs-rebase' label
   conditions:
       - conflict

From 15dac210f0e6b907f191911917238273042552ed Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 27 Mar 2025 16:14:41 -0700
Subject: [PATCH 054/593] [V1] AsyncLLM data parallel (#13923)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 .buildkite/test-pipeline.yaml               |   5 +
 examples/offline_inference/data_parallel.py |  22 +-
 tests/v1/engine/test_engine_core_client.py  |   8 +-
 tests/v1/test_async_llm_dp.py               | 109 +++++++
 vllm/config.py                              |  21 +-
 vllm/distributed/utils.py                   |  12 +
 vllm/engine/arg_utils.py                    |  10 +
 vllm/envs.py                                |   8 +
 vllm/utils.py                               |  14 +-
 vllm/v1/core/sched/scheduler.py             |  17 +-
 vllm/v1/engine/__init__.py                  |   9 +-
 vllm/v1/engine/async_llm.py                 |  23 +-
 vllm/v1/engine/core.py                      | 208 ++++++++++--
 vllm/v1/engine/core_client.py               | 332 ++++++++++++++++----
 vllm/v1/engine/llm_engine.py                |  17 +-
 vllm/v1/executor/multiproc_executor.py      |  38 ++-
 vllm/v1/metrics/loggers.py                  |  14 +-
 vllm/v1/utils.py                            |  11 +-
 18 files changed, 722 insertions(+), 156 deletions(-)
 create mode 100644 tests/v1/test_async_llm_dp.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f22b2b0ab6f2f..428b4c593c38e 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -135,12 +135,14 @@ steps:
   - examples/offline_inference/rlhf.py
   - examples/offline_inference/rlhf_colocate.py
   - tests/examples/offline_inference/data_parallel.py
+  - tests/v1/test_async_llm_dp.py
   commands:
   # test with tp=2 and external_dp=2
   - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
   - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
   # test with internal dp
   - python3 ../examples/offline_inference/data_parallel.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
   - pytest -v -s distributed/test_utils.py
   - pytest -v -s compile/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
@@ -514,7 +516,10 @@ steps:
   - vllm/worker/worker.py
   - vllm/worker/model_runner.py
   - entrypoints/llm/test_collective_rpc.py
+  - tests/v1/test_async_llm_dp.py
+  - vllm/v1/engine/
   commands:
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
   - VLLM_ENABLE_V1_MULTIPROCESSING=0 pytest -v -s entrypoints/llm/test_collective_rpc.py
   - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index 232afd8b73d00..04a79e2f8ae66 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -28,6 +28,7 @@ Multi-node:
                     --master-port=13345
 """
 import os
+from time import sleep
 
 from vllm import LLM, SamplingParams
 from vllm.utils import get_open_port
@@ -36,14 +37,13 @@ from vllm.utils import get_open_port
 def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
          dp_master_port, GPUs_per_dp_rank):
     os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
+    os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
     os.environ["VLLM_DP_SIZE"] = str(dp_size)
     os.environ["VLLM_DP_MASTER_IP"] = dp_master_ip
     os.environ["VLLM_DP_MASTER_PORT"] = str(dp_master_port)
-    # set devices for each dp_rank
-    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
-        str(i)
-        for i in range(local_dp_rank * GPUs_per_dp_rank, (local_dp_rank + 1) *
-                       GPUs_per_dp_rank))
+
+    # CUDA_VISIBLE_DEVICES for each DP rank is set automatically inside the
+    # engine processes.
 
     # Sample prompts.
     prompts = [
@@ -90,6 +90,9 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
         print(f"DP rank {global_dp_rank}, Prompt: {prompt!r}, "
               f"Generated text: {generated_text!r}")
 
+    # Give engines time to pause their processing loops before exiting.
+    sleep(1)
+
 
 if __name__ == "__main__":
     import argparse
@@ -152,8 +155,13 @@ if __name__ == "__main__":
         procs.append(proc)
     exit_code = 0
     for proc in procs:
-        proc.join()
-        if proc.exitcode:
+        proc.join(timeout=300)
+        if proc.exitcode is None:
+            print(f"Killing process {proc.pid} that "
+                  f"didn't stop within 5 minutes.")
+            proc.kill()
+            exit_code = 1
+        elif proc.exitcode:
             exit_code = proc.exitcode
 
     exit(exit_code)
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 48f451a589688..68844b877c17d 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -167,11 +167,11 @@ def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
 
             core_client: SyncMPClient = client
 
-            result = core_client._call_utility("echo", "testarg")
+            result = core_client.call_utility("echo", "testarg")
             assert result == "testarg"
 
             with pytest.raises(Exception) as e_info:
-                core_client._call_utility("echo", None, "help!")
+                core_client.call_utility("echo", None, "help!")
 
             assert str(e_info.value) == "Call to echo method failed: help!"
 
@@ -238,10 +238,10 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
 
         core_client: AsyncMPClient = client
 
-        result = await core_client._call_utility_async("echo", "testarg")
+        result = await core_client.call_utility_async("echo", "testarg")
         assert result == "testarg"
 
         with pytest.raises(Exception) as e_info:
-            await core_client._call_utility_async("echo", None, "help!")
+            await core_client.call_utility_async("echo", None, "help!")
 
         assert str(e_info.value) == "Call to echo method failed: help!"
diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py
new file mode 100644
index 0000000000000..f0e031969e733
--- /dev/null
+++ b/tests/v1/test_async_llm_dp.py
@@ -0,0 +1,109 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import asyncio
+import os
+from contextlib import ExitStack
+from typing import Optional
+
+import pytest
+
+from vllm import SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.inputs import PromptType
+from vllm.platforms import current_platform
+from vllm.sampling_params import RequestOutputKind
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.engine.core_client import DPAsyncMPClient
+
+engine_args = AsyncEngineArgs(
+    model="ibm-research/PowerMoE-3b",
+    enforce_eager=True,
+    disable_log_requests=True,
+    tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
+    data_parallel_size=int(os.getenv("DP_SIZE", 2)),
+)
+
+if not current_platform.supports_v1(engine_args.create_model_config()):
+    pytest.skip(reason="Requires V1-supporting platform.",
+                allow_module_level=True)
+
+
+async def generate(engine: AsyncLLM,
+                   request_id: str,
+                   prompt: PromptType,
+                   output_kind: RequestOutputKind,
+                   max_tokens: int,
+                   prompt_logprobs: Optional[int] = None) -> tuple[int, str]:
+    # Ensure generate doesn't complete too fast for cancellation test.
+    await asyncio.sleep(0.2)
+
+    count = 0
+    sampling_params = SamplingParams(max_tokens=max_tokens,
+                                     ignore_eos=True,
+                                     output_kind=output_kind,
+                                     temperature=0,
+                                     prompt_logprobs=prompt_logprobs)
+    async for out in engine.generate(request_id=request_id,
+                                     prompt=prompt,
+                                     sampling_params=sampling_params):
+
+        num_tokens = len(out.outputs[0].token_ids)
+        if output_kind == RequestOutputKind.DELTA:
+            count += num_tokens
+        else:
+            count = num_tokens
+
+        await asyncio.sleep(0.)
+
+    return count, request_id
+
+
+@pytest.mark.parametrize(
+    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+@pytest.mark.asyncio
+async def test_load(output_kind: RequestOutputKind):
+
+    with ExitStack() as after:
+
+        prompt = "This is a test of data parallel"
+
+        engine = AsyncLLM.from_engine_args(engine_args)
+        after.callback(engine.shutdown)
+
+        NUM_REQUESTS = 100
+        NUM_EXPECTED_TOKENS = 10
+
+        request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
+
+        # Create concurrent requests.
+        tasks = []
+        for request_id in request_ids:
+            tasks.append(
+                asyncio.create_task(
+                    generate(engine, request_id, prompt, output_kind,
+                             NUM_EXPECTED_TOKENS)))
+
+        # Confirm that we got all the EXPECTED tokens from the requests.
+        done, pending = await asyncio.wait(tasks,
+                                           return_when=asyncio.FIRST_EXCEPTION)
+        for task in pending:
+            task.cancel()
+        for task in done:
+            num_generated_tokens, request_id = await task
+            assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
+                f"{request_id} generated {num_generated_tokens} but "
+                f"expected {NUM_EXPECTED_TOKENS}")
+
+        assert not engine.output_processor.has_unfinished_requests()
+
+        # testing internals here which may break
+        core_client: DPAsyncMPClient = engine.engine_core
+        # the engines only synchronize stopping every N steps so
+        # allow a small amount of time here.
+        for _ in range(10):
+            if core_client.num_engines_running == 0:
+                break
+            await asyncio.sleep(0.5)
+
+        assert core_client.num_engines_running == 0
+        assert not core_client.reqs_in_flight
diff --git a/vllm/config.py b/vllm/config.py
index 687c8b56ec126..831fa2e4b06eb 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -40,7 +40,8 @@ from vllm.transformers_utils.config import (
 from vllm.transformers_utils.s3_utils import S3Model
 from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
 from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
-                        get_cpu_memory, random_uuid, resolve_obj_by_qualname)
+                        get_cpu_memory, get_open_port, random_uuid,
+                        resolve_obj_by_qualname)
 
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
@@ -1389,6 +1390,8 @@ class ParallelConfig:
     tensor_parallel_size: int = 1  # Number of tensor parallel groups.
     data_parallel_size: int = 1  # Number of data parallel groups.
     data_parallel_rank: int = 0  # Rank of the data parallel group.
+    # Local rank of the data parallel group, defaults to global rank.
+    data_parallel_rank_local: Optional[int] = None
     # IP of the data parallel master.
     data_parallel_master_ip: str = "127.0.0.1"
     data_parallel_master_port: int = 29500  # Port of the data parallel master.
@@ -1493,10 +1496,18 @@ class ParallelConfig:
         self.world_size = self.pipeline_parallel_size * \
             self.tensor_parallel_size
 
-        self.data_parallel_size = envs.VLLM_DP_SIZE
-        self.data_parallel_rank = envs.VLLM_DP_RANK
-        self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP
-        self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT
+        if self.data_parallel_size > 1:
+            # Data parallel was specified in the engine args.
+            self.data_parallel_master_port = get_open_port()
+            # TODO multi-node
+        else:
+            # Otherwise fall back to env vars (e.g. for offline SPMD case).
+            self.data_parallel_size = envs.VLLM_DP_SIZE
+            self.data_parallel_rank = envs.VLLM_DP_RANK
+            self.data_parallel_rank_local = envs.VLLM_DP_RANK_LOCAL
+            self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP
+            self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT
+
         self.world_size_across_dp = self.world_size * self.data_parallel_size
 
         if self.distributed_executor_backend == "external_launcher":
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index 84899358a6d66..b8178af5a2daa 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -15,6 +15,8 @@ import torch
 from torch.distributed import ProcessGroup, TCPStore
 from torch.distributed.distributed_c10d import (Backend, PrefixStore,
                                                 _get_default_timeout,
+                                                _shutdown_backend,
+                                                _unregister_process_group,
                                                 is_nccl_available)
 from torch.distributed.rendezvous import rendezvous
 
@@ -333,3 +335,13 @@ def stateless_init_torch_distributed_process_group(
     pg._register_backend(device, backend_type, backend_class)
 
     return pg
+
+
+def stateless_destroy_torch_distributed_process_group(
+        pg: ProcessGroup) -> None:
+    """
+    Destroy ProcessGroup returned by
+        stateless_init_torch_distributed_process_group().
+    """
+    _shutdown_backend(pg)
+    _unregister_process_group(pg.group_name)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 53af3e5717c52..a3b83c65a604a 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -114,6 +114,7 @@ class EngineArgs:
     # number of P/D disaggregation (or other disaggregation) workers
     pipeline_parallel_size: int = 1
     tensor_parallel_size: int = 1
+    data_parallel_size: int = 1
     enable_expert_parallel: bool = False
     max_parallel_loading_workers: Optional[int] = None
     block_size: Optional[int] = None
@@ -442,6 +443,14 @@ class EngineArgs:
                             type=int,
                             default=EngineArgs.tensor_parallel_size,
                             help='Number of tensor parallel replicas.')
+        parser.add_argument('--data-parallel-size',
+                            '-dp',
+                            type=int,
+                            default=EngineArgs.data_parallel_size,
+                            help='Number of data parallel replicas. '
+                            'MoE layers will be sharded according to the '
+                            'product of the tensor-parallel-size and '
+                            'data-parallel-size.')
         parser.add_argument(
             '--enable-expert-parallel',
             action='store_true',
@@ -1359,6 +1368,7 @@ class EngineArgs:
         parallel_config = ParallelConfig(
             pipeline_parallel_size=self.pipeline_parallel_size,
             tensor_parallel_size=self.tensor_parallel_size,
+            data_parallel_size=self.data_parallel_size,
             enable_expert_parallel=self.enable_expert_parallel,
             max_parallel_loading_workers=self.max_parallel_loading_workers,
             disable_custom_all_reduce=self.disable_custom_all_reduce,
diff --git a/vllm/envs.py b/vllm/envs.py
index e5025485a2501..5334667376b24 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -2,6 +2,7 @@
 
 import hashlib
 import os
+import sys
 import tempfile
 from typing import TYPE_CHECKING, Any, Callable, Optional
 
@@ -95,6 +96,7 @@ if TYPE_CHECKING:
     VLLM_CUDART_SO_PATH: Optional[str] = None
     VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH: bool = True
     VLLM_DP_RANK: int = 0
+    VLLM_DP_RANK_LOCAL: int = -1
     VLLM_DP_SIZE: int = 1
     VLLM_DP_MASTER_IP: str = ""
     VLLM_DP_MASTER_PORT: int = 0
@@ -625,6 +627,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_DP_RANK":
     lambda: int(os.getenv("VLLM_DP_RANK", "0")),
 
+    # Rank of the process in the data parallel setting.
+    # Defaults to VLLM_DP_RANK when not set.
+    "VLLM_DP_RANK_LOCAL":
+    lambda: int(
+        os.getenv("VLLM_DP_RANK_LOCAL", sys.modules[__name__].VLLM_DP_RANK)),
+
     # World size of the data parallel setting
     "VLLM_DP_SIZE":
     lambda: int(os.getenv("VLLM_DP_SIZE", "1")),
diff --git a/vllm/utils.py b/vllm/utils.py
index 77f4e2dcf5e45..afe68a2b8cb3d 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -578,7 +578,7 @@ def get_open_port() -> int:
         dp_port = envs.VLLM_DP_MASTER_PORT
         while True:
             port = _get_open_port()
-            if port >= dp_port and port < dp_port + 10:
+            if dp_port <= port < dp_port + 10:
                 continue
             return port
     return _get_open_port()
@@ -2176,11 +2176,11 @@ def make_zmq_socket(
     if socket_type == zmq.constants.PULL:
         socket.setsockopt(zmq.constants.RCVHWM, 0)
         socket.setsockopt(zmq.constants.RCVBUF, buf_size)
-        socket.connect(path)
+        socket.bind(path)
     elif socket_type == zmq.constants.PUSH:
         socket.setsockopt(zmq.constants.SNDHWM, 0)
         socket.setsockopt(zmq.constants.SNDBUF, buf_size)
-        socket.bind(path)
+        socket.connect(path)
     else:
         raise ValueError(f"Unknown Socket Type: {socket_type}")
 
@@ -2188,7 +2188,11 @@ def make_zmq_socket(
 
 
 @contextlib.contextmanager
-def zmq_socket_ctx(path: str, socket_type: Any) -> Iterator[zmq.Socket]:
+def zmq_socket_ctx(
+    path: str,
+    socket_type: Any,
+    linger: int = 0,
+) -> Iterator[zmq.Socket]:
     """Context manager for a ZMQ socket"""
 
     ctx = zmq.Context()  # type: ignore[attr-defined]
@@ -2199,7 +2203,7 @@ def zmq_socket_ctx(path: str, socket_type: Any) -> Iterator[zmq.Socket]:
         logger.debug("Got Keyboard Interrupt.")
 
     finally:
-        ctx.destroy(linger=0)
+        ctx.destroy(linger=linger)
 
 
 def is_in_ray_actor():
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 87d30c8aefbf0..448119761259c 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -37,9 +37,10 @@ class Scheduler(SchedulerInterface):
         cache_config: CacheConfig,
         lora_config: Optional[LoRAConfig],
         speculative_config: Optional[SpeculativeConfig],
-        log_stats: bool,
         structured_output_manager: StructuredOutputManager,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+        include_finished_set: bool = False,
+        log_stats: bool = False,
     ) -> None:
         self.scheduler_config = scheduler_config
         self.cache_config = cache_config
@@ -48,6 +49,12 @@ class Scheduler(SchedulerInterface):
         self.log_stats = log_stats
         self.structured_output_manager = structured_output_manager
 
+        # include_finished_set controls whether a separate set of finished
+        # request ids should be included in the EngineCoreOutputs returned
+        # by update_from_outputs(). This is currently used in the multi-engine
+        # case to track request lifetimes efficiently.
+        self.include_finished_set = include_finished_set
+
         # Scheduling constraints.
         self.max_num_running_reqs = self.scheduler_config.max_num_seqs
         self.max_num_scheduled_tokens = \
@@ -663,10 +670,16 @@ class Scheduler(SchedulerInterface):
                 new_running.append(request)
 
         self.running = new_running
-        return EngineCoreOutputs(
+        engine_core_outputs = EngineCoreOutputs(
             outputs=outputs,
             scheduler_stats=self.make_stats(),
         )
+        if self.include_finished_set:
+            #TODO currently sending duplicates here, improve this
+            engine_core_outputs.finished_requests = (
+                scheduler_output.finished_req_ids | self.finished_req_ids)
+
+        return engine_core_outputs
 
     def add_request(self, request: Request) -> None:
         self.waiting.append(request)
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 3699779b3a0fe..0557d0c6c19d0 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -128,12 +128,18 @@ class EngineCoreOutputs(
     #NOTE(Nick): We could consider ways to make this more compact,
     # e.g. columnwise layout
 
+    engine_index: int = 0
+
     # [num_reqs]
     outputs: list[EngineCoreOutput] = []
     scheduler_stats: Optional[SchedulerStats] = None
     timestamp: float = 0.0
 
     utility_output: Optional[UtilityOutput] = None
+    finished_requests: Optional[set[str]] = None
+
+    # In DP case, used to signal that the engine is paused.
+    engine_paused: bool = False
 
     def __post_init__(self):
         if self.timestamp == 0.0:
@@ -147,4 +153,5 @@ class EngineCoreRequestType(enum.Enum):
     """
     ADD = b'\x00'
     ABORT = b'\x01'
-    UTILITY = b'\x02'
+    START_DP = b'\x02'
+    UTILITY = b'\x03'
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 3a6811db31327..1fb9ae8cb7a59 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -66,11 +66,17 @@ class AsyncLLM(EngineClient):
 
         self.log_requests = log_requests
         self.log_stats = log_stats
-        self.stat_loggers: list[StatLoggerBase] = []
+
+        # Set up stat loggers; independent set for each DP rank.
+        self.stat_loggers: list[list[StatLoggerBase]] = []
         if self.log_stats:
-            if logger.isEnabledFor(logging.INFO):
-                self.stat_loggers.append(LoggingStatLogger())
-            self.stat_loggers.append(PrometheusStatLogger(vllm_config))
+            for i in range(vllm_config.parallel_config.data_parallel_size):
+                loggers: list[StatLoggerBase] = []
+                if logger.isEnabledFor(logging.INFO):
+                    loggers.append(LoggingStatLogger(engine_index=i))
+                loggers.append(
+                    PrometheusStatLogger(vllm_config, engine_index=i))
+                self.stat_loggers.append(loggers)
 
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(
@@ -329,6 +335,7 @@ class AsyncLLM(EngineClient):
                 # TODO(rob): make into a coroutine and launch it in
                 # background thread once Prometheus overhead is non-trivial.
                 self._record_stats(
+                    engine_index=outputs.engine_index,
                     scheduler_stats=outputs.scheduler_stats,
                     iteration_stats=iteration_stats,
                 )
@@ -350,12 +357,13 @@ class AsyncLLM(EngineClient):
         self,
         scheduler_stats: Optional[SchedulerStats],
         iteration_stats: Optional[IterationStats],
+        engine_index: int = 0,
     ):
         if not self.log_stats:
             return
 
         assert scheduler_stats is not None
-        for stat_logger in self.stat_loggers:
+        for stat_logger in self.stat_loggers[engine_index]:
             stat_logger.record(scheduler_stats=scheduler_stats,
                                iteration_stats=iteration_stats)
 
@@ -393,8 +401,9 @@ class AsyncLLM(EngineClient):
         scheduler_outputs=None,
         model_output=None,
     ) -> None:
-        for stat_logger in self.stat_loggers:
-            stat_logger.log()
+        for loggers in self.stat_loggers:
+            for stat_logger in loggers:
+                stat_logger.log()
 
     async def check_health(self) -> None:
         logger.debug("Called check_health.")
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 42511777feebb..20904cd495f91 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -1,12 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
-
+import os
 import queue
 import signal
+import sys
 import threading
 import time
 from concurrent.futures import Future
 from inspect import isclass, signature
-from multiprocessing.connection import Connection
+from logging import DEBUG
 from typing import Any, Optional
 
 import msgspec
@@ -14,7 +15,9 @@ import psutil
 import zmq
 import zmq.asyncio
 
-from vllm.config import VllmConfig
+from vllm.config import ParallelConfig, VllmConfig
+from vllm.distributed import stateless_destroy_torch_distributed_process_group
+from vllm.executor.multiproc_worker_utils import _add_prefix
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.config import (
@@ -91,6 +94,8 @@ class EngineCore:
             cache_config=vllm_config.cache_config,
             lora_config=vllm_config.lora_config,
             speculative_config=vllm_config.speculative_config,
+            include_finished_set=vllm_config.parallel_config.data_parallel_size
+            > 1,
             log_stats=self.log_stats,
             structured_output_manager=self.structured_output_manager,
         )
@@ -283,10 +288,10 @@ class EngineCoreProc(EngineCore):
         self,
         input_path: str,
         output_path: str,
-        ready_pipe: Connection,
         vllm_config: VllmConfig,
         executor_class: type[Executor],
         log_stats: bool,
+        engine_index: int = 0,
     ):
         super().__init__(vllm_config, executor_class, log_stats)
 
@@ -302,14 +307,20 @@ class EngineCoreProc(EngineCore):
                          args=(input_path, ),
                          daemon=True).start()
         threading.Thread(target=self.process_output_socket,
-                         args=(output_path, ),
+                         args=(output_path, engine_index),
                          daemon=True).start()
 
-        # Send Readiness signal to EngineClient.
-        ready_pipe.send({"status": "READY"})
+        self.global_unfinished_reqs = False
+
+        self.step_fn = (self.step if self.batch_queue is None else
+                        self.step_with_batch_queue)
 
     @staticmethod
-    def run_engine_core(*args, **kwargs):
+    def run_engine_core(*args,
+                        dp_rank: int = 0,
+                        local_dp_rank: int = 0,
+                        ready_pipe,
+                        **kwargs):
         """Launch EngineCore busy loop in background process."""
 
         # Signal handler used for graceful termination.
@@ -331,9 +342,21 @@ class EngineCoreProc(EngineCore):
         signal.signal(signal.SIGINT, signal_handler)
 
         parent_process = psutil.Process().parent()
-        engine_core = None
+        engine_core: Optional[EngineCoreProc] = None
         try:
-            engine_core = EngineCoreProc(*args, **kwargs)
+            parallel_config: ParallelConfig = kwargs[
+                "vllm_config"].parallel_config
+            if parallel_config.data_parallel_size > 1:
+                # Set data parallel rank for this engine process.
+                parallel_config.data_parallel_rank = dp_rank
+                parallel_config.data_parallel_rank_local = local_dp_rank
+                engine_core = DPEngineCoreProc(*args, **kwargs)
+            else:
+                engine_core = EngineCoreProc(*args, **kwargs)
+
+            # Send Readiness signal to EngineClient.
+            ready_pipe.send({"status": "READY"})
+
             engine_core.run_busy_loop()
 
         except SystemExit:
@@ -351,28 +374,44 @@ class EngineCoreProc(EngineCore):
     def run_busy_loop(self):
         """Core busy loop of the EngineCore."""
 
-        step_fn = (self.step
-                   if self.batch_queue is None else self.step_with_batch_queue)
-
         # Loop until process is sent a SIGINT or SIGTERM
         while True:
             # 1) Poll the input queue until there is work to do.
-            while not self.scheduler.has_requests():
-                logger.debug("EngineCore busy loop waiting.")
-                req = self.input_queue.get()
-                self._handle_client_request(*req)
+            self._process_input_queue()
+            # 2) Step the engine core and return the outputs.
+            self._process_engine_step()
 
-            # 2) Handle any new client requests.
-            while not self.input_queue.empty():
-                req = self.input_queue.get_nowait()
-                self._handle_client_request(*req)
+    def _process_input_queue(self):
+        """Exits when an engine step needs to be performed."""
 
-            # 3) Step the engine core.
-            outputs = step_fn()
+        waited = False
+        while not self.global_unfinished_reqs and not (
+                self.scheduler.has_requests()):
+            if logger.isEnabledFor(DEBUG) and self.input_queue.empty():
+                logger.debug("EngineCore waiting for work.")
+                waited = True
+            req = self.input_queue.get()
+            self._handle_client_request(*req)
 
-            # 4) Put EngineCoreOutputs into the output queue.
-            if outputs is not None:
-                self.output_queue.put_nowait(outputs)
+        if waited:
+            logger.debug(
+                "EngineCore loop active - local unfinished: %s, finished: %s.",
+                self.scheduler.has_unfinished_requests(),
+                self.scheduler.has_finished_requests())
+
+        # Handle any more client requests.
+        while not self.input_queue.empty():
+            req = self.input_queue.get_nowait()
+            self._handle_client_request(*req)
+
+    def _process_engine_step(self):
+        """Called only when there are unfinished local requests."""
+
+        # Step the engine core.
+        outputs = self.step_fn()
+        # Put EngineCoreOutputs into the output queue.
+        if outputs is not None:
+            self.output_queue.put_nowait(outputs)
 
     def _handle_client_request(self, request_type: EngineCoreRequestType,
                                request: Any) -> None:
@@ -382,6 +421,10 @@ class EngineCoreProc(EngineCore):
             self.add_request(request)
         elif request_type == EngineCoreRequestType.ABORT:
             self.abort_requests(request)
+        elif request_type == EngineCoreRequestType.START_DP:
+            if not self.global_unfinished_reqs:
+                logger.debug("EngineCore starting idle loop.")
+                self.global_unfinished_reqs = True
         elif request_type == EngineCoreRequestType.UTILITY:
             call_id, method_name, args = request
             output = UtilityOutput(call_id)
@@ -432,7 +475,7 @@ class EngineCoreProc(EngineCore):
                 # Push to input queue for core busy loop.
                 self.input_queue.put_nowait((request_type, request))
 
-    def process_output_socket(self, output_path: str):
+    def process_output_socket(self, output_path: str, engine_index: int):
         """Output socket IO thread."""
 
         # Msgpack serialization encoding.
@@ -443,5 +486,114 @@ class EngineCoreProc(EngineCore):
         with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket:
             while True:
                 outputs = self.output_queue.get()
+                outputs.engine_index = engine_index
                 encoder.encode_into(outputs, buffer)
-                socket.send_multipart((buffer, ), copy=False)
+                socket.send(buffer, copy=False)
+
+
+ENGINE_PAUSED_OUTPUTS = EngineCoreOutputs(engine_paused=True)
+
+
+class DPEngineCoreProc(EngineCoreProc):
+    """ZMQ-wrapper for running EngineCore in background process
+    in a data parallel context."""
+
+    def __init__(
+        self,
+        input_path: str,
+        output_path: str,
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+    ):
+        # Add process-specific prefix to stdout and stderr before
+        # we initialize the engine.
+        from multiprocessing import current_process
+        process_name = current_process().name
+        pid = os.getpid()
+        _add_prefix(sys.stdout, process_name, pid)
+        _add_prefix(sys.stderr, process_name, pid)
+
+        dp_size = vllm_config.parallel_config.data_parallel_size
+        dp_rank = vllm_config.parallel_config.data_parallel_rank
+        local_dp_rank = vllm_config.parallel_config.data_parallel_rank_local
+
+        assert dp_size > 1
+        assert 0 <= local_dp_rank <= dp_rank < dp_size
+
+        from vllm.platforms import current_platform
+        if current_platform.is_cuda_alike():
+            from vllm.platforms.cuda import device_id_to_physical_device_id
+            tp_size = vllm_config.parallel_config.tensor_parallel_size
+            os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
+                str(device_id_to_physical_device_id(i))
+                for i in range(local_dp_rank * tp_size, (local_dp_rank + 1) *
+                               tp_size))
+
+        self.dp_group = vllm_config.parallel_config.stateless_init_dp_group()
+
+        # Initialize the engine after setting up environment.
+        super().__init__(input_path, output_path, vllm_config, executor_class,
+                         log_stats, dp_rank)
+
+        # Counts forward-passes of the model so that we can synchronize
+        # finished with DP peers every N steps.
+        self.counter = 0
+
+    def shutdown(self):
+        super().shutdown()
+        if dp_group := getattr(self, "dp_group", None):
+            stateless_destroy_torch_distributed_process_group(dp_group)
+
+    def run_busy_loop(self):
+        """Core busy loop of the EngineCore for data parallel case."""
+
+        # Loop until process is sent a SIGINT or SIGTERM
+        while True:
+            # 1) Poll the input queue until there is work to do.
+            self._process_input_queue()
+
+            local_unfinished_reqs = self.scheduler.has_unfinished_requests()
+
+            if local_unfinished_reqs:
+                # 2) Step the engine core.
+                self._process_engine_step()
+
+                # Check if we have now finished all requests.
+                local_unfinished_reqs = (
+                    self.scheduler.has_unfinished_requests())
+            else:
+                if self.scheduler.has_finished_requests():
+                    # There are no unfinished requests, but there are some
+                    # finished requests remaining to be removed from the
+                    # batch state. This engine step won't perform a forward
+                    # pass but will flush the finished requests to ensure
+                    # up-to-date state is returned in the engine outputs.
+                    self._process_engine_step()
+
+                if not self.global_unfinished_reqs:
+                    # All engines are idle.
+                    continue
+
+                # There must be unfinished requests in DP peers, run a
+                # dummy forward pass.
+                self.execute_dummy_batch()
+
+            # 3) All-reduce operation to determine global unfinished reqs.
+            self.global_unfinished_reqs = self._has_global_unfinished_reqs(
+                local_unfinished_reqs)
+
+            if not self.global_unfinished_reqs:
+                # Notify client that we are pausing the loop.
+                self.output_queue.put_nowait(ENGINE_PAUSED_OUTPUTS)
+
+    def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool:
+
+        # Optimization - only perform finish-sync all-reduce every 16 steps.
+        self.counter += 1
+        if self.counter != 16:
+            return True
+        self.counter = 0
+
+        return ParallelConfig.has_unfinished_dp(self.dp_group,
+                                                local_unfinished)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 13b72c80dc0d4..c41ee6704be0f 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -8,10 +8,11 @@ import threading
 import uuid
 import weakref
 from abc import ABC, abstractmethod
+from collections.abc import Awaitable, Sequence
 from concurrent.futures import Future
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from threading import Thread
-from typing import Any, Optional, Union
+from typing import Any, Callable, Optional, Union
 
 import zmq
 import zmq.asyncio
@@ -60,6 +61,9 @@ class EngineCoreClient(ABC):
                 "is not currently supported.")
 
         if multiprocess_mode and asyncio_mode:
+            if vllm_config.parallel_config.data_parallel_size > 1:
+                return DPAsyncMPClient(vllm_config, executor_class, log_stats)
+
             return AsyncMPClient(vllm_config, executor_class, log_stats)
 
         if multiprocess_mode and not asyncio_mode:
@@ -207,28 +211,74 @@ class InprocClient(EngineCoreClient):
         return self.engine_core.pin_lora(lora_id)
 
 
+class CoreEngine:
+    """One per data parallel rank."""
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+        ctx: Union[zmq.Context, zmq.asyncio.Context],
+        output_path: str,
+        index: int = 0,
+        local_dp_rank: int = 0,
+    ):
+        # Paths and sockets for IPC.
+        input_path = get_open_zmq_ipc_path()
+        self.input_socket = make_zmq_socket(ctx, input_path,
+                                            zmq.constants.PUSH)
+        try:
+            # Start EngineCore in background process.
+            self.proc_handle = BackgroundProcHandle(
+                input_path=input_path,
+                output_path=output_path,
+                process_name=f"EngineCore_{index}",
+                target_fn=EngineCoreProc.run_engine_core,
+                process_kwargs={
+                    "vllm_config": vllm_config,
+                    "dp_rank": index,
+                    "local_dp_rank": local_dp_rank,
+                    "executor_class": executor_class,
+                    "log_stats": log_stats,
+                })
+
+            self.num_reqs_in_flight = 0
+        finally:
+            if not hasattr(self, "num_reqs_in_flight"):
+                # Ensure socket is closed if process fails to start.
+                self.close()
+
+    def send_multipart(self, msg_parts: Sequence):
+        return self.input_socket.send_multipart(msg_parts, copy=False)
+
+    def close(self):
+        if proc_handle := getattr(self, "proc_handle", None):
+            proc_handle.shutdown()
+        if socket := getattr(self, "input_socket", None):
+            socket.close(linger=0)
+
+
 @dataclass
 class BackgroundResources:
     """Used as a finalizer for clean shutdown, avoiding
     circular reference back to the client object."""
 
-    ctx: zmq.Context
+    ctx: Union[zmq.Context]
+    core_engines: list[CoreEngine] = field(default_factory=list)
     output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
-    input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
-    proc_handle: Optional[BackgroundProcHandle] = None
     shutdown_path: Optional[str] = None
 
     def __call__(self):
         """Clean up background resources."""
 
-        if self.proc_handle is not None:
-            self.proc_handle.shutdown()
+        for core_engine in self.core_engines:
+            core_engine.close()
+
         # ZMQ context termination can hang if the sockets
         # aren't explicitly closed first.
         if self.output_socket is not None:
             self.output_socket.close(linger=0)
-        if self.input_socket is not None:
-            self.input_socket.close(linger=0)
         if self.shutdown_path is not None:
             # We must ensure that the sync output socket is
             # closed cleanly in its own thread.
@@ -284,7 +334,7 @@ class MPClient(EngineCoreClient):
         self.decoder = MsgpackDecoder(EngineCoreOutputs)
 
         # ZMQ setup.
-        sync_ctx = zmq.Context()
+        sync_ctx = zmq.Context(io_threads=2)
         self.ctx = zmq.asyncio.Context(sync_ctx) if asyncio_mode else sync_ctx
 
         # This will ensure resources created so far are closed
@@ -293,28 +343,38 @@ class MPClient(EngineCoreClient):
         self.resources = BackgroundResources(ctx=sync_ctx)
         self._finalizer = weakref.finalize(self, self.resources)
 
-        # Paths for IPC.
+        # Paths and sockets for IPC.
         self.output_path = get_open_zmq_ipc_path()
-        input_path = get_open_zmq_ipc_path()
 
-        # Start EngineCore in background process.
-        self.resources.proc_handle = BackgroundProcHandle(
-            input_path=input_path,
-            output_path=self.output_path,
-            process_name="EngineCore",
-            target_fn=EngineCoreProc.run_engine_core,
-            process_kwargs={
-                "vllm_config": vllm_config,
-                "executor_class": executor_class,
-                "log_stats": log_stats,
-            })
+        new_core_engine = lambda index, local_dp_rank=None: CoreEngine(
+            vllm_config, executor_class, log_stats, self.ctx, self.output_path,
+            index, local_dp_rank)
+
+        # Start engine core process(es).
+        self._init_core_engines(vllm_config, new_core_engine,
+                                self.resources.core_engines)
+
+        # Wait for engine core process(es) to start.
+        for engine in self.resources.core_engines:
+            engine.proc_handle.wait_for_startup()
 
-        # Create input socket.
-        self.resources.input_socket = make_zmq_socket(self.ctx, input_path,
-                                                      zmq.constants.PUSH)
-        self.input_socket = self.resources.input_socket
         self.utility_results: dict[int, AnyFuture] = {}
 
+    def _init_core_engines(
+        self,
+        vllm_config: VllmConfig,
+        new_core_engine: Callable[[int, Optional[int]], CoreEngine],
+        core_engines: list[CoreEngine],
+    ) -> None:
+
+        # Default case - single core engine.
+        dp_rank = vllm_config.parallel_config.data_parallel_rank
+        local_dp_rank = vllm_config.parallel_config.data_parallel_rank_local
+        core_engine = new_core_engine(
+            dp_rank, local_dp_rank if local_dp_rank is not None else dp_rank)
+        core_engines.append(core_engine)
+        self.core_engine = core_engine
+
     def shutdown(self):
         self._finalizer()
 
@@ -370,7 +430,7 @@ class SyncMPClient(MPClient):
                         # shutdown signal, exit thread.
                         break
 
-                    (frame, ) = out_socket.recv_multipart(copy=False)
+                    frame = out_socket.recv(copy=False)
                     outputs = decoder.decode(frame.buffer)
                     if outputs.utility_output:
                         _process_utility_output(outputs.utility_output,
@@ -391,18 +451,15 @@ class SyncMPClient(MPClient):
     def get_output(self) -> EngineCoreOutputs:
         return self.outputs_queue.get()
 
-    def _send_input(self, request_type: EngineCoreRequestType,
-                    request: Any) -> None:
-
+    def _send_input(self, request_type: EngineCoreRequestType, request: Any):
         # (RequestType, SerializedRequest)
         msg = (request_type.value, self.encoder.encode(request))
-        self.input_socket.send_multipart(msg, copy=False)
+        self.core_engine.send_multipart(msg)
 
-    def _call_utility(self, method: str, *args) -> Any:
+    def call_utility(self, method: str, *args) -> Any:
         call_id = uuid.uuid1().int >> 64
         future: Future[Any] = Future()
         self.utility_results[call_id] = future
-
         self._send_input(EngineCoreRequestType.UTILITY,
                          (call_id, method, args))
 
@@ -419,34 +476,34 @@ class SyncMPClient(MPClient):
             self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
     def profile(self, is_start: bool = True) -> None:
-        self._call_utility("profile", is_start)
+        self.call_utility("profile", is_start)
 
     def reset_prefix_cache(self) -> None:
-        self._call_utility("reset_prefix_cache")
+        self.call_utility("reset_prefix_cache")
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
-        return self._call_utility("add_lora", lora_request)
+        return self.call_utility("add_lora", lora_request)
 
     def remove_lora(self, lora_id: int) -> bool:
-        return self._call_utility("remove_lora", lora_id)
+        return self.call_utility("remove_lora", lora_id)
 
     def list_loras(self) -> set[int]:
-        return self._call_utility("list_loras")
+        return self.call_utility("list_loras")
 
     def pin_lora(self, lora_id: int) -> bool:
-        return self._call_utility("pin_lora", lora_id)
+        return self.call_utility("pin_lora", lora_id)
 
     def sleep(self, level: int = 1) -> None:
-        self._call_utility("sleep", level)
+        self.call_utility("sleep", level)
 
     def wake_up(self) -> None:
-        self._call_utility("wake_up")
+        self.call_utility("wake_up")
 
     def is_sleeping(self) -> bool:
-        return self._call_utility("is_sleeping")
+        return self.call_utility("is_sleeping")
 
     def execute_dummy_batch(self) -> None:
-        self._call_utility("execute_dummy_batch")
+        self.call_utility("execute_dummy_batch")
 
 
 class AsyncMPClient(MPClient):
@@ -464,13 +521,21 @@ class AsyncMPClient(MPClient):
         self.outputs_queue: Optional[asyncio.Queue[EngineCoreOutputs]] = None
         self.queue_task: Optional[asyncio.Task] = None
 
-    async def _start_output_queue_task(self):
+        self.outputs_handler: Optional[Callable[
+            [AsyncMPClient, EngineCoreOutputs], Awaitable[None]]] = None
+
+    def _ensure_output_queue_task(self):
+        if self.outputs_queue is not None:
+            return
+
         # Perform IO in separate task to parallelize as much as possible.
         # Avoid task having direct reference back to the client.
         self.outputs_queue = asyncio.Queue()
         decoder = self.decoder
         utility_results = self.utility_results
         outputs_queue = self.outputs_queue
+        output_handler = self.outputs_handler
+        _self_ref = weakref.ref(self) if output_handler else None
         output_path = self.output_path
         output_socket = make_zmq_socket(self.ctx, output_path,
                                         zmq.constants.PULL)
@@ -483,34 +548,52 @@ class AsyncMPClient(MPClient):
                 if outputs.utility_output:
                     _process_utility_output(outputs.utility_output,
                                             utility_results)
-                else:
+                    continue
+
+                if output_handler is not None:
+                    assert _self_ref is not None
+                    _self = _self_ref()
+                    if not _self:
+                        # Client has been garbage collected, abort.
+                        return
+                    await output_handler(_self, outputs)
+
+                if outputs.outputs or outputs.scheduler_stats:
                     outputs_queue.put_nowait(outputs)
 
         self.queue_task = asyncio.create_task(process_outputs_socket(),
                                               name="EngineCoreOutputQueueTask")
 
     async def get_output_async(self) -> EngineCoreOutputs:
-        if self.outputs_queue is None:
-            await self._start_output_queue_task()
-            assert self.outputs_queue is not None
+        self._ensure_output_queue_task()
+        assert self.outputs_queue is not None
         return await self.outputs_queue.get()
 
     async def _send_input(self, request_type: EngineCoreRequestType,
                           request: Any) -> None:
+        await self.core_engine.send_multipart(
+            (request_type.value, self.encoder.encode(request)))
 
-        msg = (request_type.value, self.encoder.encode(request))
-        await self.input_socket.send_multipart(msg, copy=False)
+        self._ensure_output_queue_task()
 
-        if self.outputs_queue is None:
-            await self._start_output_queue_task()
+    async def call_utility_async(self, method: str, *args) -> Any:
+        return await self._call_utility_async(method,
+                                              *args,
+                                              engine=self.core_engine)
 
-    async def _call_utility_async(self, method: str, *args) -> Any:
+    async def _call_utility_async(
+        self,
+        method: str,
+        *args,
+        engine: CoreEngine,
+    ) -> Any:
         call_id = uuid.uuid1().int >> 64
         future = asyncio.get_running_loop().create_future()
         self.utility_results[call_id] = future
-        await self._send_input(EngineCoreRequestType.UTILITY,
-                               (call_id, method, args))
-
+        message = (EngineCoreRequestType.UTILITY.value,
+                   self.encoder.encode((call_id, method, args)))
+        await engine.send_multipart(message)
+        self._ensure_output_queue_task()
         return await future
 
     async def add_request_async(self, request: EngineCoreRequest) -> None:
@@ -524,31 +607,146 @@ class AsyncMPClient(MPClient):
             await self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
     async def profile_async(self, is_start: bool = True) -> None:
-        await self._call_utility_async("profile", is_start)
+        await self.call_utility_async("profile", is_start)
 
     async def reset_prefix_cache_async(self) -> None:
-        await self._call_utility_async("reset_prefix_cache")
+        await self.call_utility_async("reset_prefix_cache")
 
     async def sleep_async(self, level: int = 1) -> None:
-        await self._call_utility_async("sleep", level)
+        await self.call_utility_async("sleep", level)
 
     async def wake_up_async(self) -> None:
-        await self._call_utility_async("wake_up")
+        await self.call_utility_async("wake_up")
 
     async def is_sleeping_async(self) -> bool:
-        return await self._call_utility_async("is_sleeping")
+        return await self.call_utility_async("is_sleeping")
 
     async def execute_dummy_batch_async(self) -> None:
-        await self._call_utility_async("execute_dummy_batch")
+        await self.call_utility_async("execute_dummy_batch")
 
     async def add_lora_async(self, lora_request: LoRARequest) -> bool:
-        return await self._call_utility_async("add_lora", lora_request)
+        return await self.call_utility_async("add_lora", lora_request)
 
     async def remove_lora_async(self, lora_id: int) -> bool:
-        return await self._call_utility_async("remove_lora", lora_id)
+        return await self.call_utility_async("remove_lora", lora_id)
 
     async def list_loras_async(self) -> set[int]:
-        return await self._call_utility_async("list_loras")
+        return await self.call_utility_async("list_loras")
 
     async def pin_lora_async(self, lora_id: int) -> bool:
-        return await self._call_utility_async("pin_lora", lora_id)
+        return await self.call_utility_async("pin_lora", lora_id)
+
+
+class DPAsyncMPClient(AsyncMPClient):
+    """Asyncio-compatible client for multi-proc, multi-engine (data parallel)
+    EngineCore."""
+
+    def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
+                 log_stats: bool):
+        super().__init__(vllm_config, executor_class, log_stats)
+
+        assert len(self.core_engines) > 1
+
+        # Control message used for triggering dp idle mode loop.
+        self.start_dp_msg = (EngineCoreRequestType.START_DP.value,
+                             self.encoder.encode(None))
+
+        self.num_engines_running = 0
+        self.reqs_in_flight: dict[str, CoreEngine] = {}
+
+        self.outputs_handler = DPAsyncMPClient.process_engine_outputs  # type: ignore[assignment]
+
+    def _init_core_engines(
+        self,
+        vllm_config: VllmConfig,
+        new_core_engine: Callable[[int, Optional[int]], CoreEngine],
+        core_engines: list[CoreEngine],
+    ) -> None:
+
+        # Launch a core engine for each data parallel rank.
+        dp_size = vllm_config.parallel_config.data_parallel_size
+        for i in range(dp_size):
+            # Multi-node not yet supported so local_dp_rank == dp_rank.
+            core_engines.append(new_core_engine(i, i))
+
+        self.core_engines = core_engines
+
+    async def call_utility_async(self, method: str, *args) -> Any:
+        # Only the result from the first engine is returned.
+        return (await asyncio.gather(*[
+            self._call_utility_async(method, *args, engine=engine)
+            for engine in self.core_engines
+        ]))[0]
+
+    async def add_request_async(self, request: EngineCoreRequest) -> None:
+        # NOTE: text prompt is not needed in the core engine as it has been
+        # tokenized.
+        request.prompt = None
+
+        msg = (EngineCoreRequestType.ADD.value, self.encoder.encode(request))
+
+        chosen_engine = self.get_core_engine_for_request()
+        self.reqs_in_flight[request.request_id] = chosen_engine
+        chosen_engine.num_reqs_in_flight += 1
+        if self.num_engines_running >= len(self.core_engines):
+            await chosen_engine.send_multipart(msg)
+        else:
+            # Send request to chosen engine and dp start loop
+            # control message to all other engines.
+            self.num_engines_running += len(self.core_engines)
+            await asyncio.gather(*[
+                engine.send_multipart(msg if engine is
+                                      chosen_engine else self.start_dp_msg)
+                for engine in self.core_engines
+            ])
+
+        self._ensure_output_queue_task()
+
+    def get_core_engine_for_request(self) -> CoreEngine:
+        return min(self.core_engines, key=lambda e: e.num_reqs_in_flight)
+
+    @staticmethod
+    async def process_engine_outputs(self: "DPAsyncMPClient",
+                                     outputs: EngineCoreOutputs):
+        if self.reqs_in_flight:
+            for req_id in outputs.finished_requests or ():
+                if engine := self.reqs_in_flight.pop(req_id, None):
+                    engine.num_reqs_in_flight -= 1
+
+        if outputs.engine_paused:
+            assert self.num_engines_running >= 1
+            self.num_engines_running -= 1
+            if not self.num_engines_running and self.reqs_in_flight:
+                # If there are requests in flight here, they must have
+                # been sent after the engines paused. We must make
+                # sure to start the other engines:
+                self.num_engines_running = len(self.core_engines)
+                coros = [
+                    engine.send_multipart(self.start_dp_msg)
+                    for engine in self.core_engines
+                    if not engine.num_reqs_in_flight
+                ]
+                if coros:
+                    await asyncio.gather(*coros)
+
+    async def abort_requests_async(self, request_ids: list[str]) -> None:
+        if not request_ids:
+            return
+
+        if len(request_ids) == 1:
+            # Fast-path common case.
+            if engine := self.reqs_in_flight.get(request_ids[0]):
+                await self._abort_requests(request_ids, engine)
+            return
+
+        by_engine: dict[CoreEngine, list[str]] = {}
+        for req_id in request_ids:
+            if engine := self.reqs_in_flight.get(req_id):
+                by_engine.setdefault(engine, []).append(req_id)
+        for engine, req_ids in by_engine.items():
+            await self._abort_requests(req_ids, engine)
+
+    async def _abort_requests(self, request_ids: list[str],
+                              engine: CoreEngine) -> None:
+        await engine.send_multipart((EngineCoreRequestType.ABORT.value,
+                                     self.encoder.encode(request_ids)))
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 7bda3a30d2028..8cc73f9fe7224 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -8,6 +8,7 @@ from typing_extensions import TypeVar
 
 import vllm.envs as envs
 from vllm.config import ParallelConfig, VllmConfig
+from vllm.distributed import stateless_destroy_torch_distributed_process_group
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.metrics_types import StatLoggerBase
 from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType
@@ -60,11 +61,13 @@ class LLMEngine:
         self.cache_config = vllm_config.cache_config
 
         # important: init dp group before init the engine_core
-        self.parallel_config = vllm_config.parallel_config
-        self.dp_enabled = self.parallel_config.data_parallel_size > 1  # noqa
+        # In the decoupled engine case this is handled in EngineCoreProc.
+        parallel_config = vllm_config.parallel_config
+        if not multiprocess_mode and parallel_config.data_parallel_size > 1:
+            self.dp_group = parallel_config.stateless_init_dp_group()
+        else:
+            self.dp_group = None
         self.should_execute_dummy_batch = False
-        if self.dp_enabled:
-            self.dp_group = self.parallel_config.stateless_init_dp_group()
 
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(
@@ -148,7 +151,7 @@ class LLMEngine:
 
     def has_unfinished_requests(self) -> bool:
         has_unfinished = self.output_processor.has_unfinished_requests()
-        if not self.dp_enabled:
+        if self.dp_group is None:
             return has_unfinished
         return self.has_unfinished_requests_dp(has_unfinished)
 
@@ -280,3 +283,7 @@ class LLMEngine:
     def pin_lora(self, lora_id: int) -> bool:
         """Prevent an adapter from being evicted."""
         return self.engine_core.pin_lora(lora_id)
+
+    def __del__(self):
+        if dp_group := getattr(self, "dp_group", None):
+            stateless_destroy_torch_distributed_process_group(dp_group)
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 21e7d26506d3f..1d5175eb6adc3 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -235,7 +235,10 @@ class WorkerProc:
         worker_response_mq_handle = self.worker_response_mq.export_handle()
 
         # Send Readiness signal to EngineCore process.
-        with zmq_socket_ctx(ready_path, zmq.constants.PUSH) as ready_socket:
+        # Set linger here because we want to ensure the message has
+        # been sent before the context is closed.
+        with zmq_socket_ctx(ready_path, zmq.constants.PUSH,
+                            linger=10000) as ready_socket:
             payload = pickle.dumps(worker_response_mq_handle,
                                    protocol=pickle.HIGHEST_PROTOCOL)
             ready_socket.send_string(WorkerProc.READY_STR)
@@ -270,11 +273,13 @@ class WorkerProc:
         proc = context.Process(target=WorkerProc.worker_main,
                                kwargs=process_kwargs,
                                daemon=True)
-        proc.start()
 
-        # Wait for startup
-        worker_response_mq_handle = WorkerProc.wait_for_startup(
-            proc, ready_path)
+        with zmq_socket_ctx(ready_path, zmq.constants.PULL) as ready_socket:
+            proc.start()
+
+            # Wait for startup
+            worker_response_mq_handle = WorkerProc.wait_for_startup(
+                proc, ready_socket)
 
         worker_response_mq = MessageQueue.create_from_handle(
             worker_response_mq_handle, 0)
@@ -337,23 +342,22 @@ class WorkerProc:
     @staticmethod
     def wait_for_startup(
         proc: BaseProcess,
-        ready_path: str,
+        ready_socket: zmq.Socket,
     ) -> Optional[Handle]:
         """Wait until the Worker is ready."""
-        with zmq_socket_ctx(ready_path, zmq.constants.PULL) as socket:
 
-            # Wait for Worker to send READY.
-            while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
-                logger.debug("Waiting for WorkerProc to startup.")
+        # Wait for Worker to send READY.
+        while ready_socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
+            logger.debug("Waiting for WorkerProc to startup.")
 
-                if not proc.is_alive():
-                    raise RuntimeError("WorkerProc failed to start.")
+            if not proc.is_alive():
+                raise RuntimeError("WorkerProc failed to start.")
 
-            message = socket.recv_string()
-            assert message == WorkerProc.READY_STR
-            handle_frame = socket.recv(copy=False)
-            handle = pickle.loads(handle_frame.buffer)
-            return handle
+        message = ready_socket.recv_string()
+        assert message == WorkerProc.READY_STR
+        handle_frame = ready_socket.recv(copy=False)
+        handle = pickle.loads(handle_frame.buffer)
+        return handle
 
     class ResponseStatus(Enum):
         SUCCESS = auto()
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index fcb4d4f5a25a6..6ffd00ebd17a1 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -31,7 +31,8 @@ class StatLoggerBase(ABC):
 
 class LoggingStatLogger(StatLoggerBase):
 
-    def __init__(self):
+    def __init__(self, engine_index: int = 0):
+        self.engine_index = engine_index
         self._reset(time.monotonic())
         self.last_scheduler_stats = SchedulerStats()
         # Prefix cache metrics. This cannot be reset.
@@ -78,11 +79,13 @@ class LoggingStatLogger(StatLoggerBase):
 
         # Format and print output.
         logger.info(
+            "Engine %03d: "
             "Avg prompt throughput: %.1f tokens/s, "
             "Avg generation throughput: %.1f tokens/s, "
             "Running: %d reqs, Waiting: %d reqs, "
             "GPU KV cache usage: %.1f%%, "
             "Prefix cache hit rate: %.1f%%",
+            self.engine_index,
             prompt_throughput,
             generation_throughput,
             scheduler_stats.num_running_reqs,
@@ -94,7 +97,7 @@ class LoggingStatLogger(StatLoggerBase):
 
 class PrometheusStatLogger(StatLoggerBase):
 
-    def __init__(self, vllm_config: VllmConfig):
+    def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
         self._unregister_vllm_metrics()
 
         # Use this flag to hide metrics that were deprecated in
@@ -102,8 +105,11 @@ class PrometheusStatLogger(StatLoggerBase):
         self.show_hidden_metrics = \
             vllm_config.observability_config.show_hidden_metrics
 
-        labelnames = ["model_name"]
-        labelvalues = [vllm_config.model_config.served_model_name]
+        labelnames = ["model_name", "engine"]
+        labelvalues = [
+            vllm_config.model_config.served_model_name,
+            str(engine_index)
+        ]
 
         max_model_len = vllm_config.model_config.max_model_len
 
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 6c01ed3de52d7..f42b3501adb3b 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -105,7 +105,7 @@ class BackgroundProcHandle:
         process_kwargs: dict[Any, Any],
     ):
         context = get_mp_context()
-        reader, writer = context.Pipe(duplex=False)
+        self.reader, writer = context.Pipe(duplex=False)
 
         assert ("ready_pipe" not in process_kwargs
                 and "input_path" not in process_kwargs
@@ -115,14 +115,17 @@ class BackgroundProcHandle:
         process_kwargs["output_path"] = output_path
 
         # Run busy loop in background process.
-        self.proc = context.Process(target=target_fn, kwargs=process_kwargs)
+        self.proc = context.Process(target=target_fn,
+                                    kwargs=process_kwargs,
+                                    name=process_name)
         self._finalizer = weakref.finalize(self, shutdown, self.proc,
                                            input_path, output_path)
         self.proc.start()
 
+    def wait_for_startup(self):
         # Wait for startup.
-        if reader.recv()["status"] != "READY":
-            raise RuntimeError(f"{process_name} initialization failed. "
+        if self.reader.recv()["status"] != "READY":
+            raise RuntimeError(f"{self.proc.name} initialization failed. "
                                "See root cause above.")
 
     def shutdown(self):

From bd45912b99e3bad6621fd4d6bc103352ff31bcb7 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Thu, 27 Mar 2025 21:57:01 -0400
Subject: [PATCH 055/593] [TPU] Lazy Import (#15656)

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
---
 vllm/distributed/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index b8178af5a2daa..4206a24465e28 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -15,7 +15,6 @@ import torch
 from torch.distributed import ProcessGroup, TCPStore
 from torch.distributed.distributed_c10d import (Backend, PrefixStore,
                                                 _get_default_timeout,
-                                                _shutdown_backend,
                                                 _unregister_process_group,
                                                 is_nccl_available)
 from torch.distributed.rendezvous import rendezvous
@@ -343,5 +342,7 @@ def stateless_destroy_torch_distributed_process_group(
     Destroy ProcessGroup returned by
         stateless_init_torch_distributed_process_group().
     """
+    # Lazy import for non-CUDA backends.
+    from torch.distributed.distributed_c10d import _shutdown_backend
     _shutdown_backend(pg)
     _unregister_process_group(pg.group_name)

From 726efc6a320ad9a4ef0b0378b40abbd0561ea394 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 28 Mar 2025 10:12:47 +0800
Subject: [PATCH 056/593] [Quantization][V1]  BitsAndBytes support V1 (#15611)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .../vision_language/test_mllama.py            |  1 -
 tests/models/test_transformers.py             |  1 -
 tests/quantization/test_bitsandbytes.py       |  3 -
 vllm/config.py                                |  6 +-
 vllm/engine/arg_utils.py                      |  2 +-
 .../layers/quantization/bitsandbytes.py       | 61 ++++++++++++++-----
 vllm/model_executor/model_loader/loader.py    |  2 +
 7 files changed, 52 insertions(+), 24 deletions(-)

diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py
index ae7a7b028b152..260d2c1093879 100644
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -425,7 +425,6 @@ def test_bnb_regression(
         max_model_len=4096,
         max_num_seqs=2,
         quantization="bitsandbytes",
-        load_format="bitsandbytes",
     )
     sampling_params = SamplingParams(
         temperature=0,
diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
index c45fc7e649ec8..65bb11d6b5e4e 100644
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -72,7 +72,6 @@ def test_distributed(
         "meta-llama/Llama-3.2-1B-Instruct",
         {
             "quantization": "bitsandbytes",
-            "load_format": "bitsandbytes",
         },
     ),
 ])
diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py
index 1b6a918401487..533b055ee6d53 100644
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -101,8 +101,6 @@ def test_load_pp_4bit_bnb_model(model_name, description) -> None:
         "--enable-prefix-caching",
         "--quantization",
         "bitsandbytes",
-        "--load-format",
-        "bitsandbytes",
         "--gpu-memory-utilization",
         "0.7",
     ]
@@ -137,7 +135,6 @@ def validate_generated_texts(hf_runner,
     # when using distributed inference
     with vllm_runner(model_name,
                      quantization='bitsandbytes',
-                     load_format='bitsandbytes',
                      tensor_parallel_size=vllm_tp_size,
                      enforce_eager=False) as llm:
         vllm_outputs = llm.generate_greedy(prompts, 8)
diff --git a/vllm/config.py b/vllm/config.py
index 831fa2e4b06eb..5c73ff56ebbcf 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -682,8 +682,9 @@ class ModelConfig:
 
     def _verify_bnb_config(self) -> None:
         """
-        The current version of bitsandbytes (0.44.0) with 8-bit models does not
+        The current version of bitsandbytes (0.45.3) with 8-bit models does not
         yet support CUDA graph.
+        # TODO Remove this when bitsandbytes supports.
         """
         is_bitsandbytes = self.quantization == "bitsandbytes"
         has_quantization_config = (getattr(self.hf_config,
@@ -698,8 +699,9 @@ class ModelConfig:
                 not self.enforce_eager,
         ]):
             logger.warning(
-                "CUDA graph is not supported on BitAndBytes 8bit yet, "
+                "CUDA graph is not supported on BitsAndBytes 8bit yet, "
                 "fallback to the eager mode.")
+
             self.enforce_eager = True
 
     def _verify_with_expert_parallelism(self) -> None:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index a3b83c65a604a..d049f773caccd 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1616,7 +1616,7 @@ class EngineArgs:
             return False
 
         # Some quantization is not compatible with torch.compile.
-        V1_UNSUPPORTED_QUANT = ["bitsandbytes", "gguf"]
+        V1_UNSUPPORTED_QUANT = ["gguf"]
         if model_config.quantization in V1_UNSUPPORTED_QUANT:
             _raise_or_fallback(
                 feature_name=f"--quantization {model_config.quantization}",
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 1e8e7aa1b8c12..f5d32efe83688 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -9,6 +9,7 @@ from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.utils import direct_register_custom_op
 
 
 class BitsAndBytesConfig(QuantizationConfig):
@@ -321,9 +322,6 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
             x: torch.Tensor,
             bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
-        # only load the bitsandbytes module when needed
-        from bitsandbytes import matmul_4bit
-
         original_type = x.dtype
         original_shape = x.shape
         reshape_after_matmul = False
@@ -343,19 +341,7 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
                           out_dim_1,
                           dtype=torch.bfloat16,
                           device=x.device)
-
-        current_index = 0
-        for i in range(len(quant_states)):
-            output_size = quant_states[i].shape[0]
-            # It is more efficient to use out kwarg like
-            # matmul_4bit(..., out = ...).  Infeasible now due to the bug
-            # https://github.com/TimDettmers/bitsandbytes/issues/1235.
-            # Need to change  after the bug is fixed.
-            out[:, current_index:current_index + output_size] = matmul_4bit(
-                bf_x, qweight[offsets[i]:offsets[i + 1]].t(), quant_states[i])
-
-            current_index += output_size
-
+        apply_bnb_4bit(bf_x, qweight, offsets, out)
         out = out.to(original_type)
 
         if reshape_after_matmul:
@@ -365,3 +351,46 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
             out += bias
 
         return out
+
+
+def _apply_bnb_4bit(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    offsets: torch.Tensor,
+    out: torch.Tensor,
+) -> None:
+    # only load the bitsandbytes module when needed
+    from bitsandbytes import matmul_4bit
+    quant_states = weight.bnb_quant_state
+    current_index = 0
+    for i in range(len(quant_states)):
+        output_size = quant_states[i].shape[0]
+        # It is more efficient to use out kwarg like
+        # matmul_4bit(..., out = ...).  Infeasible now due to the bug
+        # https://github.com/TimDettmers/bitsandbytes/issues/1235.
+        # Need to change  after the bug is fixed.
+        out[:, current_index:current_index + output_size] = matmul_4bit(
+            x, weight[offsets[i]:offsets[i + 1]].t(), quant_states[i])
+        current_index += output_size
+
+
+def _apply_bnb_4bit_fake(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    offsets: torch.Tensor,
+    out: torch.Tensor,
+) -> None:
+    return
+
+
+try:
+    direct_register_custom_op(
+        op_name="apply_bnb_4bit",
+        op_func=_apply_bnb_4bit,
+        mutates_args=["out"],
+        fake_impl=_apply_bnb_4bit_fake,
+    )
+    apply_bnb_4bit = torch.ops.vllm.apply_bnb_4bit
+
+except AttributeError as error:
+    raise error
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index c969f18b822c4..5649cf2dd2cf1 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -1259,6 +1259,8 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                                          pack_ratio)
 
                 offsets = np.concatenate(([0], np.cumsum(num_elements)))
+                # Make torch infer_schema happy
+                offsets = torch.tensor(offsets).cpu()
                 set_weight_attrs(param, {"bnb_shard_offsets": offsets})
 
                 if load_8bit:

From 4e0f6076be71532272f114429a24a559f2656bef Mon Sep 17 00:00:00 2001
From: Kebe <mail@kebe7jun.com>
Date: Fri, 28 Mar 2025 10:13:41 +0800
Subject: [PATCH 057/593] [Bugfix] Fix failure to launch in Tensor Parallel TP
 mode on macOS. (#14948)

Signed-off-by: Kebe <mail@kebe7jun.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/design/multiprocessing.md                  | 4 ++--
 vllm/distributed/device_communicators/shm_broadcast.py | 9 +++++++--
 vllm/platforms/cpu.py                                  | 8 ++++++++
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md
index 55dae0bb92d4e..43fe5fe2e5e94 100644
--- a/docs/source/design/multiprocessing.md
+++ b/docs/source/design/multiprocessing.md
@@ -24,7 +24,7 @@ This document describes how vLLM deals with these challenges.
 [Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include:
 
 - `spawn` - spawn a new Python process. This will be the default as of Python
-  3.14.
+  3.14. In macOS, this is already the default.
 
 - `fork` - Use `os.fork()` to fork the Python interpreter. This is the default
   in Python versions prior to 3.14.
@@ -34,7 +34,7 @@ This document describes how vLLM deals with these challenges.
 ### Tradeoffs
 
 `fork` is the fastest method, but is incompatible with dependencies that use
-threads.
+threads. If you are under macOS, using `fork` may cause the process to crash.
 
 `spawn` is more compatible with dependencies, but can be problematic when vLLM
 is used as a library. If the consuming code does not use a `__main__` guard (`if
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 0d54fc73c882b..11ed7c0843779 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -125,8 +125,13 @@ class ShmRingBuffer:
                        lambda *args, **kwargs: None):
                 try:
                     self.shared_memory = shared_memory.SharedMemory(name=name)
-                    assert (
-                        self.shared_memory.size == self.total_bytes_of_buffer)
+                    # See https://docs.python.org/3/library/multiprocessing.shared_memory.html # noqa
+                    # Some platforms allocate memory based on page size,
+                    # so the shared memory block size may be larger or equal
+                    # to the requested size. The size parameter is ignored
+                    # when attaching to an existing block.
+                    assert (self.shared_memory.size
+                            >= self.total_bytes_of_buffer)
                 except FileNotFoundError:
                     # we might deserialize the object in a different node
                     # in this case, this object is not used,
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 0eb747a4c4514..619219023f4da 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
+import sys
 from typing import TYPE_CHECKING, Optional
 
 import psutil
@@ -148,6 +149,13 @@ class CpuPlatform(Platform):
         # To hint IPEX uses shared memory based AllReduce
         os.environ["LOCAL_WORLD_SIZE"] = str(
             vllm_config.parallel_config.tensor_parallel_size)
+        if sys.platform == "darwin" and \
+                envs.VLLM_WORKER_MULTIPROC_METHOD == "fork":
+            if os.environ.get('VLLM_WORKER_MULTIPROC_METHOD', None) is None:
+                logger.warning(
+                    "Default to spawn method on MacOS. If this is not desired,"
+                    " set VLLM_WORKER_MULTIPROC_METHOD to fork explicitly.")
+                os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
 
     @classmethod
     def is_pin_memory_available(cls) -> bool:

From b4245a48df84e5e807b92de6066728eeeaff9190 Mon Sep 17 00:00:00 2001
From: wwl2755 <wangwenlong2755@gmail.com>
Date: Thu, 27 Mar 2025 21:43:40 -0500
Subject: [PATCH 058/593] [Doc] Fix dead links in Job Board (#15637)

Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
---
 docs/source/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 3e790827f53bb..a83ad764125c5 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -104,7 +104,7 @@ myst_url_schemes = {
         "classes": ["github"],
     },
     "gh-project": {
-        "url": "https://github.com/vllm-project/projects/{{path}}",
+        "url": "https://github.com/orgs/vllm-project/projects/{{path}}",
         "title": "Project #{{path}}",
         "classes": ["github"],
     },

From 8a49eea74bb1e664381d32f1d041b5d1e651664d Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Thu, 27 Mar 2025 22:45:05 -0400
Subject: [PATCH 059/593] [CI][TPU] Temporarily Disable Quant Test on TPU
 (#15649)

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
---
 .buildkite/run-tpu-v1-test.sh | 9 +++++----
 tests/v1/tpu/test_basic.py    | 3 ---
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh
index a93b79c0b1b28..7bd91575e1729 100755
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@@ -28,15 +28,16 @@ docker run --privileged --net host --shm-size=16G -it \
     && echo TEST_3 \
     && pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
     && echo TEST_4 \
-    && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
-    && echo TEST_5 \
     && python3 /workspace/vllm/examples/offline_inference/tpu.py \
-    && echo TEST_6 \
+    && echo TEST_5 \
     && pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py \
-    && echo TEST_7 \
+    && echo TEST_6 \
     && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" \
 
 
 # TODO: This test fails because it uses RANDOM_SEED sampling
 # && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
 
+# TODO: Re-enable this after fixing recompilation in quantization.
+# && echo TEST_4 \
+# && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py
index 417483853916b..591aa9c5878ae 100644
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@@ -31,14 +31,12 @@ TENSOR_PARALLEL_SIZES = [1]
                     reason="This is a basic test for TPU only")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [5])
-@pytest.mark.parametrize("enforce_eager", [True])
 @pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES)
 def test_models(
     vllm_runner: type[VllmRunner],
     monkeypatch: pytest.MonkeyPatch,
     model: str,
     max_tokens: int,
-    enforce_eager: bool,
     tensor_parallel_size: int,
 ) -> None:
     prompt = "The next numbers of the sequence " + ", ".join(
@@ -51,7 +49,6 @@ def test_models(
         with vllm_runner(
                 model,
                 max_model_len=8192,
-                enforce_eager=enforce_eager,
                 gpu_memory_utilization=0.7,
                 max_num_seqs=16,
                 tensor_parallel_size=tensor_parallel_size) as vllm_model:

From 4ae17bf1e242d18d5cbf2eacdaf60185957d6f5b Mon Sep 17 00:00:00 2001
From: Wes <wryanmedford@gmail.com>
Date: Thu, 27 Mar 2025 20:45:55 -0600
Subject: [PATCH 060/593] Revert "Use Cache Hinting for fused_moe kernel
 (#15511)" (#15645)

Signed-off-by: Wes Medford <wryanmedford@gmail.com>
---
 .../model_executor/layers/fused_moe/fused_moe.py | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 0929530ebec4c..70d0037d7cb01 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -189,11 +189,7 @@ def fused_moe_kernel_gptq_awq(
                     mask=token_mask[:, None] &
                     (offs_k[None, :] < K - k * BLOCK_SIZE_K),
                     other=0.0)
-        b = tl.load(
-            b_ptrs,
-            cache_modifier=".cg",
-            eviction_policy="evict_last",
-        )
+        b = tl.load(b_ptrs)
         if use_int4_w4a16:
             b = (b >> b_shifter) & 0xF
 
@@ -395,13 +391,9 @@ def fused_moe_kernel(
                     mask=token_mask[:, None] &
                     (offs_k[None, :] < K - k * BLOCK_SIZE_K),
                     other=0.0)
-        b = tl.load(
-            b_ptrs,
-            mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
-            other=0.0,
-            cache_modifier=".cg",
-            eviction_policy="evict_last",
-        )
+        b = tl.load(b_ptrs,
+                    mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
         # We accumulate along the K dimension.
         if use_int8_w8a16:
             accumulator = tl.dot(a, b.to(compute_type), acc=accumulator)

From e7f720ea569c37d026d80cedc78944fe0dbc6a86 Mon Sep 17 00:00:00 2001
From: Chen Xia <cxia0209@gmail.com>
Date: Thu, 27 Mar 2025 19:47:05 -0700
Subject: [PATCH 061/593] [Misc]add coding benchmark for speculative decoding
 (#15303)

Signed-off-by: CXIAAAAA <cxia0209@gmail.com>
---
 benchmarks/benchmark_dataset.py    | 63 ++++++++++++++++++++++++++++++
 benchmarks/benchmark_serving.py    | 16 +++++---
 benchmarks/benchmark_throughput.py | 43 ++++++++++++--------
 3 files changed, 101 insertions(+), 21 deletions(-)

diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index 0567875f9862f..38ef739c69f9e 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -715,3 +715,66 @@ class VisionArenaDataset(HuggingFaceDataset):
                 ))
         self.maybe_oversample_requests(sampled_requests, num_requests)
         return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# Instruct Coder Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class InstructCoderDataset(HuggingFaceDataset):
+    """
+    InstructCoder Dataset.
+    https://huggingface.co/datasets/likaixin/InstructCoder
+
+    InstructCoder is the dataset designed for general code editing.
+    It consists of 114,239 instruction-input-output triplets,
+    and covers multiple distinct code editing scenario.
+    """
+
+    DEFAULT_OUTPUT_LEN = 200  # this is the average default output length
+    DEFAULT_NUM_REQUESTS = 1000
+    INSTRUCT_CODER_DATASET_PATH = "likaixin/InstructCoder"
+
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        if self.dataset_path != self.INSTRUCT_CODER_DATASET_PATH:
+            raise ValueError(f"Only support likaixin/InstructCoder dataset.\
+                    This data path {self.dataset_path} is not valid.")
+        if self.dataset_subset is None and self.dataset_split != "train":
+            raise ValueError("Dataset split must be 'train'.")
+
+    def load_data(self) -> None:
+        dataset = load_dataset(
+            self.dataset_path,
+            name=self.dataset_subset,
+            split=self.dataset_split,
+            streaming=True,
+        )
+        self.data = dataset.shuffle(seed=self.random_seed)
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               output_len: Optional[int] = None,
+               enable_multimodal_chat: bool = False,
+               **kwargs) -> list:
+        output_len = (output_len
+                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        sampled_requests = []
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = f"{item['instruction']}:\n{item['input']}"
+            prompt_len = len(tokenizer(prompt).input_ids)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 47627126b6688..82c6b426b9a2b 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -53,8 +53,9 @@ except ImportError:
     from argparse import ArgumentParser as FlexibleArgumentParser
 
 from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
-                               RandomDataset, SampleRequest, ShareGPTDataset,
-                               SonnetDataset, VisionArenaDataset)
+                               InstructCoderDataset, RandomDataset,
+                               SampleRequest, ShareGPTDataset, SonnetDataset,
+                               VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@@ -588,9 +589,14 @@ def main(args: argparse.Namespace):
     elif args.dataset_name == "hf":
         # Choose between VisionArenaDataset
         # and HuggingFaceDataset based on provided parameters.
-        dataset_class = (VisionArenaDataset if args.dataset_path
-                         == VisionArenaDataset.VISION_ARENA_DATASET_PATH
-                         and args.hf_subset is None else HuggingFaceDataset)
+        dataset_class = HuggingFaceDataset
+        if args.dataset_path == VisionArenaDataset.VISION_ARENA_DATASET_PATH:
+            assert args.hf_subset is None, "VisionArenaDataset needs hf_subset to be None."  #noqa: E501
+            dataset_class = VisionArenaDataset
+        elif args.dataset_path == "likaixin/InstructCoder":
+            dataset_class = InstructCoderDataset
+            args.hf_split = "train"
+
         input_requests = dataset_class(
             dataset_path=args.dataset_path,
             dataset_subset=args.hf_subset,
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 53869db478c51..f2f68b0d1e5e2 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -12,8 +12,9 @@ from typing import Any, Optional, Union
 import torch
 import uvloop
 from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
-                               RandomDataset, SampleRequest, ShareGPTDataset,
-                               SonnetDataset, VisionArenaDataset)
+                               InstructCoderDataset, RandomDataset,
+                               SampleRequest, ShareGPTDataset, SonnetDataset,
+                               VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
@@ -300,6 +301,7 @@ def get_requests(args, tokenizer):
         "input_len": args.input_len,
         "output_len": args.output_len,
     }
+
     if args.dataset_path is None or args.dataset_name == "random":
         sample_kwargs["range_ratio"] = args.random_range_ratio
         sample_kwargs["prefix_len"] = args.prefix_len
@@ -317,17 +319,21 @@ def get_requests(args, tokenizer):
     elif args.dataset_name == "burstgpt":
         dataset_cls = BurstGPTDataset
     elif args.dataset_name == "hf":
-        if args.backend != "vllm-chat":
-            raise ValueError(
-                "hf datasets only are supported by vllm-chat backend")
-        # Choose between VisionArenaDataset and HuggingFaceDataset based on
-        # provided parameters.
-        dataset_cls = (VisionArenaDataset if args.dataset_path
-                       == VisionArenaDataset.VISION_ARENA_DATASET_PATH
-                       and args.hf_subset is None else HuggingFaceDataset)
-        common_kwargs['dataset_subset'] = args.hf_subset
-        common_kwargs['dataset_split'] = args.hf_split
-        sample_kwargs["enable_multimodal_chat"] = True
+        if args.dataset_path == VisionArenaDataset.VISION_ARENA_DATASET_PATH:
+            if args.args.backend == "vllm-chat":
+                raise ValueError(
+                    "hf datasets only are supported by vllm-chat backend")
+            # Choose between VisionArenaDataset and HuggingFaceDataset based on
+            # provided parameters.
+            dataset_cls = (VisionArenaDataset if args.dataset_path
+                           == VisionArenaDataset.VISION_ARENA_DATASET_PATH
+                           and args.hf_subset is None else HuggingFaceDataset)
+            common_kwargs['dataset_subset'] = args.hf_subset
+            common_kwargs['dataset_split'] = args.hf_split
+            sample_kwargs["enable_multimodal_chat"] = True
+        elif args.dataset_path == "likaixin/InstructCoder":
+            dataset_cls = InstructCoderDataset
+            common_kwargs['dataset_split'] = "train"
 
     else:
         raise ValueError(f"Unknown dataset name: {args.dataset_name}")
@@ -462,9 +468,14 @@ def validate_args(args):
         warnings.warn("--hf-subset and --hf-split will be ignored \
                 since --dataset-name is not 'hf'.",
                       stacklevel=2)
-    elif args.dataset_name == "hf" and args.backend != "vllm-chat":
-        raise ValueError(
-            "When --dataset-name is 'hf', backend must be 'vllm-chat'")
+    elif args.dataset_name == "hf":
+        if args.dataset_path == VisionArenaDataset.VISION_ARENA_DATASET_PATH:
+            assert args.backend == "vllm-chat", "VisionArenaDataset needs to use vllm-chat as the backend."  #noqa: E501
+        elif args.dataset_path == "likaixin/InstructCoder":
+            assert args.backend == "vllm", "InstructCoder dataset needs to use vllm as the backend."  #noqa: E501
+        else:
+            raise ValueError(
+                f"{args.dataset_path} is not supported by hf dataset.")
 
     # --random-range-ratio: only used when dataset_name is 'random'
     if args.dataset_name != 'random' and args.random_range_ratio is not None:

From 4d0ec37267afaf988e32174ebc31f24268076491 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Thu, 27 Mar 2025 22:58:16 -0400
Subject: [PATCH 062/593] [Quantization][FP8] Adding support for fp8 gemm layer
 input in fp8 (#14578)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
---
 .../schemes/compressed_tensors_w8a8_fp8.py    |  2 ++
 .../layers/quantization/fbgemm_fp8.py         |  2 ++
 .../model_executor/layers/quantization/fp8.py | 17 ++++++++++++
 .../quark/schemes/quark_w8a8_fp8.py           |  2 ++
 .../layers/quantization/utils/w8a8_utils.py   | 27 ++++++++++++-------
 5 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index 27a74d677da7b..e99a452963f48 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -23,6 +23,7 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
 
     def __init__(self, strategy: str, is_static_input_scheme: bool):
         self.strategy = strategy
+        self.out_dtype = torch.get_default_dtype()
         self.is_static_input_scheme = is_static_input_scheme
         self.fp8_linear = Fp8LinearOp(use_per_token_if_dynamic=True)
 
@@ -143,5 +144,6 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
         return self.fp8_linear.apply(input=x,
                                      weight=layer.weight,
                                      weight_scale=layer.weight_scale,
+                                     out_dtype=self.out_dtype,
                                      input_scale=layer.input_scale,
                                      bias=bias)
diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
index 1cc431c5cc7be..7dddc40f3446d 100644
--- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -73,6 +73,7 @@ class FBGEMMFp8LinearMethod(LinearMethodBase):
     def __init__(self, quant_config: FBGEMMFp8Config):
         self.quant_config = quant_config
         self.fp8_linear = Fp8LinearOp(use_per_token_if_dynamic=True)
+        self.out_dtype = torch.get_default_dtype()
 
     def create_weights(
         self,
@@ -161,6 +162,7 @@ class FBGEMMFp8LinearMethod(LinearMethodBase):
         return self.fp8_linear.apply(input=x,
                                      weight=layer.weight,
                                      weight_scale=layer.weight_scale,
+                                     out_dtype=self.out_dtype,
                                      input_scale=None,
                                      input_scale_ub=layer.input_scale_ub,
                                      bias=bias)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index f3907b4784b54..11bfdb4180531 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -116,6 +116,21 @@ class Fp8Config(QuantizationConfig):
             return Fp8KVCacheMethod(self)
         return None
 
+    def get_cache_scale(self, name: str) -> Optional[str]:
+        """
+        Check whether the param name matches the format for k/v cache scales
+        in compressed-tensors. If this is the case, return its equivalent
+        param name expected by vLLM
+
+        :param name: param name
+        :return: matching param name for KV cache scale in vLLM
+        """
+        if name.endswith(".output_scale") and ".k_proj" in name:
+            return name.replace(".k_proj.output_scale", ".attn.k_scale")
+        if name.endswith(".output_scale") and ".v_proj" in name:
+            return name.replace(".v_proj.output_scale", ".attn.v_scale")
+        return None
+
 
 class Fp8LinearMethod(LinearMethodBase):
     """Linear method for FP8.
@@ -138,6 +153,7 @@ class Fp8LinearMethod(LinearMethodBase):
     def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
         self.cutlass_block_fp8_supported = cutlass_block_fp8_supported()
+        self.out_dtype = torch.get_default_dtype()
 
         # For GPUs that lack FP8 hardware support, we can leverage the Marlin
         # kernel for fast weight-only FP8 quantization
@@ -386,6 +402,7 @@ class Fp8LinearMethod(LinearMethodBase):
         return self.fp8_linear.apply(input=x,
                                      weight=layer.weight,
                                      weight_scale=layer.weight_scale,
+                                     out_dtype=self.out_dtype,
                                      input_scale=layer.input_scale,
                                      bias=bias)
 
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
index 3e4251e46931c..c161849c8c5a2 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
@@ -22,6 +22,7 @@ class QuarkW8A8Fp8(QuarkScheme):
         self.qscheme = qscheme
         self.is_static_input_scheme = is_static_input_scheme
         self.fp8_linear = Fp8LinearOp(use_per_token_if_dynamic=True)
+        self.out_dtype = torch.get_default_dtype()
 
     @classmethod
     def get_min_capability(cls) -> int:
@@ -134,5 +135,6 @@ class QuarkW8A8Fp8(QuarkScheme):
         return self.fp8_linear.apply(input=x,
                                      weight=layer.weight,
                                      weight_scale=layer.weight_scale,
+                                     out_dtype=self.out_dtype,
                                      input_scale=layer.input_scale,
                                      bias=bias)
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index c2bd4bce560e7..b8e6384d7359f 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -163,6 +163,7 @@ class Fp8LinearOp:
         input: torch.Tensor,
         weight: torch.Tensor,
         weight_scale: torch.Tensor,
+        out_dtype: Optional[torch.dtype] = None,
         input_scale: Optional[torch.Tensor] = None,
         input_scale_ub: Optional[torch.Tensor] = None,
         bias: Optional[torch.Tensor] = None,
@@ -182,8 +183,13 @@ class Fp8LinearOp:
         if use_per_token_if_dynamic is None:
             use_per_token_if_dynamic = self.use_per_token_if_dynamic
 
+        if out_dtype is None:
+            out_dtype = input.dtype
+
         # cutlass_scaled_mm supports per tensor/channel W and per tensor/token A
         if self.cutlass_fp8_supported:
+            assert input.dtype != current_platform.fp8_dtype(
+            ), "FP8 input to cutlass is not currently implemented"
             qinput, x_scale = ops.scaled_fp8_quant(
                 input_2d,
                 input_scale,
@@ -193,7 +199,7 @@ class Fp8LinearOp:
             # Fused GEMM_DQ
             output = ops.cutlass_scaled_mm(qinput,
                                            weight,
-                                           out_dtype=input.dtype,
+                                           out_dtype=out_dtype,
                                            scale_a=x_scale,
                                            scale_b=weight_scale,
                                            bias=bias)
@@ -202,12 +208,15 @@ class Fp8LinearOp:
         # torch.scaled_mm supports per tensor weights + activations only
         # so fallback to naive if per channel or per token
         else:
-            # Maybe apply padding to output, see comment in __init__
-            qinput, x_scale = ops.scaled_fp8_quant(
-                input_2d,
-                input_scale,
-                num_token_padding=self.output_padding,
-                use_per_token_if_dynamic=use_per_token_if_dynamic)
+            if input.dtype != current_platform.fp8_dtype():
+                # Maybe apply padding to output, see comment in __init__
+                qinput, x_scale = ops.scaled_fp8_quant(
+                    input_2d,
+                    input_scale,
+                    num_token_padding=self.output_padding,
+                    use_per_token_if_dynamic=use_per_token_if_dynamic)
+            else:
+                qinput, x_scale = input_2d, input_scale
 
             per_tensor_weights = (weight_scale.numel() == 1)
             per_tensor_activations = (x_scale.numel() == 1)
@@ -216,7 +225,7 @@ class Fp8LinearOp:
                 # Fused GEMM_DQ
                 output = torch._scaled_mm(qinput,
                                           weight,
-                                          out_dtype=input.dtype,
+                                          out_dtype=out_dtype,
                                           scale_a=x_scale,
                                           scale_b=weight_scale,
                                           bias=bias)
@@ -240,7 +249,7 @@ class Fp8LinearOp:
                 # Fused GEMM_DQ Rowwise GEMM
                 output = torch._scaled_mm(qinput,
                                           weight,
-                                          out_dtype=input.dtype,
+                                          out_dtype=out_dtype,
                                           scale_a=x_scale,
                                           scale_b=weight_scale.t(),
                                           bias=bias)

From cec8c7d7f8753d13737427ceb5cebe987f5f0549 Mon Sep 17 00:00:00 2001
From: "Jason (Siyu) Zhu" <jasonchu13@outlook.com>
Date: Thu, 27 Mar 2025 20:27:20 -0700
Subject: [PATCH 063/593] Refactor error handling for multiple exceptions in
 preprocessing (#15650)

Signed-off-by: JasonZhu1313 <jasonchu13@outlook.com>
---
 vllm/entrypoints/openai/serving_chat.py         | 12 ++----------
 vllm/entrypoints/openai/serving_embedding.py    |  5 +----
 vllm/entrypoints/openai/serving_pooling.py      |  8 +-------
 vllm/entrypoints/openai/serving_tokenization.py |  8 +-------
 4 files changed, 5 insertions(+), 28 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 3c35a848ea3a5..3102db4050f5b 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -197,16 +197,8 @@ class OpenAIServingChat(OpenAIServing):
                 truncate_prompt_tokens=request.truncate_prompt_tokens,
                 add_special_tokens=request.add_special_tokens,
             )
-        except ValueError as e:
-            logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
-        except TypeError as e:
-            logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
-        except RuntimeError as e:
-            logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
-        except jinja2.TemplateError as e:
+        except (ValueError, TypeError, RuntimeError,
+                jinja2.TemplateError) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 1c2c78aaf8926..0ee58672631d0 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -139,10 +139,7 @@ class OpenAIServingEmbedding(OpenAIServing):
                      truncate_prompt_tokens=truncate_prompt_tokens,
                      add_special_tokens=request.add_special_tokens,
                  )
-        except ValueError as e:
-            logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
-        except TypeError as e:
+        except (ValueError, TypeError) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index 894128ee974cd..779a3eded2c16 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -136,13 +136,7 @@ class OpenAIServingPooling(OpenAIServing):
                      truncate_prompt_tokens=truncate_prompt_tokens,
                      add_special_tokens=request.add_special_tokens,
                  )
-        except ValueError as e:
-            logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
-        except TypeError as e:
-            logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
-        except jinja2.TemplateError as e:
+        except (ValueError, TypeError, jinja2.TemplateError) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 90c0da2a24d51..c642fc51005ea 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -89,13 +89,7 @@ class OpenAIServingTokenization(OpenAIServing):
                      request.prompt,
                      add_special_tokens=request.add_special_tokens,
                  )
-        except ValueError as e:
-            logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
-        except TypeError as e:
-            logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
-        except jinja2.TemplateError as e:
+        except (ValueError, TypeError, jinja2.TemplateError) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 

From 8693e47e6ab52c1219323141d9b0eba89c4143b7 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Mar 2025 13:51:05 +0800
Subject: [PATCH 064/593] [Bugfix] Fix `mm_hashes` forgetting to be passed
 (#15668)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/inputs/preprocess.py                        |  2 ++
 vllm/model_executor/models/llava.py              |  2 ++
 vllm/model_executor/models/mllama.py             |  2 +-
 vllm/model_executor/models/phi4mm.py             | 16 ++++++++--------
 .../models/prithvi_geospatial_mae.py             |  1 +
 vllm/multimodal/inputs.py                        |  2 +-
 6 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 33f39bedea5b5..5cda5e5e3dee4 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -528,6 +528,7 @@ class InputPreprocessor:
                     prompt_token_ids=decoder_inputs_to_override[
                         "prompt_token_ids"],
                     mm_kwargs=inputs["mm_kwargs"],
+                    mm_hashes=inputs["mm_hashes"],
                     mm_placeholders=inputs["mm_placeholders"],
                 )
             else:
@@ -536,6 +537,7 @@ class InputPreprocessor:
                     prompt=inputs["prompt"],
                     prompt_token_ids=inputs["prompt_token_ids"],
                     mm_kwargs=inputs["mm_kwargs"],
+                    mm_hashes=inputs["mm_hashes"],
                     mm_placeholders=inputs["mm_placeholders"],
                 )
         elif inputs["type"] == "token":
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 826f04b37547b..45a0bf73b837d 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -868,6 +868,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
         mm_items = self._to_mm_items(mm_data)
         mm_item_counts = mm_items.get_all_counts()
         mm_kwargs = result["mm_kwargs"]
+        mm_hashes = result["mm_hashes"]
 
         # We reimplement the functionality of MLlavaProcessor from
         # https://github.com/TIGER-AI-Lab/Mantis.git
@@ -916,6 +917,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
             prompt=prompt,
             prompt_token_ids=prompt_ids,
             mm_kwargs=mm_kwargs,
+            mm_hashes=mm_hashes,
             mm_placeholders=mm_placeholder_ranges,
         )
 
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 9ed49597cf827..d2c8fb7237274 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1378,7 +1378,7 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
             # Because attn_metadata.encoder_seq_lens only counts the last
             # group of images for each sample, which is used to cheat the
             # block manager to allocate blocks for those images only.
-            # See input_processor_for_mllama() for more details.
+            # See MllamaMultiModalProcessor for more details.
             num_tiles_tensor = kwargs.pop("num_tiles")
             num_tiles = [t.tolist() for t in num_tiles_tensor]
             num_tokens_per_tile = calc_token_per_chunk(self.image_size)
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 3d4505d556e2c..cb75ee1ea2ccd 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -28,7 +28,7 @@ from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalInputs, NestedTensors
+from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
 from vllm.sequence import IntermediateTensors, SequenceData
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
@@ -1319,9 +1319,9 @@ def dummy_data_for_phi4mm(ctx: InputContext, seq_len: int,
 
 
 def input_mapper_for_phi4mm_audio(ctx: InputContext,
-                                  data: object) -> MultiModalInputs:
+                                  data: object) -> MultiModalKwargs:
     """
-    This function is used to create the MultiModalInputs for the Phi4MM 
+    This function is used to create the MultiModalKwargs for the Phi4MM 
     (audio) model.
     Specifically, for audio, we extract the audio features from the sound 
     file and create pairs of audio features and audio embed lengths (the
@@ -1338,13 +1338,13 @@ def input_mapper_for_phi4mm_audio(ctx: InputContext,
         data (object): Audio data.
 
     Returns:
-        MultiModalInputs: Multi-modal inputs.
+        MultiModalKwargs: Multi-modal inputs.
     """
     if not isinstance(data, list):
         data = [data]
 
     if len(data) == 0:
-        return MultiModalInputs()
+        return MultiModalKwargs()
 
     audio_features = []
     for audio_input in data:
@@ -1365,7 +1365,7 @@ def input_mapper_for_phi4mm_audio(ctx: InputContext,
             [single_audio_embed_size],
         )
         audio_features.append(single_audio_feature_audio_len_pair)
-    return MultiModalInputs({"audio_features": audio_features})
+    return MultiModalKwargs({"audio_features": audio_features})
 
 
 def input_mapper_for_phi4mm_image(ctx: InputContext, data: object):
@@ -1373,7 +1373,7 @@ def input_mapper_for_phi4mm_image(ctx: InputContext, data: object):
         data = [data]
     # data: list of PIL images
     if len(data) == 0:
-        return MultiModalInputs()
+        return MultiModalKwargs()
     hf_config = ctx.get_hf_config()
     vision_encoder_name = hf_config.img_processor
     if vision_encoder_name is None:
@@ -1385,7 +1385,7 @@ def input_mapper_for_phi4mm_image(ctx: InputContext, data: object):
 
     image_input_dict = preprocess(data, dynamic_hd_size, vit_image_size,
                                   vit_patch_size)
-    return MultiModalInputs({
+    return MultiModalKwargs({
         "pixel_values":
         image_input_dict["pixel_values"],
         "image_sizes":
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index 3f5faea4f875c..a69c0fc54e4c2 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -105,6 +105,7 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
             prompt=prompt,
             prompt_token_ids=[1],
             mm_kwargs=MultiModalKwargs(mm_kwargs),
+            mm_hashes=None,
             mm_placeholders={},
         )
 
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 3a588bb4eaba1..81d72ff190222 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -743,7 +743,7 @@ class MultiModalInputs(TypedDict):
     mm_kwargs: MultiModalKwargs
     """Keyword arguments to be directly passed to the model after batching."""
 
-    mm_hashes: NotRequired[Optional["MultiModalHashDict"]]
+    mm_hashes: Optional["MultiModalHashDict"]
     """The hashes of the multi-modal data."""
 
     mm_placeholders: MultiModalPlaceholderDict

From 355f66348c3ddb0a2c3217372f2ee47fb961d58f Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Mar 2025 14:34:34 +0800
Subject: [PATCH 065/593] [V1] Remove legacy input registry (#15673)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../multimodal/processing/test_h2ovl.py       |  7 +--
 .../multimodal/processing/test_idefics3.py    |  7 +--
 .../multimodal/processing/test_internvl.py    |  7 +--
 .../multimodal/processing/test_llava_next.py  | 16 +----
 .../processing/test_llava_onevision.py        | 16 +----
 .../multimodal/processing/test_phi3v.py       |  7 +--
 .../multimodal/processing/test_qwen2_vl.py    |  8 +--
 tests/multimodal/test_processing.py           | 18 ++----
 vllm/inputs/preprocess.py                     | 12 ++--
 vllm/inputs/registry.py                       | 25 +++++---
 vllm/multimodal/profiling.py                  | 55 +++++++---------
 vllm/multimodal/registry.py                   | 63 ++++++++++++++++---
 vllm/v1/engine/async_llm.py                   |  7 ++-
 vllm/v1/engine/llm_engine.py                  |  4 +-
 vllm/v1/engine/processor.py                   | 22 +++----
 vllm/v1/worker/gpu_model_runner.py            |  9 +--
 vllm/v1/worker/tpu_model_runner.py            |  2 -
 17 files changed, 132 insertions(+), 153 deletions(-)

diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py
index 713fc733e21c6..709a686577f34 100644
--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -10,7 +10,6 @@ from transformers import PretrainedConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.processing import BaseMultiModalProcessor
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
@@ -156,11 +155,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": len(size_factors)},
     )
-    tokenizer = cached_tokenizer_from_config(ctx.model_config)
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=tokenizer,
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     min_num = min_dynamic_patch if dynamic_image_size else 1
diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py
index 4cff429a53941..f5b5cf6b5ba96 100644
--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -4,7 +4,6 @@ import pytest
 from transformers import Idefics3Config
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
@@ -38,11 +37,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = cached_tokenizer_from_config(ctx.model_config)
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=tokenizer,
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py
index f5bd661071ac6..5ac47ecc5cc17 100644
--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -10,7 +10,6 @@ from transformers import PretrainedConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.processing import BaseMultiModalProcessor
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
@@ -113,11 +112,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": len(size_factors)},
     )
-    tokenizer = cached_tokenizer_from_config(ctx.model_config)
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=tokenizer,
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     min_num = min_dynamic_patch if dynamic_image_size else 1
diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
index 74bca0e358996..fe56a200a330f 100644
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -10,7 +10,6 @@ from pqdm.threads import pqdm
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.parse import ImageSize
 from vllm.multimodal.processing import BaseMultiModalProcessor
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ...utils import build_model_context
 
@@ -40,10 +39,7 @@ def test_processor_max_tokens(model_id):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": 1},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=cached_tokenizer_from_config(ctx.model_config),
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     info = processor.info
 
     seen_aspect_ratios = set[float]()
@@ -139,10 +135,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=cached_tokenizer_from_config(ctx.model_config),
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
     image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
                     (488, 183), (2560, 1669)]
@@ -168,10 +161,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=cached_tokenizer_from_config(ctx.model_config),
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
     seen_aspect_ratios = set[float]()
     image_sizes = list[ImageSize]()
diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
index c27898a40b711..7cefdd37ee49a 100644
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -10,7 +10,6 @@ from pqdm.threads import pqdm
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.parse import ImageSize
 from vllm.multimodal.processing import BaseMultiModalProcessor
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ...utils import build_model_context
 
@@ -41,10 +40,7 @@ def test_processor_max_tokens(model_id):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": 1},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=cached_tokenizer_from_config(ctx.model_config),
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     info = processor.info
 
     seen_aspect_ratios = set[float]()
@@ -139,10 +135,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=cached_tokenizer_from_config(ctx.model_config),
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
     image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
                     (488, 183), (2560, 1669)]
@@ -169,10 +162,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=cached_tokenizer_from_config(ctx.model_config),
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
     seen_aspect_ratios = set[float]()
     image_sizes = list[ImageSize]()
diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py
index dd5f30a23176b..ed0d04c5c5f5d 100644
--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -3,7 +3,6 @@
 import pytest
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
@@ -39,11 +38,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = cached_tokenizer_from_config(ctx.model_config)
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=tokenizer,
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py
index 95204c7ebb4d8..d8c2ca414d41c 100644
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -3,7 +3,6 @@
 import pytest
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
@@ -34,11 +33,8 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = cached_tokenizer_from_config(ctx.model_config)
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=tokenizer,
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    tokenizer = processor.info.get_tokenizer()
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index b229f1e6ec8da..da112bd7a921c 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -28,8 +28,7 @@ from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
                                         replace_token_matches)
 # yapf: enable
 from vllm.multimodal.profiling import MultiModalProfiler
-from vllm.transformers_utils.tokenizer import (AnyTokenizer,
-                                               cached_tokenizer_from_config)
+from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import full_groupby
 
 from .utils import random_image
@@ -955,10 +954,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
 
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        model_config,
-        tokenizer=cached_tokenizer_from_config(model_config),
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(model_config)
     profiler = MultiModalProfiler(processor)
 
     mock_supported_mm_limits = MagicMock(return_value={"image": num_supported})
@@ -994,10 +990,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
 
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        model_config,
-        tokenizer=cached_tokenizer_from_config(model_config),
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(model_config)
 
     rng = np.random.RandomState(0)
     image = random_image(rng, min_wh=128, max_wh=256)
@@ -1066,10 +1059,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
         revision=None,
     )
 
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        model_config,
-        tokenizer=cached_tokenizer_from_config(model_config),
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(model_config)
     orig_get_hf_processor = processor.info.get_hf_processor
 
     def get_hf_processor(self, **kwargs):
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 5cda5e5e3dee4..669fb96e6653a 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -261,13 +261,13 @@ class InputPreprocessor:
         # initialized without a tokenizer while using also multi-modal
         # input.
         if not self.tokenizer:
-            tokenizer = None
+            tokenizer = object()  # Dummy
         else:
             tokenizer_group = self.get_tokenizer_group()
             tokenizer = tokenizer_group.get_lora_tokenizer(lora_request)
 
-        mm_processor = self.mm_registry.create_processor(
-            self.model_config, tokenizer)
+        mm_processor = self.mm_registry.create_processor(self.model_config,
+                                                         tokenizer=tokenizer)
 
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
@@ -288,14 +288,14 @@ class InputPreprocessor:
         # initialized without a tokenizer while using also multi-modal
         # input.
         if not self.tokenizer:
-            tokenizer = None
+            tokenizer = object()  # Dummy
         else:
             tokenizer_group = self.get_tokenizer_group()
             tokenizer = await tokenizer_group.get_lora_tokenizer_async(
                 lora_request)
 
-        mm_processor = self.mm_registry.create_processor(
-            self.model_config, tokenizer)
+        mm_processor = self.mm_registry.create_processor(self.model_config,
+                                                         tokenizer=tokenizer)
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 8b95db7a72522..0579893e5d767 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -13,8 +13,7 @@ from typing_extensions import TypeVar, assert_never
 
 from vllm.logger import init_logger
 from vllm.transformers_utils.processor import cached_processor_from_config
-from vllm.transformers_utils.tokenizer import (AnyTokenizer,
-                                               cached_tokenizer_from_config)
+from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides,
                         resolve_mm_processor_kwargs)
 
@@ -329,17 +328,27 @@ class InputRegistry:
         from vllm.model_executor.model_loader import get_model_architecture
         from vllm.multimodal import MultiModalKwargs
         from vllm.multimodal.profiling import MultiModalProfiler
+        from vllm.sequence import SequenceData
 
         if mm_registry.has_processor(model_config):
-            tokenizer = cached_tokenizer_from_config(model_config)
             processor = mm_registry.create_processor(model_config,
-                                                     tokenizer,
                                                      disable_cache=True)
             profiler = MultiModalProfiler(processor)
-            dummy_data_factory = (profiler.get_encoder_dummy_data
-                                  if is_encoder_data else
-                                  profiler.get_decoder_dummy_data)
-            dummy_data = dummy_data_factory(seq_len)
+
+            dummy_data_v1 = (profiler.get_encoder_dummy_data(seq_len)
+                             if is_encoder_data else
+                             profiler.get_decoder_dummy_data(seq_len))
+            _seq_data = SequenceData.from_seqs(
+                dummy_data_v1.prompt_token_ids)  # type: ignore[attr-defined]
+
+            dummy_data = DummyData(
+                seq_data=_seq_data,
+                multi_modal_data=getattr(dummy_data_v1, "multi_modal_data",
+                                         None),
+                multi_modal_placeholders=getattr(dummy_data_v1,
+                                                 "multi_modal_placeholders",
+                                                 None),
+            )
         else:
             model_cls, _ = get_model_architecture(model_config)
             if is_encoder_data:
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 7b4fb5eb598d1..e36f8e4434ec6 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -3,18 +3,18 @@
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from dataclasses import dataclass, field
-from typing import Generic, TypeVar, cast
+from typing import Generic, NamedTuple, TypeVar, cast
 
 import numpy as np
 import numpy.typing as npt
 from PIL import Image
 
 import vllm.envs as envs
-from vllm.inputs import DummyData
 from vllm.logger import init_logger
 
 from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
-                     MultiModalInputs)
+                     MultiModalInputs, MultiModalKwargs,
+                     MultiModalPlaceholderDict)
 from .processing import BaseMultiModalProcessor, BaseProcessingInfo
 
 logger = init_logger(__name__)
@@ -31,6 +31,20 @@ class ProcessorInputs:
     hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
 
 
+class DummyEncoderData(NamedTuple):
+    """Dummy data used for profiling."""
+
+    prompt_token_ids: list[int]
+
+
+class DummyDecoderData(NamedTuple):
+    """Dummy data used for profiling."""
+
+    prompt_token_ids: list[int]
+    multi_modal_data: MultiModalKwargs
+    multi_modal_placeholders: MultiModalPlaceholderDict
+
+
 _I = TypeVar("_I", bound=BaseProcessingInfo)
 
 
@@ -179,13 +193,7 @@ class MultiModalProfiler(Generic[_I]):
                 "tokens.")
         return mm_inputs, total_placeholders_by_modality
 
-    def get_encoder_dummy_data(
-        self,
-        seq_len: int,
-    ) -> DummyData:
-        # Avoid circular import
-        from vllm.sequence import SequenceData
-
+    def get_encoder_dummy_data(self, seq_len: int) -> DummyEncoderData:
         mm_inputs, _ = self.get_and_validate_mm_inputs(seq_len)
         mm_inputs = cast(MultiModalEncDecInputs, mm_inputs)
 
@@ -197,19 +205,9 @@ class MultiModalProfiler(Generic[_I]):
         num_tokens_to_pad = max(total_len, seq_len) - total_len
         encoder_prompt_token_ids.extend([0] * num_tokens_to_pad)
 
-        return DummyData(
-            seq_data=SequenceData.from_seqs(encoder_prompt_token_ids),
-            multi_modal_data=None,
-            multi_modal_placeholders=None,
-        )
-
-    def get_decoder_dummy_data(
-        self,
-        seq_len: int,
-    ) -> DummyData:
-        # Avoid circular import
-        from vllm.sequence import SequenceData
+        return DummyEncoderData(encoder_prompt_token_ids)
 
+    def get_decoder_dummy_data(self, seq_len: int) -> DummyDecoderData:
         (mm_inputs, total_placeholders_by_modality
          ) = self.get_and_validate_mm_inputs(seq_len)
 
@@ -231,16 +229,11 @@ class MultiModalProfiler(Generic[_I]):
                 "and/or reduce `mm_counts`.", seq_len, total_len,
                 total_placeholders_by_modality)
 
-            return DummyData(
-                seq_data=SequenceData.from_prompt_token_counts((0, seq_len)),
-                multi_modal_data=None,
-                multi_modal_placeholders=None,
-            )
+        if total_len < seq_len:
+            prompt_token_ids.extend([0] * (seq_len - total_len))
 
-        prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids)))
-
-        return DummyData(
-            seq_data=SequenceData.from_seqs(prompt_token_ids),
+        return DummyDecoderData(
+            prompt_token_ids=prompt_token_ids,
             multi_modal_data=mm_inputs["mm_kwargs"],
             multi_modal_placeholders=mm_inputs["mm_placeholders"],
         )
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 24b8358982797..8c16c3ba80750 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -21,7 +21,8 @@ from .image import ImagePlugin
 from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors
 from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
                          ProcessingCache)
-from .profiling import BaseDummyInputsBuilder, MultiModalProfiler
+from .profiling import (BaseDummyInputsBuilder, DummyDecoderData,
+                        DummyEncoderData, MultiModalProfiler)
 from .video import VideoPlugin
 
 if TYPE_CHECKING:
@@ -256,10 +257,7 @@ class MultiModalRegistry:
         on underlying model configuration.
         """
         if self.has_processor(model_config):
-            tokenizer = cached_tokenizer_from_config(model_config)
-            processor = self.create_processor(model_config,
-                                              tokenizer,
-                                              disable_cache=True)
+            processor = self.create_processor(model_config, disable_cache=True)
             seq_len = model_config.max_model_len
             mm_limits = self.get_mm_limits_per_prompt(model_config)
             return processor.info.get_mm_max_tokens_per_item(
@@ -373,10 +371,7 @@ class MultiModalRegistry:
             This should be called after :meth:`init_mm_limits_per_prompt`.
         """
         if self.has_processor(model_config):
-            tokenizer = cached_tokenizer_from_config(model_config)
-            processor = self.create_processor(model_config,
-                                              tokenizer,
-                                              disable_cache=True)
+            processor = self.create_processor(model_config, disable_cache=True)
             profiler = MultiModalProfiler(processor)
             return profiler.get_mm_limits()
 
@@ -436,8 +431,8 @@ class MultiModalRegistry:
     def create_processor(
         self,
         model_config: "ModelConfig",
-        tokenizer: AnyTokenizer,
         *,
+        tokenizer: Optional[AnyTokenizer] = None,
         disable_cache: Optional[bool] = None,
     ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
         """
@@ -446,6 +441,8 @@ class MultiModalRegistry:
         See also:
             :ref:`mm-processing`
         """
+        if tokenizer is None:
+            tokenizer = cached_tokenizer_from_config(model_config)
         if disable_cache is None:
             disable_cache = model_config.disable_mm_preprocessor_cache
 
@@ -456,3 +453,49 @@ class MultiModalRegistry:
         cache = None if disable_cache else self._processing_cache
 
         return factories.build_processor(ctx, cache=cache)
+
+    def get_decoder_dummy_data(
+        self,
+        model_config: "ModelConfig",
+        seq_len: int,
+    ) -> DummyDecoderData:
+        """
+        Create dummy data for profiling the memory usage of a model.
+
+        The model is identified by ``model_config``.
+        """
+        processor = self.create_processor(model_config, disable_cache=True)
+        profiler = MultiModalProfiler(processor)
+        dummy_data = profiler.get_decoder_dummy_data(seq_len)
+
+        # Having more tokens is over-conservative but otherwise fine
+        token_ids = dummy_data.prompt_token_ids
+        if len(token_ids) < seq_len:
+            raise AssertionError(
+                f"Expected at least {seq_len} dummy tokens for profiling, "
+                f"but found {len(token_ids)} tokens instead.")
+
+        return dummy_data
+
+    def get_encoder_dummy_data(
+        self,
+        model_config: "ModelConfig",
+        seq_len: int,
+    ) -> DummyEncoderData:
+        """
+        Create dummy data for profiling the memory usage of a model.
+
+        The model is identified by ``model_config``.
+        """
+        processor = self.create_processor(model_config, disable_cache=True)
+        profiler = MultiModalProfiler(processor)
+        dummy_data = profiler.get_encoder_dummy_data(seq_len)
+
+        # Having more tokens is over-conservative but otherwise fine
+        token_ids = dummy_data.prompt_token_ids
+        if len(token_ids) < seq_len:
+            logger.warning_once(
+                f"Expected at least {seq_len} dummy encoder tokens for "
+                f"profiling, but found {len(token_ids)} tokens instead.")
+
+        return dummy_data
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 1fb9ae8cb7a59..a8d86e70f6abf 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -14,10 +14,11 @@ from vllm.config import ModelConfig, VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.protocol import EngineClient
 from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE
-from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType
+from vllm.inputs import PromptType
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.outputs import RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
@@ -48,7 +49,7 @@ class AsyncLLM(EngineClient):
         executor_class: type[Executor],
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        input_registry: InputRegistry = INPUT_REGISTRY,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         use_cached_outputs: bool = False,
         log_requests: bool = True,
         start_engine_loop: bool = True,
@@ -90,7 +91,7 @@ class AsyncLLM(EngineClient):
         self.processor = Processor(
             vllm_config=vllm_config,
             tokenizer=self.tokenizer,
-            input_registry=input_registry,
+            mm_registry=mm_registry,
         )
 
         # OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 8cc73f9fe7224..000de21fbe7bf 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -11,7 +11,7 @@ from vllm.config import ParallelConfig, VllmConfig
 from vllm.distributed import stateless_destroy_torch_distributed_process_group
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.metrics_types import StatLoggerBase
-from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType
+from vllm.inputs import PromptType
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
@@ -44,7 +44,6 @@ class LLMEngine:
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
         stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
-        input_registry: InputRegistry = INPUT_REGISTRY,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         use_cached_outputs: bool = False,
         multiprocess_mode: bool = False,
@@ -80,7 +79,6 @@ class LLMEngine:
         # Processor (convert Inputs --> EngineCoreRequests)
         self.processor = Processor(vllm_config=vllm_config,
                                    tokenizer=self.tokenizer,
-                                   input_registry=input_registry,
                                    mm_registry=mm_registry)
 
         # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 065ac0920af77..24762d214c345 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -5,8 +5,7 @@ from collections.abc import Mapping
 from typing import Optional, Union
 
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
-                         PromptType, SingletonInputsAdapter)
+from vllm.inputs import ProcessorInputs, PromptType
 from vllm.inputs.parse import split_enc_dec_inputs
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.lora.request import LoRARequest
@@ -31,7 +30,6 @@ class Processor:
         self,
         vllm_config: VllmConfig,
         tokenizer: BaseTokenizerGroup,
-        input_registry: InputRegistry = INPUT_REGISTRY,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ):
 
@@ -210,7 +208,6 @@ class Processor:
         self._validate_model_inputs(processed_inputs, lora_request)
 
         encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
-        decoder_inputs = SingletonInputsAdapter(decoder_inputs)
 
         # TODO: Impl encoder-decoder
         if encoder_inputs is not None:
@@ -221,8 +218,9 @@ class Processor:
         sampling_params = params.clone()
         # If unset max tokens, then generate up to the max_model_len.
         if sampling_params.max_tokens is None:
-            sampling_params.max_tokens = (self.model_config.max_model_len -
-                                          len(decoder_inputs.prompt_token_ids))
+            sampling_params.max_tokens = (
+                self.model_config.max_model_len -
+                len(decoder_inputs["prompt_token_ids"]))
         sampling_params.update_from_generation_config(
             self.generation_config_fields, eos_token_id)
         sampling_params.update_from_tokenizer(
@@ -232,8 +230,8 @@ class Processor:
         sorted_mm_inputs: Optional[list[MultiModalKwargs]] = None
         sorted_mm_positions: Optional[list[PlaceholderRange]] = None
         sorted_mm_hashes: Optional[list[str]] = None
-        if (decoder_mm_inputs := decoder_inputs.multi_modal_data):
-            assert isinstance(decoder_mm_inputs, MultiModalKwargs)
+        if decoder_inputs["type"] == "multimodal":
+            decoder_mm_inputs = decoder_inputs["mm_kwargs"]
 
             # The output of merged multi-modal processor (`decoder_mm_inputs`)
             # contains the kwargs for all items from all modalities.
@@ -254,8 +252,8 @@ class Processor:
                 sorted_mm_positions,
                 sorted_mm_hashes,
             ) = merge_and_sort_multimodal_metadata(
-                decoder_inputs.multi_modal_placeholders,
-                decoder_inputs.multi_modal_hashes if self.use_hash else None,
+                decoder_inputs["mm_placeholders"],
+                decoder_inputs["mm_hashes"] if self.use_hash else None,
             )
 
             # NOTE: Sort multimodal inputs/kwargs ONLY IF there are multiple
@@ -281,8 +279,8 @@ class Processor:
 
         return EngineCoreRequest(
             request_id=request_id,
-            prompt=decoder_inputs.prompt,
-            prompt_token_ids=decoder_inputs.prompt_token_ids,
+            prompt=decoder_inputs.get("prompt"),
+            prompt_token_ids=decoder_inputs["prompt_token_ids"],
             mm_inputs=sorted_mm_inputs,
             mm_hashes=sorted_mm_hashes,
             mm_placeholders=sorted_mm_positions,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 133ccf84832c4..1b581c69a728b 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -15,7 +15,6 @@ from vllm.attention.layer import Attention
 from vllm.config import CompilationLevel, VllmConfig
 from vllm.distributed.parallel_state import get_pp_group, graph_capture
 from vllm.forward_context import set_forward_context
-from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
@@ -130,7 +129,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         self.cascade_attn_enabled = not self.model_config.disable_cascade_attn
 
         # Multi-modal data support
-        self.input_registry = INPUT_REGISTRY
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
 
@@ -1473,16 +1471,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 encoder_budget, max_num_mm_items, dummy_data_modality)
 
             # Create dummy batch of multimodal inputs.
-            dummy_request_data = self.input_registry.dummy_data_for_profiling(
+            dummy_request_data = self.mm_registry.get_decoder_dummy_data(
                 model_config=self.model_config,
                 seq_len=self.max_num_tokens,
-                mm_registry=self.mm_registry,
             )
             dummy_mm_data = dummy_request_data.multi_modal_data
-            if not isinstance(dummy_mm_data, MultiModalKwargs):
-                # TODO: Delete this check once input mapper is fully removed.
-                raise RuntimeError(
-                    "Legacy input mapper is not supported in V1")
 
             # Dummy data definition may contain multiple multimodal items
             # (e.g, multiple images) for a single request, therefore here we
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 97dfd23163dff..5401fff2bf19b 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -17,7 +17,6 @@ from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention
 from vllm.config import VllmConfig
 from vllm.forward_context import set_forward_context
-from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
@@ -102,7 +101,6 @@ class TPUModelRunner:
         self.hidden_size = model_config.get_hidden_size()
 
         # Multi-modal data support
-        self.input_registry = INPUT_REGISTRY
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
         # TODO: Support M-RoPE (e.g, Qwen2-VL)

From 2d9045fce8a6b440f937a5f313bf8bc5baf3103a Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Fri, 28 Mar 2025 03:01:26 -0400
Subject: [PATCH 066/593] [TPU][CI] Fix TPUModelRunner Test (#15667)

Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
---
 .buildkite/run-tpu-v1-test.sh                |  2 +-
 tests/v1/tpu/worker/test_tpu_model_runner.py | 18 +-----------------
 2 files changed, 2 insertions(+), 18 deletions(-)

diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh
index 7bd91575e1729..2c356b8fe5274 100755
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@@ -30,7 +30,7 @@ docker run --privileged --net host --shm-size=16G -it \
     && echo TEST_4 \
     && python3 /workspace/vllm/examples/offline_inference/tpu.py \
     && echo TEST_5 \
-    && pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
     && echo TEST_6 \
     && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" \
 
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index d5f812ed4d543..6b6a91b857f0e 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -7,7 +7,6 @@ from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
                                        SchedulerOutput)
-from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.tpu_model_runner import (TPUModelRunner,
                                              _get_padded_token_len,
                                              _get_paddings)
@@ -113,12 +112,6 @@ def _is_req_added(model_runner, req_id: str) -> bool:
     return req_id in model_runner.requests
 
 
-def _is_sampling_metadata_changed(model_runner,
-                                  sampling_metadata_before: SamplingMetadata):
-    return model_runner.input_batch.sampling_metadata is not (
-        sampling_metadata_before)
-
-
 def _is_req_state_block_table_match(model_runner, req_id: str) -> bool:
     req_index = model_runner.input_batch.req_id_to_index[req_id]
     block_table = model_runner.input_batch.block_table
@@ -136,10 +129,8 @@ def test_update_states_new_request(model_runner):
     # new req
     scheduler_output = _schedule_new_request(req_id)
 
-    metadata_before = model_runner.input_batch.sampling_metadata
     model_runner._update_states(scheduler_output)
 
-    assert _is_sampling_metadata_changed(model_runner, metadata_before)
     assert _is_req_added(model_runner, req_id)
     assert _is_req_scheduled(model_runner, req_id)
     assert _is_req_state_block_table_match(model_runner, req_id)
@@ -170,9 +161,7 @@ def test_update_states_request_finished(model_runner):
         grammar_bitmask=None,
     )
 
-    metadata_before = model_runner.input_batch.sampling_metadata
     model_runner._update_states(scheduler_output)
-    assert _is_sampling_metadata_changed(model_runner, metadata_before)
     assert not _is_req_added(model_runner, req_id)
     assert not _is_req_scheduled(model_runner, req_id)
 
@@ -229,9 +218,7 @@ def test_update_states_request_resumed(model_runner):
         grammar_bitmask=None,
     )
 
-    metadata_before = model_runner.input_batch.sampling_metadata
     model_runner._update_states(scheduler_output)
-    assert _is_sampling_metadata_changed(model_runner, metadata_before)
     assert _is_req_added(model_runner, req_id)
     assert _is_req_scheduled(model_runner, req_id)
     assert _is_req_state_block_table_match(model_runner, req_id)
@@ -262,9 +249,7 @@ def test_update_states_no_changes(model_runner):
         grammar_bitmask=None,
     )
 
-    metadata_before = model_runner.input_batch.sampling_metadata
     model_runner._update_states(scheduler_output)
-    assert not _is_sampling_metadata_changed(model_runner, metadata_before)
     assert _is_req_added(model_runner, req_id)
     assert _is_req_scheduled(model_runner, req_id)
     assert _is_req_state_block_table_match(model_runner, req_id)
@@ -299,8 +284,7 @@ def test_update_states_request_unscheduled(model_runner):
         grammar_bitmask=None,
     )
 
-    metadata_before = model_runner._update_states(scheduler_output)
-    assert _is_sampling_metadata_changed(model_runner, metadata_before)
+    model_runner._update_states(scheduler_output)
 
     assert _is_req_added(model_runner, req_ids[0])
     assert _is_req_scheduled(model_runner, req_ids[0])

From 32b14baf8a1f7195ca09484de3008063569b43c5 Mon Sep 17 00:00:00 2001
From: Ce Gao <cegao@tensorchord.ai>
Date: Fri, 28 Mar 2025 15:23:30 +0800
Subject: [PATCH 067/593] [Refactor][Frontend] Keep all logic about reasoning
 into one class (#14428)

Signed-off-by: Ce Gao <cegao@tensorchord.ai>
---
 .../__init__.py                               |   0
 .../test_deepseekr1_reasoning_parser.py       |  52 +++++++--
 .../test_granite_reasoning_parser.py          |   6 +-
 .../reasoning_parsers => reasoning}/utils.py  |   2 +-
 vllm/engine/arg_utils.py                      |   3 +-
 vllm/engine/llm_engine.py                     |   5 +-
 vllm/entrypoints/openai/api_server.py         |   2 +-
 vllm/entrypoints/openai/serving_chat.py       |   3 +-
 .../guided_decoding/__init__.py               |  15 ++-
 .../guided_decoding/outlines_decoding.py      |   8 +-
 .../outlines_logits_processors.py             |  14 +--
 .../reasoner/deepseek_reasoner.py             |  38 -------
 .../guided_decoding/reasoner/reasoner.py      |  23 ----
 .../guided_decoding/xgrammar_decoding.py      |   6 +-
 .../__init__.py                               |   0
 .../abs_reasoning_parsers.py                  | 101 ++++++++----------
 .../deepseek_r1_reasoning_parser.py           |  90 ++++++++--------
 .../granite_reasoning_parser.py               |   3 +-
 18 files changed, 171 insertions(+), 200 deletions(-)
 rename tests/{entrypoints/openai/reasoning_parsers => reasoning}/__init__.py (100%)
 rename tests/{entrypoints/openai/reasoning_parsers => reasoning}/test_deepseekr1_reasoning_parser.py (75%)
 rename tests/{entrypoints/openai/reasoning_parsers => reasoning}/test_granite_reasoning_parser.py (97%)
 rename tests/{entrypoints/openai/reasoning_parsers => reasoning}/utils.py (97%)
 delete mode 100644 vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py
 delete mode 100644 vllm/model_executor/guided_decoding/reasoner/reasoner.py
 rename vllm/{entrypoints/openai/reasoning_parsers => reasoning}/__init__.py (100%)
 rename vllm/{entrypoints/openai/reasoning_parsers => reasoning}/abs_reasoning_parsers.py (82%)
 rename vllm/{entrypoints/openai/reasoning_parsers => reasoning}/deepseek_r1_reasoning_parser.py (64%)
 rename vllm/{entrypoints/openai/reasoning_parsers => reasoning}/granite_reasoning_parser.py (99%)

diff --git a/tests/entrypoints/openai/reasoning_parsers/__init__.py b/tests/reasoning/__init__.py
similarity index 100%
rename from tests/entrypoints/openai/reasoning_parsers/__init__.py
rename to tests/reasoning/__init__.py
diff --git a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py b/tests/reasoning/test_deepseekr1_reasoning_parser.py
similarity index 75%
rename from tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
rename to tests/reasoning/test_deepseekr1_reasoning_parser.py
index 5ce5d9280f3ef..7b6af183a86ad 100644
--- a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
+++ b/tests/reasoning/test_deepseekr1_reasoning_parser.py
@@ -3,74 +3,92 @@
 import pytest
 from transformers import AutoTokenizer
 
-from tests.entrypoints.openai.reasoning_parsers.utils import (
-    run_reasoning_extraction)
-from vllm.entrypoints.openai.reasoning_parsers import (ReasoningParser,
-                                                       ReasoningParserManager)
+from tests.reasoning.utils import run_reasoning_extraction
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
 
 parser_name = "deepseek_r1"
 start_token = "<think>"
 end_token = "</think>"
 
+REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
+
+
+@pytest.fixture(scope="module")
+def deepseek_r1_qwen_tokenizer():
+    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+
+
 SIMPLE_REASONING = {
     "output": "This is a reasoning section</think>This is the rest",
     "reasoning_content": "This is a reasoning section",
     "content": "This is the rest",
+    "is_reasoning_end": True,
 }
 COMPLETE_REASONING = {
     "output": "This is a reasoning section</think>",
     "reasoning_content": "This is a reasoning section",
     "content": None,
+    "is_reasoning_end": True,
 }
 NO_CONTENT = {
     "output": "This is content",
     "reasoning_content": "This is content",
     "content": None,
+    "is_reasoning_end": False,
 }
 NO_REASONING_STREAMING = {
     "output": "This is a reasoning section",
     "reasoning_content": "This is a reasoning section",
     "content": None,
+    "is_reasoning_end": False,
 }
 MULTIPLE_LINES = {
     "output": "This\nThat</think>This is the rest\nThat",
     "reasoning_content": "This\nThat",
     "content": "This is the rest\nThat",
+    "is_reasoning_end": True,
 }
 SHORTEST_REASONING_NO_STREAMING = {
     "output": "</think>This is the rest",
     "reasoning_content": "",
     "content": "This is the rest",
+    "is_reasoning_end": True,
 }
 SHORTEST_REASONING = {
     "output": "</think>This is the rest",
     "reasoning_content": None,
     "content": "This is the rest",
+    "is_reasoning_end": True,
 }
 REASONING_WITH_THINK = {
     "output": "<think>This is a reasoning section</think>This is the rest",
     "reasoning_content": "This is a reasoning section",
     "content": "This is the rest",
+    "is_reasoning_end": True,
 }
 COMPLETE_REASONING_WITH_THINK = {
     "output": "<think>This is a reasoning section</think>",
     "reasoning_content": "This is a reasoning section",
     "content": None,
+    "is_reasoning_end": True,
 }
 MULTIPLE_LINES_WITH_THINK = {
     "output": "<think>This\nThat</think>This is the rest\nThat",
     "reasoning_content": "This\nThat",
     "content": "This is the rest\nThat",
+    "is_reasoning_end": True,
 }
 SHORTEST_REASONING_NO_STREAMING_WITH_THINK = {
     "output": "</think>This is the rest",
     "reasoning_content": "",
     "content": "This is the rest",
+    "is_reasoning_end": True,
 }
 SHORTEST_REASONING_WITH_THINK = {
     "output": "</think>This is the rest",
     "reasoning_content": None,
     "content": "This is the rest",
+    "is_reasoning_end": True,
 }
 
 TEST_CASES = [
@@ -166,23 +184,21 @@ TEST_CASES = [
     ),
 ]
 
-# Global tokenizer initialization to avoid repeated loading
-tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
-tokenizer.add_tokens([start_token, end_token])
-
 
 @pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
 def test_reasoning(
     streaming: bool,
     param_dict: dict,
+    deepseek_r1_qwen_tokenizer,
 ):
-    output = tokenizer.tokenize(param_dict["output"])
+    output = deepseek_r1_qwen_tokenizer.tokenize(param_dict["output"])
     # decode everything to tokens
     output_tokens: list[str] = [
-        tokenizer.convert_tokens_to_string([token]) for token in output
+        deepseek_r1_qwen_tokenizer.convert_tokens_to_string([token])
+        for token in output
     ]
     parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
-        parser_name)(tokenizer)
+        parser_name)(deepseek_r1_qwen_tokenizer)
 
     reasoning, content = run_reasoning_extraction(parser,
                                                   output_tokens,
@@ -190,3 +206,17 @@ def test_reasoning(
 
     assert reasoning == param_dict["reasoning_content"]
     assert content == param_dict["content"]
+
+    # Test is_reasoning_end
+    output_ids = deepseek_r1_qwen_tokenizer.convert_tokens_to_ids(output)
+    is_reasoning_end = parser.is_reasoning_end(output_ids)
+    assert is_reasoning_end == param_dict["is_reasoning_end"]
+
+    # Test extract_content
+    if param_dict["content"] is not None:
+        content = parser.extract_content_ids(output_ids)
+        assert content == deepseek_r1_qwen_tokenizer.convert_tokens_to_ids(
+            deepseek_r1_qwen_tokenizer.tokenize(param_dict["content"]))
+    else:
+        content = parser.extract_content_ids(output)
+        assert content == []
diff --git a/tests/entrypoints/openai/reasoning_parsers/test_granite_reasoning_parser.py b/tests/reasoning/test_granite_reasoning_parser.py
similarity index 97%
rename from tests/entrypoints/openai/reasoning_parsers/test_granite_reasoning_parser.py
rename to tests/reasoning/test_granite_reasoning_parser.py
index 84ac6600498b2..48fb8c2f8d1b9 100644
--- a/tests/entrypoints/openai/reasoning_parsers/test_granite_reasoning_parser.py
+++ b/tests/reasoning/test_granite_reasoning_parser.py
@@ -2,10 +2,8 @@
 import pytest
 from transformers import AutoTokenizer
 
-from tests.entrypoints.openai.reasoning_parsers.utils import (
-    DeltaMessage, run_reasoning_extraction)
-from vllm.entrypoints.openai.reasoning_parsers import (ReasoningParser,
-                                                       ReasoningParserManager)
+from tests.reasoning.utils import DeltaMessage, run_reasoning_extraction
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
 
 parser_name = "granite"
 START_REASONING = "Here is my thought process:"
diff --git a/tests/entrypoints/openai/reasoning_parsers/utils.py b/tests/reasoning/utils.py
similarity index 97%
rename from tests/entrypoints/openai/reasoning_parsers/utils.py
rename to tests/reasoning/utils.py
index 01e43130bc6e7..0f894ed800c6c 100644
--- a/tests/entrypoints/openai/reasoning_parsers/utils.py
+++ b/tests/reasoning/utils.py
@@ -4,7 +4,7 @@ from typing import Optional, Union
 
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaMessage)
-from vllm.entrypoints.openai.reasoning_parsers import ReasoningParser
+from vllm.reasoning import ReasoningParser
 
 
 class StreamingReasoningReconstructor:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index d049f773caccd..a416fa8aa08e3 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -23,6 +23,7 @@ from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.plugins import load_general_plugins
+from vllm.reasoning import ReasoningParserManager
 from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.usage.usage_lib import UsageContext
@@ -1119,7 +1120,7 @@ class EngineArgs:
         parser.add_argument(
             "--reasoning-parser",
             type=str,
-            choices=["deepseek_r1", "granite"],
+            choices=list(ReasoningParserManager.reasoning_parsers),
             default=None,
             help=
             "Select the reasoning parser depending on the model that you're "
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 4856c3568319b..5682b3dabe2e8 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -2080,8 +2080,9 @@ class LLMEngine:
             guided_decoding.backend = guided_decoding.backend or \
                 self.decoding_config.guided_decoding_backend
 
-            logger.debug("Reasoning backend: %s",
-                         self.decoding_config.reasoning_backend)
+            if self.decoding_config.reasoning_backend is not None:
+                logger.debug("Building with reasoning backend %s",
+                             self.decoding_config.reasoning_backend)
 
             processor = get_local_guided_decoding_logits_processor(
                 guided_params=guided_decoding,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 1e735da641df9..6c1f60fa6a3b4 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -68,7 +68,6 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               TranscriptionRequest,
                                               TranscriptionResponse,
                                               UnloadLoRAAdapterRequest)
-from vllm.entrypoints.openai.reasoning_parsers import ReasoningParserManager
 # yapf: enable
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
@@ -85,6 +84,7 @@ from vllm.entrypoints.openai.serving_transcription import (
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
 from vllm.entrypoints.utils import load_aware_call, with_cancellation
 from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParserManager
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
 from vllm.transformers_utils.tokenizer import MistralTokenizer
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 3102db4050f5b..eda4722836bdb 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -23,8 +23,6 @@ from vllm.entrypoints.openai.protocol import (
     ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage,
     DeltaToolCall, ErrorResponse, FunctionCall, PromptTokenUsageInfo,
     RequestResponseMetadata, ToolCall, UsageInfo)
-from vllm.entrypoints.openai.reasoning_parsers import (ReasoningParser,
-                                                       ReasoningParserManager)
 from vllm.entrypoints.openai.serving_engine import (OpenAIServing,
                                                     clamp_prompt_logprobs)
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
@@ -33,6 +31,7 @@ from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
     MistralToolCall)
 from vllm.logger import init_logger
 from vllm.outputs import CompletionOutput, RequestOutput
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.sequence import Logprob
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index 0c26a60588c88..cecb3a8a1d4a8 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -5,10 +5,10 @@ from __future__ import annotations
 from typing import TYPE_CHECKING
 
 from vllm.logger import init_logger
-from vllm.model_executor.guided_decoding.reasoner import get_reasoner
 from vllm.model_executor.guided_decoding.utils import (
     convert_lark_to_gbnf, grammar_is_likely_lark,
     has_lmf_unsupported_json_features, has_xgrammar_unsupported_json_features)
+from vllm.reasoning import ReasoningParserManager
 
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizer
@@ -107,7 +107,11 @@ async def get_guided_decoding_logits_processor(
         model_config: ModelConfig,
         reasoning_backend: str | None = None) -> LogitsProcessor | None:
 
-    reasoner = get_reasoner(tokenizer, reasoning_backend)
+    reasoner = None
+    if reasoning_backend is not None:
+        reasoner_class = ReasoningParserManager.get_reasoning_parser(
+            reasoning_backend)
+        reasoner = reasoner_class(tokenizer)
 
     guided_params = maybe_backend_fallback(guided_params)
 
@@ -146,8 +150,11 @@ def get_local_guided_decoding_logits_processor(
         reasoning_backend: str | None = None) -> LogitsProcessor | None:
     guided_params = maybe_backend_fallback(guided_params)
 
-    # Get the reasoner if needed, it will be None if reasoning_
-    reasoner = get_reasoner(tokenizer, reasoning_backend)
+    reasoner = None
+    if reasoning_backend is not None:
+        reasoner_class = ReasoningParserManager.get_reasoning_parser(
+            reasoning_backend)
+        reasoner = reasoner_class(tokenizer)
 
     # CFG grammar not supported by LMFE, so we use outlines instead
     if guided_params.backend_name == 'outlines':
diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py
index 97f63ae11f457..564f9277a83c6 100644
--- a/vllm/model_executor/guided_decoding/outlines_decoding.py
+++ b/vllm/model_executor/guided_decoding/outlines_decoding.py
@@ -12,7 +12,7 @@ from transformers import PreTrainedTokenizerBase
 
 from vllm.model_executor.guided_decoding.outlines_logits_processors import (
     CFGLogitsProcessor, JSONLogitsProcessor, RegexLogitsProcessor)
-from vllm.model_executor.guided_decoding.reasoner import Reasoner
+from vllm.reasoning import ReasoningParser
 from vllm.sampling_params import GuidedDecodingParams
 
 
@@ -61,7 +61,7 @@ _MAX_THREADPOOL_WORKERS = 16
 async def get_outlines_guided_decoding_logits_processor(
     guided_params: GuidedDecodingParams,
     tokenizer: PreTrainedTokenizerBase,
-    reasoner: Optional[Reasoner],
+    reasoner: Optional[ReasoningParser],
 ) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor,
            None]:
     """
@@ -92,7 +92,7 @@ async def get_outlines_guided_decoding_logits_processor(
 def get_local_outlines_guided_decoding_logits_processor(
     guided_params: GuidedDecodingParams,
     tokenizer: PreTrainedTokenizerBase,
-    reasoner: Optional[Reasoner],
+    reasoner: Optional[ReasoningParser],
 ) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor,
            None]:
     """
@@ -141,7 +141,7 @@ def _get_logits_processor(
     tokenizer: PreTrainedTokenizerBase,
     mode: GuidedDecodingMode,
     whitespace_pattern: Union[str, None],
-    reasoner: Optional[Reasoner],
+    reasoner: Optional[ReasoningParser],
 ) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor]:
     if mode == GuidedDecodingMode.JSON:
         return JSONLogitsProcessor(guide, tokenizer, whitespace_pattern,
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index 8b2a0f4cfe64b..31af4593f1123 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -34,8 +34,8 @@ from transformers import PreTrainedTokenizerBase
 
 import vllm.envs as envs
 from vllm.logger import init_logger
-from vllm.model_executor.guided_decoding.reasoner import Reasoner
 from vllm.platforms import current_platform
+from vllm.reasoning import ReasoningParser
 
 logger = init_logger(__name__)
 
@@ -49,9 +49,9 @@ else:
 
 class BaseLogitsProcessor:
 
-    def __init__(self, guide: Guide, reasoner: Optional[Reasoner]):
+    def __init__(self, guide: Guide, reasoner: Optional[ReasoningParser]):
         self._guide: Guide = guide
-        self._reasoner: Optional[Reasoner] = reasoner
+        self._reasoner: Optional[ReasoningParser] = reasoner
         # CFGState is used for the FSM state for CFGGuide
         self._fsm_state: DefaultDict[int, Union[int,
                                                 CFGState]] = defaultdict(int)
@@ -69,7 +69,7 @@ class BaseLogitsProcessor:
                 # Remove the reasoning tokens from the input_ids
                 # We need this because our implementation relies on the
                 # hash of the input_ids to store the FSM state.
-                input_ids = self._reasoner.extract_content(input_ids)
+                input_ids = self._reasoner.extract_content_ids(input_ids)
 
         seq_id = hash(tuple(input_ids))
 
@@ -142,7 +142,7 @@ class RegexLogitsProcessor(BaseLogitsProcessor):
         self,
         regex_string: str,
         tokenizer: PreTrainedTokenizerBase,
-        reasoner: Optional[Reasoner],
+        reasoner: Optional[ReasoningParser],
     ):
         """Compile the FSM that drives the regex-structured generation.
 
@@ -163,7 +163,7 @@ class JSONLogitsProcessor(RegexLogitsProcessor):
     def __init__(self, schema: Union[str, Dict, BaseModel],
                  tokenizer: PreTrainedTokenizerBase,
                  whitespace_pattern: Union[str, None],
-                 reasoner: Optional[Reasoner]):
+                 reasoner: Optional[ReasoningParser]):
         """Compile the FSM that drives the JSON-guided generation.
 
         Parameters
@@ -203,7 +203,7 @@ class CFGLogitsProcessor(BaseLogitsProcessor):
         return CFGGuide(cfg, tokenizer)
 
     def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase,
-                 reasoner: Optional[Reasoner]):
+                 reasoner: Optional[ReasoningParser]):
         """Compile the FSM that drives the context free grammar generation.
 
         Parameters
diff --git a/vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py b/vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py
deleted file mode 100644
index 7e61e6a9620c7..0000000000000
--- a/vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-from dataclasses import dataclass
-
-from transformers import PreTrainedTokenizer
-
-from vllm.model_executor.guided_decoding.reasoner.reasoner import Reasoner
-
-
-@dataclass
-class DeepSeekReasoner(Reasoner):
-    """
-    Reasoner for DeepSeek R series models.
-    """
-    start_token_id: int
-    end_token_id: int
-
-    start_token: str = "<think>"
-    end_token: str = "</think>"
-
-    @classmethod
-    def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
-        return cls(start_token_id=tokenizer.encode(
-            "<think>", add_special_tokens=False)[0],
-                   end_token_id=tokenizer.encode("</think>",
-                                                 add_special_tokens=False)[0])
-
-    def is_reasoning_end(self, input_ids: list[int]) -> bool:
-        return self.end_token_id in input_ids
-
-    def extract_content(self, input_ids: list[int]) -> list[int]:
-        """
-        Extract the content after the end tokens
-        """
-        if self.end_token_id not in input_ids or \
-            input_ids.index(self.end_token_id) + 1 == len(input_ids):
-            return []
-        else:
-            return input_ids[input_ids.index(self.end_token_id) + 1:]
diff --git a/vllm/model_executor/guided_decoding/reasoner/reasoner.py b/vllm/model_executor/guided_decoding/reasoner/reasoner.py
deleted file mode 100644
index df21b1db62218..0000000000000
--- a/vllm/model_executor/guided_decoding/reasoner/reasoner.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-from __future__ import annotations
-
-from abc import ABC, abstractmethod
-from dataclasses import dataclass
-
-from transformers import PreTrainedTokenizer
-
-
-@dataclass
-class Reasoner(ABC):
-
-    @abstractmethod
-    def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
-        pass
-
-    @abstractmethod
-    def is_reasoning_end(self, input_ids: list[int]) -> bool:
-        pass
-
-    @abstractmethod
-    def extract_content(self, input_ids: list[int]) -> list[int]:
-        pass
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index bc156223953e0..47b1e7e3f9811 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -27,7 +27,7 @@ if TYPE_CHECKING:
     from transformers import PreTrainedTokenizer
 
     from vllm.config import ModelConfig
-    from vllm.model_executor.guided_decoding.reasoner import Reasoner
+    from vllm.reasoning import ReasoningParser
     from vllm.sampling_params import GuidedDecodingParams
 
 logger = init_logger(__name__)
@@ -37,7 +37,7 @@ def get_local_xgrammar_guided_decoding_logits_processor(
         guided_params: GuidedDecodingParams,
         tokenizer: PreTrainedTokenizer,
         model_config: ModelConfig,
-        reasoner: Reasoner | None,
+        reasoner: ReasoningParser | None,
         max_threads: int = 8):
     config = GrammarConfig.from_guided_params(guided_params=guided_params,
                                               model_config=model_config,
@@ -280,7 +280,7 @@ class GrammarConfig:
 class XGrammarLogitsProcessor:
     """Wrapper class to support pickle protocol"""
     config: GrammarConfig
-    reasoner: Reasoner | None = None
+    reasoner: ReasoningParser | None = None
 
     ctx: xgr.CompiledGrammar | None = None
     tokenizer_info: xgr.TokenizerInfo = None  # type: ignore[assignment]
diff --git a/vllm/entrypoints/openai/reasoning_parsers/__init__.py b/vllm/reasoning/__init__.py
similarity index 100%
rename from vllm/entrypoints/openai/reasoning_parsers/__init__.py
rename to vllm/reasoning/__init__.py
diff --git a/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
similarity index 82%
rename from vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
rename to vllm/reasoning/abs_reasoning_parsers.py
index c95ff191e4d2e..454167a0dc950 100644
--- a/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -17,7 +17,7 @@ logger = init_logger(__name__)
 
 class ReasoningParser:
     """
-    Abstract reasoning parser class that should not be used directly. 
+    Abstract reasoning parser class that should not be used directly.
     Provided and methods should be used in derived classes.
 
     It is used to extract reasoning content from the model output.
@@ -32,6 +32,36 @@ class ReasoningParser:
         # whereas all tokenizers have .get_vocab()
         return self.model_tokenizer.get_vocab()
 
+    @abstractmethod
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        """
+        Check if the reasoning content ends in the input_ids.
+
+        It is used in structured engines like `xgrammar` to check if the
+        reasoning content ends in the model output.
+
+        Parameters:
+        input_ids: list[int]
+            The input_ids of the model output.
+
+        Returns:
+        bool
+            True if the reasoning content ends in the input_ids.
+        """
+
+    @abstractmethod
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        """
+        Extract content token ids from the input_ids.
+        Parameters:
+        input_ids: list[int]
+            The input_ids of the model output.
+        Returns:
+        list[int]
+            The extracted content from the input_ids.
+        """
+
+    @abstractmethod
     def extract_reasoning_content(
             self, model_output: str, request: ChatCompletionRequest
     ) -> tuple[Optional[str], Optional[str]]:
@@ -53,10 +83,7 @@ class ReasoningParser:
             A tuple containing the reasoning content and the content.
         """
 
-        raise NotImplementedError(
-            "AbstractReasoningParser.extract_reasoning_calls "
-            "has not been implemented!")
-
+    @abstractmethod
     def extract_reasoning_content_streaming(
         self,
         previous_text: str,
@@ -73,43 +100,6 @@ class ReasoningParser:
         the current tokens/diffs, but also the information about what has
         previously been parsed and extracted (see constructor)
         """
-        raise NotImplementedError(
-            "AbstractReasoningParser.extract_reasoning_content_streaming "
-            "has not been implemented!")
-
-    # TODO: need to rebase by PR #14428
-    @abstractmethod
-    def is_reasoning_end(self, input_ids: list[int]) -> bool:
-        """
-        Check if the reasoning content ends in the input_ids.
-        Parameters:
-        input_ids: list[int]
-            The input_ids of the model output.
-        Returns:
-        bool
-            True if the reasoning content ends in the input_ids.
-        """
-
-        raise NotImplementedError(
-            "AbstractReasoningParser.is_reasoning_end has"
-            "not been implemented!")
-
-    # TODO: need to rebase by PR #14428
-    @abstractmethod
-    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
-        """
-        Extract content token ids from the input_ids.
-        Parameters:
-        input_ids: list[int]
-            The input_ids of the model output.
-        Returns:
-        list[int]
-            The extracted content from the input_ids.
-        """
-
-        raise NotImplementedError(
-            "AbstractReasoningParser.extract_content_ids has"
-            " not been implemented!")
 
 
 class ReasoningParserManager:
@@ -125,14 +115,16 @@ class ReasoningParserManager:
         if name in cls.reasoning_parsers:
             return cls.reasoning_parsers[name]
 
-        raise KeyError(f"reasoning helper: '{name}' not found in "
-                       "reasoning_parsers")
+        raise KeyError(
+            f"reasoning helper: '{name}' not found in reasoning_parsers")
 
     @classmethod
-    def _register_module(cls,
-                         module: type,
-                         module_name: Optional[Union[str, list[str]]] = None,
-                         force: bool = True) -> None:
+    def _register_module(
+        cls,
+        module: type,
+        module_name: Optional[Union[str, list[str]]] = None,
+        force: bool = True,
+    ) -> None:
         if not issubclass(module, ReasoningParser):
             raise TypeError("module must be subclass of ReasoningParser, "
                             f"but got {type(module)}")
@@ -149,13 +141,14 @@ class ReasoningParserManager:
 
     @classmethod
     def register_module(
-            cls,
-            name: Optional[Union[str, list[str]]] = None,
-            force: bool = True,
-            module: Union[type, None] = None) -> Union[type, Callable]:
+        cls,
+        name: Optional[Union[str, list[str]]] = None,
+        force: bool = True,
+        module: Union[type, None] = None,
+    ) -> Union[type, Callable]:
         """
         Register module with the given name or name list. it can be used as a
-        decoder(with module as None) or normal function(with module as not 
+        decoder(with module as None) or normal function(with module as not
         None).
         """
         if not isinstance(force, bool):
@@ -183,7 +176,7 @@ class ReasoningParserManager:
     @classmethod
     def import_reasoning_parser(cls, plugin_path: str) -> None:
         """
-        Import a user-defined reasoning parser by the path 
+        Import a user-defined reasoning parser by the path
         of the reasoning parser define file.
         """
         module_name = os.path.splitext(os.path.basename(plugin_path))[0]
diff --git a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py b/vllm/reasoning/deepseek_r1_reasoning_parser.py
similarity index 64%
rename from vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
rename to vllm/reasoning/deepseek_r1_reasoning_parser.py
index 54e960168cf46..73be6d4d1ab13 100644
--- a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
+++ b/vllm/reasoning/deepseek_r1_reasoning_parser.py
@@ -8,9 +8,8 @@ from transformers import PreTrainedTokenizerBase
 
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaMessage)
-from vllm.entrypoints.openai.reasoning_parsers.abs_reasoning_parsers import (
-    ReasoningParser, ReasoningParserManager)
 from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
 
 logger = init_logger(__name__)
 
@@ -20,43 +19,45 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
     """
     Reasoning parser for DeepSeek R1 model.
 
-    The DeepSeek R1 model uses <think>...</think> tokens to denote reasoning 
+    The DeepSeek R1 model uses <think>...</think> tokens to denote reasoning
     text. This parser extracts the reasoning content from the model output.
     """
 
+    start_token_id: int
+    end_token_id: int
+
+    start_token: str = "<think>"
+    end_token: str = "</think>"
+
     def __init__(self, tokenizer: PreTrainedTokenizerBase):
         super().__init__(tokenizer)
-        self.think_start_token = "<think>"
-        self.think_end_token = "</think>"
 
         self.reasoning_regex = re.compile(
-            rf"{self.think_start_token}(.*?){self.think_end_token}", re.DOTALL)
+            rf"{self.start_token}(.*?){self.end_token}", re.DOTALL)
 
         if not self.model_tokenizer:
             raise ValueError(
                 "The model tokenizer must be passed to the ReasoningParser "
                 "constructor during construction.")
 
-        self.think_start_token_id = self.vocab.get(self.think_start_token)
-        self.think_end_token_id = self.vocab.get(self.think_end_token)
-        if (self.think_start_token_id is None
-                or self.think_end_token_id is None):
+        self.start_token_id = self.vocab.get(self.start_token)
+        self.end_token_id = self.vocab.get(self.end_token)
+        if self.start_token_id is None or self.end_token_id is None:
             raise RuntimeError(
                 "DeepSeek R1 reasoning parser could not locate think start/end "
                 "tokens in the tokenizer!")
 
-    # TODO: need to rebase by PR #14428
     def is_reasoning_end(self, input_ids: list[int]) -> bool:
-        return self.think_end_token_id in input_ids
+        return self.end_token_id in input_ids
 
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
         """
         Extract the content after the end tokens
         """
-        if self.think_end_token_id not in input_ids[:-1]:
+        if self.end_token_id not in input_ids[:-1]:
             return []
         else:
-            return input_ids[input_ids.index(self.think_end_token_id) + 1:]
+            return input_ids[input_ids.index(self.end_token_id) + 1:]
 
     def extract_reasoning_content_streaming(
         self,
@@ -77,22 +78,24 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
         """
         # Skip single special tokens
         if len(delta_token_ids) == 1 and (delta_token_ids[0] in [
-                self.think_start_token_id, self.think_end_token_id
+                self.start_token_id, self.end_token_id
         ]):
             return None
 
         # Check if <think> is present in previous or delta.
         # Keep compatibility with models that don't generate <think> tokens.
-        if self.think_start_token_id in previous_token_ids:
-            if self.think_end_token_id in delta_token_ids:
+        if self.start_token_id in previous_token_ids:
+            if self.end_token_id in delta_token_ids:
                 # <think> in previous, </think> in delta,
                 # extract reasoning content
-                end_index = delta_text.find(self.think_end_token)
+                end_index = delta_text.find(self.end_token)
                 reasoning_content = delta_text[:end_index]
-                content = delta_text[end_index + len(self.think_end_token):]
-                return DeltaMessage(reasoning_content=reasoning_content,
-                                    content=content if content else None)
-            elif self.think_end_token_id in previous_token_ids:
+                content = delta_text[end_index + len(self.end_token):]
+                return DeltaMessage(
+                    reasoning_content=reasoning_content,
+                    content=content if content else None,
+                )
+            elif self.end_token_id in previous_token_ids:
                 # <think> in previous, </think> in previous,
                 # reasoning content continues
                 return DeltaMessage(content=delta_text)
@@ -100,17 +103,18 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
                 # <think> in previous, no </think> in previous or delta,
                 # reasoning content continues
                 return DeltaMessage(reasoning_content=delta_text)
-        elif self.think_start_token_id in delta_token_ids:
-            if self.think_end_token_id in delta_token_ids:
+        elif self.start_token_id in delta_token_ids:
+            if self.end_token_id in delta_token_ids:
                 # <think> in delta, </think> in delta, extract reasoning content
-                start_index = delta_text.find(self.think_start_token)
-                end_index = delta_text.find(self.think_end_token)
+                start_index = delta_text.find(self.start_token)
+                end_index = delta_text.find(self.end_token)
                 reasoning_content = delta_text[start_index +
-                                               len(self.think_start_token
-                                                   ):end_index]
-                content = delta_text[end_index + len(self.think_end_token):]
-                return DeltaMessage(reasoning_content=reasoning_content,
-                                    content=content if content else None)
+                                               len(self.start_token):end_index]
+                content = delta_text[end_index + len(self.end_token):]
+                return DeltaMessage(
+                    reasoning_content=reasoning_content,
+                    content=content if content else None,
+                )
             else:
                 # <think> in delta, no </think> in delta,
                 # reasoning content continues
@@ -119,15 +123,17 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
             # No <think> in previous or delta, also need to check for </think>.
             # Because the model may have generated </think> without <think>
             # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
-            if self.think_end_token_id in delta_token_ids:
+            if self.end_token_id in delta_token_ids:
                 # </think> in delta with more tokens,
                 # extract reasoning content and content
-                end_index = delta_text.find(self.think_end_token)
+                end_index = delta_text.find(self.end_token)
                 reasoning_content = delta_text[:end_index]
-                content = delta_text[end_index + len(self.think_end_token):]
-                return DeltaMessage(reasoning_content=reasoning_content,
-                                    content=content if content else None)
-            elif self.think_end_token_id in previous_token_ids:
+                content = delta_text[end_index + len(self.end_token):]
+                return DeltaMessage(
+                    reasoning_content=reasoning_content,
+                    content=content if content else None,
+                )
+            elif self.end_token_id in previous_token_ids:
                 # </think> in previous, thinking content ends
                 return DeltaMessage(content=delta_text)
             else:
@@ -137,22 +143,20 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
     def extract_reasoning_content(
             self, model_output: str, request: ChatCompletionRequest
     ) -> tuple[Optional[str], Optional[str]]:
-
         # DeepSeek R1 doesn't generate <think> now.
         # Thus we assume the reasoning content is always at the start.
         # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
-        if self.think_end_token not in model_output:
+        if self.end_token not in model_output:
             return model_output, None
         else:
             # Add a start token if it's missing to keep compatibility.
-            if self.think_start_token not in model_output:
-                model_output = f"{self.think_start_token}{model_output}"
+            if self.start_token not in model_output:
+                model_output = f"{self.start_token}{model_output}"
             # Use a regex to find the reasoning content
             reasoning_content = self.reasoning_regex.findall(model_output)[0]
 
             end_index = len(
-                f"{self.think_start_token}{reasoning_content}{self.think_end_token}"
-            )
+                f"{self.start_token}{reasoning_content}{self.end_token}")
             final_output = model_output[end_index:]
 
             if len(final_output) == 0:
diff --git a/vllm/entrypoints/openai/reasoning_parsers/granite_reasoning_parser.py b/vllm/reasoning/granite_reasoning_parser.py
similarity index 99%
rename from vllm/entrypoints/openai/reasoning_parsers/granite_reasoning_parser.py
rename to vllm/reasoning/granite_reasoning_parser.py
index 117d051a73782..249ace1f167fa 100644
--- a/vllm/entrypoints/openai/reasoning_parsers/granite_reasoning_parser.py
+++ b/vllm/reasoning/granite_reasoning_parser.py
@@ -8,9 +8,8 @@ from transformers import PreTrainedTokenizerBase
 
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaMessage)
-from vllm.entrypoints.openai.reasoning_parsers.abs_reasoning_parsers import (
-    ReasoningParser, ReasoningParserManager)
 from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
 
 logger = init_logger(__name__)
 

From 280d074103160d042059dc60c28898fd9fb56568 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Fri, 28 Mar 2025 16:36:31 +0800
Subject: [PATCH 068/593] [CPU][CI] Improve CPU Dockerfile (#15690)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .buildkite/release-pipeline.yaml              |   2 +-
 .buildkite/run-cpu-test.sh                    |  16 +-
 Dockerfile.cpu                                | 155 +++++++++++++-----
 .../getting_started/installation/cpu.md       |  35 +++-
 .../installation/cpu/x86.inc.md               |   2 +
 5 files changed, 151 insertions(+), 59 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 18f582b6e4c94..a1dcb01e482bb 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -82,7 +82,7 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f Dockerfile.cpu ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
     env:
       DOCKER_BUILDKIT: "1"
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 05744bb5225b8..bf9f191d3b064 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -8,15 +8,19 @@ set -ex
 CORE_RANGE=${CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
 
-# Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
-
 # Setup cleanup
-remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
+remove_docker_container() { 
+    set -e; 
+    docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; 
+    docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true; 
+}
 trap remove_docker_container EXIT
 remove_docker_container
 
+# Try building the docker image
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f Dockerfile.cpu .
+
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
  --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
@@ -36,8 +40,6 @@ function cpu_tests() {
   # Run basic model test
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
-    pip install -r vllm/requirements/test.txt
-    pip install -r vllm/requirements/cpu.txt
     pytest -v -s tests/kernels/test_cache.py -m cpu_model
     pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
     pytest -v -s tests/models/decoder_only/language -m cpu_model
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index a10090529d8a9..8133651865b50 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -1,69 +1,138 @@
 # This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
+#
+# Build targets:
+#   vllm-openai (default): used for serving deployment
+#   vllm-test: used for CI tests
+#   vllm-dev: used for development
+#
+# Build arguments:
+#   PYTHON_VERSION=3.12 (default)|3.11|3.10|3.9
+#   VLLM_CPU_DISABLE_AVX512=false (default)|true
+#
 
-FROM ubuntu:22.04 AS cpu-test-1
+######################### BASE IMAGE #########################
+FROM ubuntu:22.04 AS base
+
+WORKDIR /workspace/
+
+ARG PYTHON_VERSION=3.12
+ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
+
+# Install minimal dependencies and uv
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    apt-get update -y \
+    && apt-get install -y --no-install-recommends ccache git curl wget ca-certificates \
+        gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 \
+    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
+    && curl -LsSf https://astral.sh/uv/install.sh | sh
 
 ENV CCACHE_DIR=/root/.cache/ccache
-
 ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
 
-RUN --mount=type=cache,target=/var/cache/apt \
-    apt-get update -y \
-    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
-    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+ENV PATH="/root/.local/bin:$PATH"
+ENV VIRTUAL_ENV="/opt/venv"
+RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 
-# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
-# intel-openmp provides additional performance improvement vs. openmp
-# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install intel-openmp==2025.0.1
+ENV UV_HTTP_TIMEOUT=500
 
-ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
+# Install Python dependencies 
+ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+ENV UV_LINK_MODE="copy"
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
+    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
+    uv pip install --upgrade pip && \
+    uv pip install -r requirements/cpu.txt
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install intel-openmp==2024.2.1 intel_extension_for_pytorch==2.6.0
+
+ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD"
 
 RUN echo 'ulimit -c 0' >> ~/.bashrc
 
-RUN pip install intel_extension_for_pytorch==2.6.0
+######################### BUILD IMAGE #########################
+FROM base AS vllm-build
 
-WORKDIR /workspace
-
-ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
-ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
-    pip install --upgrade pip && \
-    pip install -r requirements/build.txt
-
-FROM cpu-test-1 AS build
-
-WORKDIR /workspace/vllm
-
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
-    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
-    pip install -v -r requirements/cpu.txt
-
-COPY . .
 ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
-
 # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
 ARG VLLM_CPU_DISABLE_AVX512
 ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
 
-RUN --mount=type=cache,target=/root/.cache/pip \
+WORKDIR /workspace/vllm
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
+    uv pip install -r requirements/build.txt
+
+COPY . .
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+
+RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=bind,source=.git,target=.git \
-    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
-    pip install dist/*.whl && \
-    rm -rf dist
+    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel 
+
+######################### DEV IMAGE #########################
+FROM vllm-build AS vllm-dev
+
+WORKDIR /workspace/vllm
+
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    apt-get install -y --no-install-recommends vim numactl
+
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install -e tests/vllm_test_utils 
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=bind,source=.git,target=.git \
+    VLLM_TARGET_DEVICE=cpu python3 setup.py develop 
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install -r requirements/dev.txt && \
+    pre-commit install --hook-type pre-commit --hook-type commit-msg
+
+ENTRYPOINT ["bash"]
+
+######################### TEST IMAGE #########################
+FROM base AS vllm-test
 
 WORKDIR /workspace/
 
-RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,src=requirements/test.txt,target=requirements/test.txt \
+    uv pip install -r requirements/test.txt
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
+    uv pip install dist/*.whl
+
+ADD ./tests/ ./tests/
+ADD ./examples/ ./examples/
+ADD ./benchmarks/ ./benchmarks/
 
 # install development dependencies (for testing)
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -e tests/vllm_test_utils
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install -e tests/vllm_test_utils 
+
+ENTRYPOINT ["bash"]
+
+######################### RELEASE IMAGE #########################
+FROM base AS vllm-openai
+
+WORKDIR /workspace/
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
+    uv pip install dist/*.whl
 
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/docs/source/getting_started/installation/cpu.md b/docs/source/getting_started/installation/cpu.md
index 1b2ffd6199945..844b184afc99b 100644
--- a/docs/source/getting_started/installation/cpu.md
+++ b/docs/source/getting_started/installation/cpu.md
@@ -159,18 +159,37 @@ Currently, there are no pre-built CPU wheels.
 
 ### Pre-built images
 
-Currently, there are no pre-build CPU images.
+:::::{tab-set}
+:sync-group: device
+
+::::{tab-item} Intel/AMD x86
+:sync: x86
+
+:::{include} cpu/x86.inc.md
+:start-after: "### Pre-built images"
+:end-before: "### Build image from source"
+:::
+
+::::
+
+:::::
 
 ### Build image from source
 
 ```console
-$ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g .
-$ docker run -it \
-             --rm \
-             --network=host \
-             --cpuset-cpus=<cpu-id-list, optional> \
-             --cpuset-mems=<memory-node, optional> \
-             vllm-cpu-env
+$ docker build -f Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
+
+# Launching OpenAI server 
+$ docker run --rm \
+             --privileged=true \
+             --shm-size=4g \
+             -p 8000:8000 \
+             -e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
+             -e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
+             vllm-cpu-env \
+             --model=meta-llama/Llama-3.2-1B-Instruct \
+             --dtype=bfloat16 \
+             other vLLM OpenAI server arguments
 ```
 
 ::::{tip}
diff --git a/docs/source/getting_started/installation/cpu/x86.inc.md b/docs/source/getting_started/installation/cpu/x86.inc.md
index b2f3bafb4e511..9ae2035db5433 100644
--- a/docs/source/getting_started/installation/cpu/x86.inc.md
+++ b/docs/source/getting_started/installation/cpu/x86.inc.md
@@ -34,6 +34,8 @@ There are no pre-built wheels or images for this device, so you must build vLLM
 
 ### Pre-built images
 
+See [https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo)
+
 ### Build image from source
 
 ## Extra information

From 70f2c2a7094cc5fbc6788f2b0b9b9da6973290cd Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 28 Mar 2025 17:10:40 +0800
Subject: [PATCH 069/593] [Bugfix] Fix 'InductorAdaptor object has no attribute
 'cache_dir' (#15674)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/compilation/compiler_interface.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index ab0f98bdaa3e5..d6e44fa6d3414 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -144,6 +144,7 @@ class InductorAdaptor(CompilerInterface):
         return hash_str
 
     def initialize_cache(self, cache_dir: str, disable_cache: bool = False):
+        self.cache_dir = cache_dir
         if disable_cache:
             return
         # redirect the cache directory to a sub-directory
@@ -156,7 +157,6 @@ class InductorAdaptor(CompilerInterface):
         triton_cache = os.path.join(cache_dir, "triton_cache")
         os.makedirs(triton_cache, exist_ok=True)
         os.environ["TRITON_CACHE_DIR"] = triton_cache
-        self.cache_dir = cache_dir
 
     def compile(
         self,

From a10314c6b35c7bad4320286409d8b5e6d11aa56e Mon Sep 17 00:00:00 2001
From: Lize Cai <lizzzcai1@gmail.com>
Date: Fri, 28 Mar 2025 19:00:14 +0900
Subject: [PATCH 070/593] [Misc] Fix test_sleep to use query parameters
 (#14373)

Signed-off-by: Lize Cai <lize.cai@sap.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 tests/entrypoints/openai/test_sleep.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/entrypoints/openai/test_sleep.py b/tests/entrypoints/openai/test_sleep.py
index 8bdf00bcee126..66d8d9294018c 100644
--- a/tests/entrypoints/openai/test_sleep.py
+++ b/tests/entrypoints/openai/test_sleep.py
@@ -25,8 +25,9 @@ def test_sleep_mode():
                                 "VLLM_SERVER_DEV_MODE": "1",
                                 "CUDA_VISIBLE_DEVICES": "0"
                             }) as remote_server:
+
         response = requests.post(remote_server.url_for("/sleep"),
-                                 data={"level": "1"})
+                                 params={"level": "1"})
         assert response.status_code == 200
         response = requests.get(remote_server.url_for("/is_sleeping"))
         assert response.status_code == 200

From 3bbaacbe15c90ac5339aa46f481311a80038d3a9 Mon Sep 17 00:00:00 2001
From: Ce Gao <cegao@tensorchord.ai>
Date: Fri, 28 Mar 2025 19:20:35 +0800
Subject: [PATCH 071/593] [Bugfix][Frontend] Eliminate regex based check in
 reasoning full generator (#14821)

Signed-off-by: Ce Gao <cegao@tensorchord.ai>
---
 .../test_deepseekr1_reasoning_parser.py       | 64 +++++++++++++++++++
 .../reasoning/deepseek_r1_reasoning_parser.py | 43 +++++++------
 2 files changed, 89 insertions(+), 18 deletions(-)

diff --git a/tests/reasoning/test_deepseekr1_reasoning_parser.py b/tests/reasoning/test_deepseekr1_reasoning_parser.py
index 7b6af183a86ad..1b669c8fd2fb9 100644
--- a/tests/reasoning/test_deepseekr1_reasoning_parser.py
+++ b/tests/reasoning/test_deepseekr1_reasoning_parser.py
@@ -90,6 +90,40 @@ SHORTEST_REASONING_WITH_THINK = {
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
+THINK_NO_END = {
+    "output": "<think>This is a reasoning section",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+    "is_reasoning_end": False,
+}
+EMPTY = {
+    "output": "",
+    "reasoning_content": "",
+    "content": None,
+    "is_reasoning_end": False,
+}
+EMPTY_STREAMING = {
+    "output": "",
+    "reasoning_content": None,
+    "content": None,
+    "is_reasoning_end": False,
+}
+NEW_LINE = {
+    "output": "\n<think>This is a reasoning section</think>\nThis is the rest",
+    "reasoning_content": "This is a reasoning section",
+    "content": "\nThis is the rest",
+    "is_reasoning_end": True,
+}
+# Streaming cannot handle new lines at the beginning of the output
+# because we need to support <think>...</think> and </think>...
+# We cannot know if the text before <think> is reasoning content
+# or not.
+NEW_LINE_STREAMING = {
+    "output": "\n<think>This is a reasoning section</think>\nThis is the rest",
+    "reasoning_content": "\nThis is a reasoning section",
+    "content": "\nThis is the rest",
+    "is_reasoning_end": True,
+}
 
 TEST_CASES = [
     pytest.param(
@@ -182,6 +216,36 @@ TEST_CASES = [
         SHORTEST_REASONING_WITH_THINK,
         id="shortest_with_think_streaming",
     ),
+    pytest.param(
+        False,
+        THINK_NO_END,
+        id="think_no_end",
+    ),
+    pytest.param(
+        True,
+        THINK_NO_END,
+        id="think_no_end_streaming",
+    ),
+    pytest.param(
+        False,
+        EMPTY,
+        id="empty",
+    ),
+    pytest.param(
+        True,
+        EMPTY_STREAMING,
+        id="empty_streaming",
+    ),
+    pytest.param(
+        False,
+        NEW_LINE,
+        id="new_line",
+    ),
+    pytest.param(
+        True,
+        NEW_LINE_STREAMING,
+        id="new_line_streaming",
+    ),
 ]
 
 
diff --git a/vllm/reasoning/deepseek_r1_reasoning_parser.py b/vllm/reasoning/deepseek_r1_reasoning_parser.py
index 73be6d4d1ab13..1c283c092a28c 100644
--- a/vllm/reasoning/deepseek_r1_reasoning_parser.py
+++ b/vllm/reasoning/deepseek_r1_reasoning_parser.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import re
 from collections.abc import Sequence
 from typing import Optional, Union
 
@@ -32,9 +31,6 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
     def __init__(self, tokenizer: PreTrainedTokenizerBase):
         super().__init__(tokenizer)
 
-        self.reasoning_regex = re.compile(
-            rf"{self.start_token}(.*?){self.end_token}", re.DOTALL)
-
         if not self.model_tokenizer:
             raise ValueError(
                 "The model tokenizer must be passed to the ReasoningParser "
@@ -143,23 +139,34 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
     def extract_reasoning_content(
             self, model_output: str, request: ChatCompletionRequest
     ) -> tuple[Optional[str], Optional[str]]:
+        """
+        Extract reasoning content from the model output.
+
+        For text <think>abc</think>xyz:
+        - 'abc' goes to reasoning_content
+        - 'xyz' goes to content
+
+        Returns:
+            tuple[Optional[str], Optional[str]]: reasoning content and content
+        """
+
+        # Check if the start token is present in the model output, remove it
+        # if it is present.
+        model_output_parts = model_output.partition(self.start_token)
+        model_output = model_output_parts[2] if model_output_parts[
+            1] else model_output_parts[0]
+
         # DeepSeek R1 doesn't generate <think> now.
         # Thus we assume the reasoning content is always at the start.
         # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
         if self.end_token not in model_output:
             return model_output, None
         else:
-            # Add a start token if it's missing to keep compatibility.
-            if self.start_token not in model_output:
-                model_output = f"{self.start_token}{model_output}"
-            # Use a regex to find the reasoning content
-            reasoning_content = self.reasoning_regex.findall(model_output)[0]
-
-            end_index = len(
-                f"{self.start_token}{reasoning_content}{self.end_token}")
-            final_output = model_output[end_index:]
-
-            if len(final_output) == 0:
-                return reasoning_content, None
-
-            return reasoning_content, final_output
+            reasoning_content, _, content = model_output.partition(
+                self.end_token)
+            # If the end token is not found, return the model output as is.
+            # It should not happen since we already checked for the presence
+            # of the end token.
+            # If generation stops right after end-of-think, return null content
+            final_content = content or None
+            return reasoning_content, final_content

From fd5fd2690275e90865023a0bcac0047ecb3f3897 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Fri, 28 Mar 2025 19:40:12 +0800
Subject: [PATCH 072/593] [Frontend] update priority for --api-key and
 VLLM_API_KEY (#15588)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 vllm/entrypoints/openai/api_server.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 6c1f60fa6a3b4..7dbe31e62da67 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -818,7 +818,8 @@ def build_app(args: Namespace) -> FastAPI:
         return JSONResponse(err.model_dump(),
                             status_code=HTTPStatus.BAD_REQUEST)
 
-    if token := envs.VLLM_API_KEY or args.api_key:
+    # Ensure --api-key option from CLI takes precedence over VLLM_API_KEY
+    if token := args.api_key or envs.VLLM_API_KEY:
 
         @app.middleware("http")
         async def authentication(request: Request, call_next):

From 0b4167526d030391785e28e44c68d2e1cdc5ad3b Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 28 Mar 2025 13:03:21 +0000
Subject: [PATCH 073/593] [Docs] Add "Generation quality changed" section to
 troubleshooting (#15701)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/source/getting_started/troubleshooting.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md
index fdfaf9f932698..87fa442e9a489 100644
--- a/docs/source/getting_started/troubleshooting.md
+++ b/docs/source/getting_started/troubleshooting.md
@@ -26,6 +26,14 @@ To isolate the model downloading and loading issue, you can use the `--load-form
 
 If the model is too large to fit in a single GPU, you will get an out-of-memory (OOM) error. Consider [using tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
 
+## Generation quality changed
+
+In v0.8.0, the source of default sampling parameters was changed in <gh-pr:12622>. Prior to v0.8.0, the default sampling parameters came from vLLM's set of neutral defaults. From v0.8.0 onwards, the default sampling parameters come from the `generation_config.json` provided by the model creator.
+
+In most cases, this should lead to higher quality responses, because the model creator is likely to know which sampling parameters are best for their model. However, in some cases the defaults provided by the model creator can lead to degraded performance.
+
+You can check if this is happening by trying the old defaults with `--generation-config vllm` for online and `generation_config="vllm"` for offline. If, after trying this, your generation quality improves we would recommend continuing to use the vLLM defaults and petition the model creator on <https://huggingface.co> to update their default `generation_config.json` so that it produces better quality generations.
+
 ## Enable more logging
 
 If other strategies don't solve the problem, it's likely that the vLLM instance is stuck somewhere. You can use the following environment variables to help debug the issue:

From 91276c57210b36997861af706a48ac784573ed4c Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 28 Mar 2025 21:14:09 +0800
Subject: [PATCH 074/593] [Model] Adding torch compile annotations to chatglm
 (#15624)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/models/chatglm.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 14dca23b3934f..a51a0af9e2bcf 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -10,6 +10,7 @@ from torch import nn
 from torch.nn import LayerNorm
 
 from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
@@ -293,6 +294,7 @@ class GLMTransformer(nn.Module):
         return hidden_states
 
 
+@support_torch_compile
 class ChatGLMModel(nn.Module):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):

From 3b00ff91380044fa409612401309b9cb6a82685f Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Fri, 28 Mar 2025 21:14:53 +0800
Subject: [PATCH 075/593] [Bugfix][v1] xgrammar structured output supports
 Enum. (#15594)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 .../llm/test_struct_output_generate.py        | 53 +++++++++++++++++++
 vllm/v1/structured_output/utils.py            |  4 --
 2 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 6bdfa0fae4a2c..00fa47575b6ae 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -4,10 +4,12 @@ from __future__ import annotations
 
 import json
 import re
+from enum import Enum
 from typing import Any
 
 import jsonschema
 import pytest
+from pydantic import BaseModel
 
 from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
@@ -390,3 +392,54 @@ def test_guided_choice_completion(
         assert generated_text is not None
         assert generated_text in sample_guided_choice
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+class CarType(str, Enum):
+    sedan = "sedan"
+    suv = "SUV"
+    truck = "Truck"
+    coupe = "Coupe"
+
+
+class CarDescription(BaseModel):
+    brand: str
+    model: str
+    car_type: CarType
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend",
+                         GUIDED_DECODING_BACKENDS_V1)
+@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+def test_guided_json_completion_with_enum(
+    monkeypatch: pytest.MonkeyPatch,
+    guided_decoding_backend: str,
+    model_name: str,
+):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend=guided_decoding_backend)
+    json_schema = CarDescription.model_json_schema()
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(json=json_schema))
+    outputs = llm.generate(
+        prompts="Generate a JSON with the brand, model and car_type of"
+        "the most iconic car from the 90's",
+        sampling_params=sampling_params,
+        use_tqdm=True)
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json, schema=json_schema)
diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
index 694e46f763f02..a771256ef29fd 100644
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -26,10 +26,6 @@ def has_xgrammar_unsupported_json_features(schema: dict[str, Any]) -> bool:
         if "pattern" in obj:
             return True
 
-        # Check for enum restrictions
-        if "enum" in obj:
-            return True
-
         # Check for numeric ranges
         if obj.get("type") in ("integer", "number") and any(
                 key in obj

From 541d1df486ac863ab057d842791695763de6f58b Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Mar 2025 23:27:52 +0800
Subject: [PATCH 076/593] [Bugfix] `embed_is_patch` for Idefics3 (#15696)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/commandr.py    |   1 -
 vllm/model_executor/models/idefics3.py    | 501 ++++++++++++++--------
 vllm/model_executor/models/mllama.py      |   1 -
 vllm/model_executor/models/qwen2_audio.py |   2 +-
 vllm/model_executor/models/ultravox.py    |   3 +-
 5 files changed, 320 insertions(+), 188 deletions(-)

diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index b0cb4a62333a4..e7e73f446df27 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -24,7 +24,6 @@
 from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
-import torch.utils.checkpoint
 from torch import nn
 from transformers import CohereConfig
 
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 432f26141048b..327ec4640f03e 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -17,16 +17,14 @@
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Dict, List, Literal, Optional, Set, Tuple, TypedDict, Union
+from typing import Dict, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
-import torch.utils.checkpoint
 from torch import nn
 from transformers import (BatchFeature, Idefics3Config, Idefics3ImageProcessor,
                           Idefics3Processor)
 
 from vllm.config import VllmConfig
-from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -35,13 +33,16 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.inputs import NestedTensors
-from vllm.multimodal.parse import ImageProcessorItems
+from vllm.multimodal.parse import ImageProcessorItems, ImageSize
+# yapf conflicts with isort for this block
+# yapf: disable
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo,
                                         MultiModalDataItems,
                                         MultiModalFieldConfig,
-                                        PromptReplacement, PromptUpdate)
+                                        PromptReplacement, PromptUpdate,
+                                        encode_tokens)
+# yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -53,18 +54,28 @@ from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal
 from .llama import LlamaModel
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
-
-logger = init_logger(__name__)
+from .vision import scatter_patch_features, select_patch_features
 
 
 class Idefics3ImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
-    data: torch.Tensor
+    pixel_values: torch.Tensor
     """
     Shape: `(batch_size * num_images * num_patches, 
              num_channels, height, width)`
     """
-    pixel_attention_mask: Optional[torch.BoolTensor]
+    pixel_attention_mask: torch.Tensor
+
+    num_patches: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
+
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
 
 
 class Idefics3ImageEmbeddingInputs(TypedDict):
@@ -75,6 +86,14 @@ class Idefics3ImageEmbeddingInputs(TypedDict):
     `hidden_size` must match the hidden size of language model backbone.
     """
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
 
 ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs]
 
@@ -100,32 +119,14 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> Mapping[str, int]:
-        hf_processor = self.get_hf_processor()
-        image_processor: Idefics3ImageProcessor = hf_processor.image_processor
-        grid_w, grid_h = self._get_image_feature_grid_size(
-            image_width=image_processor.size['longest_edge'],
-            image_height=image_processor.size['longest_edge'],
-        )
-        num_image_token = (grid_w * grid_h + 1) * hf_processor.image_seq_len
-        # Calculate Non-image-token length
-        # NOTE: <row_1_col_1> and <global-img> are special token for SmolVLM
-        # but not for Idefic3, so we need to tokenize them to get actual length.
-        tokenizer = self.get_tokenizer()
-        tile_token_len = len(tokenizer.tokenize("<row_1_col_1>"))
-        glob_token_len = len(tokenizer.tokenize(hf_processor.global_image_tag))
-        # linebreak and <fake_token_around_image> always cost 1 token
-        fake_token_len = lb_len = 1
-        non_image_token = (grid_w * grid_h) * (
-            tile_token_len + fake_token_len) + glob_token_len + (
-                grid_h + 1) * lb_len + fake_token_len
-        return {"image": num_image_token + non_image_token}
+        return {"image": self.get_max_image_tokens()}
 
     def _resize_output_size(self,
                             *,
                             height: int,
                             width: int,
                             max_len: Optional[int] = None,
-                            min_len: Optional[int] = 1,
+                            min_len: int = 1,
                             max_size: Optional[int] = None) -> tuple[int, int]:
         # Set default value for max_len if not provided
         max_len = max(height, width) if max_len is None else max_len
@@ -181,10 +182,13 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        size: Optional[dict[str, object]] = None,
+        processor: Optional[Idefics3Processor],
     ) -> tuple[int, int]:
-        hf_processor = self.get_hf_processor(size=size)
-        image_processor: Idefics3ImageProcessor = hf_processor.image_processor
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        image_processor: Idefics3ImageProcessor = processor.image_processor
+
         max_image_size = image_processor.max_image_size['longest_edge']
         size = image_processor.size['longest_edge']
         assert size % max_image_size == 0, (
@@ -204,6 +208,105 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
             grid_h = grid_w = 0
         return grid_w, grid_h
 
+    def get_num_patches(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[Idefics3Processor],
+    ) -> int:
+        grid_w, grid_h = self._get_image_feature_grid_size(
+            image_width=image_width,
+            image_height=image_height,
+            processor=processor,
+        )
+
+        return grid_w * grid_h + 1
+
+    def get_image_repl(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[Idefics3Processor],
+    ) -> str:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        image_token = processor.image_token.content
+        fake_image_token = processor.fake_image_token.content
+        global_img_token = processor.global_image_tag
+        image_seq_len = processor.image_seq_len
+        grid_placeholder = "<row_{n_h}_col_{n_w}>"
+
+        p_img = image_token * image_seq_len
+        global_img_placeholder = fake_image_token + global_img_token + p_img
+        tile_img_placeholder = fake_image_token + grid_placeholder + p_img
+
+        grid_w, grid_h = self._get_image_feature_grid_size(
+            image_width=image_width,
+            image_height=image_height,
+            processor=processor,
+        )
+        if grid_w == 0 and grid_h == 0:
+            return global_img_placeholder + fake_image_token
+
+        tiles_placeholder = list[str]()
+        for i in range(grid_h):
+            for j in range(grid_w):
+                placeholder_per_tile = tile_img_placeholder.format(n_h=i + 1,
+                                                                   n_w=j + 1)
+                tiles_placeholder.append(placeholder_per_tile)
+                # Add line break if it is the last tile in the row
+                if j == grid_w - 1:
+                    tiles_placeholder.append("\n")
+
+        return "".join([
+            *tiles_placeholder,
+            "\n",
+            global_img_placeholder,
+            fake_image_token,
+        ])
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[Idefics3Processor],
+    ) -> int:
+        tokenizer = self.get_tokenizer()
+        image_repl = self.get_image_repl(
+            image_width=image_width,
+            image_height=image_height,
+            processor=processor,
+        )
+
+        image_repl_tokens = encode_tokens(
+            tokenizer,
+            image_repl,
+            add_special_tokens=False,
+        )
+        return len(image_repl_tokens)
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        processor = self.get_hf_processor()
+        image_processor: Idefics3ImageProcessor = processor.image_processor
+
+        return ImageSize(
+            width=image_processor.size["longest_edge"],
+            height=image_processor.size["longest_edge"],
+        )
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            processor=None,
+        )
+
 
 class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo]
                                  ):
@@ -217,7 +320,7 @@ class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo]
         hf_processor = self.info.get_hf_processor()
         image_processor: Idefics3ImageProcessor = hf_processor.image_processor
         longest_edge = image_processor.max_image_size['longest_edge']
-        image_token: str = hf_processor.image_token.content
+        image_token = hf_processor.image_token.content
 
         mm_data = {
             "image":
@@ -241,26 +344,61 @@ class Idefics3MultiModalProcessor(
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        if mm_data:
-            processed_outputs = super()._call_hf_processor(
-                prompt, mm_data, mm_kwargs)
-            image_grids = [
-                self.info._get_image_feature_grid_size(
-                    image_width=img.width,
-                    image_height=img.height,
-                    **mm_kwargs,
-                ) for img in mm_data["images"]
-            ]
-            image_patches = list(map(lambda x: math.prod(x) + 1, image_grids))
-            for key in ("pixel_values", "pixel_attention_mask"):
-                data = processed_outputs.pop(key)
-                data = data.flatten(0, 1).split(image_patches)
-                processed_outputs[key] = data
-        else:
-            tokenizer = self.info.get_tokenizer()
-            processed_outputs = tokenizer(prompt,
-                                          add_special_tokens=True,
-                                          return_tensors="pt")
+        # Text-only input not supported in composite processor
+        if not (images := mm_data.get("images", [])):
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        processed_outputs = super()._call_hf_processor(
+            prompt,
+            mm_data,
+            mm_kwargs,
+        )
+
+        parsed_images = (self._get_data_parser().parse_mm_data({
+            "image": images
+        }).get_items("image", ImageProcessorItems))
+        image_sizes = [
+            parsed_images.get_image_size(i) for i in range(len(parsed_images))
+        ]
+        hf_processor = self.info.get_hf_processor(**mm_kwargs)
+
+        image_repl_features = [
+            self.info.get_image_repl(image_width=size.width,
+                                     image_height=size.height,
+                                     processor=hf_processor)
+            for size in image_sizes
+        ]
+
+        tokenizer = self.info.get_tokenizer()
+        image_repls_feature_tokens = [
+            tokenizer.encode(image_repl, add_special_tokens=False)
+            for image_repl in image_repl_features
+        ]
+
+        vocab = tokenizer.get_vocab()
+        image_token_id = vocab[hf_processor.image_token.content]
+
+        embed_is_patch = [
+            torch.tensor(image_repl_tokens) == image_token_id
+            for image_repl_tokens in image_repls_feature_tokens
+        ]
+        processed_outputs["embed_is_patch"] = embed_is_patch
+
+        num_patches = [
+            self.info.get_num_patches(
+                image_width=size.width,
+                image_height=size.height,
+                processor=hf_processor,
+            ) for size in image_sizes
+        ]
+        processed_outputs["num_patches"] = torch.tensor(num_patches)
+
+        # Remove the extra batch dimension
+        processed_outputs["pixel_values"].squeeze_(0)
+        processed_outputs["pixel_attention_mask"].squeeze_(0)
+
         return processed_outputs
 
     def _get_mm_fields_config(
@@ -268,10 +406,16 @@ class Idefics3MultiModalProcessor(
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
+        num_patches = hf_inputs.get("num_patches", torch.empty(0))
+
         return dict(
-            pixel_values=MultiModalFieldConfig.batched("image"),
-            pixel_attention_mask=MultiModalFieldConfig.batched("image"),
+            pixel_values=MultiModalFieldConfig.flat_from_sizes(
+                "image", num_patches),
+            pixel_attention_mask=MultiModalFieldConfig.flat_from_sizes(
+                "image", num_patches),
             image_embeds=MultiModalFieldConfig.batched("image"),
+            num_patches=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
         )
 
     def _get_prompt_updates(
@@ -281,42 +425,18 @@ class Idefics3MultiModalProcessor(
         out_mm_kwargs: MultiModalKwargs,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
-
         image_token = hf_processor.image_token.content
-        fake_image_token = hf_processor.fake_image_token.content
-        global_img_token = hf_processor.global_image_tag
-        image_seq_len = hf_processor.image_seq_len
-        grid_placeholder = "<row_{n_h}_col_{n_w}>"
-
-        p_img = image_token * image_seq_len
-        global_img_placeholder = fake_image_token + global_img_token + p_img
-        tile_img_placeholder = fake_image_token + grid_placeholder + p_img
 
         def get_replacement_idefics3(item_idx: int) -> str:
             images = mm_items.get_items("image", ImageProcessorItems)
 
             image_size = images.get_image_size(item_idx)
-            grid_w, grid_h = self.info._get_image_feature_grid_size(
+
+            return self.info.get_image_repl(
                 image_width=image_size.width,
                 image_height=image_size.height,
-                **hf_processor_mm_kwargs,
+                processor=hf_processor,
             )
-            if grid_w == 0 and grid_h == 0:
-                image_placeholder = global_img_placeholder
-            else:
-                tiles_placeholder = list[str]()
-                for i in range(grid_h):
-                    for j in range(grid_w):
-                        placeholder_per_tile = tile_img_placeholder.format(
-                            n_h=i + 1, n_w=j + 1)
-                        tiles_placeholder.append(placeholder_per_tile)
-                        # Add line break if it is the last tile in the row
-                        if j == grid_w - 1:
-                            tiles_placeholder.append("\n")
-
-                image_placeholder = "".join(
-                    [*tiles_placeholder, "\n", global_img_placeholder])
-            return image_placeholder + fake_image_token
 
         return [
             PromptReplacement(
@@ -424,73 +544,13 @@ class Idefics3Model(nn.Module):
               config.vision_config.patch_size)**2) / (config.scale_factor**2))
         self.image_token_id = self.config.image_token_id
 
-    def _validate_pixel_values(
-        self, data: Union[torch.Tensor, List[torch.Tensor]]
-    ) -> Union[torch.Tensor, List[torch.Tensor]]:
-
-        h = w = self.config.vision_config.image_size
-        expected_dims = (3, h, w)
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = tuple(d.shape[1:])
-
-            if actual_dims != expected_dims:
-                expected_expr = ("num_patches", *map(str, expected_dims))
-                raise ValueError(
-                    "The expected shape of pixel values per image per batch "
-                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
-    def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[ImageInputs]:
-        pixel_values = kwargs.pop("pixel_values", None)
-        image_embeds = kwargs.pop("image_embeds", None)
-        pixel_attention_mask = kwargs.pop("pixel_attention_mask", None)
-
-        if pixel_values is None and image_embeds is None:
-            return None
-
-        if image_embeds is not None:
-            if not isinstance(image_embeds, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
-
-            return Idefics3ImageEmbeddingInputs(
-                type="image_embeds",
-                data=flatten_bn(image_embeds, concat=True),
-            )
-
-        if pixel_values is not None:
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
-            if isinstance(pixel_values, list):
-                pixel_values = torch.cat(pixel_values, dim=1)
-                pixel_attention_mask = torch.cat(pixel_attention_mask, dim=1)
-            else:
-                pixel_values = flatten_bn(pixel_values)
-                pixel_attention_mask = flatten_bn(pixel_attention_mask)
-
-            return Idefics3ImagePixelInputs(
-                type="pixel_values",
-                data=self._validate_pixel_values(pixel_values),
-                pixel_attention_mask=pixel_attention_mask)
-
-        raise AssertionError("This line should be unreachable.")
-
-    def _image_pixels_to_features(
+    def image_pixels_to_features(
         self,
         pixel_values: torch.Tensor,
-        pixel_attention_mask: Optional[torch.BoolTensor] = None,
-    ) -> NestedTensors:
+        pixel_attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
         # NOTE: we skip the step to select the vision feature layer since
         # this is already done inside the vision tower
-        num_patches = [x.size(0) for x in pixel_values]
         pixel_values = pixel_values.to(
             dtype=self.vision_model.embeddings.patch_embedding.weight.dtype
         )  # fp16 compatibility
@@ -502,17 +562,9 @@ class Idefics3Model(nn.Module):
         pixel_values = pixel_values[real_images_inds].contiguous()
 
         # Handle the vision attention mask
-        if pixel_attention_mask is None:
-            pixel_attention_mask = torch.ones(
-                size=(pixel_values.size(0), pixel_values.size(2),
-                      pixel_values.size(3)),
-                dtype=torch.bool,
-                device=pixel_values.device,
-            )
-        else:
-            # Remove padding images from the mask
-            pixel_attention_mask = pixel_attention_mask[
-                real_images_inds].contiguous()
+        # Remove padding images from the mask
+        pixel_attention_mask = pixel_attention_mask[
+            real_images_inds].contiguous()
 
         patch_size = self.config.vision_config.patch_size
         patches_subgrid = pixel_attention_mask.unfold(dimension=1,
@@ -529,27 +581,7 @@ class Idefics3Model(nn.Module):
             patch_attention_mask=patch_attention_mask,
         )
 
-        return image_hidden_states.split(num_patches)
-
-    def _process_image_pixels(
-            self, inputs: Idefics3ImagePixelInputs) -> NestedTensors:
-        assert self.vision_model is not None
-
-        pixel_values = inputs["data"]
-        pixel_attention_mask = inputs["pixel_attention_mask"]
-
-        return self._image_pixels_to_features(pixel_values,
-                                              pixel_attention_mask)
-
-    def _process_image_input(self, image_input: ImageInputs) -> torch.Tensor:
-        if image_input["type"] == "image_embeds":
-            return image_input["data"]
-
-        assert self.vision_model is not None
-        image_features = self._process_image_pixels(image_input)
-        num_patches = [x.size(0) for x in image_features]
-        image_features = torch.cat(image_features)
-        return self.connector(image_features).split(num_patches)
+        return image_hidden_states
 
     def get_input_embeddings(
         self,
@@ -616,13 +648,113 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.logits_processor = LogitsProcessor(config.text_config.vocab_size)
         self.sampler = get_sampler()
 
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape)
+
+            if actual_dims != expected_dims:
+                expected_expr = str(expected_dims)
+                raise ValueError(
+                    "The expected shape of pixel values per image per batch "
+                    f" per patch is {expected_expr}. "
+                    f"You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[ImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        embed_is_patch = kwargs.pop("embed_is_patch")
+        if not isinstance(embed_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of embed_is_patch. "
+                             f"Got type: {type(embed_is_patch)}")
+
+        embed_is_patch = flatten_bn(embed_is_patch)
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+
+            return Idefics3ImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds, concat=True),
+                embed_is_patch=embed_is_patch,
+            )
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            pixel_attention_mask = kwargs.pop("pixel_attention_mask")
+            if not isinstance(pixel_attention_mask, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel_attention_mask. "
+                                 f"Got type: {type(pixel_attention_mask)}")
+
+            num_patches = kwargs.pop("num_patches")
+            if not isinstance(num_patches, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of num_patches. "
+                                 f"Got type: {type(num_patches)}")
+
+            pixel_values = flatten_bn(pixel_values, concat=True)
+            pixel_attention_mask = flatten_bn(pixel_attention_mask,
+                                              concat=True)
+            num_patches = flatten_bn(num_patches, concat=True)
+
+            return Idefics3ImagePixelInputs(
+                type="pixel_values",
+                pixel_values=self._validate_pixel_values(pixel_values),
+                pixel_attention_mask=pixel_attention_mask,
+                num_patches=num_patches,
+                embed_is_patch=embed_is_patch,
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_image_pixels(
+            self, inputs: Idefics3ImagePixelInputs) -> torch.Tensor:
+        pixel_values = inputs["pixel_values"]
+        pixel_attention_mask = inputs["pixel_attention_mask"]
+
+        return self.model.image_pixels_to_features(
+            pixel_values,
+            pixel_attention_mask=pixel_attention_mask,
+        )
+
+    def _process_image_input(self, image_input: ImageInputs) -> torch.Tensor:
+        if image_input["type"] == "image_embeds":
+            return image_input["data"]
+
+        image_features = self._process_image_pixels(image_input)
+        image_features = self.model.connector(image_features)
+
+        num_patches = image_input["num_patches"]
+        return image_features.split(num_patches.tolist())
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
-        image_input = self.model._parse_and_validate_image_input(**kwargs)
+        image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
-        vision_embeddings = self.model._process_image_input(image_input)
-        return vision_embeddings
+
+        image_features = self._process_image_input(image_input)
+
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -632,8 +764,11 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
         inputs_embeds = self.model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
             inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                self.config.image_token_id)
+                input_ids,
+                inputs_embeds,
+                select_patch_features(multimodal_embeddings),
+                self.config.image_token_id,
+            )
         return inputs_embeds
 
     def forward(
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index d2c8fb7237274..ac4bdbc41e441 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -21,7 +21,6 @@ from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
 import numpy as np
 import torch
 import torch.nn.functional as F
-import torch.utils.checkpoint
 import transformers.models.mllama.configuration_mllama as config_mllama
 from PIL.Image import Image
 from torch import nn
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index f63bd0a11459a..ccb5a3f600b2d 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -160,7 +160,7 @@ class Qwen2AudioMultiModalProcessor(
         mm_kwargs: Mapping[str, Any],
     ) -> BatchFeature:
         # Text-only input not supported in composite processor
-        if not mm_data or not mm_data.get("audios", []):
+        if not mm_data.get("audios", []):
             prompt_ids = self.info.get_tokenizer().encode(prompt)
             prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
             return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index cb1e143838496..6e73a2ae656c2 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -8,7 +8,6 @@ from functools import cached_property
 from typing import Any, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
-import torch.utils.checkpoint
 from torch import nn
 from torch.nn import functional as F
 from transformers import BatchFeature, ProcessorMixin
@@ -160,7 +159,7 @@ class UltravoxMultiModalProcessor(
         mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
         # Text-only input not supported in composite processor
-        if not mm_data or not mm_data.get("audios", []):
+        if not mm_data.get("audios", []):
             prompt_ids = self.info.get_tokenizer().encode(
                 prompt, add_special_tokens=False)
             prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)

From 7329ff5468eceaf17f4b193ae3ef0b43c7bf38d6 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 28 Mar 2025 11:46:45 -0400
Subject: [PATCH 077/593] [V1] Support disable_any_whtespace for guidance
 backend (#15584)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 tests/entrypoints/llm/test_guided_generate.py | 62 +++----------------
 .../llm/test_struct_output_generate.py        | 54 +++-------------
 vllm/engine/arg_utils.py                      |  3 +-
 .../guided_decoding/guidance_decoding.py      | 12 +++-
 vllm/v1/engine/processor.py                   | 11 ++--
 vllm/v1/structured_output/backend_guidance.py | 19 +++---
 6 files changed, 44 insertions(+), 117 deletions(-)

diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index 5f1a91cb2b19f..3f275e0b2ec74 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -6,7 +6,6 @@ import weakref
 
 import jsonschema
 import pytest
-from pydantic import BaseModel
 
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.entrypoints.llm import LLM
@@ -15,7 +14,10 @@ from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
 MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
 GUIDED_DECODING_BACKENDS = [
-    "outlines", "lm-format-enforcer", "xgrammar", "guidance"
+    "outlines",
+    "lm-format-enforcer",
+    "xgrammar:disable-any-whitespace",
+    "guidance:disable-any-whitespace",
 ]
 
 
@@ -322,59 +324,9 @@ def test_guided_json_object(llm, guided_decoding_backend: str):
             print(generated_text)
             assert generated_text is not None
 
+            if 'disable-any-whitespace' in guided_decoding_backend:
+                assert "\n" not in generated_text
+
             # Parse to verify it is valid JSON
             parsed_json = json.loads(generated_text)
             assert isinstance(parsed_json, dict)
-
-
-@pytest.mark.skip_global_cleanup
-def test_json_with_any_whitespace_disabled(llm):
-
-    class ResponseSchema(BaseModel):
-        clarifying_question: str
-        cost_per_serving: str
-        calories: str
-        type_dish_ids: str
-        type_meal_ids: str
-        product_ids: list[str]
-        exclude_product_ids: list[str]
-        allergen_ids: list[str]
-        total_cooking_time: str
-        kitchen_ids: str
-        holiday_ids: str
-
-    # Note: Without this setting, the response is sometimes full of `\n`
-    # for some models. This option prevents that.
-    guided_decoding_backend = 'xgrammar:disable-any-whitespace'
-
-    schema = ResponseSchema.model_json_schema()
-    guided_params = GuidedDecodingParams(json=schema,
-                                         backend=\
-                                           guided_decoding_backend)
-    sampling_params = SamplingParams(max_tokens=2000,
-                                     frequency_penalty=0,
-                                     presence_penalty=-1.1,
-                                     repetition_penalty=1.3,
-                                     guided_decoding=guided_params)
-
-    prompt = ("<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You"
-              "are a helpful assistant.<|im_end|>\n<|im_start|>user\nI want a "
-              "quick launch fast with $10.<|im_end|>\n<|im_start|>assistant\n")
-    outputs = llm.generate(prompts=prompt,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
-
-    assert outputs is not None
-
-    for output in outputs:
-        assert output is not None
-        assert isinstance(output, RequestOutput)
-
-        generated_text = output.outputs[0].text
-        assert generated_text is not None
-        assert "\n" not in generated_text
-
-        # Parse to verify it is valid JSON
-        parsed_json = json.loads(generated_text)
-        assert isinstance(parsed_json, dict)
-        jsonschema.validate(instance=parsed_json, schema=schema)
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 00fa47575b6ae..c9fa03a1ae1fb 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -15,7 +15,9 @@ from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
-GUIDED_DECODING_BACKENDS_V1 = ["xgrammar", "guidance"]
+GUIDED_DECODING_BACKENDS_V1 = [
+    "xgrammar:disable-any-whitespace", "guidance:disable-any-whitespace"
+]
 MODELS_TO_TEST = [
     "Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410"
 ]
@@ -55,50 +57,8 @@ def test_guided_json_completion(
 
         generated_text = output.outputs[0].text
         assert generated_text is not None
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-        output_json = json.loads(generated_text)
-        jsonschema.validate(instance=output_json, schema=sample_json_schema)
-
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_json_completion_disable_any_whitespace(
-    monkeypatch: pytest.MonkeyPatch,
-    sample_json_schema: dict[str, Any],
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    if guided_decoding_backend != "xgrammar":
-        pytest.skip("disable-any-whitespace is only supported for xgrammar.")
-    guided_decoding_backend = 'xgrammar:disable-any-whitespace'
-
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(json=sample_json_schema))
-    outputs = llm.generate(prompts=[
-        f"Give an example JSON for an employee profile "
-        f"that fits this schema: {sample_json_schema}"
-    ] * 2,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
-
-    assert outputs is not None
-
-    for output in outputs:
-        assert output is not None
-        assert isinstance(output, RequestOutput)
-        prompt = output.prompt
-
-        generated_text = output.outputs[0].text
-        assert generated_text is not None
-        assert "\n" not in generated_text
+        if 'disable-any-whitespace' in guided_decoding_backend:
+            assert "\n" not in generated_text
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
         output_json = json.loads(generated_text)
         jsonschema.validate(instance=output_json, schema=sample_json_schema)
@@ -142,7 +102,7 @@ def test_guided_json_object(
             # Parse to verify it is valid JSON
             parsed_json = json.loads(generated_text)
             allowed_types: tuple[type, ...] = (dict, )
-            if guided_decoding_backend == "xgrammar":
+            if guided_decoding_backend.startswith("xgrammar"):
                 # TODO - we are currently too permissive with xgrammar and
                 # allow # any valid json (typically comes back as a list or
                 # object).  We can fix this by specifying a jsonschema of
@@ -170,7 +130,7 @@ def test_guided_json_unsupported_schema(
         temperature=1.0,
         max_tokens=1000,
         guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
-    if guided_decoding_backend == "xgrammar":
+    if guided_decoding_backend.startswith("xgrammar"):
         with pytest.raises(ValueError,
                            match="The provided JSON schema contains features "
                            "not supported by xgrammar."):
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index a416fa8aa08e3..6f498af36a403 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1561,7 +1561,8 @@ class EngineArgs:
 
         # Xgrammar and Guidance are supported.
         SUPPORTED_GUIDED_DECODING = [
-            "xgrammar", "xgrammar:disable-any-whitespace", "guidance", "auto"
+            "xgrammar", "xgrammar:disable-any-whitespace", "guidance",
+            "guidance:disable-any-whitespace", "auto"
         ]
         if self.guided_decoding_backend not in SUPPORTED_GUIDED_DECODING:
             _raise_or_fallback(feature_name="--guided-decoding-backend",
diff --git a/vllm/model_executor/guided_decoding/guidance_decoding.py b/vllm/model_executor/guided_decoding/guidance_decoding.py
index d8675a14030de..f19ebcbe420e3 100644
--- a/vllm/model_executor/guided_decoding/guidance_decoding.py
+++ b/vllm/model_executor/guided_decoding/guidance_decoding.py
@@ -18,14 +18,22 @@ def get_local_guidance_guided_decoding_logits_processor(
     """
 
     grm = ""
+    any_whitespace = 'disable-any-whitespace' not in \
+        guided_params.backend_options()
     if guided_params.json:
         grm = llguidance.LLMatcher.grammar_from_json_schema(
             guided_params.json,
-            overrides={"whitespace_pattern": guided_params.whitespace_pattern})
+            overrides={"whitespace_pattern": guided_params.whitespace_pattern},
+            defaults={
+                "whitespace_flexible": any_whitespace,
+            })
     elif guided_params.json_object:
         grm = llguidance.LLMatcher.grammar_from_json_schema(
             '{"type": "object"}',
-            overrides={"whitespace_pattern": guided_params.whitespace_pattern})
+            overrides={"whitespace_pattern": guided_params.whitespace_pattern},
+            defaults={
+                "whitespace_flexible": any_whitespace,
+            })
     elif guided_params.regex:
         grm = llguidance.grammar_from("regex", guided_params.regex)
     elif guided_params.choice:
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 24762d214c345..dbaf0abaea18a 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -121,7 +121,8 @@ class Processor:
             return
 
         supported_backends = [
-            "xgrammar", "xgrammar:disable-any-whitespace", "guidance", "auto"
+            "xgrammar", "xgrammar:disable-any-whitespace", "guidance",
+            "guidance:disable-any-whitespace", "auto"
         ]
         engine_level_backend = self.decoding_config.guided_decoding_backend
         if engine_level_backend not in supported_backends:
@@ -140,11 +141,10 @@ class Processor:
             raise ValueError("Structured output is not supported on TPU.")
 
         # Request content validation
-
-        if engine_level_backend == "xgrammar":
+        if engine_level_backend.startswith("xgrammar"):
             # xgrammar with no fallback
             validate_structured_output_request_xgrammar(params)
-            params.guided_decoding.backend = "xgrammar"
+            params.guided_decoding.backend = engine_level_backend
         elif engine_level_backend == "auto":
             # "auto" is an opt-in to opinionated behavior where we try to
             # choose a backend based on request contents. This is not the
@@ -158,12 +158,13 @@ class Processor:
                 # are not supported in xgrammar. Fall back to guidance.
                 params.guided_decoding.backend = "guidance"
 
-        if params.guided_decoding.backend == "guidance":
+        if engine_level_backend.startswith("guidance"):
             # TODO ideally we would have the LLTokenizer here as Lark syntax
             # allows <|special_token|> and similar, see
             # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
             # Without tokenizer these are disallowed in grammars.
             validate_guidance_grammar(params, tokenizer=None)
+            params.guided_decoding.backend = engine_level_backend
 
     def process_inputs(
         self,
diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
index 1e274ad0ae623..a7ba710169497 100644
--- a/vllm/v1/structured_output/backend_guidance.py
+++ b/vllm/v1/structured_output/backend_guidance.py
@@ -41,6 +41,9 @@ class GuidanceBackend(StructuredOutputBackend):
         tokenizer_group.ping()
         self.vllm_config = vllm_config
         self.vocab_size = vllm_config.model_config.get_vocab_size()
+        self.disable_any_whitespace = (
+            "disable-any-whitespace"
+            in vllm_config.decoding_config.guided_decoding_backend)
 
         tokenizer = tokenizer_group.get_lora_tokenizer(None)
         self.ll_tokenizer = llguidance_hf.from_tokenizer(tokenizer, None)
@@ -48,7 +51,7 @@ class GuidanceBackend(StructuredOutputBackend):
     def compile_grammar(self, request_type: StructuredOutputOptions,
                         grammar_spec: str) -> StructuredOutputGrammar:
         self.serialized_grammar = serialize_guidance_grammar(
-            request_type, grammar_spec)
+            request_type, grammar_spec, self.disable_any_whitespace)
 
         ll_matcher = llguidance.LLMatcher(
             self.ll_tokenizer,
@@ -126,17 +129,19 @@ class GuidanceGrammar(StructuredOutputGrammar):
 
 
 def serialize_guidance_grammar(request_type: StructuredOutputOptions,
-                               grammar_spec: str) -> str:
+                               grammar_spec: str,
+                               disable_any_whitespace: bool = False) -> str:
     if request_type == StructuredOutputOptions.JSON:
-        # TODO: make whitespace_flexible configurable
         return llguidance.LLMatcher.grammar_from_json_schema(
-            grammar_spec, defaults={
-                "whitespace_flexible": True,
+            grammar_spec,
+            defaults={
+                "whitespace_flexible": not disable_any_whitespace,
             })
     elif request_type == StructuredOutputOptions.JSON_OBJECT:
         return llguidance.LLMatcher.grammar_from_json_schema(
-            '{"type": "object"}', defaults={
-                "whitespace_flexible": True,
+            '{"type": "object"}',
+            defaults={
+                "whitespace_flexible": not disable_any_whitespace,
             })
     else:
         if request_type == StructuredOutputOptions.REGEX:

From 2914006fe09875ebfa33626d945e34173c7441c6 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Fri, 28 Mar 2025 23:56:48 +0800
Subject: [PATCH 078/593] [doc] add missing imports (#15699)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 docs/source/models/generative_models.md  | 6 ++++++
 docs/source/models/pooling_models.md     | 8 ++++++++
 docs/source/performance/optimization.md  | 2 ++
 docs/source/serving/multimodal_inputs.md | 8 ++++++++
 docs/source/serving/offline_inference.md | 6 ++++++
 5 files changed, 30 insertions(+)

diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index c94e940b8534c..63fc53b0e7c55 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -23,6 +23,8 @@ It is similar to [its counterpart in HF Transformers](https://huggingface.co/doc
 except that tokenization and detokenization are also performed automatically.
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="facebook/opt-125m")
 outputs = llm.generate("Hello, my name is")
 
@@ -36,6 +38,8 @@ You can optionally control the language generation by passing {class}`~vllm.Samp
 For example, you can use greedy sampling by setting `temperature=0`:
 
 ```python
+from vllm import LLM, SamplingParams
+
 llm = LLM(model="facebook/opt-125m")
 params = SamplingParams(temperature=0)
 outputs = llm.generate("Hello, my name is", params)
@@ -83,6 +87,8 @@ Base models may perform poorly as they are not trained to respond to the chat co
 :::
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
 conversation = [
     {
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index f774f3d0fa0ed..dbcd846cc9779 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -68,6 +68,8 @@ The {class}`~vllm.LLM.encode` method is available to all pooling models in vLLM.
 It returns the extracted hidden states directly, which is useful for reward models.
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward")
 (output,) = llm.encode("Hello, my name is")
 
@@ -81,6 +83,8 @@ The {class}`~vllm.LLM.embed` method outputs an embedding vector for each prompt.
 It is primarily designed for embedding models.
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed")
 (output,) = llm.embed("Hello, my name is")
 
@@ -96,6 +100,8 @@ The {class}`~vllm.LLM.classify` method outputs a probability vector for each pro
 It is primarily designed for classification models.
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify")
 (output,) = llm.classify("Hello, my name is")
 
@@ -116,6 +122,8 @@ To handle RAG at a higher level, you should use integration frameworks such as [
 :::
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score")
 (output,) = llm.score("What is the capital of France?",
                       "The capital of Brazil is Brasilia.")
diff --git a/docs/source/performance/optimization.md b/docs/source/performance/optimization.md
index 5b0f8421a51eb..ccbe8a367061f 100644
--- a/docs/source/performance/optimization.md
+++ b/docs/source/performance/optimization.md
@@ -31,6 +31,8 @@ vLLM supports an experimental feature chunked prefill. Chunked prefill allows to
 You can enable the feature by specifying `--enable-chunked-prefill` in the command line or setting `enable_chunked_prefill=True` in the LLM constructor.
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True)
 # Set max_num_batched_tokens to tune performance.
 # NOTE: 2048 is the default max_num_batched_tokens for chunked prefill.
diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index 2e2016c95e4fc..f45d36c3ccaca 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -21,6 +21,8 @@ To input multi-modal data, follow this schema in {class}`vllm.inputs.PromptType`
 You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="llava-hf/llava-1.5-7b-hf")
 
 # Refer to the HuggingFace repo for the correct format to use
@@ -65,6 +67,8 @@ Full example: <gh-file:examples/offline_inference/vision_language.py>
 To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
 
 ```python
+from vllm import LLM
+
 llm = LLM(
     model="microsoft/Phi-3.5-vision-instruct",
     trust_remote_code=True,  # Required to load Phi-3.5-vision
@@ -96,6 +100,8 @@ Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py
 Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
 
 ```python
+from vllm import LLM
+
 # Specify the maximum number of frames per video to be 4. This can be changed.
 llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
 
@@ -139,6 +145,8 @@ To input pre-computed embeddings belonging to a data type (i.e. image, video, or
 pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
 
 ```python
+from vllm import LLM
+
 # Inference with image embeddings as input
 llm = LLM(model="llava-hf/llava-1.5-7b-hf")
 
diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md
index 7bf1c08828dba..2fa19332d4aa6 100644
--- a/docs/source/serving/offline_inference.md
+++ b/docs/source/serving/offline_inference.md
@@ -11,6 +11,8 @@ For example, the following code downloads the [`facebook/opt-125m`](https://hugg
 and runs it in vLLM using the default configuration.
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="facebook/opt-125m")
 ```
 
@@ -47,6 +49,8 @@ To fix this, explicitly specify the model architecture by passing `config.json`
 For example:
 
 ```python
+from vllm import LLM
+
 model = LLM(
     model="cerebras/Cerebras-GPT-1.3B",
     hf_overrides={"architectures": ["GPT2LMHeadModel"]},  # GPT-2
@@ -92,6 +96,8 @@ You can further reduce memory usage by limiting the context length of the model
 and the maximum batch size (`max_num_seqs` option).
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="adept/fuyu-8b",
           max_model_len=2048,
           max_num_seqs=2)

From 432cf22a6a1f800cf64e79a706639dbe163fbc18 Mon Sep 17 00:00:00 2001
From: Kebe <mail@kebe7jun.com>
Date: Fri, 28 Mar 2025 23:58:44 +0800
Subject: [PATCH 079/593] [Bugfix] Fix regex compile display format (#15368)

Signed-off-by: Kebe <mail@kebe7jun.com>
---
 vllm/transformers_utils/tokenizers/mistral.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 2d036e2c83f74..d893431f4871b 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -124,13 +124,15 @@ def find_tokenizer_file(files: List[str]):
 
     matched_files = [file for file in files if file_pattern.match(file)]
     if len(matched_files) > 1:
-        raise OSError(f"Found {len(matched_files)} files matching the "
-                      f"pattern: {file_pattern}. Make sure only one Mistral "
-                      f"tokenizer is present in {files}.")
+        raise OSError(
+            f"Found {len(matched_files)} files matching the "
+            f"pattern: `{file_pattern.pattern}`. Make sure only one Mistral "
+            f"tokenizer is present in {files}.")
     elif len(matched_files) == 0:
-        raise OSError(f"Found {len(matched_files)} files matching the "
-                      f"pattern: {file_pattern}. Make sure that a Mistral "
-                      f"tokenizer is present in {files}.")
+        raise OSError(
+            f"Found {len(matched_files)} files matching the "
+            f"pattern: `{file_pattern.pattern}`. Make sure that a Mistral "
+            f"tokenizer is present in {files}.")
 
     return matched_files[0]
 

From 47e9038d2386d31b8493ac995094bdc1aec710ce Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 28 Mar 2025 10:29:32 -0600
Subject: [PATCH 080/593] Fix cpu offload testing for gptq/awq/ct (#15648)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 tests/quantization/test_cpu_offload.py | 12 +++++++---
 tests/utils.py                         | 33 ++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
index 79afcc916f2bb..a7d6518514c72 100644
--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -33,7 +33,9 @@ def test_cpu_offload_fp8():
 
 @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
                     reason="gptq_marlin is not supported on this GPU type.")
-def test_cpu_offload_gptq():
+def test_cpu_offload_gptq(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
     # Test GPTQ Marlin
     compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [],
                          ["--cpu-offload-gb", "1"],
@@ -47,7 +49,9 @@ def test_cpu_offload_gptq():
 
 @pytest.mark.skipif(not is_quant_method_supported("awq_marlin"),
                     reason="awq_marlin is not supported on this GPU type.")
-def test_cpu_offload_awq():
+def test_cpu_offload_awq(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
     # Test AWQ Marlin
     compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", [],
                          ["--cpu-offload-gb", "1"],
@@ -61,7 +65,9 @@ def test_cpu_offload_awq():
 
 @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
                     reason="gptq_marlin is not supported on this GPU type.")
-def test_cpu_offload_compressed_tensors():
+def test_cpu_offload_compressed_tensors(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
     # Test wNa16
     compare_two_settings("nm-testing/tinyllama-oneshot-w4a16-channel-v2", [],
                          ["--cpu-offload-gb", "1"],
diff --git a/tests/utils.py b/tests/utils.py
index a827b6d4b9bfe..8915453ebd0a3 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -317,6 +317,37 @@ def _test_completion_close(
     return results
 
 
+def _test_chat(
+    client: openai.OpenAI,
+    model: str,
+    prompt: str,
+):
+    results = []
+
+    messages = [{
+        "role": "user",
+        "content": [{
+            "type": "text",
+            "text": prompt
+        }]
+    }]
+
+    # test with text prompt
+    chat_response = client.chat.completions.create(model=model,
+                                                   messages=messages,
+                                                   max_tokens=5,
+                                                   temperature=0.0)
+
+    results.append({
+        "test": "completion_close",
+        "text": chat_response.choices[0].message.content,
+        "finish_reason": chat_response.choices[0].finish_reason,
+        "usage": chat_response.usage,
+    })
+
+    return results
+
+
 def _test_embeddings(
     client: openai.OpenAI,
     model: str,
@@ -512,6 +543,8 @@ def compare_all_settings(model: str,
                 results += _test_completion(client, model, prompt, token_ids)
             elif method == "generate_close":
                 results += _test_completion_close(client, model, prompt)
+            elif method == "generate_chat":
+                results += _test_chat(client, model, prompt)
             elif method == "generate_with_image":
                 results += _test_image_text(
                     client, model,

From 70e132244a61425a6b88c0b8345e496dc5bdfecd Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 28 Mar 2025 09:30:08 -0700
Subject: [PATCH 081/593] [Minor] Remove TGI launching script  (#15646)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 benchmarks/benchmark_serving.py                  |  3 ---
 .../benchmark_serving_structured_output.py       |  3 ---
 benchmarks/launch_tgi_server.sh                  | 16 ----------------
 3 files changed, 22 deletions(-)
 delete mode 100755 benchmarks/launch_tgi_server.sh

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 82c6b426b9a2b..e2f712dfc6f49 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -7,9 +7,6 @@ On the server side, run one of the following commands:
         --swap-space 16 \
         --disable-log-requests
 
-    (TGI backend)
-    ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
-
 On the client side, run:
     python benchmarks/benchmark_serving.py \
         --backend <backend> \
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index c79a93faff197..71cb420a52c46 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -5,9 +5,6 @@ On the server side, run one of the following commands:
     (vLLM OpenAI API server)
     vllm serve <your_model> --disable-log-requests
 
-    (TGI backend)
-    ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
-
 On the client side, run:
     python benchmarks/benchmark_serving_structured_output.py \
         --backend <backend> \
diff --git a/benchmarks/launch_tgi_server.sh b/benchmarks/launch_tgi_server.sh
deleted file mode 100755
index ba7383d88dc49..0000000000000
--- a/benchmarks/launch_tgi_server.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-
-PORT=8000
-MODEL=$1
-TOKENS=$2
-
-docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \
-           -v "$PWD/data:/data" \
-           ghcr.io/huggingface/text-generation-inference:2.2.0 \
-           --model-id "$MODEL" \
-           --sharded false  \
-           --max-input-length 1024 \
-           --max-total-tokens 2048 \
-           --max-best-of 5 \
-           --max-concurrent-requests 5000 \
-           --max-batch-total-tokens "$TOKENS"

From c6bc0034d0b8d3960d0a44565812ca253fc95943 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 29 Mar 2025 00:41:16 +0800
Subject: [PATCH 082/593] [Misc] Remove unused utils and clean up imports
 (#15708)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/multimodal/test_utils.py    |  69 +----------------
 vllm/multimodal/utils.py          | 119 ------------------------------
 vllm/v1/core/sched/output.py      |   3 +-
 vllm/v1/worker/gpu_input_batch.py |   9 +--
 4 files changed, 5 insertions(+), 195 deletions(-)

diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 8f76d895fdd29..a3f136c5667d5 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -9,12 +9,10 @@ from typing import TYPE_CHECKING, NamedTuple, Optional
 import numpy as np
 import pytest
 from PIL import Image, ImageChops
-from transformers import AutoConfig, AutoTokenizer
 
 from vllm.multimodal.inputs import PlaceholderRange
 from vllm.multimodal.utils import (MediaConnector,
-                                   merge_and_sort_multimodal_metadata,
-                                   repeat_and_pad_placeholder_tokens)
+                                   merge_and_sort_multimodal_metadata)
 
 if TYPE_CHECKING:
     from vllm.multimodal.hasher import MultiModalHashDict
@@ -136,71 +134,6 @@ async def test_fetch_image_local_files(image_url: str):
                 f"file://{temp_dir}/../{os.path.basename(image_url)}")
 
 
-@pytest.mark.parametrize("model", ["llava-hf/llava-v1.6-mistral-7b-hf"])
-def test_repeat_and_pad_placeholder_tokens(model):
-    config = AutoConfig.from_pretrained(model)
-    image_token_id = config.image_token_index
-
-    tokenizer = AutoTokenizer.from_pretrained(model)
-
-    test_cases = [
-        (
-            "<image>",
-            2,
-            "<image><image>",
-            [32000, 32000],
-            [{ "offset": 0, "length": 2 }],
-        ),
-        (
-            "<image><image>",
-            2,
-            "<image><image><image>",
-            [32000, 32000, 32000],
-            [{ "offset": 0, "length": 2 }],
-        ),
-        (
-            "<image><image>",
-            [3, 2],
-            "<image><image><image><image><image>",
-            [32000, 32000, 32000, 32000, 32000],
-            [{ "offset": 0, "length": 3 }, { "offset": 3, "length": 2 }],
-        ),
-        (
-            "Image:<image>Image:<image>!",
-            [3, 2],
-            "Image:<image><image><image>Image:<image><image>!",
-            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
-            [{ "offset": 2, "length": 3 }, { "offset": 7, "length": 2 }],
-        ),
-        (
-            "<image>",
-            [3, 2],
-            "<image><image><image>",
-            [32000, 32000, 32000],
-            [{ "offset": 0, "length": 3 }],
-        ),
-    ]  # yapf: disable
-
-    for (
-            prompt,
-            repeat_count,
-            expected_prompt,
-            expected_token_ids,
-            expected_ranges,
-    ) in test_cases:
-        new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
-            tokenizer=tokenizer,
-            prompt=prompt,
-            prompt_token_ids=tokenizer.encode(prompt,
-                                              add_special_tokens=False),
-            placeholder_token_id=image_token_id,
-            repeat_count=repeat_count,
-        )
-        assert new_prompt == expected_prompt
-        assert new_token_ids == expected_token_ids
-        assert ranges == expected_ranges
-
-
 # Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
 class TestCase(NamedTuple):
     mm_positions: "MultiModalPlaceholderDict"
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index ad381e1d1d00d..8e4fb7eac49c0 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -12,8 +12,6 @@ from PIL import Image
 
 import vllm.envs as envs
 from vllm.connections import HTTPConnection, global_http_connection
-from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 from .audio import AudioMediaIO
 from .base import MediaIO
@@ -21,8 +19,6 @@ from .image import ImageEmbeddingMediaIO, ImageMediaIO
 from .inputs import PlaceholderRange
 from .video import VideoMediaIO
 
-logger = init_logger(__name__)
-
 _M = TypeVar("_M")
 
 if TYPE_CHECKING:
@@ -296,121 +292,6 @@ def encode_video_base64(frames: npt.NDArray) -> str:
     return video_io.encode_base64(frames)
 
 
-# Utilities for input processors
-_T = TypeVar("_T", str, int)
-
-
-def repeat_and_pad_token(
-    token: _T,
-    *,
-    repeat_count: int = 1,
-    pad_token_left: Optional[_T] = None,
-    pad_token_right: Optional[_T] = None,
-) -> list[_T]:
-    replacement = [token] * repeat_count
-    if pad_token_left is not None:
-        replacement = [pad_token_left] + replacement
-    if pad_token_right is not None:
-        replacement = replacement + [pad_token_right]
-
-    return replacement
-
-
-def repeat_and_pad_placeholder_tokens(
-    tokenizer: AnyTokenizer,
-    prompt: Optional[str],
-    prompt_token_ids: list[int],
-    *,
-    placeholder_token_id: int,
-    repeat_count: Union[int, list[int]],
-    pad_token_left: Optional[int] = None,
-    pad_token_right: Optional[int] = None,
-) -> tuple[Optional[str], list[int], list[PlaceholderRange]]:
-    if isinstance(repeat_count, int):
-        repeat_count = [repeat_count]
-
-    if prompt is None:
-        new_prompt = None
-    else:
-        placeholder_token_str = tokenizer.decode(placeholder_token_id)
-        pad_token_str_left = (None if pad_token_left is None else
-                              tokenizer.decode(pad_token_left))
-        pad_token_str_right = (None if pad_token_right is None else
-                               tokenizer.decode(pad_token_right))
-
-        placeholder_token_count = prompt.count(placeholder_token_str)
-        # This is an arbitrary number to distinguish between the two cases
-        if placeholder_token_count > 16:
-            logger.warning(
-                "Please follow the prompt format that is "
-                "documented on HuggingFace which does not involve "
-                "repeating %s tokens.", placeholder_token_str)
-        if placeholder_token_count < len(repeat_count):
-            logger.warning(
-                "The number of multi-modal placeholder tokens in the prompt "
-                "is less than the number of multi-modal inputs. Extra "
-                "placeholder tokens will be treated as plain text")
-            repeat_count = repeat_count[:placeholder_token_count]
-
-        prompt_parts = prompt.split(placeholder_token_str,
-                                    maxsplit=len(repeat_count))
-        new_prompt = ""
-        for i, repeat_count_item in enumerate(repeat_count):
-            replacement_str = "".join(
-                repeat_and_pad_token(
-                    placeholder_token_str,
-                    repeat_count=repeat_count_item,
-                    pad_token_left=pad_token_str_left,
-                    pad_token_right=pad_token_str_right,
-                ))
-            # The image tokens are removed to be consistent with HuggingFace
-            new_prompt += prompt_parts[i] + replacement_str
-        new_prompt += prompt_parts[-1]
-
-    new_token_ids = list[int]()
-    placeholder_ranges = list[PlaceholderRange]()
-    placeholder_token_idx = 0
-    for i, token in enumerate(prompt_token_ids):
-        if token == placeholder_token_id:
-            curr_repeat_count = repeat_count[placeholder_token_idx]
-            replacement_ids = repeat_and_pad_token(
-                placeholder_token_id,
-                repeat_count=curr_repeat_count,
-                pad_token_left=pad_token_left,
-                pad_token_right=pad_token_right,
-            )
-            offset = len(new_token_ids)
-            if pad_token_left is not None:
-                offset += 1
-            placeholder_ranges.append({
-                "offset": offset,
-                "length": curr_repeat_count,
-            })
-            new_token_ids.extend(replacement_ids)
-            placeholder_token_idx += 1
-
-            # No need to further scan the list since we replaced all tokens
-            if placeholder_token_idx >= len(repeat_count):
-                new_token_ids.extend(prompt_token_ids[i + 1:])
-                break
-        else:
-            new_token_ids.append(token)
-
-    return new_prompt, new_token_ids, placeholder_ranges
-
-
-def consecutive_placeholder_ranges(
-        num_items: int,
-        item_size: int,
-        initial_offset: int = 0) -> list[PlaceholderRange]:
-    """Returns a list of consecutive PlaceholderRanges of a fixed size"""
-
-    return [
-        PlaceholderRange(offset=initial_offset + i * item_size,
-                         length=item_size) for i in range(num_items)
-    ]
-
-
 def merge_and_sort_multimodal_metadata(
     mm_positions: "MultiModalPlaceholderDict",
     mm_hashes: Optional["MultiModalHashDict"],
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index bb883acdb44b6..dc0d2d59fea7f 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -10,8 +10,7 @@ if TYPE_CHECKING:
     import numpy.typing as npt
 
     from vllm.lora.request import LoRARequest
-    from vllm.multimodal import MultiModalKwargs
-    from vllm.multimodal.base import PlaceholderRange
+    from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
     from vllm.sampling_params import SamplingParams
     from vllm.v1.request import Request
 
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 01a5cb5548bb4..351b358155801 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -2,13 +2,13 @@
 # Datastructures defining an input batch
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional, cast
+from typing import Optional, cast
 
 import numpy as np
 import torch
 
 from vllm.lora.request import LoRARequest
-from vllm.multimodal import MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
@@ -18,9 +18,6 @@ from vllm.v1.worker.block_table import BlockTable
 
 _SAMPLING_EPS = 1e-5
 
-if TYPE_CHECKING:
-    from vllm.multimodal.inputs import PlaceholderRange
-
 
 @dataclass
 class CachedRequestState:
@@ -29,7 +26,7 @@ class CachedRequestState:
     prompt_token_ids: list[int]
     prompt: Optional[str]
     mm_inputs: list[MultiModalKwargs]
-    mm_positions: list["PlaceholderRange"]
+    mm_positions: list[PlaceholderRange]
     sampling_params: SamplingParams
     generator: Optional[torch.Generator]
 

From d03308be0c8d5c0e367bb67e8d6e158eb373f5e4 Mon Sep 17 00:00:00 2001
From: shangmingc <caishangming@linux.alibaba.com>
Date: Sat, 29 Mar 2025 01:33:32 +0800
Subject: [PATCH 083/593] [Misc] Remove stale func in KVTransferConfig (#14746)

Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
---
 vllm/config.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 5c73ff56ebbcf..6a15109c6744d 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2986,12 +2986,6 @@ class KVTransferConfig(BaseModel):
         return self.kv_connector is not None and \
             self.kv_role in ["kv_producer", "kv_consumer", "kv_both"]
 
-    @property
-    def need_kv_parallel_group(self) -> bool:
-        # for those database-based connector, vLLM does not need to create
-        # parallel group, and in that case the kv parallel size will be 1.
-        return self.kv_connector is not None and self.kv_parallel_size > 1
-
     @property
     def is_kv_producer(self) -> bool:
         return self.kv_connector is not None and \

From 038bededbac67e873d3aa6155c7c05674b98db8c Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Fri, 28 Mar 2025 10:37:52 -0700
Subject: [PATCH 084/593] [TPU] [Perf] Improve Memory Usage Estimation (#15671)

Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
---
 vllm/v1/worker/tpu_worker.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index 4d9a113e39ee4..c8691ee87fe6a 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -161,7 +161,13 @@ class TPUWorker:
         # intermediate activations.
         m = xm.get_memory_info(self.device)
         total_memory_size = m["bytes_limit"]
-        profiled = m["peak_bytes_used"]  # Weights + intermediate activations.
+        current_mem = m["bytes_used"]
+        # Ideally we would use profiled = m["peak_bytes_used"] to
+        # get weights + activations. But there is memory used during
+        # compilation / weight loading that impacts the peak and
+        # there is no way to reset peak memory in XLA, So we
+        # use the heuristic of 2% of weights.
+        profiled = current_mem * 1.02
 
         # Calculate the TPU KV cache size based on profiling.
         usable_memory_size = int(total_memory_size *

From 04437e313dbbf4427734a3ed2d1d650efc57ef66 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luka=20Govedi=C4=8D?=
 <ProExpertProg@users.noreply.github.com>
Date: Fri, 28 Mar 2025 16:01:09 -0400
Subject: [PATCH 085/593] [Bugfix] [torch.compile] Add Dynamo metrics context
 during compilation (#15639)

Signed-off-by: luka <luka@neuralmagic.com>
---
 tests/compile/test_full_graph.py       | 79 +++++++++++++++++---------
 vllm/compilation/compiler_interface.py | 38 ++++++++++++-
 2 files changed, 89 insertions(+), 28 deletions(-)

diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 3a45c35442ca8..5311a4ce21054 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -2,21 +2,20 @@
 
 from __future__ import annotations
 
-from typing import Any
+from typing import Any, Union
 
 import pytest
 import torch
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
-from vllm.config import CompilationLevel
+from vllm.config import CompilationConfig, CompilationLevel
 from vllm.platforms import current_platform
 
 from ..utils import create_new_process_for_each_test
 
 
-@pytest.fixture(params=None, name="model_info")
-def models_list_fixture(request):
+def models_list(all: bool):
     TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
         ("facebook/opt-125m", {}),
         ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
@@ -33,6 +32,9 @@ def models_list_fixture(request):
         ("meta-llama/Llama-3.2-1B-Instruct", {}),
     ]
 
+    if not all:
+        return TEST_MODELS
+
     if is_quant_method_supported("aqlm"):
         TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
             "quantization": "aqlm"
@@ -77,7 +79,7 @@ def models_list_fixture(request):
     "optimization_level",
     [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
 )
-@pytest.mark.parametrize("model_info", "", indirect=True)
+@pytest.mark.parametrize("model_info", models_list(all=True))
 @create_new_process_for_each_test()
 def test_full_graph(
     monkeypatch: pytest.MonkeyPatch,
@@ -91,25 +93,50 @@ def test_full_graph(
         m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
         print(f"MODEL={model}")
 
-        prompts = [
-            "Hello, my name is",
-            "The president of the United States is",
-            "The capital of France is",
-            "The future of AI is",
-        ]
-        sampling_params = SamplingParams(temperature=0)
-        llm = LLM(
-            model=model,
-            enforce_eager=True,
-            tensor_parallel_size=1,
-            disable_custom_all_reduce=True,
-            compilation_config=optimization_level,
-            **model_kwargs,
-        )
-        outputs = llm.generate(prompts, sampling_params)
+        run_model(optimization_level, model, model_kwargs)
 
-        # Print the outputs.
-        for output in outputs:
-            prompt = output.prompt
-            generated_text = output.outputs[0].text
-            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+# TODO(luka) add other supported compilation config scenarios here
+@pytest.mark.parametrize(
+    "compilation_config",
+    # additional compile sizes
+    [
+        CompilationConfig(level=CompilationLevel.PIECEWISE,
+                          compile_sizes=[1, 2])
+    ])
+# only test some of the models
+@pytest.mark.parametrize("model_info", models_list(all=False))
+@create_new_process_for_each_test()
+def test_custom_compile_config(
+    model_info: tuple[str, dict[str, Any]],
+    compilation_config: CompilationConfig,
+):
+    model, model_kwargs = model_info
+    print(f"MODEL={model}")
+    run_model(compilation_config, model, model_kwargs)
+
+
+def run_model(compile_config: Union[int, CompilationConfig], model: str,
+              model_kwargs: dict[str, Any]):
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0)
+    llm = LLM(
+        model=model,
+        enforce_eager=True,
+        tensor_parallel_size=1,
+        disable_custom_all_reduce=True,
+        compilation_config=compile_config,
+        **model_kwargs,
+    )
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index d6e44fa6d3414..5a22cf70aadab 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
+import contextlib
 import copy
 import hashlib
+import importlib.metadata
 import os
 from contextlib import ExitStack
 from typing import Any, Callable, Dict, List, Optional, Tuple
@@ -9,6 +11,7 @@ from unittest.mock import patch
 import torch
 import torch._inductor.compile_fx
 import torch.fx as fx
+from packaging.version import Version
 
 from vllm.config import VllmConfig
 
@@ -285,6 +288,9 @@ class InductorAdaptor(CompilerInterface):
                     "torch._inductor.codecache.FxGraphCache._check_can_cache",
                     _check_can_cache))
 
+            # Dynamo metrics context, see method for more details.
+            stack.enter_context(self.metrics_context())
+
             compiled_graph = compile_fx(
                 graph,
                 example_inputs,
@@ -309,8 +315,14 @@ class InductorAdaptor(CompilerInterface):
         hash_str = handle[0]
 
         from torch._inductor.codecache import FxGraphCache
-        with patch("torch._inductor.codecache.FxGraphCache._get_shape_env",
-                   lambda *args, **kwargs: AlwaysHitShapeEnv()):
+        with ExitStack() as exit_stack:
+            exit_stack.enter_context(
+                patch("torch._inductor.codecache.FxGraphCache._get_shape_env",
+                      lambda *args, **kwargs: AlwaysHitShapeEnv()))
+
+            # Dynamo metrics context, see method for more details.
+            exit_stack.enter_context(self.metrics_context())
+
             if torch.__version__.startswith("2.5"):
                 inductor_compiled_graph = FxGraphCache._lookup_graph(
                     hash_str, example_inputs, True, False)
@@ -351,6 +363,28 @@ class InductorAdaptor(CompilerInterface):
 
         return compiled_graph
 
+    def metrics_context(self) -> contextlib.AbstractContextManager:
+        """
+        This method returns the Dynamo metrics context (if it exists,
+        otherwise a null context). It is used by various compile components.
+        Present in torch>=2.6, it's used inside FxGraphCache in
+        torch==2.6 (but not after). It might also be used in various other
+        torch.compile internal functions.
+
+        Because it is re-entrant, we always set it (even if entering via Dynamo
+        and the context was already entered). We might want to revisit if it
+        should be set at a different level of compilation.
+
+        This is likely a bug in PyTorch: public APIs should not rely on
+        manually setting up internal contexts. But we also rely on non-public
+        APIs which might not provide these guarantees.
+        """
+        if Version(importlib.metadata.version('torch')) >= Version("2.6"):
+            import torch._dynamo.utils
+            return torch._dynamo.utils.get_metrics_context()
+        else:
+            return contextlib.nullcontext()
+
 
 class EagerAdaptor(CompilerInterface):
     name = "eager"

From c3f687ac227f93cfb8a5558da871d9dfa68095ab Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com>
Date: Fri, 28 Mar 2025 16:19:04 -0400
Subject: [PATCH 086/593] [V1] TPU - Fix the chunked prompt bug (#15713)

Signed-off-by: Alexander Matveev <amatveev@redhat.com>
---
 tests/v1/tpu/test_basic.py         |  5 ++++-
 vllm/v1/worker/tpu_model_runner.py | 13 +++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py
index 591aa9c5878ae..0d7e8d8d7f5e9 100644
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@@ -48,7 +48,10 @@ def test_models(
 
         with vllm_runner(
                 model,
-                max_model_len=8192,
+                # Note: max_num_batched_tokens == 1024 is needed here to
+                # actually test chunked prompt
+                max_num_batched_tokens=1024,
+                max_model_len=8196,
                 gpu_memory_utilization=0.7,
                 max_num_seqs=16,
                 tensor_parallel_size=tensor_parallel_size) as vllm_model:
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 5401fff2bf19b..695e31f715b4d 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -618,6 +618,7 @@ class TPUModelRunner:
         # Update the cache state concurrently. Code above will not block until
         # we use `selected_token_ids`. Add mark_step if post-processing changes
         request_seq_lens: list[tuple[int, CachedRequestState, int]] = []
+        discard_sampled_tokens_req_indices = []
         for i, req_id in zip(range(num_reqs), self.input_batch.req_ids):
             assert req_id is not None
             req_state = self.requests[req_id]
@@ -633,6 +634,10 @@ class TPUModelRunner:
                     # This relies on cuda-specific torch-internal impl details
                     generator.set_offset(generator.get_offset() - 4)
 
+                # Record the index of the request that should not be sampled,
+                # so that we could clear the sampled tokens before returning.
+                discard_sampled_tokens_req_indices.append(i)
+
         assert all(
             req_id is not None for req_id in
             self.input_batch.req_ids[:num_reqs]), "req_ids contains None"
@@ -646,11 +651,19 @@ class TPUModelRunner:
         if max_gen_len == 1:
             valid_sampled_token_ids = selected_token_ids.tolist()
 
+            # Mask out the sampled tokens that should not be sampled.
+            # TODO: Keep in sync with gpu_model_runner.py, in particular
+            #       the "else" case here
+            for i in discard_sampled_tokens_req_indices:
+                valid_sampled_token_ids[i].clear()
+
+            # Append sampled tokens
             for i, req_state, seq_len in request_seq_lens:
                 token_id = valid_sampled_token_ids[i][0]
                 self.input_batch.token_ids_cpu[i, seq_len] = token_id
                 req_state.output_token_ids.append(token_id)
                 self.input_batch.num_tokens[i] += 1
+
         else:
             valid_mask = selected_token_ids != INVALID_TOKEN_ID
             gen_lens = valid_mask.sum(dim=1).tolist()

From 26df46ee59e05882f7f46268f731a8e4b3ae0454 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sat, 29 Mar 2025 06:23:00 +0800
Subject: [PATCH 087/593] [Misc] cli auto show default value (#15582)

Signed-off-by: reidliu41 <reid201711@gmail.com>
---
 vllm/benchmarks/serve.py            |  4 +---
 vllm/engine/arg_utils.py            | 25 ++++++++-----------------
 vllm/entrypoints/openai/cli_args.py |  2 +-
 vllm/utils.py                       |  2 +-
 4 files changed, 11 insertions(+), 22 deletions(-)

diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index cddfd672e7ab0..813556f90f534 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -726,15 +726,13 @@ def add_cli_args(parser: argparse.ArgumentParser):
         default="ttft,tpot,itl",
         help="Comma-seperated list of selected metrics to report percentils. "
         "This argument specifies the metrics to report percentiles. "
-        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
-        "Default value is \"ttft,tpot,itl\".")
+        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". ")
     parser.add_argument(
         "--metric-percentiles",
         type=str,
         default="99",
         help="Comma-seperated list of percentiles for selected metrics. "
         "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
-        "Default value is \"99\". "
         "Use \"--percentile-metrics\" to select metrics.",
     )
     parser.add_argument(
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 6f498af36a403..ca511c7434f83 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -322,9 +322,7 @@ class EngineArgs:
         parser.add_argument('--download-dir',
                             type=nullable_str,
                             default=EngineArgs.download_dir,
-                            help='Directory to download and load the weights, '
-                            'default to the default cache dir of '
-                            'huggingface.')
+                            help='Directory to download and load the weights.')
         parser.add_argument(
             '--load-format',
             type=str,
@@ -399,8 +397,7 @@ class EngineArgs:
             'Valid backend values are "xgrammar", "guidance", and "auto". '
             'With "auto", we will make opinionated choices based on request'
             'contents and what the backend libraries currently support, so '
-            'the behavior is subject to change in each release. '
-            'The default is xgrammar.')
+            'the behavior is subject to change in each release.')
         parser.add_argument(
             '--logits-processor-pattern',
             type=nullable_str,
@@ -493,8 +490,7 @@ class EngineArgs:
             default=EngineArgs.prefix_caching_hash_algo,
             help="Set the hash algorithm for prefix caching. "
             "Options are 'builtin' (Python's built-in hash) or 'sha256' "
-            "(collision resistant but with certain overheads). Defaults "
-            "to 'builtin'.",
+            "(collision resistant but with certain overheads).",
         )
         parser.add_argument('--disable-sliding-window',
                             action='store_true',
@@ -568,9 +564,7 @@ class EngineArgs:
             type=int,
             default=EngineArgs.max_num_partial_prefills,
             help="For chunked prefill, the max number of concurrent \
-            partial prefills."
-            "Defaults to 1",
-        )
+            partial prefills.")
         parser.add_argument(
             "--max-long-partial-prefills",
             type=int,
@@ -579,15 +573,13 @@ class EngineArgs:
             "than --long-prefill-token-threshold that will be prefilled "
             "concurrently. Setting this less than --max-num-partial-prefills "
             "will allow shorter prompts to jump the queue in front of longer "
-            "prompts in some cases, improving latency. Defaults to 1.")
+            "prompts in some cases, improving latency.")
         parser.add_argument(
             "--long-prefill-token-threshold",
             type=float,
             default=EngineArgs.long_prefill_token_threshold,
             help="For chunked prefill, a request is considered long if the "
-            "prompt is longer than this number of tokens. Defaults to 4%% of "
-            "the model's context length.",
-        )
+            "prompt is longer than this number of tokens.")
         parser.add_argument('--max-num-seqs',
                             type=int,
                             default=EngineArgs.max_num_seqs,
@@ -739,8 +731,7 @@ class EngineArgs:
             type=int,
             default=EngineArgs.max_cpu_loras,
             help=('Maximum number of LoRAs to store in CPU memory. '
-                  'Must be >= than max_loras. '
-                  'Defaults to max_loras.'))
+                  'Must be >= than max_loras.'))
         parser.add_argument(
             '--fully-sharded-loras',
             action='store_true',
@@ -894,7 +885,7 @@ class EngineArgs:
             help='Set the lower bound threshold for the posterior '
             'probability of a token to be accepted. This threshold is '
             'used by the TypicalAcceptanceSampler to make sampling decisions '
-            'during speculative decoding. Defaults to 0.09')
+            'during speculative decoding.')
 
         parser.add_argument(
             '--typical-acceptance-sampler-posterior-alpha',
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index e956920c2f9a7..218a8fbe10b76 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -247,7 +247,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                         default=None,
                         help='Max number of prompt characters or prompt '
                         'ID numbers being printed in log.'
-                        '\n\nDefault: Unlimited')
+                        ' The default of None means unlimited.')
 
     parser.add_argument(
         "--disable-fastapi-docs",
diff --git a/vllm/utils.py b/vllm/utils.py
index afe68a2b8cb3d..bf83b38ace80d 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1212,7 +1212,7 @@ class StoreBoolean(argparse.Action):
                              "Expected 'true' or 'false'.")
 
 
-class SortedHelpFormatter(argparse.HelpFormatter):
+class SortedHelpFormatter(argparse.ArgumentDefaultsHelpFormatter):
     """SortedHelpFormatter that sorts arguments by their option strings."""
 
     def add_arguments(self, actions):

From f3f8d8fff4c5354d5214f0f6f29e4dc5c4e3a8ca Mon Sep 17 00:00:00 2001
From: daniel-salib <danielsalib@meta.com>
Date: Fri, 28 Mar 2025 17:12:02 -0700
Subject: [PATCH 088/593] implement prometheus fast-api-instrumentor for http
 service metrics (#15657)

---
 vllm/entrypoints/openai/api_server.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 7dbe31e62da67..18d75a04ab0f3 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -311,6 +311,7 @@ def mount_metrics(app: FastAPI):
     # See https://prometheus.github.io/client_python/multiprocess/
     from prometheus_client import (CollectorRegistry, make_asgi_app,
                                    multiprocess)
+    from prometheus_fastapi_instrumentator import Instrumentator
 
     prometheus_multiproc_dir_path = os.getenv("PROMETHEUS_MULTIPROC_DIR", None)
     if prometheus_multiproc_dir_path is not None:
@@ -318,6 +319,16 @@ def mount_metrics(app: FastAPI):
                      prometheus_multiproc_dir_path)
         registry = CollectorRegistry()
         multiprocess.MultiProcessCollector(registry)
+        Instrumentator(
+            excluded_handlers=[
+                "/metrics",
+                "/health",
+                "/load",
+                "/ping",
+                "/version",
+            ],
+            registry=registry,
+        ).add().instrument(app).expose(app)
 
         # Add prometheus asgi middleware to route /metrics requests
         metrics_route = Mount("/metrics", make_asgi_app(registry=registry))

From cff8991a50dd35c2cb9d2e6d3446a0051cac144a Mon Sep 17 00:00:00 2001
From: simpx <simpxx@gmail.com>
Date: Sat, 29 Mar 2025 11:33:58 +0800
Subject: [PATCH 089/593] [Docs][V1] Optimize diagrams in prefix caching design
 (#15716)

---
 .../v1/prefix_caching/example-time-1.png      | Bin 34837 -> 47947 bytes
 .../v1/prefix_caching/example-time-3.png      | Bin 37069 -> 51241 bytes
 .../v1/prefix_caching/example-time-4.png      | Bin 41530 -> 60607 bytes
 .../v1/prefix_caching/example-time-5.png      | Bin 39727 -> 55437 bytes
 .../v1/prefix_caching/example-time-6.png      | Bin 25462 -> 54829 bytes
 .../v1/prefix_caching/example-time-7.png      | Bin 33144 -> 55922 bytes
 6 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-1.png b/docs/source/assets/design/v1/prefix_caching/example-time-1.png
index 8849ca0237c39b4c428c4ab74c08b512812846f5..d5a165ff6944b7edd95f215685ca6185692e8876 100644
GIT binary patch
literal 47947
zcmb@tXVl}=)i!+3(1sR<-iH|&YG%NaZCR#UR&%!{S&dL!)NHHEwj|07FhJ-7girzu
zkO2}(r~yJC455Y^N(h8rLJy&LnD>YzZ+M@zzO|ko-&*%=OULINowCo~*S_}FiOo8-
z;d<MwH#0M{VTs0>nVH!uXJ%&2%vl?hC_D8Z24Ay$hQelk^XPVO&&=e?0U-uvs2U?>
zX6JPAUspTBDc$yiozwWvFieb;Uf|glxCG^<-8WQ2>Hq6DI0a`Hr_zgIl!J0RrwiE>
z_(-FBreH|^SN)ErSpV)QyJre?uv@LuGpxY&^_|lgxG!0OG6LVA7<@KZ@F9YKD0L{5
zJv6sFxGXr%ND-87#RztW$!t2eXBq*;yH#keR^1scg6oknRKORd^oDlQr0;d6dm1*V
z4(0bu?U@EeMcuPaP>F<6Feu8VQrSI`<cI&x@4uDVj@t2zA3QCYx)OA>e@#18NK+#i
z>e3NyJAyr!Aad?s)14@uZ-BM>e&zo2?4)rJIm*9E`^v=Vfq^8Y4p<)lU&UZb@Xl!h
z>_Ok?XkNzzKO6sE=1@Z`MMfUzikQzK-BB>0I{)gx>VP@@>tX+DpOb@bA1NVZ4J;=)
z5GHj+(fl#%Hbu+MdrG~g<U4XItBi{zC6zovPft9D%M`U%rEC;K8LTk|RVBT|p=uoH
zxe)VMQ$fcyMw0?mFXn7$EM$Vt&`p>9fE5LgRWsy>;zucmA!H^nLZL!InSM5_t5iS6
z9jMwctfF2a#74Q68cLkPWKwL;rfNht?pF!D7c>ISFO+(=oFj4un&-jL>5?T6yFIra
zsHsZBsO7pNiybI~3RPoCp`Ax5NVly)!O6zLNDB!g)h9D4i5!<m#Anl-8cH3{ovL`M
zSkHFN0c3h;hs%35#N*x|r!=cEcuLcedol0(wK3TgGRbqNa?8oLY$zbJR<k)sXXL>U
z>;)!GAk7E|otEB9mAc)!ZswpO&a|?5Enk@CB0`XaK|D;!)xcI72*uQ=mQ2!IDvcmz
zyaUILlF_b{-5gPsP$kmLZl{;`b(GE&;|gAHi0!cB*KK4poFZg9?_$|(ww%e2h$1&G
zHByDHUNXx%H42OMQje?(Hq(cg;W+Dpv9b10w9uH(vjREF6}T*1=t$EHB-DYJU~;O^
zpyXDbKy{f2h;$!k3$+@a>R@eEE$c?T45rQ6Oj0);Te)evUPTOsX_SaG>!MbrSI{AF
zbTOutiY5?R$}1HK!%Aqq5Tm0)B{RmR9Bmis4NA<^h7CvL8(O~99Je%sN5x{h68LZq
z^9G=cHqa<o4)J}$gX75dn{g@?ySavy)k)PH=zN=)28g0n@eVT`dp*A@3?NLXa1**{
zc8b#!5>l?>^$QimFu*<N&2@V@ui_v@Pr@=8uac>AL2oKl8fMc^Y13?>&b9g&3B|O6
zbPY^XK^rGy>HL6&F_ETP(hzD0rIb6R2dzRd$rL~h((?jFa+{IS2GwB%6Q{Y3KCNbo
zp`V_r)BF$=XT>qb=R(h-GRh!3WV!`fqRVz!O;`0H){48N+LofcjglFyXP2o?J6Fg;
zZr36TUaDoWSsy&ez&k;C9C->+%QKB`2y`L|{wkE_L0BwxG#Wf9QwK$Kb%aUMRILE5
zMOw9zGbvRI(S%0gNxl;XSX%T6%`4h1HB;&hGF(3u*pSCe+I3be4YHjepBdIz%fKqr
z5;`^8(U@z2zCmYH88dwp@72TRG^I3ah#aT{5Oi^bf(m$>9@z3kOARw+i5yj^js|nJ
zq2s68^2l@jzSgqPyyjGgL#$Y6)l+`GG%c5gMwP(FP^gu2*%qA-I;yXjc%-QlX4;6l
z)xrRCVX<!vQN=DAhU@V3kU*u0G4LH(?+eZB)UdKyFo1SHET=kJCse173sF!qCXPu}
zN!s-sHfZbosK%RosiQzG2d)%acr+D2rCe5bq<*dK;(7+$a(?pJ$&n72GPnwdW3t@O
z5<bksk)KH?C&@G_T-OMIl#)r6Vg{p!)YPFWprG3-j-gSeVvRaQy(kQbx)%0Z<xo}}
zQr1fbL=fPVkVsC?q!2|Jms~p;eKf8M#<*k%xt1eCqm(9lZnY&r-M;GiT-eU$AtE!(
zrd!6CcNz|^veT~b(jA}6N`q`Z-7p5dY>N<C&!!4s?xdYc+d&=X$rjw0WUE=*9JDi0
zF+~ZYry-5OBrhrBs?C#7y%Hf#I@eAQ2Ejmv_)!KOGm?V}&0*WaoqVcA1CjQr?ih?y
z9N@_z=YS<6C*UGwWr}K#8bgC(9O)$#l0>>&F8ORpCV)UOQ6^iY@d};EdAW{&l}xY0
zF(b2@CrTaL(pwX%q{AePD!sOmrjSHYks8kT(=psi2lyBzvz;mG(NX}+KZ)l>!fw_B
zQjz;~r4^Fhk;G>hiK3IW$5M12@jx}TsEAMm3VPYVCe?zQmpq+pwN1&3Bh}I;d9<EW
z)n>X8D;}>^CT)&FZ5u60URXhUlQP@QrY8*Oi5@`xfY+i4LQOEX)8mImWn^dbCKI?Q
zD+xoS?|T&k8I8-FS<lo!6Yv+uH4mIcYyj!t&dODC3NjtBMH59--7$(~ltL4YQD|8K
zuz*FSV;Y`3={4LzGRs8r1PU&96wg9ks!jGm0jQ3(8!D;-+YfI4WB&gK7Wl8F2gg6b
z4*vlMRAF%`&hhm_DbG$tQr-$(;y+j*{GV7rEDB<sr>i-*FS;r)N@i~a4H<h-GJ5T@
z?aTRO@LG!`jB%<pNJ|sF1=o6s>IOq&>TuuccH0FB;4X3+;aO$cG5gp!DyF$^G{AKu
ztX4ZCT%8I6F92~6<)KMct8ys?>kbVS#`R{+4J^>6UT0e+xW{xHJ4N;M&>ogsb*U67
zDKuq9L{9CuOmm1gYB+_*kc9?AbZT@`{c#tJCALy!uaOx9Rb`+czJ`So7?3o~X_18B
zBfUM$bRw-VU@Db?&LvaokBevm6-hS-?#W69vEy+fQEjzu<~o5($uSG;H&Rt8E?>kn
zM6Txh5k!j|io<3y-*HAjn`66`*vVLmvPLFZEUMhWCT6uEQNHA(EK;veQpvGr^meLF
z44_o8EyoR_08azXD3ugAN1#58B9cZyby{@XR71g_I%q;_jxLQzvnler<mgt<nV_gY
z)p}KlVA9QoTyeNgty$_3Q@IJXT!fisWS<4k;vtCgMG+H`Vt+W*!!#BUMZ${=uCMb}
z+bj%PB+#JMIGcfua!G(pR&0}XzW@Y<BvWa%Mv*wAc?D<MhH^TSr~?O{QgW>$(}_~R
z?I^4aLb;^ml>sD98iJl3!LC0DioI;vDL^tXCt6q6dTQDok!r40fd#B1s09SKunr|)
zLzZoDxY2MMW`naPdWWbg4O%1Mbk&3!O-vIfy(#SHL#|Tu1J9_59gikawi(l`-)ae(
z)ad!l1Rd)-HBDIq9U8+3H=4>2JuowPE1t+{3eWHz5#`th-!G-TTvUT=eWqE6eWqm8
zB8q^jMuE&%OR`LvS%qRTD@ZvUP<iPtFl(Y)ijv88w2^HZbXg~$AjNg8u;R3gimRF$
zo8pYJMU_fZ2-nSNFF!UUOtz6$sR`P7^sp%^8a?nU<3fa$Wn3Go$@ci^fTCNYl+vP*
zv}z1nUB1;O4ZA4UsvVBaG|XHEhYQ6xH?Gi*-^EldlEtE%3e3_pHX4J9s#}c?Rp6C@
zKFwlNRzkbISdaX0+R~bqSp>$q%o3u8vS6DEn#Oj)l7YQt%Y`^KmHJ^&rPE@e=hJ<x
zpC2<&+~r4ppoF==Z^V3yx1oNu)gR=ToHh!;kn=dW108&SpGWalIH>0bV<V%$1E!~x
zsX&Ujz9JSfCM$DDKlA|Tg<C#cv`c!Y&=kS%sUTms-I_W@2E_N4*CU-cF9GdMVnZOQ
zU3{eaUU^Uz6-F<Bhssg`0Ek3K;yfM<!KOB#7$pW=5ofWIZcrHsF4bjDvHC<<O7}~$
zo3D?^Y<39ia-oq%YL*OIV;s>RvSuj~x+ogBm1e01L4=)ngZyBUs>`+(s#TdCWg$ku
z8(3jf5z?uYPxESGqbkF3yCg=UZ`)3mz%riJsOQ@;qq6m8x80{@wbpjjMq_}eY?^5{
zyLe^XbFye|oRXA|i<rKoxxkYEhdPsM<$PouncYe<u1;eJ%s~#QD$L0@$Qr{93uwE{
zw40Tw+M%V+G@Gsi?T%^1n4{&yO4)d((=!1eLMu?U((yW?BeK05Wotwwl_`z`)hH`k
zqf@Br0<Mk{Gllt<1B;`&&5nasrZyJFLe=g~NSj1?Eu9M!rN#4NhfL#B80i#<vF>Kt
zS~okUVl}8Md4mEEhK)?B)x=~bRK;es-;IF3>s7#zFbK8ic-&QMiSkf!2uoW)Uyy87
z<a4qqavd_48Kx{4irN!J<%iiyw(m0$Tgml!c2sR5UED(_dRFagZO~mC;`!KCf&Rl3
zhAT;0u|~0CM(81^Eq4NN+<75m!-(2vx`sIn6P*qIfbq#U+YO2ewM4&jm^-SggOWlA
zoFg`}i45l?5$mzAGU=g&niUdXfefdVqw=X>(#7Jj#04D~8IxWm4$Ptej$%a)n)c9v
z@?G90T&oSG!>ElQfj8x2wvu4VtnZW4oLI=2Mi`ESJ`7Y#ix*06iLvQe=(mP4n(m_{
z9i<eK1zI;P0o@S@g00!fhNcxl0V<6q%^DP-e7copf@%tiC!I8e)RYn>l=5&?@8y9r
z+2`wl#zQ244C85nK!ZxVg;d*AOA{NKQ5s=wM8)zlp98B$cGWn7)PN_Gr3Cj9UJ`)^
z*#q{HBxB8>(T0R@90GiDi1fzoLLXv-daUL~PS%UNxrvGr$iT@0>_=uoyl%?LmF`jr
zkpL_OU>W{*_ynAF_y_o;5tMUOov%f{qVRp3G4ojJKlmi>CtL+lOYk=3gAc$Ofw5x+
z1$^)=6_g0N2YSQO>A;($CliJ+YPLfFOO6rzM46<g7^d}F6|YS-0>Ic{WU1AW(=aPV
z?HC}HZX@GQaWp^VYcAa%k`vQ*i5itF`9Y`E0QDKZGV(}*&%2@2aOAQMr^rDlsW2X~
ztX;2SO~jN4sl)_b6p34{N~Vg-5mV^p@os*k6EI`tVw-NGT8Y*|&Ci$Bkv+2fZfg+a
z)rjb_CYTt;HHHCF_hL<>+L0QTQH;pJBc|q*Tj3~EY|xpA9M)_s<yP}_wcFHf6KQxZ
zDkhr%{y3WS!i-ufv$br_4{9OJCkjn#G3umm+F5I2^KgOYp*W2;Gl@1W`w4I!NK?@Y
zL$ns9l0{B6c&)6I<pJ3nCdZt!V45y#l``G*q*j!lYE(QVB(o~wm8h8`{YKO)1_K_7
zIwQ=|daR<Nyb@?eOnWKKY{;@@Bmx#pYSRg(<90l?09!&5E!(QKGNd=16dhqqr=nWB
znJ0XnXIgzSpW&;cvf0x4H0BVv-|Y=sV5|VE$aX<&)c{52!gWJ0+ugDpIT}lK3UCwn
zGn0l);J{ZT0;Ao}6`4d#9Jokz{b8r>C`~5F8KlM8StLul6Q$RP;)dH!EK0kb9dL~9
zhPCpjkj4u@36tY~rK0*8-{#|vz#=S<6>DY9twnaR%<BMiF%^qMQX{ey<K<id_|I*-
ziO17HItV>x>N~9A(Iu~6arz=^^DI>xv4tUGmuS`36E~`nI0>YiaU&qYWWB(klv+;d
z!5H}TB*L}n@8Pzcc=%G&$Oc^}%L&;|C|P-!E*3F1*XJdC8VBB}#VByUTrb%zAR<hh
zYIBxdM*5aLZ2+!<b1^4X*Kipdp+R!^AtI6OiA83mJg)bVfJ(PuwQd79a)>t>%ZoEq
zchJZ{-LMvdnw^}JNQ?Nri|ir#K+R`;E8Q7-&bUI1A~)COf;6Bu6Z2FLOkVIiW6y6E
zfCCPtD=<|?nv-@$ikgN5xD9Vutn(eCCreSQASJ$&!6%9c<9cE}E_fDe5t!u@VzNL)
zhC`%LZ;)b;fGxAgINNr|DVWKRfkJmY7lz2Rr@)Oc4Ai*P^Ah|G2O{K+(ZC+3+xFP$
zMyfLzutf)gQH!Vud{svr-vYg34j))PX*;N2^UQIXbVogCEVf$?um@yE^F?in5H!#<
znAW7~L>Z9Jm{lK+IUOM?X%%pNx>z77brh#6VWVldl{O4Ji49jX?Q*$|v%pC;`jFe8
z6g+h6#WH1e_+bY%?No;GqExfpD~(NQQYAVxOT`*!ne8xHnTI-KzKx+!4U+>Nc)uYv
z?1e@z!s|XRfJu^45aw&bq*o6*#i3KRtu*K9sh;Mg+e$$m(plFN>u_lZfx2o`s!fU(
z5fd$DQs)PN1F}rGD5ho8%R4+DaoJK+^Gv8(@CH(euP2AXZ1aFxWe8467c`y;20mqG
zsCdvMlEo=z(^ModVtFi2)VPQYr-@Z1!8|gf<cya)RtJIjP_kjSCW?5Drn_NmPRDV7
zoXdz)9m~OG8-kT)F^92ov+f2`y;6?dP9+1Dprqpv(#iQ0Y>NehGJ$m?r)b!o$Y2Y|
zRCNp<=BOE>Ao9Gb)l8L}CW``NeL6@SxxkDKXefipV_7p37AkO$81}|+1b&lJ9-e3H
zTxC#eOiKCK^5NLcWEI_jT&n7|(Og$c2ht#4PkQp<p;>k)s+Lg_wJ;q@KviM{S%F2s
zT=zV?%2Y&NEQkJdB$b?W*Qc8erw-N#8@hEam+~ggc$`b+8ly@(F~wswhI7??Pbj0!
zPPQ^h_!Uy+gD{I$xh81V7{>JkYtl;DF7{cO57j{~T^$<$k(Fw8S_ixtX;Y*Xw)siP
z)dBA|4fGmHayVMC$I5i96^LF-C`47t&5k44Z!3uiXNi_GHXE@qO+nQ`4IZLCSPpDD
zOiWPDklhY=DrPa(gk<5GpptMJ&h?u#?J}C`454<ON8?_Q0z)$)mWfRxFG`VAL9@xy
zPqWR5U{#T7n3Ap5R77MNmzCNeP|Qr$o8(2NA@-{Z?Z<-C7eE7lS`51g1WspeEY%Ub
z4|Uxvkor+B)7SW+gZWU|lX<z5fSYkeFa##c`*z(kCWaqXg3M%E>DF91%s^9iltxEL
zV&PN7#1>$g#j!@<rE?Tbn|V;2z?1=52Q&+2B3{_D;6aEExDhY<VQZN0PT;93%4yE!
zr3zP0I;xgT)&rVPuVfs9O(vZT8vIEn?SZd+c4$>2a{4{=`QM@+u>U~W|6g{d106b!
z+e)amz(l$g9V;YYEs6Bx<gnYz`kDVkY7``qx=bN6mT56t%9#BW>~UkxG0;9<t=2K4
zHp$cnWhasNGLRJ3AW}sW^Q^L1Z0RZp(2)ZhY2ZT}k*b4U+5%Cc3DNXjoEx?n5BT$g
zfx!X;Hy8jXIU+`*woL$NDfGL&P9rY~$u3OCEa!`SAms68y)bR{0DU{+gib^YIwdf^
zK2~(D+m&(LlE(;AYY43xk*`mM#u)FmglbfAs&D`xAf``bbl4ANo}MtHj=<DQh#Hcp
zL<a7VYZTo=Mp9ZsG8NRP4ZGbWK+FcL3^w7so@xb|eji3qR8jLtCfQjW@|ghYkJ7_Z
zE}+p$p*yONA(9sJU3}nuKlBM&MWKA5?$h|7q&BPJlvS&B8yG@T(y8Jo#mNOK7Y5<b
z*IWU|*%EkMzEEp6dmsko6zM9Pje5Na7SmEMGFzE^5R*=nKvLQs1RAW`7$=*MjZUgm
zx?1vjau<_<-`?}45tmL=g=WY}O^;7ihi;E$n`#MXyWuEy1h=a7$Lyr%1k)klkto|}
z*Cv?SD_0RWG%~Hhlw<}znpQF>+Y-XAQ=K|KV}=BpPdAl3mXR6~-y+8~zZN(^dFRvm
zRPV(tW#}r?D%~#w!n>j3U0fv6L#k0B?7ZQnkP0h>HHn#avH<j90ov7)jj3`Sn>SHi
zL(+Xf;8uJdpN4I)N*rtBHVYgRb7&4DzDe*UHO6?vWd)=;%z0#l<ODbFWf@Re$z`~D
zn@YJBkLeMRNixogfy{F*nr>v!O14u>vpv}dY>Zupr(gjsV=PUY23v&;Hxo;M-KfN^
z;UF)$K~1PM@kH%oNG)X!dOb@Efj;H?0}2GuLNcYp({vGo2#OYX;CEspn~uQteSct6
zVwCFGcD^&1+BvEv7cxVaE0-Et(-{p@9T3(l@nset(1R*6wxv9yQ+ScK1q)(FQMp9d
zn&~drtX9HF+ZYaMuRRToustqIagks<q0%peg^8bSppy_Pd6OQ2qU9doZ^+@eVF;N)
zzM~YCdeaetMxRg;7-h+{rqFe%ho#6G0P|`$r?9~LA5%`lWRvl;6FltofvSxsX`^qd
z<w<JPbuF*S76n%a;FpwrI%iZ#V6pNwlyxz1^0l5eZ8(rV9T-x}VH;Tl^dwHqma6AW
zo%LGYxYaA$re1Hg$HH(N4CRK}6$>M??ebZI#|EABWE9Ix(FkP;&l^)R#PWJoqO%=E
zFtXEOKd)BP>O=`jQ$!g@St)3Ojk5c0ZEVw}h;L-nGMCJ-S|kgZxCm%}GTj+9)iFJ+
zxh>tVa<(%jjZ#}mH-Z?&pst!%YE8T(`^0EmLiI6LxBFn@2wmzm3cjmXvpHQIQvH6;
z)5fmmPlZ~uVf0F>E#zq;L`YSvhkypfC841gI{@bfPJ_YtjDz8sMyj2O(HxV`hP27%
zVxit7Cw#;<CZv&IY+Y!<dPkdxY-iL;DWiUqN0<SeHG@p54$~exnL=Hwg9b3cFdaVd
zec)VJiXKJ&#P5K3?SL}KoQ<+%JLo648ccoCZJT&p4+dl=&1WXOh-Di&tL!pCG<CCW
zwLeA^b3{f%P@hpEfOf1LFy;`|hq6Jw0u5?IPBRG-Hlb(`;Q|8kDQsD>h5j(^Krqox
zFo_f+StEz>{s7on9_l%Tj5t)RJXh?eiEhG!Ll{NnlJlJFP@UMIbYCoVC<ah{)e@eo
z)|8q!iN-Dn!t4HoO;vMJU-zez0Y9X<K3xEy6}08tX{HlEh#>Y<lMxh$rCW_Ys`FGf
zk_DweBqD|&t(+>PLSGnLfUG4vL<)S|r>aVn%17vEROS*KKNyYaa;F@O8=~h-sD8Jl
zvY<I>YuSou_f?cL+#=B>S=I%r(&iN^q{4O~stwsF0FSY!b%p1_g`X>;;EPQN$hPnB
zl;ZRzGTe?#85s?l?TOF>kw{U_k(Ft()KRrjQQA7L)T`Y>i?%0RolRkW45$S+YXP59
zml_p>7+Mn!@hL=!dK}kxkd9Sr1ELjl!1u!v4-6V#=j)2y#qwN<>`<ecW&!Urq(~%7
zfxyuya=e)L0MFWFm9QB|647m1lMap+0S0FCVJ}@7Lz7|SdvH83vx$REz=f(jF-9~o
z49bPZFxR%UZq#waAcKr5WtAFYa2zxVe?qE=+@!}NN5Wl%Zx&K^kWwRttFj|ZrP*4?
zOW`yx)={X`DMe696P<+116+-3Q#in<^*mWpjC@5-Ihre3d<|6qCP<_f>65Vv@FA(U
zvWYQmq5pd}2khVv@r<j{*?hfLs!NQPkqJWL4EEo6h6m9mfM<wi25OpWy2NEPqTHuJ
z&QRB(%x>4K0Ee=t6ufF3#6;w2$?A(xKJ-&0tGRSf8b{)k)P(FL0jmR{Lm6$A?N-Gs
zf{mIHkO)$0`I%C&E}P{t+$>-Pvdt!e@s{tyfnl&f&;`p(jev^<wlA^!zL*FRCJS{1
z*44N-EntIkuBf6Q3uPQ-A}t%&Q9MS6ItquW9H{Chr<N)VD5~*2stMBY)D?WbjUzz<
zPYfj{YaSfAjX~Im`~fY;J)?`Js?CV4wTEH}woHkM2?m+eaeSyN#o;)eL(9d|z=e>B
zlO4y5KSA=n#M&uVP2kid>;z)<N=>UHPO1}<iN8#t8Dy&XYQBjA=#90AkkcW*Iq`;>
zL_Pc)*AQW`*p(b#!@EN`1=g=;>wr-KdyLrt7w3suBd-leUCGG}Ce<DI4Vfz?$wwH)
zwzJJ19#ugUr5P0fFsTnMMW~ws$!CE8V_pShIdN&ZJkaaOp)3JLr#^)zVrxwHiwd2B
z3+eID$U_wpFOQo=w^ME~Y^&O1GK^Sh7GaqfM4nSFmB=A(J5g@}I+^n2wp;GXBc=`8
zF`*Na{-8NA*b-gjdn2V>lhQ^7HX5c-tpG<T7lJm$WFrfHAEFB*IrBZW)eY-VksE15
z6YD~CVBRQ3wsdbqI5{yL2HwEex*fTYiaY`Yf_0NZhl9|sMXkg+;2o^OWz|BqHjF5K
zQmhk6aIf5G!-hwK1RNvR70Q8D0S5<*tEDO5E$A@riPgenFcN&QtRU0}B1@I91rSaX
z@D2<Qcn8kK1JUq>W~0Ye0?EmYRLl&g0R@7U1R(uOpiNn!CNMXZ$4%f=03M}~P;wRw
zo{!aZ)3AyB6oglGyk9IwjSM}^K(SX)oj5z1g18`DE<3rX89Ak4FQpX+GEK<ANbJ@M
znMw&zq6I&jF~oK(CgrVK3pJp!Ns<+pBWhx-)+co|D?^=zVzfpq@b!DxNGlONiL7+>
za60hIU@-AGoHjkBRS6K1t$ASyOhg=jT@B$$lc||jtJ3W-1WJs$J*%jN6RlM?ZQDT8
z0$_&7f}HoegH+Mb8{-;qbjfnBW`(6eq*!&>l&y-K!-Jp;S+Ynvb|<iyS{E03t*P&O
z!_*++9TuC0wN{6AXnBC+65Y>(P<gu6v}NE*%PH5_e21H&P7A5DY$d1j{VXz86=>>7
zejBAKIA+4*jyTF%axu`H78*M^JIGfG#gc_IFawt-#RhO6du0WoZCbYD+OVF|!?NV}
zyO2;IqhiF@fIrY?COj9hX5hNjCOE7hMi1sQZsyY<&M@@`T1ChV!7c)>!Ea)8en57r
zPIEHQX+XNOOf6#SP{8*Q#h%0<wT^0F7Fw=nxFQJMDT<Lvvd@O&45GqJwT@^&!GVYw
z-bvR&-<<+p1QO+Gr3Nb_br9l3v<yRT8!c;5#7>Isy4S|^EQ%C(-yQez?Y5pS!;%}~
zd8b?e%uSWULki(F4WuMZwF-}?oMx^|xoM`%m3-X}6nW68k3|@m|B5&sbcC2LWF;gh
zD81Ym@B4BcxSs{hB@1AfzDJE4H6d&oiV5gyy{?MMf*_r~&FLYEg8V?gGXl&4Y2hSL
zM3q(*QpROK^1$E#lX}D$rUNd}Q3+{qXbOU}%@NF3#2TGdn?|h2W86`!hLy@z2@nu6
zOmOmXIt)sEkZab<p!IRJptPF-GRjq0B@)CmgK)XQC}O%+i$G1chDlL1NAhU5&~E}R
z*0M?lQZNBCBIM91z<fMfC`Pht5(wN8j8dO)5KiQP<-+*5+o!9@(C(HTBrYk8TWFRf
zo<ww|L5iH!th$Qd9Pp(o)MK5B6sL2D<LFb0lB{XE;^dqjg$y;PnJ4W=&>q&u2*?O3
z@q}bpxjZ`Ipt4g3keSR;dVV-HCKMbdsBOyjt0SS26*M(P+kS}n3~uL}OeZRHX%7qh
ztd&y)G>@7V2-WtSjL}cDVF!ZpRRtuFxNJ#m@qNX1b-o7SteUKFSzt=FG1TQLo-bth
zexqlZPOr!59EJ2-NEsZR3V`MPn(nlxNI(N7heJWHU8POqif*}F)iT6!&I7WJ(S20K
zYBu1P0wi56qr*1bFQ&WDa0()HL@y|1$5lK;$DG`7TX-2Z`VnX)PthRN(k<X8bYe7?
zhQlmd8$f6_UuqyZhmYMLM*^+|V>o3rq)4IOh6iGch}(TzM%;c()X8$b<)Vhg;~0}l
zM~Z9pnPH)8%Ph}lpgNNWx~JoKO~#vOHeRjdYeZSfq<iW9INz(&a#bz+J+rRo)4)q(
zIJk{;(tsZV8E7o)jXW7kV?e-y7M%n6O|5<-=+*3&Rw5I_I++R{4(Epqz*!@Z7l6fh
zo)8TlA7y)X)q;%xn+9q|Zx8gGI<2=XKphs{`XJ%}+uc$-sk{L*F5Q(tUk+GlzEbDl
zI-o+a`jBa-8&bAc;5j0jPLBzcV!MqFoo{svvQAr+=;7Z7wx*rf7ZtGY#j&GLpr}_(
zK%As;B_cnJMF!wXOqhTi2Hy=uk`oo)slYDQDw)Qh0-FMzX!3lNfJ09J_?9FU6B^dg
zOos9R;u)*cR<gYwz&m6`OgouD?|+X6L8Bky!MF``4Vx7f=g5hRRhf2Q1@PiO{MP@-
zL{74gYCPQq*+<+E&2fQRGv(e`uNXBJi%PsYaX<o<HY8ePwWr%)kWtx}oWx3YVL<4$
z%%IXpmotH#c97a6RglL_Cnp;jk9B)S#FuLcbRq~p%JO6aT5NBc$I@M3YruYBTDBYE
zNZAt`H0BeAR4RcS$7~AW7|qGh)iOIZyP0Md9T`CDVVVizdXSze#CZ^o&vw!(&5Y~(
zNXESc;QMv7AP?M9j!ip*zQ7xdEjq(#YmmxxW0SBPV9P)Zk?hdzl+aZQEXtPqB!Ikp
zo>vfrMo_w&?;zQJA0(vZ71#A$-xr|-erg(u=BrsgLdLF`g3`<cB=%UjW~)@3`U+p{
zgdmKV93faj61vhY=r+Zg)oCF?GG-u^I6m};O-$G5X$CXXc1zE9n~i}!bOFPd0}(Ef
z?b}knSyiWKp&oQS6r*voD>zus9GEZ%=QxmhqG0KOO+|Rxf{8{LOT9Rrc89Lu4wbQ?
z(-Vm4NPsrz>HR5NHOnJ5)0q~mLUU9F>46H=8CFv`$c0cDbTT1qwnM?~UIf%^P_{&{
zXVwre>iHJ0o0$p|rH7aVH<c<yauwW<v)MwRf(X0j_jIgR_Zg^#NNt&+NyZ`Cyp%QI
z97AO)%^)Y}xt8ff!+t9@7`ZV#CQ4<w&2|_QPyF?Mdnk*TRK&WZ*5gIQB)|8BDx&F$
zRJYxo)E#OlRK0>nPT~>cGaM4aw1B}))v9<hEO|to8^@h-TMK-S>=Hz~g7=4Y&YU=6
zwJl&WSq!9%sSc<x+pY0Jh!KX)cQfCoX4Rw`<HU*6C4P?<@Ck!us)^T0b_;H4oM|`o
zOodJ{9^Y*>TwzQ})nbD9g`RATWzy}DgcaLOmf;IStAU3=!m+3enh&u|s;v(PAb%ig
zG9dH{%9u%1GmRS2tOn)Us6-e5uWCYVq_6@m47!e1hDwQMqJhWCF<7lk)RP!lrfij!
zVGi!GMF`PTDoBDQ#)iWUr2&XSrg2lqBxtyj%?6S&Ve|DAU+UM?g3s_|t=w&b9wm?o
zV`lqpQ2-%;qUKsoMycsVgG{s)S?Sk=BJlCa-k{~-S*kus@;ESan(bnBu~p0rb0gWW
zhlXl_n0d$d{BhmRpk2L_8LJkP#l)s(Sai!TPqAK(9G7DZxSHqyh`cy(5<1dMrO4c%
zRtrF6I*)^wJW5@XAB7{<4Shn8eLO~Tc;diy8@9=T4DwLOHbrUT3hjsh$-|rugKjXz
zOoP}j$R3C#q-YjFDqAW?d7PJPVsT0Y8WjWBEr0=d!}2lERgjn!)W8G`d=k&W@^l-|
z)>Y1sGj7cUNiJrw??ZHn&W<Dyb}LYMx;`qhT!N|q^3GM+#QG*TE!B<-7DE&#bhglJ
zR(iEgpUDVj%+!c}Atj362PQyeDpY&GP+>MNn8|!{vRy$FRmlg_WFa8m$VfI8$zXxW
zo}Vif#zMl*)CQcXf*dYNta-_n6s2zR>>Qt$Nw4biBarZ32}e;|4+}Aw;znXGp7?Q6
zb)rtvQAJSEL3uJNMcEqJ(`7Q(jIl5srEC}-@MVzaQ=2$icEA%+JuH}2n;A=3Y+%L+
z1?C9LP_Ek*t0++qFb1Ert9^~g54tpvXal^nQE7u96PE}6|D@)?X~5lKz#;MS9JrV!
zW1!$hmj;P;C<fR8QYm5}ZFH&-MNx>IF(?7KLbXYZV`YV%(0YpwN}*D!<T*fJU}{jL
zg+MML)20Z++(^KAv|p6VlSw-3fz+6>IU+-g*8rNwx=5-jRI@bvJ)YIk30!LqqGmPc
z<xPCz3vjnNWt_l->q0UR)-CpwbfSdOR7;r%s#{1-O34m6CIe47%C6~cJU3Q`-AWw-
z)tXT;M)H6G1+PL7R0dbGE=F}V%(2pPRkr$*M2v<Uz-^Gz&F0C7g*lb$B}lbvGGIbr
zbf#d)rMPb*{km6?WI8cNdQQppWobBNv~trNl?jl0VKr+Y8gGu=u~`f&QA}5Wd&cUT
z7>^R~10=|SH*Y4e3SOUSHtZl1^^mCN>w+J&1y6<Yv0&5C1PR%p2|Ff~7ow^M4Omib
zfzj0Z9Xt(#<ns}D8z(#P=^$IKW?96w8X*E{V5#~5ezbMQNVml{=S?d#c-h2D4I7kz
zcU8DvfRcR+Tm>9#s^VL&sAmDh5BUVWbf*0x$#sOhm@6sid@9Hx1fC2!@Z2;PjQis%
zY=QIF1AM^*<#Ql2Chf^uds4Omu)yI&WQOGg4S?*<Lbj=Ss?H@Cplm_OIhQ2AA^Ik6
zl=D@OZnj3fxGM3<2E~~S(@a>UJSqo**i{j7kgtMQeEvV9fS~TbUz<x4NbSybW@hHk
zl<)!<9`fG7>)f;%@#GyRt=OuCa|e2x%uO9Ov+=C6=hO9*UcaF2oq54pKigoFHP_u_
z;U4)n7tzb-cW3|Uu4QZf{eW98JZkmr%I9}nc;O4rKKsHu{vSU6({tzj=$80OTWAX}
zt-bp(OXts@GiUaHz7E`K<69S>Vpg8~Pr<)`o>{j1!G&|zTys|8z`6han)Bn?7=8D8
zcV_ndL3>br8TZEj=u_NyF18K-#bxe)8z+4%SP8H5!w%=J`Nxx%&VAy#Kkjwv`u}%8
z-*>g(?EDW0xpe+qe$8dekKg>egSJ^Rv+9MJ`p!=*-v8shmR_~RZ7a50wf)Saa!AeF
z{0F0*|N5M8r<G^VJp7sd<pxW4o_TRCXZP3d_~_1~SI^SddzqT~_>S8r8yvXH>37;|
z{dt>J>daa*quGC6f7LqQ<+lCjpAMh(P3JIv=T&=u@wW?SKAR<Oc=3tUe$uS%W~DYh
zaHpBS&U$a=gClnQM|+JAQcE|U8LV;cn)8o6`kYgL^WarKT*Eatnq@D$30^AAOtxKl
zD*pC{GpBER&*j7i=ihShlSj@Rway2dFMVR&xy&Z;&mW#C&;DYq1LaK@{A0t5hfD8x
z{77-;;-4%XfAHJSXKk?d2iq;ZaNei=b(}e?`ZMEG-$d8me?{|I>nSS#(i$%}W)7Ws
z`SRjPKitya95{ag_P6a1+~}9*Tsuqb-0<Mv*GkP<b@I$)>%R5N4@X<id^_`eYLTbR
zT9mo^@P(({^xhroygZqCcf(8DJIq{j;r6qhg5Ozi*Sj+>pa1xe7GHAjsh{6J^Tj5s
zwwu{}L#F)1vwvEnZ7%rBYp<_<a{KS*%*e<Jc+tx9mtT(1x%&^lcm5mynDveF)N^wW
zOwUZWUU;tgL|kRAUe`Ei=GryHAH7_hxrupV?=Le;m6@;BK;L@2@!7lZ(nDu0KjMbv
z&!4&QtPdVtW7Sd1mp{G6b&oFFcEzVp?<EgUy#L#oEoSYu(P3AAc+?g%_mKb3JUbMa
z=ib|`YTkWz6@KQ4`NM^!1GdIbSa|m9Cm(11{o|QEw_3f++YcJwe7@V=J0JC^Q@V4#
z`6m`+<HRGMp1)$@4KHoG=Lh@F(EDz^%c{@B{d05X^>!=GI{==uMQOq6U;OhIC-1Uy
z;bV&$GvSl-!_D_yv~uCgFKu<C`qJC)?BagA*@4^NedPnqwa<KP#i?)7zd8B*IlF(I
zyZcvfDl@y!I%1uro!RT0@(1{nqqorfOSb(MU3$^XZL`7$R-O6S+>N%M9Eg=avL?^I
zJ9quZj`~n~wSZlD;%MDtA2{pk_;_X6k#PA3=VLyyG5yo!@s3y*Ka+mw1#V{H)i-_i
z_;C+!Rh4t4nYZ89_T6Rm(_5|5XTF=uoVoAuYrURYbmrF^!o-i?n0M^CXIu>(|7l@n
zxWQGI9<b-(k6nLVV}t#|yZQ?*`_wzOaIUf7vdura>G5^u%{kw?_3JNxbmCj#j4|_6
z>gtsX?_GG^H><Uo3sU!*XWqJGbn7E$ZFA50Tl5ds@3>&N{E>B!|I0Z~A9~t33(vmi
z%%{8dvf%pDkGj4tGh5HT`0v#(|FKqT)vxwKv^!3b|D!YJuKAPW4#bxI?1bUZXVV>V
zPU+^i=3R02^{1cH`t$jvTlP8l!7n$wYX0JtH*FB_v~u4MYn2@quUr^?0}`(9=<Ke%
zcg3D7w(g%}Em<<hfM2`!{+ZNGOM92Jf4=gp6V89@h64t3_pi<V@}yl3-=KBza~J%)
z{D<EDFZ`xQk6(IbhyAHx>bV7fdPM!^ftTo~etR!}=abLAwZ=6Up7P<sL-u{<lQTEI
z|Diwsbn*Uketq-o!#_KD{EHs_<v)Y_j^4%QWuaDm`;`Blr+a=h`@8#=BRL6wTUHnC
z_}Z?e3uvtL;8v@@_<4bU`^;V5UU1^VU%%#`a`eIlPhN3%^sAr#Cj7?@s~@?Jd?USR
zWsp7VrjOR#Z^dO<c*~`0AHVab+wWvOxdS#hZNUqwxM4Ksol8$#{?SwWeO<2T<KBAu
zRg0f{BQ(ZU<(^AEy6Nj*uD$05?4Q3q?BU&3T(;KFuRnj`Wk<ZZ`Q6<IU1iM=jy2AQ
z7IqiTy!!C!UHO~#-?VZ3tUvEj!`^!YD?fDQEk`|mW%&m4=H(x8``_PKKezbz%n!Bh
zoPB10w+S<kd-am-H>uqA`|~dP>2HipuDIn?l#v_Y?d<J#VK+O2ew!;Fary~IzcqW;
z!cLu?jLUXzZOYtt$e9mc|J~hWb+=xA>wnUp@As$84}N6fnfu;T#0dDlvtHX^*TdeL
zIrynJF1_Z2h3750>E<WidEk#|?f7-{^>^QB{uxWpOx>{keZRV4!~2#z@#;e;bNRtL
zLJNPrQ)lI;XTA346V4CA(*1wk-?;tr6F;2SLyRLY`|Vj<o%`|8@XcqfdB~=cDeZFK
zJ;(Ce@oAvd|L|4&f$OfkVZYOQ=P&s2nXMZ)N8dZEx%Ut3#r3Bih(+`xSG@Pzht6H|
z48APSeQs;$GJH9H(g~-32X%j3x>(dVyzIF8rp)&2gD-7$<F6Z!Jn2zC-tpi!7yf)v
zb$HLQ)YZNDGZ!85*W>!<EIipg_xuNbvsUw-;sq~$wEWDO^owU-@$z^cbL4{;UH{94
zzgu|OqJ>W{d~M+ar|k51V_egpy^~pcYh=X@S17@`h17Z2$*-(+(_uF(Kg?Tn(c9+-
z<z0?1eXwfF^{@H!!WS1kTwir_xay=Wx*uP%%c0-Cdf&yT?!NERpO5CzukCU)4`Q#&
zcl?2!J@A>0uREOGbnylIN{{`FcmoqYs~vj}wB+(#RvvNu3Gr4>zWwff^Dpnc`gQP^
zvtE7&JCk02*FRmN-M;&}+rL&i@8TD&&bc#<JJR)^tptmGT==;6Ap6}-4}Ns>!X@jy
zwvX}3!UFPF?Wl8p6W{j!)^C<8@cN5?Jz`fsCmgiyrzeN-Ig1qhA*8hLMkjsw&It#+
zc82!<NYgg6C+iyB?H^zFg-4cUuAg>4oytFLY{S2L;8~ySx@`O6_WCQAZ63C-K4#Ht
zTm08Pf4*Su0DExLuTI(ivcu1{wB6U;Y1Q1>@?qzFwEHv1MXmD=Ivcv~|B+c|!`YYY
zvhp`u|0=-$t69J>R$Vw}&Vun3KUpdK(DdH30RSpJ@+lDL`@i62?wZe^xO6TveUONE
z{^7Dd4Dd@v#TT#fL-g?fN4JL^xBS8OSx|cMCfon&fRD*HDwn1nT;uyuF57U{l25{C
z_TF{F`{&K(cX|DhU+g-)ZypA}8qRtC=X>w4`X7(oe#V!tu6q8GeP93m2K)W_82>GZ
zKZ$tb#+?p4=+Cd6_6fDk#%q4F^A_9ezF;d@d+L}4%bwbA)4~>x)?a|G|LCLhU#|Xm
zi%)J%Z+|Ag=BxuwJZ>&?-X888vDv@jX!o!e=d`?;{`Jr2*SO*3_jY@9i@z?p|Epa#
zym!yp-`#r1am(*}|C?Kt*D?Bq?AFVc@4eGTcS+7<?OB?9{l)iR`1bOfcAO(Vzu?4s
zE0>>g>w5<y>nz(Xzti0RRM{ANXM;zdI4Qg4J=kEwYtLQsw_|qv__m{sXKvo;yFbL6
zv&VmW&+R)8o0lB&99sS7o&TJ-*#qZn`R(4>`=4y6jm_S@`~IWu+4adcUitlN2fTLn
zH;-LPyl~~MubsB=#6N6V+vtlO%Xi&#UiiZ8(WV>z?Gy6u=^ihCd4*4Gy6mV^?_KwT
z=|A^b)>`rCeS6(`QQ_HFuf3RgTQ~zIygR?za`8s_qd)#=d=L~pea@}%Kej1vw)@gO
z>_42=|NNzG7tj4F`=j;a=db<h^#z6bTLB5$aP5;I?u507H_l%EwEe>R!nFtAuibrk
z_x)G@@+<Mq?dwZ!{K<k_$Dbc{$_iwMlliOfKl-;fE>9n{%ZdjMx&51MKice?<F}L2
zm$ptl>8*W6JFd6?>dp3l|1Y~7zw(u@Iqy1Wmvaxj=J}uQC0MJ@-TlzRK76XPn{?BG
z)y~6reFbLr$1nf-<eSUCdV7=I4!vQ{i+c#$ou%%1&+pbh=ei?5{MosG-1{kX$sMax
z7wyh(zxdrY`_-jizwG`>eq(&Z2S*)y?%mH`dP3&%56-yj_;moTe&w$Hx(98z)`Cx~
z2mNgIBYQt^_To+N+vxo(>32{1^tse+OYRTPJrdENy+7Ud=oQn&{E|a&Jz~EHpFV1b
zCG(#9>EZAH;m6$%fBQH0$Lojl(HjrA>E=Ve`*x#;<y)~izdm8<?HfOFpmX^x7uOy;
z-5EVJ`!9dYy)=(_QF!T(k3aJ9@|}zFRZs5NU9r)pN6g#X+WYCz-%dR4tP^%S<*N->
zEPL$Dz3zXpo!)Va8xP#uDxAI!bI0vF%~^99fcr}iJNbok7VQS#xp?J8>xZY`_1Xu2
zx$@Kp_s;abgC9Qhwk^{4e{%9VdoSLWx^wlqPm-VUOI}*?)>ht@*X*(Wtw*1E0RE>P
zE_mePT_57-fO=c5|Lb*scGxRNeDsO*lUu*o_M#(pyz$B-|9Hw;;kC~lcEJJHZL|98
zm$rv~>AW|WS@Oomcdq{K<X_~DzVXey_ITpZU*5Zp&i!(Be6zV-dMp3pEj|gWuk3j2
zhI=Ir&dE#e`K?trW34JpRnI@}@w49C?Up;PWp4cYI^oOTeVw|!vCpS-4*p`JzwdU^
zem}lqC;QRS#wQFf-u~trmwtWhjdPz^J=a<9%_ZM`x#cgfc(`!#qiaAn-T!rX%U#bM
zT|f1wM?ALs1^3r3JemB*S9_eIp7Q?51Lb65;UPy){&vi~jc!^uYRtXv6riFWT7K_m
z>u<4{v})NW-@aeuzS(Z0{lDC!w)ZD@e|hpb&~UWm(>aT)#zlLcc=8EPzkFB!^ZQR(
z{@$XOzk7MnvZwZ6@adMCb>)j!e>v~8d8@bHbnPke_FoS@?Y!vfb>4aXi0|Hg@TIpt
z{t(}M$-I*<*?*JbHvWop-dzVY`*cT7Sf^N{ul?r8&9}a2$)*>|PZzHG%Kmb@eYZSg
zuRqAA9{kYdS89m$(x(4h`m^6R*FV^veekMp@0I5KdOz%)y&k>u+wXQg<f^}|x%HEm
z-F55H7k+r)4PS$F;h+4(|J9r6+&ad*!`HlK+dF^RM4c}V-&b0>-f8whKi$A!f4|jv
zFHar;^S170eldUf%<PRDbO}FXPvL+U=kCz>bk2*5_&LYkcJm+a`26|pKU;UzCp$j*
z#-43!fxV8!ThE+k_~*R*x8J_I$6wZO{Oz{i0_AqpLG(uE7ubg9@Q;6X#|f|eHT~vh
zkFUJUfAhs-_t^iY$zO7>6)xTA$VI<^unX_I_Ul77I(tXz{-52p`JIP-{?LQ7=goN(
zqp#T%$oU(eH?CTJ+wWG~^W?@`KXEzs#6JJr;Io@t^T<msdTiefo;~sdvAXT!2fcmj
zZ`PoCUthGwGdJjG4xx34`xm|SVKcMWc)i(&Z2Z%0u&-V_Z7)}NVxRV^ul`Vc>6K5f
ztUZ235yZ@;+<TYa>Aam=`GB<9u75b>n5Ry^{FJ}0*#7Vzz4~QtaA|bm9^r@2K0j!k
zpSfwDJ$}B&HlH1J_`6GiMoMqA`!!_s>(>`=Wc+TE;~xKAd%yl;7aWp3AbglR<@|m3
zdH<8^f7keyI^v|$j--Bdfb!7KezfhD)#bOJy7qzlZoR{z3$kx`zcSujyaK<@v7S}t
z9^beis(icew-@dT-i&+am}aoYCHt<t>)gjSIC;14c)$728*cyPCx?7uy|meJ>+iem
zNw3^`a~-J3-}aWDM(i-VRisycb=0OnTNoECTkmXp>8>~QSKV5;(0%9O=%%k;E?)h@
z?dwOrdhgOb7r(a2wU2%2ZGPL4D1U||?|NP)dgOxZx5^t^?0Lv{+aLVmHCMdz`mE&#
ztiJ7-&mI}<EWWh#jI$v3zS*h$Py7<v=awr^zVz&Eue$kn*Ij)<c8__+m7gEgSXep}
z|MaZO#;0HMjQ{c+{*i6|@!+#+;f%a@<6r;&(s!$#U4P5o9@*2P?icc^gY(gjx3b6F
znEs<bTf69{)o<Uk`}5J8t1W0Qntt$uTe^kQPq_2OCyqV0Fn=)T?Z!TLGRr>s>eFqX
z`Rvo9es=A1Yp*!{?W?R;!b`ZtfB(+>>(yKE<}*K%pANq`ZqJXXk2hmio|RkkFGt<7
z&!ZRoe&+H8@BCT2)mxC>a+md2fBW@{B^`XlWdC>f`R0YwcVoXka?!b&tq%C+qaSZ`
z^FK~KJ59azgnj2#2Y>ney2m!(`Ry^ce!OJ$$-g*kt%sgw-+${X`y6&j?<04U{)LNP
zG}DhCv!lOk`E48SaK+cJ-81|BPtJY*nTuY2?-XuL?M>(B#oD8XExzW9(+_-VF7w!e
z^#1ziAN_id?^1`JxQ$zQ_x9NzUVr<&(3OX-^OHN>HSRe6w9_kl-Sfqcmv37B>GG$q
z;qTLqd_EG4r~VInU;PzV7i`%$gy6vf!QI_m8w(D>3Bd_&jXMN)cZU!hg1aTS6Wrb1
zVQ#*8Z{9bvM*e~M1=eD*Zr@X<PSvhmyH1O0aU-|gy+tK@CZZ*ljVh2iz1pv#CZ+ma
z{<LFdiP(x=>@?!4PQ3I9cndkY8ejhZJrHP`+$cojWDQQhnIzzT={iv0Kl^xDLD!)h
zw`i+LF;RQALrj#i&d-;;J*)TFspH>@p^`Pam6#lKDSTMl4nH%LuYZ_zkY&6wq+e`_
z@67zTs#$qYQIL9>EScWVIbNVaIny^Eu0e-fAtZS6Q_E5`w#lKr5k;}(86(daSM+rS
zH{-eI_X)7oda%zPtNui{l|aW29T6H@`3vcF1gY20JAN9~ALtjmL-|YF#Wzfv6tUZD
zv8S;w=W2Uwh8z`o9Z@pkEu|foQY6g}oTP$YCt^#s+Xr=0X=>3qU(Yo?X9pUUv%V>$
zUHN=!^5#!-CHl5<oStJ;s_BI}saeJ$+AhGWrb~}Jh0ltXd!mYNOgK#kWVHcA+W62~
z3K<6vrGfWIiM&m|04Ren!kTsQPB#w#0nb^pz+03A)vtuIfFdT9T_7ro@qlDIzOwY4
z662v}Pnh#!f}H1Tn%1A{U68iRhd{3~vyF)a+D_M`^5s0l{Srm9FK2&B>EIKn*=6j~
zXDcFztl!`Nu-y)}{#4`64y2WQ8w@(#>R5-qxsogRIALXp`c!(4BFI#9NHO0HfnUJk
z&RTMOv2xRA4=1Z-7N(+UYsVP?uE~NeFm}4Cf=|`H5iFc|g}iGwyRGc)K@&80`xVh7
zcz&4L2r($bJ{vvT6wX#Uew3~5`C}BUnKI<4xZUf4Qnp#{&{xidU$^UpF$wOw?=5AA
zJf(8m+-pFfGtXy`mRGPjl-ktZS@RG-uHzchDxt30>V1pYUyApcI_9xjvQ(#yR1`iS
z;#Tcf%tl*z={<}h&;2yM8OuVr|0RJ0oip%i@O<Q;pp~ihi+j({%B~FGTAXr``u@xf
z@59CFRI=|*rwxbCtie&~72Y0Hw^$^+WTe})!^wlweE>X!x=dsH?0iC~#ZqJIF|6CT
zWaU*nrZ3Wtd0OIX9=Tom005VbT!eVPkYujoKOS-i)I{B6NP1OYGY?>pjv|!3^sJ6+
zbAEYZPP!;ql|JS>PXt~Kw_`&UTGxn!orP3yOeVH+g=`0MzY-7W$oC;!wx6H+2>AW3
z?NFl#o&7o{Q2KE{BoOBe+?%@kaly=%vVb@`+s|Hgw<#`0iflCP(}%?xQ0io{y=61i
zLe%1JZGR+;7Po5#AARS=?>e^wA2Bc%cC`Qr;++J^22yE9jmcoqrf{Z2ni_5Flb`Xq
z{w6I1Q#>vLS8gtfAE;Gj<tIm%avPR$i0KTjS&@VeKNCa?#*yjs*VVcIEGNz}q4|9*
zW8*fkdd_>Dffbmrm2oePM*N7;r|*XG%+IaaP=?-dLLAkzi=0F+NB8=&a~Pzi;jg#q
zS;i!4=w5DiDd0C2d@B~Vg=Pq#<WJS!1bFO6<t!fE(H&3UQM2^g+!gvhc+jWCoM`cW
z0MbIZM__2ZaAp&t@TvAu0^&yf7c(eV^3e$}c=?+n2c`<O-E8BGN7)ZH$dL}as)+(C
zmP)Yh6@d`a7vV?fxr$Yh>o=6+#p3Ms!SCn{3<_Zg*<9WOrG+}alv8lxu>rA2dKNKl
zn`Y}-INm~j{p9Cd8UWMLq$TsgK~*C5z9qeuepz%mjg|&KyHU?(RRcSz4R$mcib&2H
zBS82dj-9fbh@+yPnsPI*a<|s@09j9^o+w=fZw+SQi+i+2qe=U~uUPgWqa{_$`!m5v
zt)CKe<)2o`?D1}4IU{ugb<BXrGCY0Zn3F#H4zp@?7l2M%B-QF-irtDR03`0eXxpL}
z<e)3miu(Jc<2K0*I3zO!uD<r2?JLOFPXN#N*9l_U7uSv<>9x)}F6#pZADbq{-iPu?
zlaV^ZjP58Q#L4N~E)~f{LV(>$xG#8g{#wy$Ig{?JO!O{p%!3_QEg5n~a{MERp5=2t
z^mxmF=ipUmU5SqRb`KGc<r_%m(SuudphOUGrbsOe=&)g1Nl5~y0|$RknXx){y=k30
zvrc6cLa+MA$@$|bl+W9m&!C}Ac}PgNDWR;mPbGr5!#0lY!^6zI+g_`hgG#lEp$?jU
z6W=OTH@@{VOo3x2H}uvL>Jcm39ivvHc8{d_Z5`xpt#-86@w(D0n5y4BjnSsmCMIK|
zu<L0klf1)f0=MqJZb53Y3Y0i{+mT*d>mh{M+(<zD=uOq2Xg+cah;sWPLS{Gc!L{+9
zV=~+|j5i5(m~VkHQvjEiah*O%W%3&a<7NqtGkL7EYPQ;WUs=vc?8EdY-{1+LhH#In
z2RdCJvC>?q2tB<F7_1$etyW|Q6k2i7SE659ToU9YUWF#pZ_Eh}iEVJ!<_vt8vzRnI
zA_FJ^7Zc*~UyL+0zD8gfF?Syhs0jRmZqp|wcCwhSlkOVj_R^9P__zHqlC2JjOdie4
z*Q79}bbp#|b(}Fo#e#UvQav;V##yKHj3=N!2{@;a)l)PxMDSMVl>g8aljSC@jD9;O
z3Gdf@v>fIbDO;FF7d&0gbJ-mOCNDthJKzhdO^qp%kJWWkhq(B`s$r?b$p^n>B}X3m
z9=Uj~omg~t`A13;9Bq?@>sm|LVPNAhpqO6SDv=e0V1r*vbOgX0gcgfZMf9`3oJT5Q
z(%)@`CMy`G*U8n_InsT=&l*hB>-(&b-Jph{qVLd&)wRp4uMyG-dA;p&u2q*Wy-#Qp
z4ph^j-s2le<;?G1TbtN4pK!~O#pn+(Z{_g|npOU}KO#0r*igqclFji4Gpfs-M@j5J
zBSSG$kXStsSd1eKu0r56ZH6GyzN_$4#1FVZdNKF)NfYOiQgESEiQ0iWN13Xm*G@k5
znDRpyfib3vqbZC6*e!^&{D)NsUh&F$YVdPNv|p;gF!v*zs_my?aiSkZg+!tx4oI}N
z8Y<@=L<oU-T15i#*F9q$8x<`lt=kpb28!9@-=&FB7ter##-iF-cV;D#J;9$@tF5rv
z(S0aSW>7GR+DyaAtD(KQpHp%)?@ya&S{TN)B-rg9D)h%L?Omw>gVa>iv;9k_<<qH;
zD~&*E{LQ*<rIE;Pl07cAd5-g2T3qZE=$nCTA6iX9#raE2m|)bvd{|j9u$6TmEFI{9
zXzs<kO<tgIX}&b=b+!Hu!clmcOWh1(lRcIdnDA`+38RgJBATM$>O?-h1vWeSBxFEU
zIPXV3_6kj&%JMx~!8G%5@<{ySm|!AufRsYlFGaJ>-V@5dq=={{BM{ie7n9z=K^Y)6
zu88rO+-TAnl(Z}SM#;-)FpiYnH;xLBFEJ3{{e747L9U3zU?oF{nDTyyQ9&SA2AK$X
zN*d+h!A3W&fbaBL<UUSPZSvLb9dg)kO;AF021O8aa#`huv{u?oMIH-2e-&1ZseA|k
z$OrW7L>e~ly7Xp@)?~n`vX-ONFxV&gRAU;|4m&Cv$2h;zSdSSmzpp$Bsy`(~Nza@@
zhQzf2toz-^=`y4Yw#K&w-^{=bOwM{7pKi}p%*-~Xx56-&>vn9XVAkj*<UwB0icaa#
z^=42>M9dv1DI}RvRic6<5_@0UJx<SM!M7D@xwBYAfgAfCIQ{u!cryr+$U?Z7TK@`X
zYXGIlA-tbWU+&+YdNaTn6gutZud@oEm5x1knb@)t_X7J7Sv>QlUt7l?iqJQ@`Mr#Z
zx1k?ycRF(DL*{ZW5_lDfR7r7YRB;D>4r{&{-8TYg08dq{(EDm)AJO0eRr*7aV$P)O
zpq0Mc#Z`#GM>5BU0rj-vPt5H&c0P6(8PB5o9cHj*uhlh|#&v7M(~7NM15R%0N{QPj
zt#g<o8ZnCNIpq55&Mzz%yQUAjv-z^qX1gjh53pd5NDNKH;N@AMk4bD<wd?Ie!rF3!
zzN-Y>3|@Do5GHjkuc(F&yG=HY4Pc!C>Bi^)SFZEJZuj=#MD;1+ljp^|u=XoLsVnPB
z)L#+LUzwsjU-N4^%jfH`)CXImp2l`WcVEcWGsT-b(6rmlweoc!V=p~_YOJzr{80{P
zZ~~+hl|yVk%@ss9Z`LPgYqCZQq`qDtE3UGstt_cME}zh;EAPJKdo~+_+ldF2-(`FL
z*-p|)6T;(wC-!2~E5zAAk}tyFI2Pga(aX7mzsSN0<cRU^JK%T0Is4Sd5Mz%#>X&2s
z{Gn@}&(yE=^S|hzGT<!hY&@U0R;UgnO+1U@-uxx)JU4df;O!1iylB+g{RU5Ge_AL2
zlp#XX8%<@LK7cjJos-_Q-@iEh!{Q^*j7C&!<Shj0@FPnUAXT{iHPE(T*w<YCsl_DO
z;goBCT(J)!YHtd4zSdYI-^FXZMxQf*Mg?TsI)zzJm#m=~$|lOcXzE2#oE5~ow>S71
zm9L-rqKMem2Cs;H@YVD~;!Oa*rQ_#Lq2)h(^-5(G#NX?rLNm$G%%vzU;(!AE4xZ}W
zrn5)p-%5$}Kl#xAqD(pF`9HFxjFnrI4<3kwQ56U*U2N1giMo<2hF&c6zuI(~qbGk1
z_&q)Mb)^)wyNdAP1YwItZ5Nm9{y((<GTY&se)x`~vMY)!bsCh;J_5FnzcusVE1nzG
zm+i-N6Xc4~G@Z{VxD`DsQ`vD|fV>Iw?zCzS=uXr`7rVdPgm>0xK>Z-#pgS|-aq4!E
zNGsvZUn9gMr81_p)3V7~^x#CrI3V7y&_c;H8H#TW6xKop8F(Fw3+Q=U5fVV<GLH|N
zVbSML<XR*?8Ah_^A~qOpIv;^?@Bmf-kA3)r?V9->iD-EXF5Bl#p=F(laa5t@W)%4H
zlkB#)S<}U>dg4_^Y2e-<_wEm47Wdat#%a-9uT}35fl3WkZnLwW*#HG87X(*jj=U3@
z-B2d6_tG2lF|7&RdG!#Vd5jW`HuJ?44$^~$L^E_4q~8e;4C-z4rcup(lY=_^3%5kU
zWx_VftTOVXD*2Lgjpuy4@r*QVqy?ymQs3fWNERbVor^*yCc=iz%(l!$qxq*uEhN$H
z_(@It*JHi-bEFEnHxT#Ft&w9%tu0MRy+|JE@%|JhHrEfxgYh?;b~`^i52i_CDR_+(
z+)HpF(f&3Bco<vK0)d!b)g=b`^Bz2<=8@<6m;)C7hM{C7=cT({V&?~km0q-&4@a%H
zS(Gwyi|Q*)kF_`|qzG9E*<tEdMGi)jd_!#KmqBz$wLd}p|M9{^02tD>2~JT;&n4;t
zkjH>@EbWvxpS|pdJ7k>xucCIS{a+PYQM-6QN2GWKrP4sO5YgP=wE_A`Bt7O97HSBw
zkuNwwg2AU<;M@p|A3k4e{l&B05Z|)y4)i!Tta48`5|}{i^G5S)iOo{(bwWrwD&Y$e
zGBsKgB`^&Wn9#HZQH8%$)3Ep>s>N6Q_At~eyd&RXkn_8;D3p-rhP+ANNEWYG259eO
z?8*q=xigGb(CIUiLKa>&=g(Kh?u~1Q2>a)+5XA)An=rDfFGa)35-CXW2ESh;+(U}b
z$1++<R7iRC7-(vA5NR*H4`3{^o~e+i@oyOf-u3ImfBVvi%uOc1JLDuJbK0uvyt|k`
z5PO+8c;4JEP}sUGvpU^<7SVtFSlUwN{23=GR9sle%`{ai0%6lB7rRZQz18qdR&n-l
z)>>5?<67*u+8ZMX0Ye8ik$ypPkIwpMVWa-Y_ksj`Cht2m2Xie`jSlKlk(2X9(^ncG
zIClycTF1)gvr<6Nu0FS%C9)>Sx2b144<5W<KFhqI%jFkO#S^Y%4<!_Q!FrQa9u3q#
zxm?rx7m59AHm4hzVYaZ>wGlWO_4;erjajr?DGpIGXQfjoZI3qC?iyvX`;F;4h%_i9
zn?<AQ76qSDd|!`?GONFgb@8;i;rKO7xn=R5Ns|C=8{wJX?EWH3=ysf>#{gBEIApz(
z;%$$kLPbKoz6CwKz5HN4S4Rce3pw#Pt|`7wAZFeax?fs$T@I22>Q|sM;zn5j)ACbg
ztLv+b*KI)8V7d3!WA?l8udWjbxnUwKQi)AsO~9bQ<>rg0{Ay!R^w{OX?@)G}lE?6o
zpwJpO6G~j=OTA^!o&HE4wV$9k6+OsW7;IvoChiJquTl_>a~Zq|=BLtLf3lXK7`CD$
z4Y8z!^5!!B4UeBzKVCAA+CAEesvVF?w*|KRN4lFN{CgeFe%&5`K2*XKNJbfd)4~wS
zYy#_HgX#O}vETbA@!Ob!JyUiQ?mxcX-x`=~XSF~$>1J(lU7z=6mmg=;{egCirfWm|
z+5_FnKgIK<zf6ye%f-$xj%{PKK!$E#=%uT-V^)(Vdx1hBW~^htAVIBcn>Ko~&#O4&
z!P4*^p3Ol(uyuemm>k0lT25@CO@VycqwK3dh8Kp4f|N`}QS?FnBt~qN_gz=Zpa=Fa
zCgY{nA)L|W&`F=Y1@f3)V_<?}dl3mcmdZ=I%sIfEWl#7Kv3OzAGK>Fyaezx;)W?Hq
zu2Zr?M<Eh`483LdD*)By3<S{Xm&&1o(V;Z4RGPURam&Lt7ja5?l;|eqLT11B1>21E
z>f%r<W~;;c@zR&CxO7E715|$5)TKOz<b^R4F4J=nD*41R(NXu-nRL{10b-e13m^Ak
zE{egDL%3?y<9fLeau|bXvRA71DY;z#z~&E?I-2pCfdCUl<KMoyg4zMdIjKOidiv`G
z3S8Duew?fA>jMa~4?@yE?`Hn)szTu-Qh1Z66E2%Rk7quU`ecvBX=WZt$hGb`gWAI(
zb!Aa$M@6d(l=V`#r+e5^m;|E(08`jIQ#Qx)#c37t5Bc%k8%KeKnMgaJ4IzroN^slc
zXU>$-n7Bio|D?8<nEU0{D8!omP^{u>>PEMe=kK0N!Ar+j|5~boIz-m^V6%yisfAv*
zbuDm=Hn+F5wUYCCe?l^qKst5Ef*pMN--7t47F%pEeFD$D_bpjn|Gv&h21NmIvVkP(
ztCPP$Ku3;@%>DgK@u<2P=5Sy_dH~8|dw)A8`wTA>JLxuIk^Sc0&i%PVyrqMjBEF!D
zMi`u+O`*1%26|PHFMvCwrL4BRZdW4H66NS64@PRtk)ep0t;A%z*NVi3j%KXpCfb~P
zs+@)ZZY({9Es0C+#o<>bIWk}Uk21vf%!qe3m!3xtJ25p5*7_XEeXDT-lVL9V7sLlP
zZrY*riQd=(cYOu-hr7?Nk;JMVta^Cm;Ujz~BDD*NGkGG1u;pKW6UztBE{D)*&Ob(e
z|I+_ta_~k)ds7gZan2%(HuOB8W$k9sMl)pMN?;lpjhP8TGVjIX@e)UtR*B=OZ=z~=
zz;Y8(J<uLTRh88nM@nc=zJRy;vsMng98pV^k)SmzzrPVIJ4B<}u)o^Uu1U;WV;b=&
zXgm4?9Q!27i94*>=%8ARY|@WHi?pw!z*j0BvD4j`)f7_ReB^~e$-VhCD2A&-OL=4!
zA$~aoAPsvLJzM-@n(@ubam^)nNYxG~aSo!$;4Vrd>r!Lb0%UV9t|z{~=$KCxNC}9^
zo)UkNYO_vCjZY)}JT3e`nf?%o7B{a_!$$31*bR;XYuph5|3GYrP)}7-Uxiz{vB*w&
zyFPaqgdPQ;?|TJsa>e|QlONp0ts8znt6})*>76e}VG<J4aa8&z9gbRQe3NlmZ0Ti?
z%L^1+bul_lw1Ov8cS_{YuE;tSP|DMnbkBB}^~H5sWG%)A6^2^o9rjVC5($3hi?wml
zxm>OK*na=iUrO$t>mC$m&*ahXNZEGjbGJE0e<1SVG=In0?})mbEGw5_$Hfz45+mNU
zG)1TR*ekIY)h~syz@}dd>tis7(oXX@jA&WuOC~7;1-RMCT$w;|d$58{jpZwuTyWh1
z7kmHqzSum=Yb8}dmS!8<R!FiuZ1`VW(g!9a$Bt`{H1@$|qlUWbRTChVrrXa3CS+k8
z6k-`LD-#Lodpj6U=^A^KiUDRtL!NqM%}U)zYNDNE^2pLM{FTJLiMXkps;EQUe(kDI
zl{4^Mu9AfX`;~mnUCh!WP(@S;1E~F<s0y5_ry^~r^#3*;lE#iw*-P#WapGe0(|2g;
zEr3R)X!nj51kw5?nSMsbYNy2Hv%u+3AOcH5Q&iV|tQx128WNk0c(skagva^fjsGZD
z(|hq8O+7uLC%{pROH@0QRl(7pIGK2JJ7YCS=lZf{DR5<2(|BMpvYi{$Y@67(Fv8!~
ze6)Hh1p6bry10u3u-$yrP?l>*vHSPNH~Bf7i8$6aBA-$!lij{?lA@TtE!Cnj-8ePP
z05D5KTlL;Zx!GHbb?7mpg>Nf#eM>;U!QO89s4>CA=I^6`kR6<&Ub2dlDA!RW;6*N+
z897rsP@+&iFZTT!fw6=m@N7IEko1)n0}-9F>`fa;W@=Zu(nUe-y%;@!Y#UU{1?Um~
zh>0MUGTs@TngI2%tyr`7q0RPS^WCxbHqJC{Thd43+>`3vnKBQ}(N0H7s~pSuS5#G&
zDNs3LT}lm$_3TT}K!RjQ$og<;8wcrj{f~&Q?b7KH{&wXGr<dEh*fy^>ajd`UW+%U=
zP1#`+3%KiIY^Zu9Z$9<C{whw5sbSORq-#}vS7^ZetytefC-(DB^Nh*hQRVoTUp$gk
z_;hRd;<7{5!Mm#x#YxTy1k8AW(<3~Kqy_Hnt?CuMjH%Rnlg9ROf4vHbe)YdH%oduZ
z;^sRX9j8mRsa9w``p3;i4KzOBOjS?)W3zbx-=g-uMY~<~U=G6>4w2%dd!Q%~P{^(q
zagzk63I?x01zSuEeHrai;I&z(#$9SmM-#tqA=4V)e9@pCLxTPZsC+F6y8nFd{3k@l
z$rt@nSJOipTJt7i`o7Qz!$hc?|DsduWyF9}4p+x8PS>BJLieS403k9h?*|&S3$wCj
zA0YdyBRlh*$iik%sE-G9g6VtOt4+5gH^h8JHgN3802`wbQVx+U7aA8Gg01xXFy2%s
z+b1d3Y_Tq3B1q<!CGaR0@!;g5D^FfaR+WjL|KJ}Y;c~>AGnp_^BU)3ccjI(gJx(_m
zj_xh!_;Kd5%V&>KE_x;p?#&-awCu9z<c{m5M%w3)!IimOsulg&)=Lp=;^DW5bw-y7
zg5p%eOdiw>e_d4`EW=?Dw0JXPyq#1`Ew&oCn|fn$lb5(AU~01)Rv1Rtz0mp*JAv_B
zHlrk^o5nGCz?!z#Jax=e>|r;N9(?H}6NFq@xT8MV&HxCu&QGTmmUQD|*y?C{E(_J^
z8Zo>UNxjn4EHCv+Mfnrs)6*Ht{(7Z$y#O;JEFcP)^vw+)!3D~i4kX&x=|F#rX7j`h
zmMxe>>IwE^EXS$1)|iyK6V`X){U25RbNGGwON7Eal~O%0RpLaLigeYQ8V^48$`CtC
z`+2%0&}Xv_>aq+6v1)6v7pdc@IDTHB>Mx%>H~@OgkEZoXwGOn`gZs6(YA<p{_LGGs
zZI%aiD4T<=+oUG&zN6&_l<!w9O`u%aN0ETP2+0R?*<svDf@Pp-UA5z#){)GJO!GP2
z?{~=qvO=Z>hy`l#8=ZyHkt{;V?$4#JHR$SzkX8RoNRCA&Q<H9=4kxxa^l&)`RbRZx
zXhA<sUyF3?o&Ok`bpPWr=Ab9K&e$Q!F`#s}nLAmqI^X+?_g75K%#Z{v-jWuulk<8I
zy#SBfsi3P)@DgAy-G){YMA*NQa*^2ZiCdr>H^j|(K0e7zJw|g_*b(y6e4BKAD2~;-
zHGu{xG~_&}&VdSt|NU2t><);QmJ554-0|%l=ApO|a%017KE4^#j{I$((%UhCj?mAk
z<EIl5muSM-YE5ax3imM|zWHC+aBLF3^({I4m?>E7l-$8LsnY_Gb1cn_&WO<7f7x~B
zlr7Ropo?#s8uMmw76Z{3s$nIpm6V5{9pP_3_b0>|5NgG17`+E1Jgkh~hXJ@^mLF=B
z+DVZiG!xIS&2ax~?QLiuLV?4a%65>ZCcGOtjEk@nj!uanvJ<G7d76}7mtn#CQyU{A
zGqwRv@I*eO09&+E(J0ps)1OpT3bAx($CqJk6j8zsCnHm|GgSi+c?Hzg8KFSg&+GKi
z?_~eX`DF~V+4&VCZwvIxh}0S0xNBZHgYkU<;bA(*yB5SxWChj8@exZfP(C{&3Ze@C
zU5VFOk9e0qpttC@|LKa91dHv#n2sFtzm$HWBzfq-Fo5z#+~}KsY5dW>0NGWfQ0m~f
ze~Ssf(Euucsf5OY+JDRWB!~f7PQGFh8`;0*d}MC`qhU}o?f>{Mu`VL?L<q3^d-0_a
z;D6}^{k{Q#-9jQ%j*tG|(m&vJ13CWx>;8Z10@E-e|0ibvN5ov*N<-;_>=^T;cCtoS
z;7H(nRY5N@IXbSOMy$>mi7a-_E?1gt$9|6!{Q4`3X~aAJO0i`$`s-Cx-{&HxrqA1b
z=)wf%9V$?FSe>?{CqPC6W>)&p(J~x%Uza#=#RZH@_0jE^q1zE@g(q=9?of@ZdoNU=
zdMHoaHM<ux28kh10%8?vy9RdWF41W=6brTpb$+A|)K|EeRVn^ukQ2h$Ti;6SX?7fu
z&!DDOn)WFvpDF0Qw>jLt+#m*-oVK^1L8w27hRnX=pmyl-NhxCT?HLY`_xaw@DM$m_
z9CeMt_XZfrfGpp6kbLn#>3owv8jGYRBG%0{V#4X03KD{{V~^>Nbj2hH>A0jDXDWd0
zJhe<ZAA!@3b|YN+TF$*S#Txr2;QZMH&JMOy0(32~fLvfO{aFF(3DHo<S0>mSq5)RO
znc7Uc1xw&U{emIU8n?XGSLWO#VBXs{tDxisO(W)@QAYA+ar?TKw_kAhHiNQYt9m|(
z8wVI;OS-LKBi@k}pg`GmZ5oXQUBiKx@n5|KM8bW4zLW#X?Ebw{2~)g4M{3^|s8*yk
z4A3bk1(gIZur6BEuIbMaYE{P=%pdqQFOYcKw5Zsi7(KO&yFF+ViV&RBW&{EsEQ0Xf
z=D_#@vW!tE)QO2Xo?CA&3`*^9p{(2}8O0e5@6n8!1UgqNB39wf?4g|t;O)Ban5bYN
zoqrhYum;>@jozEfm=kL3sOKDCdTcyBL5mcM(4WT)+=9}eKEW7%CUb`0d>pd*`iF+V
zrf8%uqO#+;c`SyL+qV$v9jG&sk6RqcTK8)wu45$*bQNl+AgCe`a)pGU@*dFvx`arC
zID@QU37wd0R=fgfsrTID7+)lN8(Wm*=<kS=?$&<ZZk{Y=^ZXuVC}#QFCDxDZI7#pz
zBDNnyYQG+|FkzSkoW_*hqFESp>SFA08^)CO>drW!?Fd2TXmx4RYzsPu45nGR&}lK2
zV*La2%s0_E3f>VwpQ<DNok~}gm$q~xO^u!F5-tdrS)0m2As2g~l_ZN3kuQ7q4>-b{
zmUREK*B6lIzGNE3=n-=^wf%ZjI&8@6*y?)%+2p{wB4zFY$-{~U?`PvrKeE^vgw;eA
zp$K8lXnpxUXDjC(`%)F8p;vJ#@PXfcOQnwWz~~I=*Db35+9*n7-d13eR`7x?9_9F!
za3s*Fj?_cjOe;Ed3+VHpAagS@B+#y|!v{Jt<=)&2b)aY)H{DIX|J`)2N!GhC*gCly
z(r~ZXYOb)ebIIOA$2oY{-c%JE1I!*Y4JmBl^k9yirf4t(hmC8Gv2}K)tTONBGnu+o
z2^}E0v){E%zG*`cQTd`OBgfrga1apKE*#wXmaS?TE~jjr)iC`C?N#A|XRMQ*Rm}1P
zx5H#`c=2}RUU`C~FhK{mM|*IDt7F}qZh9!=5+CZ!gJd*;ahJ-O+P=uum&Y+Gfh~)Y
z%JXOYxq_GRo~CwL2)@I05LO4mDAIw&fl-{;OLV&XdjfO&Ez^iq(hiTfs#{P7H{DiI
zz^iFC&%oZg+^-Lc%fn6GFAf5$c&n%0`FcPzyz*Y}jG$OL%vx!+%C5G*=^>{Y%G%76
z{;A@R!HZW|HP?mhQ@h0k#rlrxfGf;(fv4fu{%h`E-k7_+Tpkk~iY5%ofxg^gvdJvC
zusCrN&!U%F%@;B^;BO@qdqR{*<wl>%OA_w=M!6EW%C*v99nLq!`o%O!LaC78L#c&n
z+UzE@&qJzt9PG_E{2HhV)DXV{6?*$v^9BB_JW!87cLxOqoyHi~8pE7%gW?2m09-1;
zP=`9fC)I$md@LS;mR%5AAllJ04s{damBVJzF%q;{?4j5a2=3eu+qjz#?)Ps@)w@yq
zw(@YDFp-=Wd_h54co_30Q8q=EFAW&i%5bGHSrUkqpA`FEu4W2-RgNTGdnB_Y)7c2x
zzyHlUB6bMB8?ArzjaFs{7PRD>ii@TwHuz#!!Lk(88&)7U#F~491`YX>g5~}tHYR9#
zuvOOl#Sv=rcA#lM0e^QNK0w!jaKV4<{(A?q!bu|rndwth33Fp;+gp-M1gRbS7$>V_
z!QF~kgsa{&Zf`7@NJ$+mm9f)kiIfzF9{G#z@wYfky+;95)YIgt5v!Qw1v}pMs)vV<
zR81Q@NszOS{nT-afUOaBPx82qJ|YkpLPI^OXhU%)Y*WIF|3~G7LO0@i>+YUWnQCcx
zoL^+J!kM<3#B>SPJ$TtX!@QFOgDRE+Y3HHmzB`hFn<PQ-%Xw32!AI)*e4T6!b?=7o
zNEECNk6_httGJkYrAo@H@6{bYqI2oqR+8CoLBK@#MN;H%IxyaoYD4`piiq?vYn)bw
zA-KtOjJfp3aXP?g9v_0qd7;u`K=wU6;`zcpXe)#3uJxzgY{Dw-{&f0tM)(o?IW^KQ
zPoo7ewjx0Yy@FWkB5n~L-yj;>IPtwFnlbb<=LU3_9&sW@BFyqr9g&0b&OrAcMLM=5
zKs<(si%!Sy?W=1n!_x@92q2@Ka!%9Z)((Z-gonTXjl{@gD%Si4ghYtT)$DKXSKyG}
zZTP4Qs;^I!@aOkBxhNF@gRi^$gTiFBb++zY>8zqcWm7d^wBwv6m=x)dv=g0MNwK*W
zNxao^)M$Xvb}rKXAQ@gaZ!&fRj-!V$d`j%+*`UfN{xBOjy_jE@GwxtZC9;$g^T+P<
zwpbVM??{AQN?Z)>r@Hz?#}FKMI;X){JgfOB=-!yHLn0I_u|eT<Zu}>1ZlMgv_J!iU
zxUFbsM<zQ~DyXfDVnP{z8Hj*5+K<t)z#ZX%Y&_EKMsv#i3KF6vPkYo!7%_iQIWorH
z-b4+R4O^yA3>EJNiuff$hm$zZGlHqW>C4A_<iyo?GDsbtG2sW3YRG9xl-FHl$i$xh
zv;*`g3RJn_O1mwl&CT4`+}%5w>AmjV)%-%Gxe2<DH3MR7Nh>j~Mu7<rnS2@PJcDrr
zAKtS=O5#`XWr`}hxtb=5KYGqXk}d-ZVx;_Mixi?~`=-k&Tr<k*hHLEfW7MeXWag}p
zWJqMlBjGsS3w*}Y0E1;7%*!dy3y-w!7(&^zcU>!3H(ZBN<}2axi;I1&V6H*uIZ;Iw
z-ZOZ(hsC#%V~1it8krzC6@FP7kTzNhz)6Ze=Js*tvgDhmX0t`nNOTgAZjVNXWfBwP
z5?PaO`v{wQUe&=W+dEcs;9{y^%4kzHc3L7MW>2sd?DR<0aVejxT0od?n)T^Kt*h5o
z>o7?z3Xs3Gm}@J$k$elM4=qTIafsa|t9TS=8wvW<LeZ2HQ)P+UZllTk?O`=7=3-S1
zk7Bu5$s$%H#kEK~)nT3Dej=p0QN&?=7V<ZtE_q%Lu?Hu3<s63O!V(fN&P`}@kW!Ww
zZ%2m{%255z&#GL~H7C1G!r=cU8C5-$(cqpn@qMgMG#;1T^!=A-ZcJ&Wf~*QoAn3tI
zU1p{eVPa{QTNLI3((y+ppE#P-#ktF{3Lzq!ODS;qKR%%sYAW{qaOY_nN2R%q9#3Sv
zxd&p+R3M=D@lbeR5XR+*-sJPoo3r{}#8S6SIZq40eC(0A+W`eKpG$_TqBE}p=ns|!
z9nJuk*gHE1q@f)dFz<l*XDh45xcu!!a#G5s$<HDbt_jSq)&Tc@6t8B1Ov@dV#-n^|
z|C|$^9wZU9!eG<wi4DZV`$9DCY9V(q0XZA->?v(qrUZ2{g#5;&@r6>f4$J3GpLh<3
ztat%cpY-yijMJsNGu{K+_iV$GI#&1GFCWE5s;400aig}`Uisc@p2*XAG9SdXswx?D
z19)CDdE~JfFLnFZ$zTC3FeYVx@Ptaw2VzDB&nVa^<LJRG8#g+9I>!E4yRITO64FAk
z4acSW$+qrH3hZ}ZuJ<Onh%F}rIcm?lY815V@BA)0N=YDgzX_!e7=zX5LE4sg?Wr*b
zdK8S6b)MsKQqaHkC|^fjcOg7Nm>5h(x`ri?rn_@3ugN$YoD(&f#!}zXg1vr9<w+bq
zM*X(MS>)!Q0*5THUSabDq$bbp4}Gfwj0F(}g)@o#9`H@lR_8e4-|e?jnk+UvrzE1n
z36L;MB;*7cy_P@|Z=}U!t(sR@>{NRdV306zQH|u~=#cfM>TLGL(IP7w@Ko4u7Q}YT
zy|g{Ro;-6`T}a22YfDCuZ`@**&ia%bulvy3JQmM+i~2q~red<KcV0E)8$I^}DT?0l
zx+$e-ocf(L|EUG|hSlM!=qk7>6A|Nn0HY{&aQ|$NHH?DSfp$P-1IDiqqsKClEu)no
zEB{I%{PgkzU8Vo!;h|k-MG%RW(Ivv7z$D6;T>FhP|8DM29(PO?>-sHP+XDbU$ka6E
z8m>3=p^z}TGPnT$yq+M;7{!p#CZsnSN%C5(cXh=!?khIAQ)buh=$OCTz|}wAJ#-xz
znBW-ska#`OJyai)<lrvfD8!}<9<WnURFa-pPz*o`CG9|Q*K1TRvJXnvc!@qe;4u!y
z+3OZEA0Bz$xs_6kZW`v~f|+(>SmI&x&fG-hVB*x?sf-|p#yE2x)GwA;k7+byyOA6~
zh^^btq;vgJCvbqZ^%T=oa%qT;3$1<FC=ye30kajd7ansE@ct<W{Cy9#fIqv%08?J?
zD)0B5u=bbIgc3rqVQCd`H8Pe;8;J^QjFUr%LZN71KORjS*3X{4NMeEVQm1uv7)%xT
znoBLFg<k)pPeO_t-9hQI^4iog+WWH8k^>qRxD+y|U&{X|woA{w#dWj|OgN!6cs5cJ
zW(=gjif|(<ot2Pk86>l#dzdk~r1c>3&Fu?AhUOMuxUd0|2}t>4Z;E3=wa${AQc_)&
zV~>QJ<Hwj@C8D|{$B(fku^bNK5*w0vy)MJ+<UkT=Lpq-o5@(~~79{?JyBpJBpMs{M
zm8pp|;uEKEjIy3r5*m;iqry^0;TR9Ls*JkdS>6A+n>W~fx}-x#-SRhUkjt1Y5wUJ_
zKm5dymJR3afHN&wvz;c7xZTRA76V&w-duadgT~q9m-aBJpz_L62vBYHaXP5#<%lmN
z!J+I(_#LDbFivP#X4HtKH@bV`-|A!?QERLZyllXGT5Cmq4(knrrwf`BR!Ar`9IbY!
z@#eK3fSFjwtP{JReJBWDl4Mw#?zfx4R?H}586Hm`Ly_7rAC$11Q8%AtJ~oN&)E;56
zZfXCz#PO8Ieg6@xFNlm(y6g64<71nYKuI>w7`{WyCt1$qkG%UDKgid@V}RQ%8Q#1W
zBO{Rr8^T98YPL>)8ZOPa?{aVj<HG-@_#G!C(2Eu{%ilf_J%~$M7!+h29hMbzYClk1
zuGUQ4ei3tkHK-;A9I!G*TpcFe=9J}vEE^JQJ=W}t?G&Dmakx>j&w2inL9=*qNMl%;
zYD#K`3waV{xQXz{<!^Kuw9G|kE5Ml{)0!QS?iAUWB(}kj3U#Kq-d6bpMRP#e##?MY
zkprh_TjX?97==GOZkOLH89e*mnsKf}x#`mF19AD3kiF=7_r`wGjD1%|{t^v@9;trJ
z1?=EdVPGRhBR<MKW}kBSldTdn?UmxHZp$1Ew4DE>(Ujt#AN!|Tk)kW(@rm1qTE>iZ
zC}6eYrKPD7K%LVLj)M<P_m@RzK)j+T$XX?4)Lx1>Com+XE&5^JHMvgz)It>9@b(=L
z()u#e;S5$AR(Pxg`$yh9iq1K6=n-v1#;EjPQ7OATWlHK)JyRAuMG5re4_^k&D$dH#
zmqitGv@GviOc79zN*`Dqh>t42Cv0RjT<8AXp4QVQ(4lAzm^$+^T_PzkHvN+$kxJY7
z#_5xW46KFI`mK1}chxXug!p&y5TIn^x^KJSr%2u97Hb=8ZY;X+*wqRadDE7vtH^_j
z<6TBE^|mnfALtmmJS{<1xM|R+l){+kRD9X9v32W&R;91)2?6;_j6UfbkxUFq`}ezJ
zHP6#jy9@Xf4+VMUsZX<;hIGX0#*C+I>SI|3goQHUF(tKc7&xQzZ=N`5)6fYwr}~`r
zl*cJ`nD?0&$e{1Z7A=08rKfEm>vRLV3Nqq8y!+`Lh72dp3W*8T2v<r7T-bH(y`h!!
z{_@he0G;~x^o>&fJfzR%!qxxa|K){>H`%Qw<HR92{BG*ja6@j6Z+oAXI}Z7PB$JXP
zh70SVo!9Iel70TLQ&^?QLg=(Bj|)!O>RUm{D@zD{hR;hNp|X!dWDa%ciM<|;<AHI7
z4ux4p^wp8Q!a2i}f^80s+yO$Ira^N?VDjOIIrg#Z04Zp1+!sO`lT{wuea8F{D!iao
zs$GgXE_y3#mW3ofSic<77(7xGZE;p@4D1hjRcab8Hjx)_#M{z^vSlo21w0CfMruDk
z6T6h`CJMt$uOoUURY>$p4QS_5CL+jyS@2w$DeYmwzaTNcbVzgakoi-BX5})C#&nsT
zDQnBY7yz%E{C8mb%74vBlKb8AgI`wB)r_3By*cBTr}#z-F=jdLdh|iJ?J*v=JO6dy
zi&oiIxNuHk%<j-T_-*(K|Kr}7KO#V*4drZ0{ojNxEYu^w3D-yGPXB9^AK-xvVF03+
zt215`?q7G&pu--K0|dKJuu8zc>0f_e6o5fq04eJKdz@PYy(${eK|kWjqWqgzMug@9
zsA)?W0XOx3k;TAJxe&lNAAh0Y{I6Sb{$5J+KVDjsjHE%GmpaMt04+BC*9b2xS#6=A
zw{2{w$XXlDGbWFV#_C&07W`A*4AJ1Fzppsrl7Isp4l{}aqF#RL*XZT0*EdBk=uNyN
zVzKEkBq2GW=)V|Y+nk&e^9%^lqnF*@55x8Ot;=Hhi{v4G+mLxj_K5aJT|nQR#G#fs
zNx}-fiSkV>Ju#x+k}mHkx{K_U9Aeju`eJ6W82AHv9LG8hbRLAzcd=pv7|QmIY2_ew
zKt?-p(LwQBI&z_%TuFBBf(o6_EUuOS$Kzex4L~)4vDU>!-bKUwSyUJJ=NageXTF5~
z8scYdK{?446)LlE2)$oivzilQqM&04J9UWg*jSnfec_-C7hy!+gRWs9+)I^e2rRIj
z`Jh3V6sv>rQxF%-9^%@eo$EB*arlP42%-yiG`b2xAVnaD0I=j4bWdB-y~47x1mJDX
z|6~^1P!{Av3Bcxj#}$(m;}Qma?U4lQz~CYH_Ogy&pn~GAj6rmuyKa)=AE%N5#vMo`
zxV`~?T$q%np}@mq{vQvIl?P`-jU#mqfm}d%Aj5#qTF0n4?|mIXe4tOTzJXU-ARlO&
zw}!aB??Iy>pFV1WSac0peA}TgRc!;ORb6yIgx{2q?N}s9!l+ZsA4-P)!?eJKL<n2x
zb?^O8|NHjUF!7KOwOme*1lFPZ(h}j_aO=+2#Sp<);dD~N-!mYs$$;v>?eID5zL+3)
zn8qQJN@Bc%n2O_EQ2+am5EZp@P}OP!jNvb+#d=x!XVQXYPy+OsCk+A?l=EK`g8v_@
zw4o}9NMG9XBb6z30cz9cKO}A@P@eE>Ha(a!+%AkgMt7LL%b*1q2)2l5{>Ql9I?$hw
zCCKGfZ$%B8a6qirAu(JC<Gpr^A?^Y;6{}?%`;$ei6?)GEFd;r+*x|eRTFbKbUm9=Q
zAJ2y!?<bm74Yr1#6IQ?QdmphZH2erxEMn}2L9;x6cy^Z%HrG7AlLH6PHdOZ@tQBt#
zY^055Tq8|eUwBIlby@C`E8CcjWpodZ33dHcu!^Nq(B|;yK%O)kHgev>c=klx!kK`;
zj^D>rBI|HhbPbno+Fzg2L-rHrl*&!XDP;1-R1n5D5k31(1f1DP6&Ql+XhGU=#878~
zzVh&-UU;3mAh5)}t2Tu+9@FC$$dbUJs6{v%BYt1*%IUc_QYqmOgadu&W4hIX?Sn{*
zCTM@eDLyfe_<agk^pm%^Zi@?cfRudyMdePeTif8Dn%9E&vsrqJEFKH6KlEQHaVlTo
zwQ9EIof_vV4a9>snAgs_y#8E`FoW-ZeF<&HQuupZUoa;P2N_@EOO^BW*$W^kbTE*u
z>S>w_zRWk(s$>c_uS<=73+iF?P&1a}nBhC4Af$sUIH#z-k4Sz?MhLC1Qtv070VP2^
z&7B$_hwU;pKATBjO7vqsWdDu%Ow>CvmoOkulm<Ex`<}j~-(vvHgN`^6X&Xidpb9`D
zL;EZsW4yYOG{yNxac}@<Jl@dYOI2Vua68NyniNgvK-Ym&;Ou6@#(C>RscdO-45tUv
z6+YX?*ja@`&Ab8MSpCm<OAW~0d@mo87}CfCIe4%r3mjS;wwbp^Qr<p#Kf%bvv-wkV
zg)hVPAX}0qF3u`gO%!OjedUL|aa~D$*P+)QXksuT8AQh!yM^kk^GQAdh;dYQ>?mvn
zmshn&<YzPfT6do7?lW8dyi?;H9*aEr(m#ByU9+U<NDD2j(nVSJQHfD=pnN@`<PpwL
zeDy&m+^CG_pm$S}K>;yr!FAd8RkSIr1&Vm_85+t&d$J~k3(fE7<5vfGU^E1JgpWe&
z>2Wv13w6%7TgR#$enWe5*z_}wd|hls&0Mkam0?d_tH-THCPf(yL@r&5`tXcHX)qeu
zUD@Vmp!gDTk!Y18R<8|VX<DI%H$%6n>^t$tkQX}`yXEqDkYLc`K;U%{49Z;O&O0lk
zRIb*xF7Qk^U~-WK{FZ<1nR_zkAbEoLy079moz?px`FnK}J9WNznfo8-ErAoo8<Al{
zV4*-TW$VMxd}C{o4QALs*!k^tZ2O4Vxd=35@X!qF_c{0(ONIpxQg1?WR4~<jbaz~+
z)L1^wvyUd0R;Aqh)*(_Mx;XekcvUksXAa>u>|=M{0L#BR{cu=(q)TQtkO)n7_}m*f
z56-4VezW!Q?-l~OIfqTD{B?el@!UlbftV-Khz~;Ox2xH2YjJJ+j~gyqKYxAKf-J>?
zR4Tl>7~`$Sy0hs`qgEFfHk8h1|E^{if|)PGS!@*LJs%`uGl8c4gb}RP;rY5unm}_`
za8XgHkc}kB|FhRs8rA4|PI#?aEqdVXtjlhx4lu6>y_H*lEJa8Cunv@Dh%!rTpfx1;
zBDXhYNK!<Tb~nP43#gyYuMTJ7ahWqz5wbn8iE485+#0+Tb=x6Oj0c)sRp;#Io-bk$
z8p%O$58c|!g1SH#1=q{3<NoWzyTeN1*dsp#^{GQmX7T3+xzBw`;;hahRFhjbpHDyH
zaT}4`c>CbHt~xSD5%T#?23^u=mPEm!LNKMr!%1UwU?YWS^icMjJBgwQCkZyCSzE}J
zjv_BoxUJrxX0}oh=p%20I|1K-Q*vP*L4qE$asPNqPZP(?{=VTL2UYQoPQdu`8qh`T
z61*;Tq76iqe6T-78z6ZdlKcW|L;{O!4BqY!u;RASm2Ss<k*l%4IulKBr$qZnbf?aR
zd1Gn~-j<?2Xj~6SC=A_Rnx7h^uXZ^6Trd<fuRlh<cMN1o4338!Cr;qx^o|LyMX&lu
z-()n|U+ySi?>_zhCQA22p9BO0x<HO3Q@O7ZGq!!TU$IUxKulS}(SB|GYe)O4DGi=2
zHF?LYEy&WS$I?OHhlzIDblaVv#sts5H68`9SBlYbn<|;>)@G2gp$ViAJ~!9*?W<C=
zSnN!$T?vU<R<{^J@J+$nMS8j7Qi^?Ll!JyxRZxGbOj8-IFxXagr$=H&RgdlGTTfwA
z@bfG(!)Wv>=ngo%$0X4M+v#1d!_g<lhrrr)I8fR5J1N9b7TJK9#H-C%i^HvfoXvE!
z!kyp#3SIuu>8*w}KUv^3GBl9uI(maiF}q?G|6bg2OQ1Qh5mc)_<#ZdBaTPQuOdv-&
zxg|9(|NapJFR80Lql=t+RMbyQp7sMjC9FgEhqD&~3-BCVoKmo@^?W0|&&xT#KD{!0
z&%jt%sXRLD{D}az*n6vNldEV=n=Lz${=c6W$ni(~{<&eO(dk@3rC0{nF<g=bC&KRc
zF<|nZ)G6w7NOJ_;v+9pTU`wjubIA{(orkWi=8Tw-1GD4-KS!VZ^vIC!(;7vSwHLy=
zjsdvnIvE!kcVY{Bin~EOHv*M9BJk7w_&b$x79R;J02RWFj8GM5Z_v*WPL<Y<6kBe9
z7ITB-IS&bPOPDlbDU8^~WZ*AGfyPB^PFRQ9#X_)$0RYA*OkK0IN$I$4Uw%&ckxH6j
zHBfFc7J6$@`G?g8x#Yt^lm$@1lmB;=qW1UcD)PWDaj*xPKAPm_>(gcQcP^PYirkWw
z8Rx$*Pf<X|-NDuWGAcZP;~uM{w7pa5nZM1OS`0k>q#1zUP^3tt&WKyDY3JV|`d*|2
z@40NlzjH!*j(oN|+bfk{u6L&e=YJD#p!}E}tP&NpdziyIa6$DMQWPdsor_GWRKbD?
zR*3MhzNtgHd#Y6=qsolb>)T`|#csOiuYL0PxOGGP@uKaX+aPa`RtuL9y-4pF@Q(lZ
zR0%|j08?uK5%lOFTIBqmI7U}-F<N-ygq-vxtNWNLe27j}9Y7|%PN4(V(Xi$v?1KB2
zo&YH}y=)t&byQMHK?tdIcxnx4yuuqp?7e^%#i#hjqYu!0P{<ZWVP@NtL3FQs=U=3i
zs?HA0r<$0F6cXWEJG}35vhFNj<V|F(rwRet96buA3e|YWZk9*2^(Eid;N@E(LSB3{
zDn-9-gOQ%1Yx;>VoxdP3b2yWOwsU<^)&%D>3xap6uO|`G;Qcg0{vX0Z*%wF^1|7j{
z*sJ{>A3WB7F52`gnjq7w2wDtFH@aWja@fohC6>*pT6<nzqh6h^Xk5Q;ll6_4?i$V{
zE4^z0Fln#U)=o#DqzzyyVV%5#-@Zttd$_N=e*aj@=ZCc*ZQwMHr-uGbDWt}9EYvKE
z8C`R(G@iP%49SADO?V!L=E?~k02y#@*=|q)k1kTPM@{ca(^)K8xNJWK-SXY^2hYf{
z*1aP9d7a9}yR`R-z*|C&)Nz&=T~f@ootrJusD(;vpTjp8eI%q$X6TgOG=)$Wlr<3d
zEt-VqXawMt%+)yq>QyU?juY#NB1dB5p;&_Q^l6bG%lh#W6OX^30-w#S2f<6IKQ5@2
z6-cs2kLPGC9aNQSKa?wZWm~nB^>hZ7BByYF)ca?yjsec8fmr%$#I~B^A?KGudxHeE
zjFS+_p&0BBWT1k1xC5k3uujl8lN7E?eop)Bea>4AL%3c1qcXsEYg;T9In!u-7huW|
z4#x=a*j#(5JiqV0-kZpUyE$42gX$56`#o`ToF2<QHWhXiTXKP)gy!MeT}kcJzowjh
zPx#|!OwGtL|IZwY<2RpL4n4l=)(VW87*zL$Lz=+$t&8m;-$TxA01Ogjl8#0v#<v}w
z2stn8K9GtjuB8~5jC|}WJEV#t3Uh9Wrk6W7{}Q77RH#eZGCM6}MDBpR(m?JIe~Kc4
z4dR6&G_F%ZH%(qV+(h@nRGDBFO@Wfa{wJEQ{U+;5`)859OplKziowd_k46{NJ?|zw
z7-lxXnOvD<>BZj9aHm}EIv5jm`|P<N^EDvFHtgFq!snOK_95~8o=a$tHzz8Z1P^q8
zNKq*)!TjzcK1JIhQ>$*{Cqh!n-&@$lt=>^4L%}m8>P>hzM6U`Mlh`__q+b0D2i$bN
zL1ivqX6uj!7Y?=Bbz5-fvwmSaogLE=(c?@8iZM#q%u3KZgx{Y=y6$SQjl8ET8=N&&
zZY^Hc&p!Do!sIc*JZl>KkAJ>jfdUv*4#T}4?RYU8^YzwPX*1=4O+yXj(iN>hRIvZV
z;TlX<AdUJ#WWfpF?tH`Thjuf18(&8_CBe;l*?sLsW(`_k%nRg+zcuR>3H#D*e*+E}
zGL{O`Ok8Yp&n&V$niBpHpgx3S)Z5D&7Bjg#M*j{_6YM7&81|%@r106+KRiYr&e>m$
zLv4BUWrr!FxKx)x{3XGXHnSAK8um{klZI3EP331mjD<FpV*+YgxMYTU-|)oTcaGo$
z(wgvA7_<<P%}~!vgLWCU1K!Lm_CI^Sdfn!9BD+mgdN>MXa<_0Ef9Nd^kOXU1%tu+&
zCjGs=s6_2VJ;A3I?B7;5Zi6mOF#8=^MF81UvF~U<zRLXCMz`4s>Uxzf=ylnR+SIFF
zrfH^jcR0ZG)VV*~=Ot=43j%wodDJ^ti0ld6=IFxbU@lCjR*sGKRuP%j$C!|=yu9hZ
zHTRZrQAXd|u#^lUjUXbz&>^98!%#CIAuTwBNDL@QcMmBsw9@DR($Xy=AtBw3NOuff
z&kg_cyyrdV%lq}^1M~aAo_nvo_F8*g>)Q8TJEJQX#8Fg1*z_qXx?SzJSsuMok#d_h
zJC>F%aL9P2H(hbYnq{QZdz9a2G31jFLO^Kyl!Mu|sy=8ubh^;nhOzq7)odNs>b7}1
zpHRot97W3O=Hh5?leBb}{;LUA^cQ3D6zo0U>H=fNU{uuETOlzg+Ninf6GF=$#{Tgn
zY{%il(?JC<_tIQMyx-NHd2d8rK9qKsqT+rPjWcBlUZJ?Am$M?W_Ea}y2$r(5o-93Q
zNauY`(L<}-{IPFEG0yeebZ~Y1n|{@|zTQ_>W3zW=4yw!{T@vWA5_pTDIbp%c%S6|8
zfwu9#IQ=&LR_MO?2ZCHbf<1TYjW3TBKX-8+f3uuqGk|`caymJAwO(;0U?5aY2a87;
z)$UV=Q+I^LM@!{^)+YFKDQhLXK4f?#;W|za2i2_1o;Q#WDiky+0zhj-_8ZJB7&tN`
zaRCaB`W#G6$l72^p#2U&3`s5uwv583vtC{(*|XVc+kyKo*YC(QUr`s95WB1(T1f{j
zjeYPBZ4%^;eg5?qsF(>9YJp`T>+U^>E4>W{y{H!trN0z7#|Q2P<GoppLyg*uYSI0m
z5iTrWf4QM1S86#&R%jq#09w0dBHfLAENnN6%l#=jw<ohOT%uNuYYpx?Q>0eNZzKQ@
zeo=Ft?it9(L|z<_m*I1|R%0!#vtF^D87m!xJe!{2hh{wEk7aG_&d|g5XfIyaOG2Fu
ziN>TzvtJ%Zbo7YpKEpKDN-pz{Uk+g>IfJoB@ve1iujL;Ni;rIFW&NxUiDp`HOh_=t
zw3ud?4vjXB*LVf6KqkM=iFj4MW2!1idy~DEDM@=DV(?vGa=7>^hjIC-8LOASt@rQt
zSAmVrqmX)8TANQ?_w@IWwR2=64Jo0j@>tZ%z2Vcpp8DNMAt{@-3j2dKKXu1UNmA+W
z(HW($sKdIV-)gtb6{87|7UF_0Gy5KHo-T)m)6LYG(OYdv{O|=oN2^3Oo=43`OP8=W
zIIMI9B-m7OBsJy+IkncjdMN1=<uDcURUa^}FljR@C9iSfgzl!CV3`1T-F|&_#E66x
z;Z`tCqDBQrI2+|Rk70z~p4-}&4%NlwJJ>N3@}M8p9k`6UMjw3$9*Z~4Rfwy@)uGp?
zE5_(vU2MJLi=qX^5}=BZv07?#=4?@ajyX|(LX+nX81G?GPHtc7u`+aSM+A4J*Z5bw
zz2xOx;ZGz6fG7lrWeM|<PbtIKyf(i#-MI#QJ95V>P4oMT4XlJewGdgt0s2m6?*EIv
zD`rF62ya4SG$X&iy8=zeFoy*=oo!~1mw9KDxm;Zy3to607#|+j$IgbG8W%SP0XFXd
zRCn{DNShp%Enu}1Wk@tk`7U$+rkK&(y8tism7jj;+&Vbk(y5_efBUqf2WE*`ug35J
z4dEq`3UG$A;bwa10vZD9ZOTA_GI=--AOoW2W2yk7x;o_>a0W1!>mCq;CU@S~br<5x
zUP64tY6kKMnoEoBl`d5~p0oJNEV%JE=^@U^t3cC}!(mZ#nAfiVwVJXx{3)qi2GVJ<
z=GE%BH!kt^r$A%1bj79Qt$`xBmYPqRc4lS$#X~4$zT``ZEXb&K3A~NzKRA$xqyuav
zh&yKp@>pBvGzY2_NB~H@fMCs7@2b5qnyPh^14p}874DWV1n<@SrdR&B&h_0x5(_(b
zXm<inMDP0zu;(a-IJDK1f+{pi9pZ6$Mx+3t_Nr79{G~rgLvhXs9N+bh>W0*twy<^(
zdFn`vbWv&wbFckM`80gF%RL!%F??|`p*2vf@E&vR25D$O7p08Ktp6VR>YO_(8_a+i
znYbl`K6G(o*u~}oI(ppy1P;6bsDcPv1x>nQLuy6>3Ey*TfDuSWoe@HqB`p0PvL^S;
zw|4S#UJjW>=RloSsU}q`ib3u_a~IsV+aTX&zO&S1iTZ4U@{_o}?1$<4)Z0QsgnUOW
zt;Wity3Fb<e!IHyo%%XxAq7;$Dz>ND&|4v29O}b=-vbjRO+)+*Yc?orgAB3#E9tRo
z@kV1hyt|z@Ad8T-`jwkBMl-}B2=C1&c5vmOIpLN;-e^zXnYM(2kta^1flKIy=f|@@
zx&Wc#ayLKTgTQXgHiF=5u*q<)G5h{_Znx)nl{;N}M*{x)dW;f)5A*6*R1ylp%M$Vn
zT&e$j4n8bMdb9hjZ#Yt=Q%fRZy)vb>B?Rd-{QGiOBRnKVQppzcaCa{4zyjj2xmvwC
z`$P1|x19EK*;}QRbh+I>2Y9<ZW8&Jemro}g-;53uK=(c<J8ei~t4p2-<eP2(>HH*c
z#B_1El98|8pc*c!y_G0^eImKdc*o8em<C-CqGr(ozom7zT&p_GRl(M!pKz`oAOKG8
zQluxst9g+4!TnXhQ~-J^@>;O2mbo8<nv4rmZ#&S;8q;WcMSF4HC%vwitti;_#q3!2
zn|_gygM?2Kqsd`%Lv-??VW!#$IrQjfpjCe%chx)nBHQ<q?kA~?X+UYEpXkpjI=t>X
z7lgbI6c0w>&%RsYD?Ho~eD6&NrW1aHsuNx7`J!I10fn&(g5M7u;WQjUT;6u?F(+57
zhZ5N&6>P}q*-FwX8Z!LfcL54*ZSvvSB1XPnQyJ@8q&EG58bfpdLe`P5xthuzsO(ef
zNEO5vtH=-G+!@yLCa6F`Bd==c6TL;LwN{FMea#zmE9xp<05*eLSgK|CFw{*l>%{Hi
zjMXNYt6p?qRLVgh$nGOxpuY%}tPHBf)q_yMiagcBoXBBtrZ=<9QG~)<J1a8p5;NXu
z*PY97)cQkp0}d`qPovtgA#_jX>kB-GT-oC*Ce;3b_P3+u?s5B*!aCb{U*oE^J3!LE
z?dVs5ibe4lT@#%-+XMnp-}Gvqt}9YK>C*9(Fbf#N?x8F7zrt8`OIUI7OD}c>Wd-Q*
z<9c*4F}%rx#z3$zC;dG@S=2Gco8mupw0NPp^>ogu|M_?_)LZPx?Bbx0Xwfr^K`N!b
z%Tf}E)(?QX+n3Iowz@JE;TZ{afv1HX+@Gw(^z=ZB%db~v24an$p>0QaxBd9`jDQ^p
zpy1YT5q5Q;IPATMEA0iXy+!Y3JnfPh{SjHl+?|~3m3ATpXwtj!+^3N$R6Vi<cSmdy
zAB&Ij1s`SZmVfU5F?HwS^7z4vG(PeJ-A(9sV+%;v2K3{D4*%t%dXwWQ?r>BNEaiz-
zd$a~phnjqqJe!Mgib9=JWj*ndZA9HG1o^2tiqeyV%}2zA*=Xu9MQ46f+Y7YqLJs4E
z>zsEq<5%}|pWX*Zf!7WP_$@Mf2f5oC5@xQ=-^*e?lmkBgo_*uK|1@L*E<Czb=iD60
zFtHVPLlO-a<6d|fgYq(DCVc<w=!0kCS<x!O+e$f4)lL3QKKN!>#4NNUPNrlKO?OI_
zN3WjyG!e)Mn>&qtv|VSYISn4~48LC%1}_EPBzy2FAz1(-?M)N%e(f({(SFA%hwY{x
zNi|_vCWR#6r~0?U1r*7-I4kk30gj7rAD&*6WWL(qS+AbsfwxYS=kC2tx4eLqI&K7i
zlD6TrttYB+*)o_G&l^N5WLK{<>;K**8Ns8Fys++Q3aP8KI*@H(JGCQs?0cx0yO^k7
zy)~btKa+I^U!>xED`!|T6C_p9_%U=6aAbL0YbtTXjU!ir+%ls9qL)&%duKvd0-zUL
z2u11T&PncOS_D#!SJ-AVMi>{J-+fiS(%S6psj+?Nm2XCSe1TMYqEd9_*|u;R{*#Iw
zbIs9p;#wXQf6e-Lk>EOQO6beWk4IJ`#p3DRd!g3RY*)fX#$mcdbH94!EJw09jggqI
zSTo<<{=SRIsDf5bRjQf+sg>=6f_3!)<h^RySHQhB39NLSjXL5BAjJ2j#D~EyyDEj-
z8kgIOkoAgbNDwf8mG%80Wp-qm)f=NhpynP;;jLaRFgkIb_`(#7_BXTK>+yRKElTCI
z7x2ohdSJ?4m0$1MDN<@?2{B}@q5A?3H9#ns7v|Nvk!xPLEcRo?^Q7vLY36t*3n3^y
zoZjVyvdsMS4)OU?7@sm$vJZp$)_W23Z0NL)M%rjQ0?9+jUkZynzB|4j?UD|Z7v0a?
zGe&OqAvs;rrZ1ZLG%hak#3}ni0b8?Py|uMDgli>f%vl?ply#Q2{y-aqSa^uPL;yz1
z`VV9%2e>f(PH4IUD9lSgC9Ujs91`Z^-1mlkD3kQFL)^ifzHbX6mmC#U&%J=;sGtMI
zL2LW<TGH>$p+?j#Cq!hz{=vX@^Q+DUHjN(NELklcj>5_YunU?>FYjg;G42d*`2Naf
zW?Q1N%mYu5E~e5{@oESl1L92kDWlCKU0%vv<#3l3xItjh7J}TSlc0iV#_A7XyOJ5w
zjvk{B#+~;7h(Ue6^${#^DIhj6))v!NoLP|)OfHPYrMkfFO9E=$o+a40CMT?e`*p{Z
zjW9sUjb(Ls8Uke6^Kx-@ZZb{$c8V?4lfdI1Qoniida#wyQ=RYy`#UZGPcuo7hr<Z^
zMi65t^F8P?8`u%qG;^rI0k#eA)XcbB2Yn0eo(-01yV;SpDG6UF4c3!Uzul6>379!j
zVHBPs4bBb#zV-AU$n5LQw|@LjVOz{It~-a<bC_5&)RQqhBcFGg#F6B98Liu2sXxqJ
zc0P68ZRM5DuU}UA@v3vbKISUmzsNQ`*PPJlfRDLal>o7?<~+l_1<*Hthj|Yy)fMWg
zD*xGnhGp`&vgy@HEL_4W-rTsAGjsvaEkKq*Jg%l`0MY{=Ab?&uFOLt{z1&}##-bP;
z>28wGDBdv}sXD1<5fvKb6MCz+rw_iN;s^{DsU)7J6>{J@IF_B;T6MDlxWva)OvZ@5
zN87g{=d5{=gHr8gfy^DmQZy*N2|Z%W_BS(<b_dLDP$*!bo)Ma0NWugU{!>k=X&`>?
zYWMNq@2L6zBF4;h-fI>}>^HrpJWQ!-{;b6|P-ZIFpi@p3OtkKw;d}9<O-1kL?q2z0
z+DzI<fK34J$;BA%GV?ePK1oy?rNpiko$wZ}Sc*dh8#LW;UH=bM@&6xCW!KtT9isdd
zc6`)RdYv-U5zitncKkR1;MX!G94VAYWD_?<pV_SAqTNK6t}9(xO@tF{=&yYdU)BVC
z>4`KE!!OsJ1Vo&Zp@AjC1F4~Rv%hI|=_8>mkM4_-Jy6SHf7C&$%E%G#q$beM@Lb-h
zc&;q5eCud-$H-=@aklZeE@Q*tulE@L<<3g&&b8y*ae1Pc)5h22$|%=Kx2xJ*fz_~(
z3Xc)aw7K2qGj1C*`QHym-IL_Tovo1qJB*XV{$zupe!Vi#ovpo32NS&`1$udNLuOSB
zXs@Iz<Rcv?g*d8w&+VY~x!e#RqgqIdqWa)!jGVLQ$AZ>e1<$!T*qm3nJWfdD$dLNc
zi1gXvsGnu~dgvo1C81`>D!bnTf;%q*oq%Z17&ZAc?Z9RqXSn@6qAdH4!$$|Vwtl_Z
zwWRZ%+97a+s;vy(Rka)ai@)6-D_4hBMD*f4;P~_qOwQEQ2Ie2h%fpta;Y02zt1dKw
zN4_smWb^s;Khe6gg>t9%38Keers^%(-XBUEiXK_-)0TW&%U<~gPYQ9GZw?Wq*R4I-
zXSE)vgU>Eex}-e<_tb|*pcHXS$5a*D9N+dB<O3I09eO*U^Xa@gmQj}-mUq`We?J*c
z;inemO){?d@S|6352Skjq4a3n(N3_j;)hh)ap{j!kJ1KSu~!}G(NPxgQD{EH8W!w0
zn<0m_x-jehmC2nzQwKs~*Iw(pD8t?}o|xc!YE8^O%_TM=Q<%~CF+uBJ^=c0@P?z?V
z8zL?%fqgzszs};nl1%D6?suK?Ru4h-8q`dOEvWjKH5drlEJ}FJBoI}cKk>{{;qAR9
zuPQRmc#_Pw^{?UM^G&fsGmXteZ;*X*hybqJRlbF`MDCdQ9Dy@tXG%-`K21^Zp|!Xu
zy0NaxQ6jx_DH~R<bJ@=%aFA9#yxJhvTfN*EX^R}D2EXUXOF<Vg?^&1Q$A#u~_RNWq
zV4QBpfpoIv!b2I_;%i(KJxKL|F{8QkF3|kyA}_cNsz?5~=87XLx_+G7?D|RdD!eW+
z30zNi%*v<tK>vfKT@C2<mV6sposO%VPcd$@ENarPS!%os;(jYQ$FeCV2uZdhsNJFk
zM}k(LLykxX4>!j0t~dnK>;}|7i!_7LI8VG~v{<9YhIkwP-p7m`ZXB*P)aKh`N+&H+
z-aEI9P+Of1hxM{C!emOy>~v_rKp8*(Lr++JEte^r{2AQ;xhl_eKpT^ZLwC;=ffe5i
z+gg9=4_sJFR5eZHh5D@E$Vnb%a?M@Vw8ELKu?<v<&1%g*b=|a~_56vYm9@sxxjabF
zy#@Y(-;Wzo?n?2i*^CvsK=$IKPZnH}n1wU8b@j^YFSLf$&OX&|PNNN~HVx_Aho8Aq
z+x#yd3qU=Or5OTlBLYgQReYAdlr5k2H5?Pj&&~SEny=VnCYWibPDF?Kl+$}w-5%|2
zR4vdnOfWJxlV8iJn~%n7X<zRdGuBNpOcv{Vo5FVm?elD<=Sm^d=S4)I3A-85tknSq
zf(qd30Lar;lV6X{Na0D!BhLvCKQP?|{}e^R#}w<SRd|V=NXlRCuL89WXQ#AAE^@?&
z_jl-=r%gTAqK;<LI;JZ+OY+RiRWP#+<{sxo!b9YPA$6|(HP4s;a`vNI)0S<$+SGgF
zmy7w!^+<TGf*dr7=Ump4-YuuMNKeTueItQtY~@N+c5HU|71xjChBkg89z>C~PQf?5
zG2PQdSl5vw4BEV~z9DA4ICNZ~*`~TP{6fzQW5`ppm^D;LNpeZP@3wbpDb#e7XY+eX
zaMhl%)4}1x_|K4$Z+TkQI$A@t1{>|V?}d$#>jNtsBT{SHX})KrlZYf~`Phs&{6cJD
z8jU?K{0iILACl-<6IEOPHr6=951U=(7BaO7D=5OE9fxv4H&L#g@(eMTjkp-pemCC1
z@_MS))hel`(Yb(A|JU{5Ws4Af^0fM$<|}JAm|3IX>0t%ZsZG6m<V&YQ@-yczL-fRn
zN_UA}2k-2|e-#gX0i&aj=(EJdCB7s`my8bf8IPoUZYGmYSt_Y%l&7e~p^A;lb^D^-
z<pP0~vVdZ4biXaGqPti85qO5&U&DCu<nOupneBw;mhVsh6ywYcf_b8*D^A;0PuR-o
z>~7GY&z&|oo;P~OZ?r~5(G$I6i-~latks(zI4g3oI{?;6WA-11LH(WPi{%ZLN!M!}
z?b0{dg&$g{&PEn|5i@nh+LZ>*D_?9W0e?H>?{a7y*4|j`9>L__T_3N^*naPsmMr*#
z!3BvaC`noy6>7x%i{^?z<;=meDAg%bswmGunQquKqda(p_YlOa)sR@lBQpYq>P_0{
z-*Fbyj3~!(X9G$kChqj3(}_K`X!<~UV~Evc#6+@~sfhu^qf^i|&oDQo>f|y`PO;$X
z6<dOQhfnWxpOf~M;xbsZvH8UwWzGTt=E)!^hrECX`{DgKL##s79S1JIm(;_ge|Y2>
zqP}lbu!9iiJkd6zA%v;Z12j9mFSmvs*$gO|X*)M{B}-cR`xpfsDCFC>%03eINoGJF
z_KKJip5&@!EV=2cfJ25si1sw|@LUk3dN??F`h_hSgGMSGDTrwk-AIj^#HgtKeKDgC
z3}hGS)Dj$-|GKewwD8r&Vq~#llhMbdr#jKS+|E-ydZ8Lue<ltdJMAX>)SgVm_J9u)
z{m3+#&}6HL5pblAgd((CDS)+lzghBKi95=09Q?r08~88u>82RB8$=AzbuL!v;}}Ul
zqGYHyFULFhovg&_Ox#i6IPI@Ny-S+VRn@@h>Cx2tg1>!D7DxfpzZ2Z&AQ#j7^`HW`
z2vlO<muIyJB-^H*J%Q7|i1WtRfrV4%jp-PonPM=9dwXO*L)Ur{1zbfIrbOAd8ZwmO
z{0kG79d9?05+YDQXCBVR_34)+rS2Z^f@p|u|0HX5W=DcuqeMrM7aOJb4ltQNB>+LK
zAYC?5F5~CSkq@WgM6?&j*<}rbCQU6H1)T{*!=<X_+$K>io>1vCw=C<6Q@b-nXUe2|
zF5Pn;{sWZacS5sq1^qTNWQ7-1WM`4r2a+k9CyP`r&^FyU5CWH+uwIQY8KvlF{1$$~
z5G?B5m>BKD)WX<D_j#d{|EB=`jc}1NP>wpj|2fhWc<9YY^T-B4NO@|h&@8gS56}N*
zZ|HuJ{TY45ZX?fHT}Ik9v!iNT+w#0C(Sf6y{Wl!s{n;l>?VkTwUPibmi}HY})<a1e
zF!y~?%6WE|SOKV}8H|hYgO?Dq+)q+I-247kWTDw`KFFg1Mt;a6vi`O)qKu)%rGF$)
zYb~#qYBE~vU_Y=4Xz`)H{v;j~TGU8kE%`qUM~HIg&10+LFdxy9Jt_vTu`321-84o4
z?|;}(N1&G7SZIJ4*FU%#to4BlHl=L9mY3uXQ<3{FM0ndbHY2HFy$MoaK77xxM*{6b
z#&x<vq24_E7I@nJn`(mpJdMg<e+uZBZ0kPo3kxnW!<-(__%bD_!fnx#Nf7rwmK?(v
zsCAO(A-LzzL3t?s2Hx>R*^1!#@*)D!Be5=y*T%5H(O8%2*MkeB=*Yp_Q9lNgdfxP{
z#FbZQ0a#g(Mxjq%(OGZoc0A@g^K8s-qjxvhplZA5T;Kmq+<*FM2er-u{X|+F-Ujc5
z&2}bqo2UnEC7_^(*ugTzWT9#BT?MFPq|5!O?z=l)UN0F_e9YLR0zNxzQmQvI*x@|8
z5jd0YwxOK%riv<<s>mHG=cjk3W0vctY>XC?yVVT@Oe%gO->^sLk212s&C>9{aiv-k
z&*Do}CvFCSB@Jrr1!Oramw-svAORGVZ7k>#o{Lei<`=kC3MF=FIU#UwaZ%DH7cIEw
zNx?%Y8u2y?Ihv_>A}pYf-9VYm%>Xh@>29Xha-iM+Z$;412DUBB=qrx>E!U?#oCTI^
zjSwu7ixc*B1zPmpr=2w#1>mFvM+K--!)~%-ln<O%=v_Yx(Z>2Fzq)o?_*6DsezcFH
zDBPx-9ve{M)qH90tpY;nZZxEz0HxoOhPb-8$#eM{6=i2M<x3uq3ohNGyZEC&Gkav}
zUH<{nTxdey%C|*b+8&jW@5oc9P-O5r$ISXk{!tpe-q&EuJe70W2ozNW8i$D4C7g{^
zS-11n%1#)7Sp?*-80}0dSknO(5*z86sUtB#8!y>H&{~n}(O})-#>lb1c&hvHmhkIi
zCwf$j+d0Fpy@}-@;h}Mk^j{SIksY_s3V<-UxMW#@*4PfH*LMH$I7obz^duRmpeYx0
z{`FIzY%qGoyEx5t^5wbqMEB?82V+%Zyc?s46Rnv)at0b6t;Z&Ba^1X`-aY^f@F!^~
z_3hy0pi{2yugj%g&o4&xy17PKXN>B9lzQ|v|6FHuQ<M;MkT2M)S#%R&4hPY(_a{6(
z?c2I^tVeQ^?h-#KLX=45QAA01fe`jw`GBwsR7sJux573s-3*-ae+3ruDx-$Cc^*!-
zc~iDOAjo~o-t?71;)WbTYHprakhA8^(|pfn)ydp_qeo)j%ZZDO1pgI(M|saCDDsGf
zrY&^hS&atYx*AIaV7ri)q=PW6*|EIC6q}20Qj>KMJj+WW+P(0cCVnNyIhx{1n=!;c
zJKD4iyQY~ckF(k&l~`2o<)Y(=)RuK+{ntE$%9_^?IN#|k8wo%RSmlFuWdJ7y``Mb^
z_?KL!-;I;9--Aq6v^5%B4Jk)HcDP=y4E%pZnIOX>ZUfhAc+{Hz$k70C0%v0P*@5i+
zq1iG!7f0$I!`+}W=|Pu+#|+u{I#a@1skf+4KbQE%3|U=o{rtXv#Em0UN`T>w7DfLZ
zfKsboc{E)fr5uSj{78O1AwxGMa83tq%y$o&gVhy!b$i(o0~W~*mR$TqSTSGjYxnF=
znoA-2fw8I(Bw79MQqvYN9~W}glEAv;2i6$=uTHqvm*V+OUXu<o=%1_#Ll1P>gh8!p
zdy994Y}#%oE>;1svSy%cjLii8vlkm+qUYN5mv<4f>x*~&qhe3YZd=_L&>jEriJN7a
z+l#)nA87x))b`e}{-ZlSQXrzlJibXfE1b=47RQ?)@(^1g#1B}X<$*BIm{~;o9}V0*
zja>{EXv_Tkc8PyIv&%Sv!@T#ov?SbS)nBo`5^an#QlWhNj8<Cz>x7E0XkKNV=tsa(
z>`x0O!o_IA*mAzPSD}+|Dr?NMZCDOftS!UQ8_eEM*LYwdusTK=<9et84?-9w%{pQ^
zH^Y$^iB6lID~fljN3&sL9#nNfU^8*wMHC0mV~^r7@NwelGcM+|rwEs*$#RR^f!)F4
zxpR31kJi-Fi(bKI!6fCDAnZ!fSFvI$ziPi54Q9nI<F9Vg!H%IG911fYu~%S3335)7
zb4_UMGg)T6jLsG7+Jt~$>F@ESMNQZsCk$VuuI8yHz)GNAU+6uDtkd)Vz~w>&qkq6E
zn-VI-G(H;YF5p!avC#2`HGU64zl=?UifIkb#1bbtz4H4Z0jaA;)5osQafSVO`Sv;<
z(})g^O;2<xK!;#ssB`Wi<4e?-V@;z`UvQIx5E5C%WP^{f5Iw+5{G<XoIXE9>!)oR!
zp)MfYAmU5sK$S<;aZdisp?_hqv%o<};t18bXUBRX7jpT=FY<NAoPWp^X4uoQj2D1o
zmzBl(jD@q(mkS{Eb=)RRABN{mGA|Y;OBab5*5?n>ZKj@2YB8a+7UR6lna~qv3A@Zl
zr$V7Lhd4s!gvdFWL9Ol1NQz~JV#_k6Ux+A;-z~9f_Kl9N(6gT`LEJ2KG5sQamoUvO
zX2=;nta0Xw!D?oD=$VcU)Z@kA%Wb!*T4rK%5O~S1y+wo}mDE~@A6KB1@5+!_)&d{q
z!P873p-=8><TwQr%%Ft_VGU$gpeh14E~iH&{d0e@(Ly{b#a$qZ5wx1BCuyfMD+~L1
zp0V`vN51Hf=TgN_CspzhPX=W?n>lHKfbN0IfL?v~n4mMeH06;_+sLaPmh2(R&lqMT
zMtKY&OTT(h!ji_a7KsZ9f5LpGAYCCmT*-pAkX>acJ!EOzSYIZv73g9@C)HOGD1n$M
z#m58?nz<?-BbD&nyZz_mL1D$StnmAdy^kMKA92%3&GLQMl7A>U^|_Z=W}1c#Kd7L~
zVfO}ae7rGo0c3p7J_Pcf@+UK_MCGi2)?%9@FHA)doPs1^$a^;|JTt_FCgo+wIA?qv
z^@ag?r*z?ZGw6~<HdzCdPQIt+baAaOT-a;zWSNsuD(;Z?c=&~W(D!rAu><7{H~3x!
z1Xmq})J)|ehz({FD*X`4Y;Izg6e{&e#z0h7E<nlR6UjlQw7UDA1ei6H^V;Hm=ZPN+
zGJfc&0@=(b1>%09xh$zR*b^&d-}yX?6C}V)XUFNS=d8Z{hpcvhr^`K)IaGR$S`mu#
z9<SgnuoTdc`k{oBJu>)t6WLlLu9i?zvn+m7mi@Qtjc*Z=C#MYzp_B|`dLxGoZnHV@
zQ&C>2sluJ|`-;tmb9^`S4_P?deIqzI*b}rSWIMchKZ(y_)zWBK<*7t<FCESkttJop
zko!k@$!LYx4>j{JMc3<RI8tjMgqXMtJ#zCi)6u8|I_P_3B<gM~HlSK!g2Q{cH5+h9
zm#enuB{`K0nAwAFkF|`3AqPpQfz2E!C(&dt$D6ti((*k0Mk?EpkmR};w7Is!mgMJ<
zoGqi_>!RL`5`=Oha4~WsIBkm8(e>c#uke6&f_MpVLneDo>#x_HCGs#C2`#e0KK?%|
zH3H#lIrmRwo`a*<rLK<dAus!Qz(E?2GRZHVkt**<>`CxC{;KFr{BeRwUmK%Pfn67;
zX~4~dhW{~68YHD9w%Bj3RRIaNReQ|zHEfa{@#2xUVvS@PU6$0@qy1nef5K!^VJLWU
zuN*MZE&mIM49BYKz99{v3$D_=S=gi>yYxa7L2Rhm5(L&)uSPN0I5_j!?}OPvsC+gf
z`ZGq#OhpM=`f3g1Ee7dI)ttc6+}@Xiz86O-GSEf~e0(~_lH%hyW63kwO<DF^u4|22
z9lk_4t9VAgO)h?=#)EnhujSD3g!joMd6aC-hc8UbmofEgoyX`8j3IAhH2amw7oV28
ziK1uw7(SAkMSsfHe7=YinJ8aOJliD?3CABgPu8kCozHrq{i2bX?D<+O7QNc4dC+Ea
zAcgIlzvQF6wvTYva8QWYi@k^QLBXG{C9#t3@vH$1+s8E1_cxH75&JXbsM8gz#wlJo
z!ye2bYP~c;hItDJ1tF+qK}Qw;)1*LC*|Eu!!PR8xW3BP*Tcg~_(5-tESS>V=y=^j+
z>e81Y!cZ*eQxUD2clXGc&3~56{zk+?p?T`x(o1C%ht@pp?stMc2mfkL_ycGB4X=V_
zhC%TP(9RODG1(6DtTKpBKB26{bFyu!1Mwh$Anc3GbAs|~-+Pd7OOzIZ?wF)6maBy$
znqn`+mKFtx2X*0>aDyq;7ExwuHR&OVAB2$)%$008-^STvVp<6jA<*CqG6Um{U{kA<
zIR30B>d(mb=aFf)Qb_;Z{RP1sH|h$F#I@bOZ;h$I3Tv@vGNI|%iu0n~hhjq{vjmin
z|1p&$1w55_4GsAz_Z}&*4u5+|gPk4iM-Nf%(&qHOTlxph7DvGGZQq65Ndn!^gq~(&
z__YLYZ#%XB;c5Rs(zrX4ifrOvEM?!q`1}|0%YSe8H0%1aE~Xsa=FKJb$fLM6@8SD8
zl^8dYoXQ(Jp3~9Jiz+jqUstFGkqC8ig#l)ZQzw}~#|JSu^W+uc|C5Nq1}O)lu)-Em
z5Gd_w7E)^5Ytbck1>zQj63x|V$Xe9)C@Jt0DDCV#T<l!edwk~(jxS7JR=X8S$GZ~L
z(;h}SsAR-i62z#K@EGi)CXU8OiQ}%d9<0Ny8RtStg~G!=fVwDHAhE(*UkVw%Q$}MT
zZleAaPeSG)4PbPt(NNC{@Ae7QEkxjp3^X%jHK!$H%qT$tN}x9&ERv;>Cl?^qn$%Bh
z${PQW;QDntwwZ%XbC8~;<XSIvL<Fkc;(St`$iIVhR}NZAn1`8QF<*s&IL;ze#8xdH
zGq~{(_{vF1Lhgw!LCB--eo=H&DwS)H3~G@)qZJR7Lzc>L)@G~YMqr*@5y4uyHsD4J
zS2$$+4rh&H%Q64dn@|RuvGk23Yi;f_7?R)kv#J;oB&B!)afUWBK6Tp3WxmKOXp_nC
z)o2WOo>7Aey!3|bG?H>BSUeYYk2-6-pFM2xI5$hXgF^N2i6&$T`2zY{-HU`{@qzi@
z#*io<aMR*rmMS?(TXym@81VNbAFCZ1VfTjjgPsQr9w>>9=Ue3A{@Pk1=KE4w>k%D%
z8JE)<(hoC1ovs0Va!~%ialmEXPc$W(NY&cqO`{re!g220qseYRc`1ebRFkS8LQldM
za_3&n^SeGP?w{Xh-SP=2LR<z%5pvC$GIgH}0cy9wwelAp&qq9=wJLs&SdA-<A;TZv
z3?%0A_SwT|sZk6S<77?(>5A<E2YwP-n5HD5yUnd~T}vE{=3zTBOFUR>W+MEdkSycN
zb?<hfnB4g^mYzG1E_kTZiCdX6@kDMpj{M;M{*%GiPaoN|kan<Pry7trv^;rt)C$qh
zh+~(3w(7YiLI7c}rCeHy_3)m@W3dz1!`6M}xRk)Kxb8+B&XC5A*HDOj+i@-LzMuSU
zG4{sWptCE9o(@tDtX^G@3XZ9;+gscGTdQb<m1FGpALqK+{c%ff#=}dFzE~}5f8Qj0
zo0Ze}zc_muk&Zb7M?^OM9i$j2m7h^Mm*0!^O@}G3I&Xtyln1iWE`QBg7VcsZ=N}LB
zP)<Uhe%^%%_r7ppD0rY>m;^6NPL;fgp#O!bfe(!n&_gh{7tI9dHnY9O-~gyb0H8Xt
z!tK4of44LNQA%MeLJ5r(g&_Y8zxi;LA7L?}S-6c-Qvbtc{0FWFMFVjCS5i!5|4njm
h1Jr>J|5(L&=}F1v()Mv5nLEG_OhHY)7-ACee*ldSo3{V}

literal 34837
zcmdqIcU03|w=OC#A_&q11gRn-pj7Fhy-1TTMd@7x1Oz1XD!oUF^ddF%BE1HQfKmdX
zg&KMZE%Xk#L4Cjd-E;Oici%h4-Q$kSACLhgYp%Iwd7fv@U-(Nk1!6*4!W%bk5G%fr
z)x2>7*Yw5>oCgHAZrr#r^onf$#tr`)in7w$?#7!Lc*!i?DIs2PV*&oM6sP_g+qcHp
z7uZ*Q*e_e8U^RnSRlG<zb009NxmWt^AW=JY$b?JPRv=D17wwaYzVG5vTMWKh*SwU=
zc=VirIf(4P>6<K3vi9KC{ibZ-dvJh!f`!{H8NyWLE!ta;oiaT#j0Sh{xC~-u@J>;S
z#JsDXY0!9c0QD5bZ{xy_cBG|kgwH&riB5^mMAIk_76Znr+3p0!iDZ2R%5$`>4rz*X
zgxfw=f7A(V`^xrbQ(ND}Ofj**=)P&K!Gr)`JwT}O5%qzEW7`GB0EByFuPOT1k+o)@
zx}Wdpkl+J+m0#WnBk2+|g;cS{EsQ|39(C?xo>;pt+jI{Gtkiydl`LixGyF}vjZYEI
zQH*yCDEKm=(qL7RXGKC26Jwd1w{gKC?aqjAr!CDt)&4{!i4p_m^jz?d-A)b&Npbw`
zY+0U{y*Rk?&Mi=0FEU9#=>f_EseEUQ@u0n8-D10Eq159}_ozEYF9)yid@?A9kLS|K
zjugpmO(k_`h){h9$rD*P0Og&`U^v%M0ryB2+VA4ECIe@lp2<kBvUMIBJ6cOah+y$d
zc{=??8!#LeRlh6$2Q8_%LaLg;k>qXE0OkpYn%Q@!n$1{Bu+mw>lGMhLBT`(CWc;?G
zj&UYc&_0GgZWTh>YBC6rrvGDd;K+ye*PLq(io4>`&I5*cmvxje)_vy#v+7S2)M2+_
zmu}J!Zl9x=TeFLe)s#oz2X~I11hdf7B@1x?-btpT;=HA@{IVz&rf!4=VWJ(=+d1%j
zGyw8RNelBACzUS)l~=j4agW|gFXC<`Q)W=ZVufOI*m`jmn~e!O%0#G<_#|ag$0mwN
z(Ss$5h!w^$oGC7jm^XKi-V7}pZW|HGWIgT_-<v(nQq;*L`fW1pckJ1x0i&aM_wE92
zq^HgCusbSqT%hsFUJ@E`plHJZUn#r7?&oR-`D9Wu2%EC~jV3r4!4WG4_8r@45vK8=
zXgeDlDeF8?G<@$?8=JmZ#g}V5DA?Od0(Q;_7D?8U?%8$hCIuuOQz!iR(Ec2Mtsp?2
ztrL}nxH;dQun-3pF$!vZ84(HXw9W+e>I)GtRJz1-8_ea+w|i#t1oP*hQusLlvB#`(
zV9noI2qzku#M_-E^sM|M*i#kB>jT<9PTf&Zny*NRapPvl9sG#9K!()9->e<lJ@(GK
zYn9k>d0cJ;kcYg8Q0sTa>(>S_4GjyL)&}u!WN2e;GUj36kD!hLI$eSDoufEAz7O%~
zS5w}KI<U{sR)ZgqjGHJg@O$t+3&SP-44dy;Nas_Hjg~2nzPkO}g`Qz<?T(&Z)HQO_
z=)?|Peiw7mx8(AMEx7FhwH3I04_D|;9qq{6sx*^B$=N&hMNH=jV%^#=DCG-&1Ru{K
zQkmsz9nEPt_RNeiI)h6AiHc2YMYwMf{!THm!ef?BSNMvEns!e7KCzl3FMo1&7+-4~
zS|k@3MU!6~?AHE#{!Q6kd8LTlB&2J015d%Huac_&IO<Iq4)o_<Gt<L@a}URN-Xgir
z>a7S?31bKrxQz+-Pi&&}NN92enejh&TpM)nRK!cjhi99L=BJqwvlhvvfFe)9?K5~K
z@<Y4$F`JRo3P-S|sA;L*qDcB*YrDBxBB{JZz^HpWvBhKn)7gaYt8Q^8tb?0xJX|%?
zpR8BxHVK9ODKXamocMQ=1G*-8U{$WxMRVzHG|8DX9oo%$fr~q>`laW4bM6;~l#BpV
zB!|UPzZe@?xx*gG_{b1nt}?TT=RRI{6w(o$_Bm$a(D5+O_RVcReLRC(W*bQz#j)Ku
zQ24IXh4rgMT%lTEU)b=DCXFN7SAuaCPbBW@%u#<eO3}$VwmaUPm?qaiMfHs{nI9M3
z2lw&DIB~slU=(pxlG}%ki7w;cUegGCM?zC+)Z$PH$pDJpK_qoJ_O<K>Kv&$DTgoXz
z_`d<4&xV;`(@KdiX8|G%Y=;qc9CHNE(*z9@w&Hfxp(Ohp&iEdqCU-vxnv-pJgzYcv
zrn!Hq%CC$LsEm&{pUh^VxeG9rQi)*E%GOn09`G}4y7iU5(lLTH*Hk6)yUq8My?LZr
z2YgS9&M%P?(ZCg_9a9(0(@88me?z{tlEo~F{wo~cVA?B`79R#-iPm^Ih9HLt{^<b5
zUlMV<2viy@#k6of$^)W3nHbeJ7LD|Mn;7O^oH}QF=Sl(F6IGwgjxHaX?M#T7ijdf0
zGF?bj5KxZ*RD92|C11``eTRZw_qJJ*EJQ3<=VZJ3Q%jm7?^(?U6fHJgPBw&jf5y$i
z;WC+%=LhC*1}qqO+ixEs7Ir$g6`fS+%{D7fT!xmH%s|n~yMH<iaYwHg_)#Y{x7Ek}
zb`RM=q=QKeMc0d@*?f>jadZ4;*_4}l6TK*MrgJ;Ym(X<0A7`V|u_5{DhkzW?qtp5m
zB=oc?r-__CouGjEE|XQLi?Oe4FSHPDshGqPQvL0`Tuo^qi9VhY#vFt1^xW#%P4LfI
zt;F%U#g@De8(tzJs0fO3=)(CC*6D73Y7r!?Eb3l2ot_J+M9j_$51epvy!X7jZTMr2
z$>}-x4mWqOR1)Pr*%${yywGg-q=@_?joF_Ld*D<oYWfcn@Vx)8*YJl+{(s7WY~(mM
z$M(G+O{~Am!+#rc_}?$|-7Q#yB@|P=!}dAvDhDMWeUbHuEmtA>1pR1HEKf;QYNn7~
zrFi6vbYKoI$Ji(4z+CK)9kbypg@-|8IkRzB=nBcWJYSZc8m=Y5u<|5xj;yAnJ@%{w
z^;22NSFc}TopMH@(aEFeqnBV~@WYnOh^JFi(uHSPLy{wFFWkGhv>0pP0b5RlN!;02
z_TjouhzCu^5&7Qh#yz>-nv%;Dr`me2us&Q<m!*O{I<(X%uxGB22DVZvpN_O9D4gbb
z_pt31eJ#HGoB*B`_QY3ly*Ec?>bY^?nkPr%UXd3EXixDpL48Tf@)axAVkz=h7nd4-
zr@M5hr=V#{)Rxt@3g)uJE$8G?nyJmv<ZMVrI1h5N6uf5Wch7S2-X5(i0+Gvubnjc<
z1tC2j0+RAECo3}tL;Nv{nAIev*Fk=VSJma>D_oJYS!XY_6Qz8Xv)8=*E>X%Tt|Zln
z@(P{k6^gTsB!@ipS)W&wnfbH6n9m0V%4*J8epkEua18CbgOmq<6LMoQY1_@i5(-aK
zpUJ%1Y_*zRWokLHJ{z}Bl6P9ya9GkP#GD;n9%NXdl}x;M&JIEyBpEXXh7e5RNl2gm
z#_fHyRMx`YDwBy0k5(4sO84yB{;~YLao#1;*-c?Sx-iz;({E2=+ap(H+*$AJTzyul
zLgZZJ_%tc2Wo<9;nb{(9mdR4ML{ffM+d0s4^_5lg>S?S2cX-ddl;4%K=Wbr^5ay~A
zA71P6?<YIUbof4-8Hxd{+EwRStfW+zG6&*U+k2dyY$eGdymbyojC2aR8Ct9;w-2*s
z+mhb5eVasa^7nr#2e_sHCv8Fs5GL+Qlob#{v0tZl)&f{*pP6_9FtQz`GGt_GoGsU&
zFBt|Qr6ewsadUQ}$Mhr^L>x^gP-|+<cFo}!siy+7)6NaI-qE;StUqlE$-j*89C;zh
zOb?xTY?oO6wPOh0$hnJ|aXA#--mj%jr@YL{E*30=H`$h_Z`T}lX^5oS9_bA$b~CO?
zF4FK+tpzkPpg09n6|x;SscHE^ULiWhzD2;)Y#E~~cwxPhj7Cgm>*~AX(?+ifr{jbE
z&;hyKmZJ&FjZ~r2pJM8b-lz3htqd6(MV@i$epm1fRB8!lmYS4jLbFL`eYLvCqX;XP
zk*j7=wy=SEW8V;EuMm`cGhnDoVyp0S`ZQeZ{rwCgIlT$L`mO2xrjw3+ux*9V*bE|S
z7{r~VSCLD)rUWtm+*42XJj_=~d-qtu1HVsbjOPB7`;6dFidA~sTlHNxMFBONs0qi4
zNHwV*(ImeU^9V8DgE8fd+|%VLbvdW=+{)@-={k`6^VlejB+Zo;@p2^@&K6@Ez4HlE
z-Ty3??lvLxel3lFwW(+OHpLKCqMmpYw{ijw{nPWywDS4_XF{m)1%eSGXaxLr>N!mT
z@=R3^iw(9HHW=_xDC=sYhArHU<b28455>v7m~{6@K;Xy-3>FdDwY0o@1S%cVv&aVq
zI+Y}D<e^*lOabO6l?N-z>YxQr3E;3CYW*pff#fzZ{vC#f?wbfFQ>a&Q<`M;inn<4C
z#k{tg3LEaM#m4^2tk$zn)l~yBo4x!1&yldc)dZ^`Si?4b<qVr$CJeA2h3X}p)g@tx
z)&flnGw_=FmpJ&GpnZ3W&+W|)`hC^i7BEEX9N2n%8CAG|1{JUPXR~cLOlJ_$mCgDJ
zOdJ|M(|5uUQ_D&Ws2BmzOVFWeHIZ>6g+PCKk=5kKGK*|cn0MJ4r)L!PpFuWv`0azy
zkewQqUajnwHIKLNh*ndcuYl`$?ISeGMqe8WwXRMsXtO-^EQN;TllBf<Ofv;>sot-U
z1Nty-HH^xmT+_;!f|=<M8-uNGN~D<$GL*h6yZHIaJZ+9=i}hJRVv(NWXp7Iw{>%lu
z+`O}f8Z2>{o;_*NSvnH^V5!SBLE(jVcInJ6$&TAOo9BzKMNOmj25)pvR5|Szf<4~I
z8Oiwa4KajRIHWVv&D!QKC#s+u_Dc7h97Ii-THQ?m!*_-oQQk*}>Lc;qO_x3V!}#x8
zD!rqeVTE6^-lybCxQ#!Qm>&#ZwxF-Xwv1-jQ&_N5O1Z>&DSsom?D^)n`?M6~{t<mn
zu30SJN(sS~OS9^5Jn1Z3#8k!$w$j4uGS8C}0g)<B#C82)EY_rfD-RXv(xVkMZ?GlU
zgEo#ow=e_=XsL?~H1YZVsGoFiNehvei9t^6tBX!&fa?=a%xI?^s-&9`Y6gu1Dt>x?
zTd#&ZR?_=bHM03sJ$=_{Y+s8!PB$K9r8XFhCK&AXoT9R7x^)z=-=D!)vw8%_bTpC6
zYcaN+E=24w)J+fm>_US&V81$!B<O3pVY<`io5hE;qXOGy7HTqg4Kke<vbWu%KPQz;
z+&H)njz!XxwVu~B@+oihi=)%9ZfB$2^7Z7&xIH6yq*oC}5y8I>)nE3{wihxIMraRl
z9twc=!ZftxHECw*R>^WDRKhK4;Gu_!xFPK7$Hs2Plt>D}RCS=Sk?)vOIZ62uWL|||
zbvK%=8PM8-d$g3H)1Zc9l9g*gItg|8#2a9^acc4CQ&EU(Sw1z(;)UNhG!*_(L9g1W
znPQgWwf1$ZrPk=2Wm;ya<PxG^^^_vlQgnPDLULadS?JI7nm!$RJk#jSwL!YZ(0bNF
zF~c|P-NC4x4oJ^zAsK(QnmOm0;CI-X)U&GR_jgmC-?n0CelJ(8^(p<Zt~JfsAxCY&
zgRdSq7_iYkG!?HvVB+Lb;{sC<g?QK`30g^ZS=n3g<=Iqfh>r9=J@2>T(Fw9(>xprA
z@%2^|?1q$2P3k*QQ$)I7+&Rp#hdCEt#9tyKWPI40@N+kgg?7@RfEf)5jhbGA>u}_1
zhpz%VJ@0mFYAJ3AOUq`Q(SWhUdbD`YU6;RHMyH+oUS|eaxh#uzH=nWG*-*u|+fW@-
zJf5<jVsBx4_b7{oDvy4j;pmh|&gveCjmW!I23~p|dS0oH7gt|h?^>0K;YYzssgewy
z5gwZZ_>M#3SE0$YD!Ryii54YGZ31RlDc0hX^IUJ-WEUY3F%>Eci5n~=X1KvjiVGR;
z<Hql6env7G(J~*D?m^!p$Q&graO)<{Y(?>(Uid<8q-Xlkk_w2}+?jTS7rhT{*=fdZ
z`@nx+B5EOc{$f@@-u~i>{R$It*EntMOx&EGnzr?N<F+k!hO#^&%l#qe(H!F%#6Ej_
zxaBhK>fbEZZ5kGtw7w8jEc^2T{BiFc4e2=F*e?d3Qa%CUcCkMk=7aLm^Im1`c<k;C
zFf)*vNmi9G&nI|LX6~EI{sfqc_zUc8`7C1rL@X7{B=CrLeZfA9RVa<7esVmbau5fK
zBEY!0>O4|2#*fnYb~vWbN{9}p{mItDGhB^&GbfsTXIvyK=FXT+uo)6(kzZ|$amry^
za?yRhZ@(poghuNpR2-VJ@t);=LHRx=cv#{7Wr5^zTJ93nEX9b{bxhvBj>*r0c)Mj_
zwBrUrviurU7c3_}rU(CWyR(oc_jNicxb8uEu|COoP(Ol4%+y=Bb>iqe)<I;ba{5E=
z?)xVXQGAxMvAnR+5ADfF5%--h*3o|rS2p0&L8KxsirMFD!^;P9O9fq`GI@<gyW|82
zY-B^tqDr4lc1q}0G2bA5PXCoRz1Gjrpcls?<Gz+Tl(#_E$wyt-9AzUJ+_UJ<=Mz_}
zVy2tP5mxFzv>#5+(y@^d@Z%OCuu?nC{IzmxtU|V~RQKh87-Ex3Yw~Lix_nBOF6?^q
zV$cgc_bU&hbZiDGhN8DzEo$Z^G#r*GSU!x(KSi~hYCpY0#K5~368oB7tAJ5+n}C#|
zvZ|L`Bi3r9--_PtQ(Egw`$wmz=M_)8iJ_+%w-J@g8XQK~Eh*u;CG~u0C$oJ*aI7Vt
zywc6L(bC}_r84h5KgSzqhLG8@*ka)I?VETeDy}ZmgXL_mU2*ZpviQ!rj)&9Kd3fdp
z$ks&jcemXhmSCI47_*V%IyseDRKrBqTR*C!63zudtlwD1hV|$i?Y7oC#R@iS{x&A|
z$EMu9uH_l!>7*GoNo_&=C}Yn8>7l|q(23iz=o*{lSL*v5+Kl*7t8g)VV-Nbucf|;t
z+*o6K%4)Ggc}5TTM8}H<hf0~pCw%4(`*p(L2VYO|GgO3H*wz&#;is0uvEMk}T;@eb
zx=T?vGlZCB<97Xc$nvFBeaT}NLNuc)ePP!FeJ@ZldFw@AoT<JSO;_pdD=T@p-G%-_
zYWX`QY<~K&T<0g2atVZ?oUCFSXM+Oa#A6Ysa+56w@l3XutH-p!DA|*Y+bP=$6^(A~
z;}WsPnh@haq;0wo+P?}Jg|{$Iq-Mh#f2(<%p=y>Jr5VYkjg(9|H<b;X*%NuIbeGOQ
z2&2GW!2j&-V!haEXjYdWGO6*%#mwp#g~?JNMFO*J=T2vyrbxb6<em-yP!bCd?ZpV@
z%%?5WpSXn8%5#?ChAhGoiio=_R>tZp@3BS=XqROI?iKz{T=~FcD^_oHt)~o?S~27G
z=1-43N9a73f?57%Fg4M8sXD^p+GFGZFij}Qe6MCc8t-lHCN1B55!r*flMAb3AT|bm
ziB%I>76P`D<%VpMM0o4Ek<fJ9U!EXkcn*+EzSV<n=YQ)7XkJ{X8T~4g*ns(xdCiap
z!k%1R6=J0f^HgeLaf}75u2%q-+|e;2MO&l7aX{4ci?}>f=3Jn?p+fftz?=qj$B$yP
z?O4;Zd=xcs!03g42Ml8J7~>rbD*&g3e8kLm&FA94Mp%Skrapf;T~Ny2%i4#0@@<Gu
zqK<hq94rE239nv7mr=+flb;z!VoP3!qB#RIOZ*a_U&_&I+jUykJgdwBBy+Xx?N(hP
zxs+0&vyZr4hp(fOy>~8JHN0ulW&}BHi~e{Gd~3KinQ;@^<V9XROV6f!2rv}~xxTY8
z#zr2qiO|<2@N50uOb%L%8Jn^(eqX`Ij_8-V&>-k;2QESmaV#tGlMO&Qj?4gf#JeCI
z84SmJ+*0*4x)(k@+uMYe;;ty9ro3K*?unQ#;j_d<8Sm%V0h&)#d@@FHoM(dB6gFu0
z3*THzq`$LsAv87ifz&(8&cBX}Kr@<>1>ny%;ZgKKX!KrAXl56`md1D$Nhl}q!;4Uj
zxbt$U*$k^C=ypJ->%O<tZi_Qj!N7<NjGro+gMZ#be&EEfxK%t={x5@v&Jx4`uQsX3
zXZA!M#5o%q<hD2^pUIyZbF?35NiM#zkh0a`8N)cXO%~iw$Z!n2;dz0gsoW*A9BmV+
zGsYr{FpUn*)j`70a0LH}MV8L!caL)hV_?H49U=N%wbq?WFA=_|w&2`^!$RRbf~we<
zOQzP%$()zKu@l7opjoZ5n7>C9g^0f=ykZKCVNQ>9)c#6-D3G;gXYNdtS-*2t9G-G>
zQ-2Lq<>1cX@(5$wu*i<~uRa6{Ix)BipxJYwUnZM)0XXi6Tw=7^zY}lG%daHKBa(k8
zyiavde82S*vHS_=eo8UkBcOhU&eQ#H4udH6YS*qz+X9-?J+Z|@*|NS|PWZ>~sqE)N
zC-QP&=`xKj&(UJ<4!%{7>I{Q2N|v~&xK|q|C9MB00~GoG#5vy%`{Thh4%)9O=U?ah
z@3Ppb`1PcE8i4dO)Jui0L4D57+i>`uF}-I><^#*GO7_{6RWw?57lw0oG9$LuJv!xU
zV<$5UKcC9pUoWe*l^{<$JhACMjQDruqS)l1?wNgGh;kYAAIU&itCYN#->9no`^c1h
z2G7X4xA{S&s+^sB_4k8*9^?<<!_rj!3<bRoWT{Q<yaZJ<<h4QQxK{16EE?iB#YMyp
z!L&YOKfyi4zslXi;uF>)o`1q2U$+OmoiE<QwyY%?tN!`bXDk)GlGWZ-)-?G?u~4o`
zP@hruNKo$qdq2#2Qu5?FWWx&=lJeR(|1Xr9S;TkG*xmsEHz68y1~*V|D2v~iK}B)p
zd4NdM{=Nn^+eTJGB(|&t;-1cwNQ7)kU2w}S_+i;y+gl}N<Yo$zm?1r$y2h3K*)k*p
zSuKCoI1P*Kx#yUerM)-w{LEb*>x5C5_;b6gtlDKsgDorjzHi(9Ep6=~!UkJCY-7*&
zlt|CxKJXaJ?d|kaE6lejlnEn7m#O|u2l6u+cEU%eOpu*(M;-?T^W5k!Sxz$UYjr#f
z7vl!OUQwV6&Aishp`ne`GCa*oIVgV|+-a{s-^12&H-*ci^NNO~jJ>M|=L|lIe?+O-
zC-%(~p4)gZp>+Fk>P$Z*`$f($=mW+8c}#g&F!@;>NWb&J#+dW{!0yo0wFE&ZWqN#j
z+qa}OsYvgg4kNRx(^4?H<;zrN^V|qa4^PUmc_4xDvjL)6`-`_Fqo=Z=crFY)X2vd?
z&$c@c*cjT}C><a(b<u#m2G`x-RX4=cN&mf5%KrfINwOexdUgL$qQ*!0n%ySbyijzr
zGo`U|l)frI&>G;>%{6m)<efM^5gmfh^N85ZW@))El1re=K5KwT&F92MVJqnu=Q%!L
z{U_a?0ymbe@&fTmdw}ikHXF=dTe{@(q~at3$5V<VG{lyB9KASGSlMBy7baf!Fe(X5
zk#)ZH2;c6UOUM!9hSiTF{XJ~FX<SkFyBkh2=KM1}E~H_i`GMgUbz@-Mp0+-Zw#L;k
z*A8*_1rqM8U&7~_J(lqCgu6XqaGu*<wkSwxx!8$DSyf0b`MTe~PXtc)K0VSZkChKR
zI<O#d<l$kZ$KOuuecZ@P%QpXm3?cQQmlfq8=_T6gKBR%lvmoVk*t)#9L^Rr#x6bA5
zCGfRv)IEg(TZeQqyaL3ePTOpIXS)yBk^QoyHCcNd2T9H4l5A&~=kUl@q<%?5Q9x7x
z-|~;HsvI+UeLbyvemj%7<xbx)ThJ4eP|~!q%L*Z16Y_sD*dRgL7}oQ{J`q<w@U*D8
zoRdkbJkikZ+rbFS@fJIN)U7@GYS<=y87ID7pSbhuB|L+l6indg^DRpJ&6uYHAC@h!
ztR$SC*Z79G=Lmr?*KjCbhtjE-Im8Iifzy24w4c-DvT`4+UIE!j*yCgH5aP+?5GfO}
zY)LmDNyer$@-Y_VL|pA2iox^<5L|p^Ae5_fzhsH+-219|5yXvMG-cCys`Cu!39+EO
zpPING55mzxz`CgOo|3wxH`D5p3RZL4;8;%Wc7SkZ*eBkfso|v+S$s{mA)k}HUNiJT
zxrxbo;k=6!eUrr`N%zf1V0W325=hciSeUT$(lOIT_u$6|WN0k6(<X-Is)E0t*T{32
zwX>F9g-ZPDQ@BRU%NB8$rLcKH6KgTwL#p9^Vcq73c8&S>R4YW9(d9)f=wCvu4}UOA
z%Vb0^Kp``v+NJ#~ZJ$6w{gmD1wS0->BN@-~JMaWxTAUlLNV|stra(qV$vfCT*A7vT
zWUSG`rXAQsOr?B|ZB|wI0Byr#bP6Oi8LA^zSIM41o*>U4t7!m3*-Np#pOHmL7ieff
zMwf)T<V75Rj2h{*#q-l-685*R2lZUXCB=T?V8wd)hAfqz33&{)mZEMH8tT|iGir9T
zFBH!H4L-+DLW9|_J~lLx97qsLuW%g3E!c#Y^jXn<G~CCEuuT@U@ah48R#R6`?`Jv>
zgpvDKTao?is}kd@^Al#Pksi$26h|0Vpc*+SInLr@GMMp$;=v~J%zAc3*%;Spjpq-f
zpWXvGGwBiaww}yhVo<eMBGG^Z($!alVH$IS5RaSMQgXZ(+O^()rRSY8eH-){c2h`Y
z4V#0<x6{C0=$6{^bSw*0QpFFGdV4dqHd>rCW5ItHQ&q4(9Z%xijlH+iTGJI<0~N)u
zB`i3DMNREY`JupEqcgh?6q(R(Ihn`Z2JZzVCEj7oFS)L7>}C0}fUxah!71Mp$`E*a
zCm$hH7sG2k_%}iXibDr@^P)4MCAh0^kv?^Yw%~D`u!z<L@8TNVYjdEQ?*_4qp9(ZZ
z;AE45*AoRr=P6JrL=TMNY^c%2m~hPiTLq0k!sed$j=y7|9tC0xOCEgvYg3@d&F7}o
zf1UdnmkM^A-ho=D4<^?-okyHxJ&kl%&Dls}mDf%=TVmMa#OJi*t-enPqb9-Y#%@!>
zaWNRix5MxhyUT$<G@029$?1!f)dnqk-u2Q^9sRlszqg{MBld0l5ezp_BIe(-7S){F
z*S>$*K439bA!SSFSOjKX9`>4hSaiPt<a_(@{F?)4$pl_vX>=u4xv75`C8%|gb#Y^{
zQWqR;QTLV<s!P!>C}8Z^p8SC()5NKfb%vJTZ!<$&(hI+D`*Fkf9HK-p?%1M#6?Y_h
zMXYQMhYc1<YHUK@HrlnA5&yHMOJ7X3yt+$?`#nw9+H*15x2^e_j7zLdZF=VG)WmPc
zZa$R1Z$q#*Ash$2#m^$X4-@nfm9)LlXp+9Ny50Kt?CNdtt5exrYO4sFsw<4KS8dh%
z9`3A(xXfGoz+Cx627$3YxY^v|Vvr*Q%+%_4_TGlp>FG!@w#9O(E8HE<y2Pj|5N=4E
zs#sA}lqOrjCVCLOhpDSG-Cz=P*{}gA-}9gB(Qk`BpK7Z{-F)^cW1ug@R6BK`Pu6r&
zNXyqpa{huMX#fbEK)*};0j|us<`IxzVRKK1pNr=X6vv`uo;QkX_EaMuEYgbu<s($6
zCX?z}A{Mhc9Kj(1OE27kkMFX)J`^#{<KO7xZxS>zJCQ)#18bdSVmRS{8(<~mC-Uf)
z@!PRpbwnc*x8X4&2duGdpXFK}!w>uK#&8o$@YK~aK69@;?<>r==dI@EjRL7Qn2a%;
zA<thIy*m9)+pyy&FG3RaawkQckghN68Dm%X?RnqRO@*KO*{_$$%|`}SC|qvZzB-MH
z_q-O4R3k0~RqTlus|!Ap+>nGEPtR!3_G>=fzg_iu4AE??GW7^@q9U;=)<wfma~iR)
z4!)7)xBHU_ek`oKq@}BIeIaG?tCY~$I${nY&ao0qIw7Q)Iqg0#tJ45Oo#S;f5*yLB
z`Mx7TGv-uvSmSj|xU?Tl+B~gaORKbnN&OUjcv(oZ!7GBUuu_m|=6bf^>1k4(E-0nn
zUL)uQi{udgZWb8eXr-_hgI>l|#xQKme!47gd$@;8*wvRs)GN6XK2-Z`a-5K_mJ(oX
z34Rblyu#_yLn-@I3@?_WmvjcGK{-&ORq#Obc=&d(^8lX<FAR)PHN|}lDX~u!z|;6w
z_NP;f2?Ryoq33lKd6dqmT;RC7XAUXz#5O($K7)>7nw8FSJBR+Yn^VEgZD0~Cn%e~a
zD0hn*sruXs8UWr~_(ak@>MH%_O^~}UwZc<c{Q0wBP9OAV{Fwt`AvR`-0{agY6hbCa
zsgJ`?F!znB=Y8asbS%j$6oEb^Zy0#fm+&2$cq3L>@k`yOjRN&ZbgP_0Y*Oc&F{OY`
zoLq}&R7%tv`>)%y@0TJ0DX1Xe%PJv+TY{zHuP*tP?A3*jVhe6m8S;`3TmEX20b$jh
zlZBPcU7+0!_33*)q@EinvFR>G5sCT745bW2^-T&);(iqdq4OZ8H{+I(14tYTTkEm)
z%D`-&ihEKg4pPow<;e3Ap}*v5g|=RL^UQvm#Pfnhc&%V8^zVs1n%oZ)<X8E8$1QQw
z_Tir%SMojVjqbe(@0{0d!y5i&Q@caEt}7JLZ?I|V!R-Lq>q{kmn<=;V^BUszv3zfO
z_1!8xvJ%RsY&Rss@CKSz=DI|Yd!b@<c-YJ%Wf!)8%~p41__jVD%TMmSjr=#RfkjiO
zP?0;xZ@VPP(Fi3E{89#~$><0U$0*puPTQkB<gXjc{}uj6+nP_kcMV&trmxJ548BxJ
zvFj#Kc8F-H{{`VSIhVDLH5r%V_c*eC{2TW;PfhMP?uzd$GA#A5URHahXP!zqTrl1T
z&BWqwlt``dtr?wvXu$vKGyZ${3o$M}$%V6K=^AU1NuhRubM{`Ada(Cd5^?-P4$ued
zc7L26F#VII;jFt@LWI>O|Nr&vr2Ltxbm+YU6=|ubYb61hPpr+QWtwsXqr>rLj0cGy
zx7^>vPQ>*LMdLAO4;FsdeCYhh$f-*wbKNCTxoJp$I-Mei5vIQIY}>_BT8vVlkiPjx
zL*7%E;fkR{wa?J!F;i!5Qfk&+%(BZnX(FJUJ>Z<J6Ik(;t(ezYT8e!wd}wzkZv$Ds
z7fWcUNto-q%`tpWz|GNHPP25C`gX^)ui?+{JzWFi<-a;`mF7>h5Dra?GuOwwR~Ptb
z5P#xVa|S4~tDGY^brz^e{kfv3qrZ{r{Coroif^zpXie63{kln)Zt#r2;TH8HBOCZ_
zZ5|QwBh2kXCOmu1m>I)u#R!`#WM*Lpmm;4e!*gC46ivo93)M*-<&ibnYtkv!PLkT0
zfJ!IDU9qax>(kf2BXxF#&&c<e>l<k+>Vy3UEnulD51S-hS3k`O+bdtkZR~vELwuD3
zRqSm{cfe!T&eh4JLkFeR!EgB|gVYR_Qf!|Gc1e}z*GTb*_o)S9J?Sg)h|XpMu8ar6
zG<5~>H@^iGPy%0*B_@52D@Wg9sKmkZ(IcFFKtPRMU6!bU7j^bg5|*ATJ?EE)`TTwO
z)d@M1*5G<R#Px~RkM=woD^x&1iTH9Q3C%9GY6N1$ydZD<MfzxFq4b*n#Kw!`{h>yO
zfE{rXnqkVM7ZF;KO@Y89#jjxZ!|yn0WnZfB`m-p(bhmM^vVBGHvp|Y=k92%>H<Dz(
zZBZ-m`ddDA>E6c^19I5=zpPZp^@s9q7j6!^9+Mxn<TL7CO3U;&a?+5m`jrdOePP7k
zTz>>Ren+CqRfzdva0QB8p)ZXC`dDE{a$q9!DuTZ^bZW18L0-Tt1MLamcT|yn%J$h-
zV2ux~GlOZ!g|K$Q`H5nTXpS*)3r+a6{+WX~xmA=gJ@B;trb04Wm-zUegIUD|cRlQ|
z<Xm8$`*zub(nQ=kWlh6gawO5Pkj;hek8Z~VV!lh~>IZ7rznE+G(RKL70hebcvEz`0
z$7ch14T)t`tyc@ps$BSYR#A}n1rLC^_$WPZrXPLa?9J82tJp`)EFV`NddI-&6qlR2
zII${rZ|FSzh(6uxejMC&E{ud2-n>YFF!cdw?gDFa?Q6XZ>LvUE!xO)jR7!@dPc0-g
zFh&;F@2nu~=T??-gpCKEakqdmiV=0~9JmESA*-#0VrI2Ne1}fD)YnR3PB^drf?tA@
z5_XW1Kt;=&3-y6yP~S)+em}sGmSMi83lvwb!H@QbvtR}Se6<Iyqzz*vhS@p=^@Av=
zLvOEwEUsMZcO-%BBu}je;;buqw>~+_UvvcTV3&smI@11a7Pt?I>me4S+1-nlq%FT7
zE!EgXc!u@^OfRI68dq{df=^!LhI6O{4gZ~o6E?h7(ON*M4&+ASKy4=Z$>{o9^Z>_6
zd3z~EEB+FcLHjM|&rrR~=~x3>1Dq2y#vZ^2S5I^l;S)DD{Nd!^s@r!H9!)V5V*h)-
zMpd<`Z3>|%l3^B##~^coxuWTQVn`awvNWK9*zr5F*z=DeSI1r}sZn4UqS`$6u$NJe
zZH&vljO=UqdA0)kjhZdn^t^jxtB0_B-s@s4aDNsz0>-!cPI_@`CU3lbH=0UlZ)a9+
zfPY5@?KN&Ae$t?j)G^qV&x`iYc2+iX)FG2J;n<8C;)}*Fkm}%6M94p^$4>u|158IL
zk+^R}O;zOfNGOT_hS+rPDLT#Eo)-}JA(bR%>2Q=yd3Mu6sXJ)6J$MkuA_*`Xwju_F
z2Qmb@n=QI&O1n`4|0a73h~E~PpF5rYO5gkfD}Vr)nYx-3T{i=b;<>vcU_|Y)9k-dI
z;7EA>PNm_n6HYd6o&3;ftPEHw6BUibAwu}ifATgWRQSHjAQ2z1vlzcnC^fKZDJ_!m
zjMrgk`3vu)ECcQ|7`-#!c+8KFb&v0x7#K}~xEJ<iRL;5kbY!UK?J4>+;3Wim_ZndG
z)U8b7+D&0Ae^*puoZjXYv(0Ijy&MczM7XIw$(UjzfKKcWx*G@<*_j5DBs;AAeZn9j
zGzZ?{>*rbO(-djFtR=v5wKxdL5)|%o74T>Zhls8iJ1!}n9j_=k%$|>*VR3~Z+Z#Am
zk{N$_7(+k#FbsF&=IT`{C7lWWy&6XvMW6r@XP>3q7{VQV?(B6&6laH3JFxSZO|HbN
z7T-h*T0;(d7^TH^`HKu?r~syIrTghi8FyD1D&LtW#MH(zW-&40oB2ccdWS^Qx!PlQ
zXDMQU-}JtTj@3TL21=~Wf{%U2KETu>Zh=6;40wZ2HNI#zs?wmzxxL_Vr?c-<?_Xh(
z;yAMI>I@5ve?#3pyr>L)r=)4<umpkHYH~38`RF#}d)-6g<nB^ECvC=%xE=(|;+c&K
z<WLGn4&qeXwy%|e`4$Gex}Wso5DD{E(cOvS31%s<fTwMPpk1%-W#iuK0r9$5;zmEC
zBlStv2DIkz?~>sAaaqKbtGepqF84Yn;;w+}o|&7A3ZfzgesFK3UhrvI+hU_Go8&gj
z=mXoEcdCBkv9zn`9Pv&rDby6=a{30*Dv-VBWLVrh(XbmB%XhX;)6~V@P=MB_Y~no_
zcpGXc-*#$mIm%XxVM|xEv%}=C(L!qyffpBN__WV_pQ!%$saWOIRl^8T3qpAq%0QQX
z=A8t*ln2`_&L=$RQ*SH@acfTwpu8)z(t^7pjAMGRe)NP^FCtA2TMRp5S)~N|u_NK>
zq}3!VkH3jtg^C7P&}Ll618o$N`4I%lp};uQ#8N~*`gdvTF43Nxm1Zwa;GoCe@JB3K
zEXrT3Q0``Kll&Vfu=ByvPerd>Zp|NFwdaZAx2?VaRO!3p_W}6zUP4Ya*1Y;fljA4{
z`0n0P>D@nmyAfY$JXu4Th``y8VX!rnS$AspC%Gq&+G2lT`z?{Z69vQZ65`0TAWr$!
zly6GSUExnw^zKxa{{RW`4K35{`}Pmb<Xg<8V={-8%%<?nyb?QZfJ4!y4ELR|t;y~k
zD!KE3S;fb^_>(=S9yU~d;QKM+Q(jNc65op+BlDcQ6fL;9uzBX4Hc8dL>#jL{c9l5f
zwB}#M7)<_REY7KKk_OE_kV)wlm97i8HMHP$ap?LkZ;ccCQ615OW8ByC%{rfBP&ap?
zy}lUwX#*6x2`u9|Jg{TLT&%+vZHf`Qm+y<Rapl=yIf{rT_)}%4cN!qvTTyo?Fgl9M
z+}TlAr-xSJpmu^(Nd4`6UXWYe6%Q(vI|b{KDai0g$~f!|YB@s^CECH7BZ}mUqkdjR
zZpWc{pME9pUZ<#76Zdt}KfVjlTbs{o_<1W=Z2l}a@&Zp$M<%#dSLHC5+1Z)n&Z!0q
z?6QcmOaD$?z+0fr2ePDE)0ZIJdqPn#m5AY6HrACS5tTYhyVuq6!X!R!;oe9+TW42N
zRsrSFBP;9#x8;7WQ<RXC$zt5XV`f8T5iM_V?3}ZA(|3s{sLAjC;{Jm;`&td-QP5CT
z&fwD!`i{2(zIyT0YSAN}l`!mMAAZUU8s<`yL5?rj#Xc?j?|R~!sGX_-)W(z}Hb{J?
zDU_TzLVfYIgi_}kX|l8>-H`XO(I+)JA?77wF78%ah`_Ok&QnKs1_&udi1D1c<X?c0
z<d$Wr6<Rd07Eg9h@=i7v!tTO=&#^tlZZ(ev1n3ZZ`3ZqD)eMY+DMz@n45Wn5K~c##
zW5K7DQ*q`qcgyF`3xKdGwQHo$6(G0ndZu3pj_+|X6~4J@ZORAe`pX+#8O&l)`guoU
z^7TB-04q@h5_vKckan_bjmu=ojgytK3f<eqHWGiC@O{e)If3U;VmWKmQ={(EHpM2-
zp=x?w_h!nFGTpS$MMdbaZ<0<!c-Wds3+`)WyQRvHkOx0q+x=mGq1|;QlC`(jx?c#W
zWkrW(sy8iQ#j2>C6Sz6R_MJuru6$GWFiiv|nt@Pe{3PU}$_ehJh#&!Bcb7Zd>xWOb
z<BshKwd+#fYo1Ffsj<bj`a!Y@HWpj7wldVNOJWU~r<`7OXH9#B@<M1jQXgZnosCE=
zl2b`zL9!@Tg{BvIX~iwDYmU=Ezga{OMYAqIQYpcYLS}noEDUYw3{|pO==mgm%2e>6
zT4O086x|^j4X1%3qau>1Gqvww6I&mxDLt(Q_#K9*vgGxCc;jVYv8AvRj|#<ikKRdw
zAa6+)4k4QG-r3$bC57l(2FbF@k6oT5Ot1q5)23xbMY!nVIazddDM~(P|BGRW<y6E6
zrBmYjI)KoXXRGeqG>WIbnw+*gY@M}obus>UaSNVoLLm>f#T1>~UP=w!nqo7unb&nM
zbn_M&FczeVen>KmUN@V{({=Dkp+pno$8q%O@kaC&P^)A-;vS%dS&Mk$0s9<zqUKz>
z*%aZsW+Od;)k~Pw9flxZpk$OX<5!K_F4V_C#>x>c=E;!1*^(>L3VB+!`ctL0KWbd?
zqJw|a!yFSQM@^vbxJQ+dVJsyYXZ6q1Cd_ACDn@U7Hl^ZEsP2`uLyn^ksGqH3&*3`K
za%Sh<3@nv^C1!+=ZA{*)|KXXkenBoecCz2!7nI#v1=V`nU0fgB{~IJU2hHm^*DghU
zl@~XnMNBKO!=p%Y>kiEh_|f9o+am3{-<vy8i!JcOyo~eY?D@o{{VJ^7zTm|G(*L(f
zTaVv0C}~RM+?ja(DtK-3ZN$MQ@o!V%?{h**9*wg6f9_3T>v!k{!j2nHz2n<hXa(GN
zOyZ4Y(Y_b&i?O4$>&M(mr`X?pg0SW+7b4HGzuahTlOJ*XoaZTf|7&nDA-uMq5;Z|b
zCa5TNdJ0jT<ujwUExZ*z^l6f3)0}wpfd;MSd9YDwwybiP9doV8cqeUfd>CzcMqa&1
z8&~UM2>%dvo;zdBFwxk3ftFfoWFKUban+4RMl85Su*=lFe8t>UC)Rp((T}o6WMoR7
z|M-l;TrFR&d2_{Pw&G~S7k)_iA6BA$q$KkWQW+_jV0}B5EANP?Sy#s8t_HML>!1SS
z^NG8rT^uDad`A@x*=kwyP8_3ye7GK@1@^T#CkkSxq<~%b<-r3Ip0BX^{Ldb1f#W!0
z1^fd7K21|GX$AW*2`|9=eySnN#pRSn%e#jXUbs1rHI~_bZ%{f%;P<kEFA_pVe6b#3
zhu?i@@M4SFwJK!&QBH_0c&+xQaDJ{li8@Hc+6vK7zgiuHTelw+{iNgSthWeNXHcF-
zLMNnpHmsf%R*pjd6HxCw7lRFkm}PEZDfbDV+eMOzij&%S{wf1iXW8SSYT*a(?!Joo
z?eKrOK|4^^=J(QfxtPBO|FWl0?cuuy*NCHO%YEQRe>XU*#lr+(+U|d_f6NrIpo?wX
zVX-NVE9rs~7ws82&X-0B%m#(UWJU(6_*-J%mR)FEhxokmOG+EGDeF_O!@p2bTU6=S
z{@#6JCoa0|;2wKPd^i91i7V9v{kZqrDnNMQ{N(Mk^9mC;t=mjZ`+vE$Yz{>>a5xBM
zz7oX0vnA}525hvv7v6E+{Lks9j_G-I9(=lVD<rUJomtah$!Qps^*p*ItD4iJ@#~=f
zV~78-cn+&<-Jcj#lqE)}4fg9~t&iDXBA6m<Be)@RYrrNr%dj72i%w(uWi5TBBHLCL
zOma#^!`+YC^2fd6C$fmw!+X8lshQrJNGdbXj8_$1(FFvuG8GkwP3o{z+0VJZUHM-K
z_AjHajUUfA&b%7t7(lLle?jB%(W2|r<kI~^r&{QD<`(Q|??tRv-hQ8i0VQ%)uI+dA
zc&Bd@%sQO_<=upPamV3~8RqoxDXX_$*#K6j_Y1V7F2@+u$L*QiqE=F^+JK#-V@mZ%
z>}dTi2a^I~1%5xtFsr;3*`?tQCd6g{HWcL(kymhZx+;?|O2W;15Dre*ih8YU84Vn^
zcU3`qc)kPLb7>gYP5UCg9*N9_v8%ED^F~3gmJ?0X;jtu2GRyVbD3z<R>*f=bXWIME
zNXJ<%xJxZ?wz>km@xCSaJcd|!R>)^NV<1-~gB@D|SoNR3iR5fO?Q}3q@j3WRI=zdd
zkddH=o;mNch1ZTAz=i#kL6Tc7z*yEqwIpM+{DUQD9?$}?7@NS_Y~+H5=gn;=b7odR
zAr}HS{x>)6f-R|?oyNbgh`Y^6PBrde#~<I)mEA8hSic#Y^t)W73K{#k)RS=DX<kkp
z9!hze&K+T(nrG$k_sFAS`s(kb6iz0mOHbWekdF-t7KNv=#(i5gWO+Z-iO63Zn|;K<
z_u<Xi+`fYX0&y2MvqR1@)S0Bu4L}q<tSHOIW)+LFfNYquOzNW1k>t2wSJl2Msd!qZ
z%LFm%?bLQ(q@}Yk%;aLO7dY4cu{_FAqG>6_E5|*9eQ%Ps7+V;MZAR8mL%j|j?LLxc
zBhJGfU7T~9QLx5^0${hq@oI3<c6F<evO47S61Y?dN8PqEij(9!q(CrN&pnI;Gx?n#
zv%d%5#vW*RBH6@4We4x8r&hV=Y!Fn8+vG+&u#rkf@DQnn`ot_VOo#RZG)XFpI92xv
zDR}m0fB@b0Max|U_)K2<lob?m#s<IZB4ne1<><v~mV;V7u@BAF_sgWFjvZuzSgvgg
zc34pAeDf!8ZIBuDikgW!V*~wgz%IM^InZ7(!~NhpuzxhMguo&&-jm+&pYo%JoxXG(
zw}7xSs!b@BR2WJ6UbQk&(H=r#7Cd}TMokRS4`6BZmE5t2UnL?oANw@p)VR9<Y&FC0
z+lR%oWsq;G+U4e|hB)w$0hG7TQ)5lZT{{5#0Wgi^Zd9RM-HC@Ofz!<7g`f58mSlNv
zy;)?YEt}eGu{uruS~L6|_njmr<Ke$T4_nJF5+~fJ&}|NYx8H($yLo(0;ty)kn1>Oe
zZ<B3S4mNuoutN&qA?Z`3I)<imOW@kD4A~LKt%0*FDM%7bXd?<`!?u~`@jhn4N{wy1
zNe&1j!Z+<zMW{ipzG<o1#rw|MuNDyTh&VJYEf9?yW)@#P{L|;l3e;!YW?N0R^4Th!
zyY0|Y(1YAzF9!*>e-BCQQy#hXj&on6Q5#Yx`C^$q<sL&8YEn`Wt1fdIY|Z3Uo)gRd
zF5xY8A=vsB&AIH-PE-2`$-)Cc$Dp{7_Omj%?rVdKY>jA<?I@z*+s)e<nSO`%v{W47
z;sUPk64;~PVW)NH-T4Szw`M@ii_vxIpUNSqF%9vW@wncCq-l2t&vOf&MysnX{LJv*
zsrRdQe#IJQ@g~U~q<;E?frsC98F+k23R>X1{FG1OCLx+n-{ruDkUUsu*0@lkIDKhT
z0K;~bBP+h1PUze$k7ef%BJ}>*{(wm|j2Wv}agO{Thh_{PHD|TbbfRy5-eV?(*VcX)
zgSaaSiD{e1Umi~i(bkr-{zn{b_*bn2x?d?2j1kaCsW}gbbzPqP#3`Zlu-+VD*Pzww
z+3-KTbt~6^9zlyUWm;`-XyI282vR#1TL=Q4-M7M)LmLpgF<ZnR$6b3NThA(sUt;;<
z>CCD=i|N5h#n%<U`(mn$C*ByC7XxMIhU`Dx`VB_~ZdVlXf{_=(V!L6CU_2R*+R4`X
za#vJmdb7OsaDFwnA>;90_K-@3+;yxh1h>ul?y6v)*8<J9qU7s}qu*Q}fq4=YAcJU#
zCr8Av%|A_iJ4mbd7Q2MUH^E=PDf~(I^^tD=cp(U{mYn^Is8RHYzW4vUO}^+5_cmrt
z<x)TQ%~8_cZZ|2xQ6puCuhCOB51F3ykML^Hj$#ih&8a|d`Z0shbOhSKXZmazt~wU*
zj21Z@?Qs_W4<NceRIlUS*=Bdd%)TrTwjMFX_`$YoJRu}HG?|sx?qufx)@>5L;F|hF
z#sVCJJ<<OW&|ddGi$pB35?DKd&xiJ861jX=v}z8?n5A+E{;cC>t9lYeVY=v+`os1Y
z6#SuSjKA5oiget~^l7$go>mpJYcNQE4;52g%C6GNm9^2<x}J;9h=mz2c`yU4I^P$v
zrF%HZjc+=`GK@WkeKe-kD6q)s8otO~uM2!k&V%001d3P1#KC!7$yO#a$5Jj8mt^u&
z0Fj0BPZSTsp1mP@MU`Wl#_=D~gdJGtvR@wUZrm!_!Vb!MS6%f+8h4~vWz6^-5nYMt
zE!7HM1BqYfPFbCwWWq-{aCdi&_^4Y&=Z%gmBqk^Rx@2BAQo&`X_9M{)-TIjQ+zJeB
zMzy*9`1U!&YsmtNTMB99>U8zPL4PoBY(7@|Hh@=e_l8)7Yb=T}`ZaocFK>}HTPbds
z9E!^l>nIj0{6SY!D*s^GK27Y?F4fERRNy2~$dO8Q-)lIu`!7p~KLmRPU8VUiQ+si8
z&9=8xWPECC5y}ct^vm+!E?n$tzOB$6oqw7YjY4~`1-ng)EsjlC@Be!yd?lfYmT|BW
zQu&r`E+Cfv*0aY$Cyy=D$L7FjtxmxN*xXV@j;|G4pIm9%^!?e>!f&Vh1><^4ELi_z
zXKaK%-`;3_h0sTgdP$O)8=MlAH%gpI3haJinIVB-hsCx&A;eXh)+Ya~c#U}fC+c$z
z7~h6-Pz$7AZim<vv{zBH=yHnG5gmBk-;hJhQu$<2Ij`5xSejEJi~fMgSXOziO0Y>*
zr&z{wF@D;v!mtc^sO#b=?Mf;2)N$qFv{?3dOlHVWF{v?jwm<LDP;EJ(2Zs+(8@pKr
zXCMsSScFq85g!=6`--+N<&T<M++#d!xS~}HD9%aJ)UNUQ?<h-lk!}Pp`Y$dHv=n|t
zPMK=ypeKq#TVir2i4teee=-dj9gBW1b9p;*M*-G_(!M4=*O2-@OO^jJOpMb8{eyW^
z{_ir1|J1+#RY3lqCd_}8Mf^YBvIWhvH#?9x^II=8K>}MfG8{59&#?SEF|jsVCUmKZ
zN8@oZef}@@_AeUsAH?z+RucWD85yzyA0GUtDc`g(hCkeyWGqRTb+Ju4EV`PBxU5}W
zhd;3`k2m(|X3=nG<f>mNMg`3rg?CKc^Q-qEjqW%PxJI7Wg#eWa24e@xG1XG9g8Z(5
z%ZZV?T6lSpLQ8ui7~L`7s@)>h*QOIeo%z0IIf&iYgbW)#zdE!81_?@+*s7bf{%P4=
zkTm1%M`;XlAH9Ko@pay#X5Sb;|7mGUlb4S)4J^{GyS(WwC4`6IUzl#hR-AOi*-Yf{
z!Td<%QL5*6qV$1=r89PM(AX64=`m8(0G;vS>ND!G0h+86O>`aU*Kp;3rsUV1WjAi-
zzEvtKq2F0Qx><a+LP7`RE;Z6AJk1ci3a5AO-`JkVHh^M+Rn`uH6Z?y{2UkoMVEOQ&
zA46rJmbsz2=vmUXvuYy9a`sQS;9*#j2$S>K>9yk^L+`d3)c~tsBOHz6dYV1$uHT!b
z#4o6VB^vG^I;Tr7(DU7Aht3r(tb`Q`2Iv|EkeHMZ@|~BPgBOE0P)-E>B+aYsH21_u
z`a-HpEk{tc?#JCjTBoAb4##!rjw0$U?0O{f#_!)l_pK+YN*cW8Sl7%>V%f6d%UjMG
zuKkTs|KC;o;%*!-6f~;=`A)vy*d5bNwTs2&%B$=%2kq6{`4GuzF;h_MnVIL$=;MNc
zmczikMo`;Ti7l@_xSa^!F5L3<3#&|*BjCD~6i+Bt=slJ=4zc|qA-TL5;VA!?<k^Y?
z%4eW$d&uu}G3!<OD&w~I#pi=Kx^1Ui{3AwBgQcS1QB;KOZ7pp_<=uw9didV!;Mpc}
zRTu=zhkE9bI4qI~><8vXY*ga%$Pq(Z9NR1Rv5ezMj{^4Ij3}sTUxueei(JhkPv?9-
z3F(Yd_g#%-XXv?Mxr-8(jD#K5{m}Y-TAC6h;|5X}_MVHE>dn-}``#f=Ra87CD;`;U
zt-n`-{9MJ)@@rlIWM&IPPFxa%I&ds%d<N(KkLtcVs;Tr{S46=P5D=sn6%~+Pq!&>E
z9T7y7UIIvyUPA{#=>aLyi-3Sg4M^`DDG3mI2_^IZA)$oM-RL-He&^0_&N^qUv(~--
zCoB8g`z!k`&-1+RZd#dAafV0%SnX(#o#X>7PEpo*r#rgH3!a5B;AROC^i5&)TJT@i
zwG9R0sGrsm;3Asc`SJrNt3Dx7Rajsm=&$&>>9jX|IUNL<NJasN&mJ~juCavbg?N3>
zikrzx5ch~p8+09!q}o=H)IFC~j(_J8fT+u5?K8;$y=mguv)^>wAlQ|eriTN0zpIht
zPKtLpk$Q`nrg*_{gYr!L40<Gd<)1A-lhX?hic2Q9q_aULSUos@XPx!cvs-80IOr{(
z%4Gujpd3|V**cxBW;-d#qLfyyc&urJcY769Z7A3r487=vJj-Dgsh#|iX;`)QqPxn&
zxQqCd5FxwfwVl1^14&`H4WxVk*W^8~s3KaIMDw^l(kI8j6(R|q$mFX&$|B7Nvg{Qn
z1pK*;uknO@obDaw(wXKJ?JTsi7&^*9Lbw{5Rmb;^P=PG~ZC7~|lMECOs<tbLp+?=x
z#TS1cnQ{$vXnC{Y)=_!b7OP~&=9Nqg>j&k8QJ8J5yV!?!0{Jral%j12uxw|s{8_AM
z<Gm?v)@_frox|BDS_FRFy0GMyonYpS$m~Q@vMFYXPU2%E0|^yaQ2eSyc8Pe2<u48l
zj=e$#?t`35k1W`LQg6KM<TEaP&}>_mNxuZBHQg#wWL8OM8+LLzFF&#n3JWMqRviJE
zWq)-D52xw6v_rl_=-ezKiJ?Dxls^vw5RB34I@WMG8rF%f%%LZ%bXdLXbup@nlyJ!h
zgIu9JRaWwKwBhsQOk#b`cB6+6#6TTsZjoH@7!w1``zv%iEGq}@5hc9_vRP4r8))fN
zvH`r>NiKe`V3B^#ZDwC~^GkH*;qtS{oOWeqAk6-I8#n7q#w6VNYChe32FEN0I=aOD
zZTad7b3}t~z0HK)o!F(^qUzylnB3Nq%hVFZhzHD!shCV;BKrvS0QF|a9$f=>Q)pZs
zLUq_I_gVbN&EE%J8ExuFUQOnh?tTnEauVu^KMO=R;#of*Uy-dpN)cwL^KU<GjG!eG
z4Y$AZIIv&JgRky3ZtzErUosEWE)I?-+F^XLEg0t4t?rTxjh;yNjw!?xUNYY^yf$aH
z*^G&wiF2C&lT=^lu6+$7K??!fwEamfibeutrtkhx;!t<1GSxiX;IxVF2S~uc&xF#o
z7BC#afo&QN>tR)!_$TyO^fU(>AmB_K_gLXMTO`QUtjOEIZI84iw~{@G!(dT;a;D5A
z_IqJ$?LugHis#0{LZujJi>LVcSZvHSCSlf!8cIOuk<1Cy;pr^<3f%0#?^C|Td*g^l
zo=>5C*(L=bN~?^kSHj69XGurSEtc}Jona2(9<M&o&W?i6BOoOH2}K|YO5uKK$PH%x
zRr=GUw{|a&s#|#~rpt6sj>M-Efb*ZQy@Tw}Nm=sd@|xUGXjB&4GSXm$Fi7i-*#pI%
z$@e#g;OEkjC@n0U?X+86a!vgNc9zSAPurE^yabyUf#cC@*b<mSuH?;C9f!Q?_GWER
z_1^{1SAhsu-@WxAB@8^u8Kl%Sqv#)!!s5^jB!?G9%$w7j3@`iG*)_uIkJ@npkXwai
z_#f-G{{f{8mUca&01hXNK-PWM`3N*{Zb|6MsGdr#wv1vXJR-Yavn5vyUBcLvkrRD7
zg?wroELHZ?@4FGN9eAYW7YmW;H?LDaQjjvI=%WB_1JV-)x~A9EE46tx&JY40vMZzM
z9V&NOx9Y+&Mh`oqz>RxKTIrGZtIn3ei|KGvqKo;;y)?kcrY=&!>%UF<0tL?^`<K?k
zsN^1Y{J7yCw!n>;^l;QHG4;^y^3Cwe3v+`cg%0~ITk?DNz%r}J_uR+rdzvk3fF@b(
zVoC(Jq-(04#?}(HFEO&ZwnIWno;+Cg#d-=}rqu5^<|%0%*bstDZMt<-;YI{sH17Ao
zo)#6qs7B}$E8quo=6pn5t`?jnV6}t_Ueq!qgZ}R{l;n=KCmiZ3239R|p7J$hkeI1o
zV0Luc9CarrUfZOCbDvF7eeCBHdoUSke#}5^=1MVwWZiKYx>89d;(RvCD?;@V9!pz-
z#u_e}6SG_Vm?M&E_n|-L`46rF4lHqy-J9hs<Clh*hv82|$Bk-A2-}xwWX}sn*@Nub
zkJ{;7sID-KYSr+Xaym6G10CqtAb@UVJZZ`+Vrsb3KnCSICA;5?6W>Td`p85u-5Wpl
zy(&tL9$l;3j_p+YBYj131_}Fqr&ZwGpUr^+#lx+gcccr$xcfM;ZnmPc*U3w}Ud28j
zCY^mbE7ebdy+XGDQ)Zl(+C>Rn(X7-W$rKT8-&cwdhA>+$70(V#+*JAHQ*y3HO^+rV
zEx0vpAPF8iX@fj)*A%~FJk3RCK|C}oV-6p)`hdSjg=yJQYOuSqhxA)W$Ty?uOiV&(
z)1aK?J4?ml_)fEIhuJS$l82ljy%eW4&}ehANS=}y@a4HwVir{`%z8C=jNsu2^0<=g
zDF6fp=HZWEfxxPaV5y}P%cvzlL1T1cDuHQE>SF}P+H9p1&1f+duWrCRHBg`H$Ihdb
zOs~Hl_Q;wZH2p6;H&9nU&g$MuPq4ML|26{*feYImJ*J0^yVR^LET%e6>Hzh#_R_N6
z!tJ$$0aQT`Y4khDKiKjQ1Lf(})VtZJ$AVb}2}$-Ja{j;tr2s8XZK?T08awN8&~3JB
zhWdhS?;~%?;Ij!8#EoClNFKId5HWppr#4*hz81=nhd2yrB<Ze@P)4%op*C-vmB4W4
z%yTpGXYU9^(JS|6f+r(11-qETg9OP!?iqP*JR=V4CDjxPNU0-Kz(`r%Ye6q#JF4b@
zkw}+%V<{E54@O_AMMiYeXq{ENueL4~+qA4;7e1^6%zmpoj*#kiX4<R*I;3QS5Z_lu
zM6b3lTZ=K%VGkD+(=Ek8POr!Mg;wkGA(rk>&z|NVp$RA<;;h;(o2bYW_sSi1O|)j^
zsy7cUghi&*H7%tho@SZRq70bH{eHH%A<aj_1P}A~XX=4>`Q;sv;ZKpBFpGS=;RRR&
zaW0@g&y3zN3VYXQmqi=1!Zy3%UB|OAHJYoMAzTszG>y1?h0M|WGgPQX?}f;bHg7VW
zLW?GsVhiKahzc#xyV3Y%=6Fpz&_M#gr0;+Y>ILNJTYvlXR@px|fT*M1UJqynpiOD=
zPTz7Z4ABcS*1~kH^4McOn!brYi%sksmv0$L+=M8tg%#NOc2dnn$}$=A;Ut*ImVu!Q
zNY&`9#{nQW=-b$rMoG|`L(o>QV;2~8Uv*?p!4%lRvJ6A|^uneMyv&xi6W~!#z{c$T
z3^#RNa{{N!RB6y7Y;jAfPe5*JVZ=e;fzm#HhXvs*I_aqcE(H2hjGcF3SK&kZ-tdmD
z<iq&x3qG6rJ;8LHw&O&!00^k9E_TTx7;|YynF6QV6{gp<o=$rF+?VkuiIr%Ym3X%$
zrG~wkS%8k*O{#oW4QJ7jpEaMd=gL}oCUNUB=1(3@>fq=&aKIvmIq_sG0`q9;P4FL_
zka6r@(+qK2fY4<*`3=RoFQfZ%8A}oUm+pdT4h7$|Cf;yiG!}l>E`ixkk4r(vi?Z%!
z3w>;f(6(c?;4d*jC@ML=CjQFbgSmXIP;`zsD1e?!z?-gxr)&Q$DPyJ>Z9e-}hw!!*
zdowvbo4bqT29oAi%87lbtUnQOY5z&(bngeGz5>zM%=x5}hqp5mtg0IewZ#la>2!qV
zHplTkvxS+n%!N%EZz*EA>NCd&mEGM+vk7s>vz^zH8fTV`x!l3@2|G*XlYvRCeL)$s
z^AVhEN|y85$!&IeAfYo{>jw!U#y^UvzjJ<;|8R;_EAK&-*@X~`I<XQZA0tM~t_zt-
zBz*NTj$y6=$9RGD=;}rv_F2QK<|{)kHJ=@CIi=P<=JLdH?-1Za3OGc)MW4QV))Fac
zrJQ&Yk6Z1&v|G4Hx*Pi^V)>{}VShWR+*MWoH+&7fVDHD3wLy=3lWd#xHE1e5Hc7Fr
zV|g&#W5WP&Re=vQ!2%wQD!qzQYWd=&`1gA(6HP(!iHt4|&$Dw&IS@*1SHNV86(2^v
z>>2BSV;N46k~|V|;DO7RDHkKJ0?YXaK?|1k5MY*YdR&bgy^UxgpX8KC%PJ6IjI)7$
zA6FIAcADKEK!uO8{pC<>Y=2L}XF0;}<vo3PMH3NxSMfjbplAHgAH;Cyp5L8|bCoRZ
z>|Tclwf{<0_TOG&N#HFVY+k9WNR3toddY3GHh2hy7~dx5rul=L93B|UVVPH`A2u*E
zi3?yV&ZhbY-uDefGR<jS5XyZSJ+gHBUn$^NO{=%t+f;bR@DYGo9ndKh=lM@JBJi+H
zsy`#(^WfA=K%tVQ#6J+-Ux=CvvD2Yqg@N_nUi$ySd9yF4XxM<0E*xzm*e3<h)#|$5
z!5pdAu8c}K!WW0bwVsK(e`YEke>Acf$y6Lm`H%Bq`gim3+nu(Ya9Y2o&cWJ%ZRRPw
zUMss($YH}F4uW=y{-Cgv6dfA175;&({wmkbE}zpDigKq*+g?X+T^)PylV$$NMjpkq
z>@Oy;F*gir=Ga>n&M36*BE6o{E&+so)f8pjl}R&6+Ct7C=pRs?NW{EvS=fI=rO<$e
zF~XH{5=_nA(PcR~;7L0LEsr%oZtOYwZN<P_;P&xQ1iMCZ5^fE$p$$yyKi#M2s!m|3
zd;zV_ddb2%G@92ALsEiG$_26(WZm{iePC$8@)*NZJ0|o`P__KkmGw1)-S5LQyHlVS
zs6YRXd`~-`U_C!(<1r_vNRygppoq<C{}a}}?JX&Yt}|9qs~frS3;o;$3(S?R$D$Qx
z8}?=$S3b!RBDi>^?nnSdN`D^QUz@8nxA0A<S1Qj#<F{%69GQH|>So9Wab#>rm^)^l
z!~>9A?xm#^<68bP;92KROMeE~*b{csi>{JeEMKBwRXugCiFoAb-V5VDpIJXZSy$gX
zJjD&qfT>5-SzG3p8d4j)Q33$-pNv+kEivE06|^+cxT8DDt*H{ickj-bPUTD^aKbGz
z^+^@G=N0#8sV|$cFp{^tGk)8^=kU5j;Jk0!FZxCbpyCD3sCY3@Dg2%aHLP28$-rC{
zzdxi>uRMJ)Yl{GAjHWvCfsXWwlmLA?1-=>|H%E<8+b%;s?@g2a+p+3mpfD#g2C2%U
zX3dxdUuisS$!0s8b~S*_8F2p}YevkF2X8&TFJu;ZQe<uI#Y9Yb1@c+heEbDWewPG^
zc%ISKEs9}wX>PfK<+E9}qvIWJ0MS}iJO4gSrpQe|Xm%EiZ65fJIqCCizoJ%;e-@*k
za|Sa0Py*Bvnp!_>p*q~cIi{*wwK(3d+T_b!5N`kwQ@w{f$7#pk-wXV{xi*oe;YqvX
zNjKfmown}f6ap|hKcp<Lzl($XiR5R`a1iwCc_EY0A&>Xd(9~fM8KoV4Gg)##ZX=7X
z;{<1$rFz~UYL`l~OST_4Sz&E(yDbs80Vr>_>nC84=Ki5LrnLIK`Le&rE*ASWrhU4`
z=6z0a-cE(FztknBxrv13)0FI=NqDN?9oEh+PGlQv=n19R9MBI`Ehg|CZ{{_8_F#2T
zk~j_%%3`z@Z)(%=IwC4GY$vCawZa_FlP_S#9S-#n%=pP^ulA;KfHBC}rnoXXEV&-J
z;k?<u0-&KfsUhaf{0fT-kW^khTSUQROca10*B@nydbjir40RNw!6Z-pR?YzAEA)cG
z(ztoYy91zBTR=hKWG^yml{Bfwv!7mWxbsUn5eqQrVc}swV&R4KI&DELXlU~=OHXaD
zkP7v#F_lokYDX9D?SI?Gab-W{l^njOp6b1I2vcoO-<@uw9<XP$T9w%X12|CdK%OF>
z&rvBye$k(Mcs8_^(ACZMr3ME?DD*a*uiV>cmAz2Nf5{~UBBG+9?PV=$B(k5vms~2c
zUOc-}2d`B>DOBUDysPJ#yD({%YB@J?xJ-S<cb~~+I$r>_H!)j9P9|yanhnRt)y-Xb
zacJq)3KXoeSQo=nHmS*;nv$b`D$Fe9`dB`vMJA91oysUv1mIsAcv$ub=h=Js@1f{1
zAP?7)SdDSK41q?PbX+91YLce`*UgF<R7eSrrQqvk-a%bLb3@NW4o;~8B$Y(WOUul+
z3R!QXZ7z@!hdDjAt<l?cm<3uN;Ym|W$8nuB7Nw@vNhx@z4l$50uk79`d}9xgkANLd
z_dUmY@inZP)5Sgj*C|Q7k>70-v~imnog*^k-T~+tEHWJbW0TgXAI)daQCl?t$!@in
z7>LQ375A~7=!%tCv$)OivjG6L>G3kP?mt5>WQ@Vdj<|S~g1a9SZga?zsH|38Dm|j{
zXJ7;rm>R`In+=FwwQqC{uq!SW+BK!#20-?^KI8~r3s0O|sWsRI^3o@~{cgB;+P(9`
zXcwCOAgjpRk{2mc?1pa_rcRh({Pmc+MAWX@7rU}7fB1x~@xdi+NI6evqitd~>>DXJ
zT;IPMwu66=RW!T7*1;tD0<%N>%MAY=BV!y36zzY%M7u8J*LrZ>TQ|rbP&hZz9UHmd
z3=gF}O9;e#I_-1f6d%AE4$7To+u}k7de49fPdGwNPlZh_=fRJRUP(jW*t5-9%;V@1
zjvlByp!2z678xligZR>_N{UIkIF@8xqvpgX%)K9(;(-)+k2Z74By`*Q7Q1)lXEF`o
zGWIPOSB^0AIbe1BM!HwqBQv!F&wtyp4E?PpcHM1vK{QrW(Qw|eJ;9XIM?>Wu>Yiam
z9l|1z%yDJ$c)C7x4*5s=j3h|%f>m|9=-aftni6Un)ScU-<oD_QSRuQ{kZh(uZeqp+
zI-Ts}d`S2)H$-W@%`NF*r9yT5hZ5(9B28<q!aqDG#Jy<q(0=cUnYdY~*irdDjA(u&
z==}~$=02b{36q37Hs_PbY#lz(J=II${liuK8_u?jGbr7zaaR`CiW5MD+bPArp}gD?
zx~1fkXgY3K7}?(JGp#e40w4tfW&R#5O}Hy`hG^>&lQ!qG?A)?=i=KE5ka2v3&(iJX
z-bu2m^LJ!g_nJ5~uBJh#bha3_DDIVvEVW9=oXWmR@}s*<Y?a6C!o`Y8jw}*s2<q6M
zTVCW4W%zEM#Y&G3X7N0+#_RbLk%gR)=T<*p%zUFp@lX}1p=UXT!&Ye^=M-`aQoOV_
zGn1sulupSQ^HK_<m>;2ots=9|emU#APfr~*O`d%BPzqSq;kVe@&3yY5^0@7PPg$!l
z6=$kQi;mNz>N~j;9t&ie3(siDHqUt<rf30#vt^g2LA$UV#`jtEvUY{c?b`6%sO~Vm
zF|Wb}U+8X_YTBlD_8VxVB3;1IrO_i2zEuS(8a*Qm4I=AW_crh!ehc`nfu8GBicMT=
zF*wdT<=Bbw5#c*ZRoosxkA&8jvh3#uiWxoSF7+&$2iLUL{iN$-fvce3&mk-m{1#g5
zg5Xtii#V*DV%l74uK)>el)mFqW<S{F>z~dom;K7<#gWNWnBWA{LazlygY_)+Q~`K$
z>NvGeHLk2*?T4c7OjG(QYL(LDr&>@%?N5o>AQf>~fZ&yW0Dy^<*&oLb<Y8P}t><S(
zPCt<p*0qQmxO@x2RGsO<gg-bpQlKN^Z>})&59iHL7uy8uO^#-TIBw55)&m*qHs^44
zUnu-a_+&@%Y|9iimj_?ECSkj<e764BU+mY}{1jjjPBOczfxLWXXZ*+kt5XMxPg1f}
zTyam2+sMqHJV}A&<B0DwNp6Y17%OSu6`VEt{ia$Ft2<V&VwT8~->u$@$%T1gtm(wC
zWjmS6#sN`f#gK9n(69Y`YgU7S)^jpy*K$ep>hPOeup7p2G9p@R&+q7qeWb-&^*=F3
zEK~opEn4nvZz`Ehk6(P3NfXeIt~klosbRhA5@aO(+%**<Jj#Rult^dspW2Z3r_oln
zNoZdvrD#JoDOCJI4PTT<jnLfMzT@`q0n&J34z0Ash%uuS%-pz1N8yVh`kogC$cvPU
z<uy~Qnp-A}2rDjlH6IgHK2~#ltW#f6{L_dSsqxokL-u6KwgScRi%>rRxPU^5k}?Mw
z078F6IVr;mLD0M`m<h|LCt>Dk-9Hln+$2?GG#ktA|Ka0)c+0Dn!Rz_NTufXdsh3+^
z!!_~pbsgh;T~&4AJ0(3=FSW#zfzW-*zcR}IF9~`vyP01Ez4-?&fS(5>!T>)n*H*%V
z`=^BSUxdaC;vfDmvG_mb#%FB8Uox)Sxh$#yKP87k^8ZcQ@88?$fqjkXSeW$m(F&EC
zZ0@Brz%AJ1?b6Eb307ybV3EAYOrsXbS7%CfAYU(O4SfV<#}q-AM%|OGSFK&8=TZzu
zrVn5LRhN=#yyn@z1D0PJO--fEp4fD6eOl#)Nh8CY2}6Dc%)wwOoYZFh0}mS@viXcx
z@fHF|@j;JT<Ng4%m3O7QPk<bve>;oZ=LO|PnwI{QQPGO!KZjG;65TnRa(+bpzK;0O
zN0oYq*;?{8eYFTzs36~7*=)*P+~7~p&uLxnbQ<aA+x=?L+VlLax3|9a{kAY*`Qc{U
z^Q(T3G0cx11PgQe#q?bbp?DtR>3be0$8nb<{gT|H#}Ce3yh!qbI?FTUap`986vlH)
z($N&N)-&dTSzJU;)f}FrE_rQiBsjXjQpF`t-Exv8rQse~uhhFw`_1D;UU&AbEv`h|
z>KJ4@(a;5-suZTQmW7jijl1O*vyv1cm9kVf+Ibr{Z5OZLFLadV=9u~2ppth26x|!O
zzJ9mRS@(H{!Q-wBB`2C<z7(bSEPm^^7m3Ue8*akl*(_cM^FmvvPv~v=5ZIiQxU{Ex
zKAe%Ky)wsLHQxmYpLs^C8ol9*`XM~pNZ-r;D(7Hj5OOp}MWF|L0>Y%(xBpPV9|Fkf
zD&pw*6-t-o%s#V`(vZvSJlJgwZOE%l+iFe%=*I=jZq3@qb>}`q!h^I`*i-vwD8JR)
zPgsBfAeyd`lfKNJt@f`hpad<S^gJe{$A-$WxYkhHSLHrSD{*-z^gC*N4SF%hI%9Hy
ztV!0=-huiN@#P^Lah0FkpF}ApcpBSGk%_>{CoJ$^zm59#qr6cmLM2=c2N0`875yTs
zoIsf)McC1>mw1}%;{KQf+IAEjo}f-SBEiei&v(m2^$zK1b0~Kr92azI=xWu7bB7N{
z4|=H7Zy$EK7-?x@A{TwgiIv49L|@h;*=zzDvYylND>TwDh;Y|zcA>c@x=HC8n!0WG
zv}GJ;6AT&*#$av~8s9OYBh!&ysC4#dPZn3)(!Q+DQrhzl3oo8}McwD?j|R?o*j8!k
z>6iB&hp#`4H1Is1@ieLnszk@n@KU%ZfVc9w3%q9uw+<!9l&kN|2sN3osxU5JC{OE*
z)$@MeRUhV1P={2+tvnI#l2q3bN^%{bACuRZisyk23hvhjGCRJA0!xNiGSI%{dxGPj
zA`5Mc(-che6FWUm^70u88QCZ^c@HdB(_zJGtDqbp`3*;`=C3||#lRWnz$5vlZPH5i
za6GL9BFY%OUO#<f%(-GZDamzr3Q)>M)lR6!Jk@<(ck6qmQoR6pq-7fJ16M1ms*0jv
zD7KZwdLlVYghL~{<!N=KYF4u)w}cg{_|bGWNKEmB%veYH81LbDnD}l*oq~vGYx9`;
zuvLd!MP}u&y<D*O4bD(O!^SW0FPJa*X3o)cf|mT}KB^eVVJq)GS_k9BtzsNwS*6Og
z5D@Ba+Y(_Dx$f4O?xyrw@^Axs%D&|l95EP-b-PYF8gD<bq(&K>h5Xh=w3eK!Vb&5r
z4u|_!{+>gh9rEp1_MX6mQ;*2LZNl`e!}rW@I#L~6PIMMUYlht!=MUT2e7rdSi;Y71
zVh>P~m;?yY72$c;$h`3qsW6hSa*pdGDqLdrn^h@K$3oEx<*mn_i`(DFWQbW)MSFgG
z`I#~vP#>!4A}T+vzJ9$e11uphP+}_Rd}Ue)9YgV}5lu<S4aX(*aLL0cz6Au>^;yCI
zXc`}_xOfQqCNTNA|8YNctFkJus`yS!rKdFT8eL{|%t`jJmnm6SOnXQuV!GC`HT_jK
zT;GSmrXJe9OwX2hw7BvJn`V#98Zp0Ybp1b!0&r)B%&W>xSIFp`>9O6c;NhjBo(gPY
zxcuDBMPmOJj@Z+%VD?n~dl#Vq!%aHsXzTf*sAC@Aw!10XZ9V~rO71ehUaLXN5HF>q
z)-ExrYvyLYzzkSCIDXdC{%ByuVR|iX-ePcs(fR=T6*Dw4=h1p`Ncq+ez2@XY?6X(?
z^P~0D`)%ykt8e8pbyG9&`w>I7qv6$Gx_qI&ZhfER+~7=Pf~}Wmr9)&rcgXjl6;rZY
zY1+``IW{iorN>11+CLU@9;1#ZD8GwjcZ=Z^1__r4ydq}tRysj4^83lAbqEU?z;aT-
z!8PPo42Oaa&C8vDJP4;sbGk1G8k$SoKc7d{1rAu@Hc$bqNpm|BM9>}GfvMC<T_mGx
z&|*jn`~A=%71zdRw$&@^vtW2p>Ydq&NgSNfX}0Cn>&CBbW#L^<w#lDpo!vYBjyIuq
zYbRqd{j^WhkViD{mm`+tTF^ub{OIO1e!|gvSPSEYC;kYJAk);q5pkbN=|l*+%j?hD
z?q=lKe4&%&^Nx-e^o9c#X4-T7IO0@THdZ1<SwNxe7<_+Xy;f1?GELf3gJ8$pyICS^
z7H49~IhGLHuMD?3cC$M9WC;3iKUvVs?(@Bonkw>o+0TufDeWymjp_K5+d0r1dG2Yd
zy*e|WkN8g!+V%@ycR)W@p#9=2_7fqQZ|>0}vj^|AFO$(V+Zle+GM*uex4pDhJ3{b8
z+(YddoJ+b`DIE);x<#m7kP)^><g+H5k1Us%u2hBIiV$o=BrKB|*rgkewP{Sh6aeMl
zFl5b+Z(mq@cCT&;ZCm-o+xDG;4zOeLTEQpmz+3*Fi1bx9$Ok<eLV!A1-0E7DHs}!O
z_8(OUBN?fJ6%af$(|2{Nmw|9j&ckh82IMtfXNn$}57{4Oubj(5T_6nbWKak<4sJ=4
zIQZ=O`|Bg-@RPOA^STGWCqH!-D5HJSQZ}`{mON3)87-4?jO#!|e-o4zK3-kx)=>Vv
zydtkMa4zX9(RZlcN|CK>!n2x_qgt!NWcS~9C*sU*9Xj2{g{s_ldh$X%mQw^2L~K?(
z><@4Bg${?FqY$ZdbK>p5C0blniUje{sU<Z#->)J_a(xhG%RNjRnf5Y{-*Bwhuw{$*
z%brD=NeCHThVHUJ)(t;`7BPfa)lPlFM=$>2oHQRTlN)<TVwKMB_!85)3nuX04bu27
zaN)b(P#A-~iC6pXZA7GAHn>60*NM0eUsY4!Q&H*l(}6@xnJJ}dr_WAKD%IfMhjqbD
zGDEJAvXMNgn?KffKj_E$0v9to%wyfmF0j?p@k`utWtREo7YItWKcT%VB=rS@qXRdl
zkJ0Ev6b*XR34lC_GSy^;aY<RJ*^XSEn?xz>`E?7<8bsX5A}ehYWvMcEeDs4HgS&DI
z9y1iKXV-yQ(;##*+>!@Db->D|xc0mqBSIHNTG?vYledY!B8xJ%T&Wvj?H$%HWz(^Q
zd@M!X*@%Noy+K*cQnr+^*L$dy*z){`+cO~wWB&v#j)!lO&wlE!`%a^?K!U{_!PMON
znB{_nX|C~=PIBP4AJ)i*ZhN7^Eo>NV>rG_j1z<JIZTBD~UzJa+cVdS|3c^J%Y_J@i
z#tRa*f4jzdbj-k?s#(=8fnGxBZzSwnAweLhsD%_yM9Gdy<>wxYZ_(}FhK5G8MNb<f
z?UCyog%+CT&YOIB;DQaILQhdB!yETK4_+aVlAe*?oe5yH;gZc6>TFs;NU-yr7}i6a
zj>+ncQ?_6t4uu|Y#^&>gtXM5x10kBFsqnc(_1~4Z*#Wzd+vBT>fzpJ*H!U0a)hCx~
z(@sVsaIQ<eS5?@y$+KpkG%EE<TX$u-F(7DGKFwHU)aH~L<4i{GEWI@%3~;R^Qlk{7
z^)Ag9HHM6Zd#uGx9WRq_O|z3jB`gNs@1N+=^P4^>){kZ%7_jJqI@N+^<{VeJ?0JXU
z2K8@|jkpX*0v}2uqZbsb+4iGyq}p8)HDf<)jUvdS!9Vv@8@euD9t#sc?zH;&F%B-{
zc^KSPevC)TFqZBUeXlQj<c<9;i1*3e4neG@o5H=hb>VU?j<wS`3N?OLS3+qksV{FW
z-U}c7_IR6A5Yo)rjUd-*Rgo}-5J{2Mibc3kFI=U4nbJo`einm3-ef1YO<34HhY3%*
zsrV;xatXI<vA6p(a2UJ4rl-%cI)=uGlU+V9i{@}WPe&H_^ZJi!D#6_&(etW6YxNkj
zQU7p$t?hCTe<h40<Z)_Dd(4Y$1A*pVmsLXBd`bnYQUT*FxaqsFz@0vNDw(e}dh%7R
z7jGZD|IFRh8cll4(D~enA1m23znSVDp?Wkltjk`6-<6&ZsztH|*f#{;34n30N_&$?
zg(GF@<G-<m%ia?;vG)$Dc5H^SvrlN{T{|(Q_iCXKlzbcLvw8hi10NCjK7OqmT2B?u
z*%8*+U)9`@0cw?i74e<j_{`Y>8*nWyIm$Wzo98YZ@AHADY<_O6EG{i_jp`|24uAPU
z?dk<CwGwoLlloA39<TI09h<S+(O#CpzryVos2>A9jgtD@9_kSMBLjF0p#|1AW|=ms
z<O2g=*KOV}E;lVI%cF-4z-4P|i11E>G^aJWvE?0gmEGXmgz7xMqIl_X({4zqv<rQb
zRnec8?)M*~orQ~k45lA7f8(D1S*UlGLp*p>dlqw#2Y;R&=XG~q{B|Tqa5BpX@2z5M
zpkI_a8hSnb{D(XF`SyE!b%cxEWt6c>Uwd!1@%zA=|H({3!^Iuatjb=gcbdw$ddX+T
z8c0aGHZX5FW#1OO216HGsBNvE-SZC@=#q1Kw34t^+QFKfRD_xG(ThTU{N)rme$E64
zS|9$L2#`Dk4!WxH{){?Fa<Lh@bAE5wqXH}?gA|kbsyl<M#!O;6M||CQO1C?$fbzZl
z7IB%qDby<q9V-b5S@pkcUSTVtSiD2!_6xSH{w}dP$zOI&^*`TgC+ZgY&pTZ`e(-|k
zk4^n?O8Trt^*@lY)ZVxG*1Ud_8>9x8y&>NIFGu9yutvoMRgS#nLw#&1+aUV{6Z}7!
zp83(}v(hP-@jGMTyHk!eRzu;c``Z;_WQL7%xuNE}4@GYsmizzmk7cd?s{EbcNhj5z
zdmA=h`W(L;<A3A0th(>)AC~SfCKZ)ra}7U46v1pm=)ndlpKp-Ttw8C-L-E$c*QOiT
zZ*>e`jYu*()ePm9k{YXs4KaR1PA_cUcbGk&_!RO=m3NEuqod;+{kYNvX|<?`LCvx9
z4%V<`@Jr0SoP$qD#^-+*T8{fgs{3%CTn1Vm6!^{QkVFdW0(i(TCw2H@o;J<boJ6_g
zpkb~E=s?hacW=+-P4aajmM1@;$zL{-PyWmjmd{*6ogcKc_XB<@;Oq8%)j+)=n5wn$
zZ$mMhmGw@zy$M{K1bNnvi;#J4EUCS%f;<FobT0bup2E$h%zdP?<g{-!eDyVOh0*;7
zp2h*h*!5|3usVC_TZO;N8nX;lanoU3U_PWJv;=6@@-_^ZWYpgc&-ilptoIycTLsOH
zzY85jzN`WL8NS^NvE_O2cXzUWJ_gLgT%LMb``zasC1M^N^|^08FiphWY}mW-XyQ`|
z(2cD>grmOfJb;?JPggd_e6D(7PrmaGuk;=3l_ddt`i*6h@T>)0Ui#i0pkD>2s_$7-
zSKtz$uc`U()Su28iU1D){3P+}MUJyQ3z|y;W|jSj;H)7%SH}NimqmTY5&=({aq+q5
zkG4twJ8d=o+N-0V>q<qU?n6uB!(Pif@vF7r<>O;-%!}a}^5ZX>%BxJi4JBaPud0jM
zG%yy(*zRCg=d)+&@>1p?7nd}x8ElgS^9`N#Yk?lDiD6Lnz@g^`?G@HaiPIbEBo_}!
z@0RniS|#bk3T%`Tb^J!zt>rbm%7B`iC{*8iwchV4BCt|MA}^WFH4GiEajpAo-$A;f
zL<4jMo3YQU$Ty8O&jU5M^>XJ;C&6m<%MbE~Le-yAALhUQs8yg-3p7j7Iu=LL-RD2b
z7&@f~+lXpSS?_@J*n4UP-fl&uY3ecCa?d13go?r*@_U~Ki(bC*C>raWCmSzZo-XZw
zym${Bzv^jFdeFB@miIB#Yf+T9zBg1E$Z@@fo9!Poa;l$xbBC>(=GijPItJf9Nlqk?
zFg|u>+r`y+(^$1gxwHkVH{8{?HJ!w=Nk+e?$PjPzow{nfMr^7z#bBKY-eAt#3dJ8=
zG%jCjtz5fmZP2?q(*EkG_Z=zF6y1r8Xf4v!Sk{-oLwLMAeu%HYh69PuA($AcgtcX6
ztcTSfkFL5Qj+eFAG{-A3Wje`y4_`OFqj>fdOcciXHo8Ehq#CjM0+r4ETno`@7PHKo
z_wa~-fS7^y97do}<8J<pS;C+%fm}E77ePV=luzOip}LxqHq-{RW1vp1<JIJ(j0qwe
zO>Fn#%f55<5<R3w>@hjL?WCKPR_%1h=eit4Z3qd{_zoY`y>qo&ot12$b8CA>8n8es
zGhUuv<W~khu>i8(DS36F{KAR@%@0IejQA;x5U(uE+Vz7>^Xl47udntdJcM`NnY*|f
zHNr{`g6uONXwYfk1=#!~@Ghu1<RjGFqt5X5wy!JdIa*{S16`za6k9CZ`f>EdJo$xX
z)gzS>31U}EzS`SE;@wTKEJlRo`S#dexKY#Vk8L{F>y2Ri7R7Z#=_%jE4-?){wc^eh
zonOLUL*eigDxDs0L5w4K$EL%*10zxtnxu@B4jUzQYK@8|S-bYx+D6xd@Xz%OiM<@A
zdfjhK1Cl2taNkN2xq2aQT2t>86XAtGFoZ;jjwG9c!c~mMG-1{EJ{zmr7A@+oMi3cY
zn!LnE$!j7As`xiEIow7uC#I8bOr|?}sBG`ev2+4sGDK4QtC?i-Ya;va?r^^I;x~_{
z+e>^>G^W;0|3I_9(jtY9Wq6eGal$I}VI-`o)}VG*sl>h971Fezjh$*nci-CqHQGQ|
zIr?RlatPQcP9y9gAK3DF`EBm@u*gdgoJgaTSVqDud#a{he0Jnj(LSDB)y(S-s|gnb
zJqR+*VJ8ZUe~{d$bbZxB>7ujD&W#-wHjU{`VmeXdIUh(;fgE?n$?4k_T}x_4{+>=T
z)ap24h&7aejrTcN=vr!ZqVgkHdgfz;>B?%n2x@iJmY&u)w?*K-Ck;{94oGQ%Jte&M
zT8Oa0Qrug)nkwL_LyPDU*90W)C(YbO&7Fq=NnF|9>&OlGZC%VRC#pU)F#JB!xa(*q
zE8hs%YjjPea;sn=VOpCE@}0%y48T445@h;?$WxrNZEOzYU9;NZ%X{T#9rm@4!;FzA
zkfp5HjGXO$)_(BZF?aN0aCQPLwY1OIo+X}+6k;ZqrvcGXBE2BzPA+}nTsidmIAifp
zfw!gZ2W;P#qRfRtZ3<m+(Pfw~Wv!3-S6WdahF!#*Gw?E5eCQd%eEG~E)Lw~3R{$}K
zC2bcYzY-(jX*06Z5klfxTwveyQ6Vre(Df+8^;uv0V8BR#$3r5eOZ{toUn0;7)FQhA
zAn)Oe{GoncUmT@d@2yIe3w->bDSYE`PBgjW!r7QoltMRybSN13jV`9R7$%!c2xTMa
z`k%COUm))=#swJ|Lk{y_LD1Z63j36|jf5;1X))a(YIsuN25@X=$<(Zi`K9UZ!p72`
z@({M0D}vmz&z&Rgd-6!`CEZHcuA7)`p=~7@u%>LWW*0DTUC|TVXD5JwfC%XOy#sr1
zv`*zpNR!B!oo;yj4sizihs!Y*7vpb{(n-IZmDwJKvi1lT2HNU=`c1t-h}Tulyz%kU
zPLzD&5<}it`^6-mB&V-_#E6zbDTOF8@cC@whu_VLn5;q#XG_TMPto(cv%0Z*gTrSN
z`yP&^5Ev(#6`;GJeF#c^ndlj6?_pt7Jzo>qr9`QF*ypIO-HRyqcW?3W@DEUhvo$;d
zee-@ju(xRT>KxHp@yxMV7#fsp^cDR~Vv~+ClPdBc8dkuc0=v)37GyYBhdM9d%xLUu
zk<r2>406)OF*siJ8*|NaeYPfX=P^mT@}jH6@R!p#k^vF!fd>!iTVBeTTHOaWSy)Mb
zZ|az1FfrSMFhDNDUqFh*3*`I;8j;iEOMsC69*AyIq5*LAteek(hZ4B%I{Df3W-BAV
zd%skCY6S$H5PA<*dHI8Cv>2Hip+I*G{E2K&<q@6zZ)EYJPwMe~{#xmA6c6PO_EzUF
z&p(wwTrv<EVl!cGPsp@mal7<UdU|Po&MSeS<+Bo}J{5<6G#0F=3l}7GDPIu=B{MeN
zm7#%f?XIja@GF!YZAWQ7r8oO#AX12rz20L!_beOXOF78GrSBmkc($vHIXP++S|jMD
zZT>9YLOx0XKQAGaM9G{}HbPG7Vn3>v(b?R*vG$2}nd1nP(MnRACF%8)*Ct{2He3~O
z%iOh+k24Vwdu>j;g8cri+_HWzO&j;5Y#W1|od(*bnVsVu2t}HlfRoB>$|Xb7W|`Mb
zI(nk+BMf_ONGj0QDn`$Lqt(~QhIDkL9r9AZVUFG?8Tv_8<#v~Oj6u>u`dsVFCT=yG
z#$`^n2G#6w-vW)=JI~)w+D__%{Vrvj8aX>4CCl@2^$8UyTJKw-!GaqSQWg?9)}V8_
zlUlSc+4clsRoiwox%%YiijtIp!kBkz+P=9FN=qmyEy6~Ld<qzXksrsl9550Wn7+w;
zbP1$FM)x*mi)@9zFl?{z)OwOyUS`_=H;+2kXZ&npj!2fpnQq|I)rl#o4E(FY<dz9v
zFoDDqzPaA5=cd1qEB5>5qQYLw@enHX%xcQ834`b^ND!=;lUBi;VD~|ToXHuNYCB@w
z5dLFEUvIFv5)&O#7A>1^Wd|Lk1{&Bgu{q8|I4?-t3l6-|Dhw=QDES48#vn#4<jO!n
zm?UlX8}}d!q%S2Zq4`ON>7|jlO4b*qjYxqZbo$$Si!v--z>^VOa5Qd|2qSq;%Sijk
zJQ-o$@sitCw=G1@_;aVm9#vT0{4o<o>rRT_<Hk==*_=^!VNm0jtB%XZ+j8gi#nvgP
z9>SB;v^yU1U3zrkbDwV$Px~`^Y+BnOxw|y0unkmbIw|#jI6JxQ!Nsr)!w;GxMBDBT
zscIp3MMhfC(YI$Zp`5{`q4*@m3fXoF{*OxVZ6d2{s6xIGYN~+BVO;~+vd=k0VKYbM
zh<#VDdW;fEf$XiIku?Jm1AleC{HG(|qve9+m$e<|1hW)>7{wk408H{{k+ld^jw&##
zFkM%E&e~u;Iq0?OGFtx_&t^f#uk=*$Mi@QOMXH?;dwU^a(0#KxL^gsO#1izRp4V(X
zXs^-%%hK{l>G}zP>c&ebD`73cl3_x;t;5g4Lc7DZ?P0B<-J!?6k%BlAmkVn)lio<*
zeLUTr#-1rli3VExBu$@}kx0Wy!kStFoykh{Q}(QedQU|eNB_Axb`j9Gdt#zW8G&?)
z-XpXa6CkQ0^s5p8-GBf1eMl})De<vJl)5qT=3=JtBfx)Tt>l<Ba-(EK2h+kx;*q1R
zMENb1=i1D$32QzNB8lQdSK1TeM8mo#h>J#jRc|<+%)Wf&$Cb@5m195`$BvNj{L~u5
z|K^=I+lM~HlXHnLK2zPgL_x>eL%I|?i_P@0E45F9JF|V!H<cixqnK|-ht>xB0Y|m+
z)99jX1ZFO$nqre*CyWKBXCMf(q9pYaUwf^4qNH|fX4>d1fa!%#((#YBSJFLLqJjoA
zC=#t$(vR57=etcg$$W(S6NI7|vE~6QOt(YH=uA-psMXOCB{TNqG_U*Y^g=Y9TuOmJ
zyc;VD3VWoQHmz4nY-$L?eVI5G$4;3O7V{BN(m)UD?Xi%(w(WWgzz{Xjw=bwD-5_!}
zTE3p*8o-&688zyVp6h-2<`TMy?{E_zbbX47k-zbJNkAB-v=MuzeFekF_utO!R&3xo
zKk*DojN)^nW%<kwo>W*rqq;DRd%wSoE?e#e_5k#fPbp<yGLpOxCEj28zWGKNjezzm
z5}es(;{vbclzERm{l@rI-RV?6ag$qp_%{K4{!eW05>(S%aDw?Gp8^S=GrtV*#gAcy
zf<5_^s<PZvn-QHL$yUCw=k%@Yq&<^~m&4g<`hHUkBVb8wBaNakQZfH(RCy<v-M%TQ
zm7<4grEJbgrO=jV?+Qw3M1_~;uW1sHl>M*%mx$(+oVm=*(RU#};vDeriM;Zo!UrI~
F{{ta%lm7q!

diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-3.png b/docs/source/assets/design/v1/prefix_caching/example-time-3.png
index 71b9e9b60ab9aae080a6e70a9594dd3374c3591b..d753a406bdb9aee1940e9e84e8b956c34f0b6b21 100644
GIT binary patch
literal 51241
zcmce-cl6`b)iz3JC<8+Y5FjuU>VyE2EXy)=EUUU(k|j&PfQy=KS+XR{lIUa@AhcnC
z&<SNi4UoV92^~TRkkAGa0wJ`#K!!FX^!^=@<c9mLb=P|Tx@-M@wsd@S>OOlv``KHk
zG-}M;jkn);t+m#g%dsTC)>`YVSZl4dx1O^BDEYF#YXN+%?eh$=){T$vxO%O%a;1Q3
z2X>@eBW<nSGR1#g?S??6;|IHC$lVZx8fm@2bH?Bjl-o|<(k-q3uip>|$?gqh_C|0K
z&h3^dWMS|D;rk#+Uj0}7j-ieJ-B5NP2sE%~wbQf4f#aLIWe9N3jRS22zCkhgtP9|y
z4gO*1a5#H-ZXviVxb8@kwQku8c0=fFCbv%p1I2rmS+QE#4Jm@_ku}u77o+utPWnjS
z>&*7C98ew3?*r|V0YydAb8JwFh9LwLWg#fL50?J$-}(Kwve3~xp7pKX(y41f$N1N@
zX+JE`Ew!l0WLKY|9jq_`qe$z7n&(?!t-e{g|MZ>y5X7$buhPC2T0Jn3wA2O5<NvD|
zObOX7LxDZ$TOGse*x+aD-^&~sc)7@{15<0~b69s244BTp8W?xLoc`6>zn;&jLAQ@_
z7+nR+Ne|?(rfG&h6()`L*vWfZt*YfaDwNfxMVe7Kk1{i%=Za|2XqHP>F;c-A6L3W_
zIT6>Bz|2L2C)gT3t@4Hv;AS!Bz*89wI>Sk(<Of1q_5>ZJM~pOrT%J<-z>4HD1Ec+H
z*3_APLb`CJZjFm(nQGTdRcNS)8jnIk&ta-mH|bX>vlr9@(Jyd4N6k?=3(ren=qxu@
zhuz+!7U)p9ZdG&L(O4L0gECVUXt|Zg8Q65jgMyn)<dG3k7SyLvNTH`3jrl@G)FY+i
zO=da?6>Hh9J%DWw?}&NNfhE!#<g`X50lhR_wU<b~U!Bqo8BP10sZBTEbl`x_jvI|Z
z22}?`uor|9!iE(MI!&_&aoujsv~zHg<eS;NkuS`0F(oVVAQ?hxC2+Jl#_+Y-SfyDJ
z%3xTD>>x><vsyK}o1-cUuEmDk?ey}#iL+=iDU-E&yA^f(nuCpoGmLKKCqy=zEur}l
zRTQUO9V&E9&MukEC@R*t9$k?gz7O-mX?6m}COE_P7*C|UAk(8<LChkBjxs}Gxdz09
zP@zJdQJZ-RH&rU2GJR4gRI4P^AzHd#GObz(Oj~gHv~DID=VqB&1+#d*&QTd*f{)9+
zf(e78OYlu74q=?jYh{HXIJ{O!@KK?RPKlYwI)z%DX`|I)-EB*CBhNLaO~aD#cCl3s
zd?ZJB1JFbZXcQuc`99?#N$mKI1cH)Du0GD1v~CYfsYT5KOfxEEho4Qoo?np%Fd>)4
zkS*Gs;tawfW}<ohLK(9xa1UCWOnN!5>|#YvAyCvSqct(;&E#_3ZupQf%NA;4vro`)
z!fII8A`Bfo<Dz3z8VCr{W|^ikgzGW~O=j$%SqMV30BX>l7x2oY5nC-#9l?n9EY~q-
z6|@-nnVCMz4?%IZJtd@E<c%3r8)S!kx4<fF$tmfXia8{jNtf1JN}P9a8Z~-OiRrX*
zKtoQtW2)dm&9RX6K}QzZ2};x0)2M2muXiJ$6KU{QW(*G|5~X9XpeM8jirC7CP?VWo
z23m_AS2;K2Dup;?u_VlQqJYS>eai5PPE$v@-T)Q*P~gBGAGT^jn;T?1K^`4eg|S7H
zXB<AWTk%wEg0?|pOo_04ob1)2#thOLRZI<Z3J7|8go6rXiyb&>Xh1`>q|l=>(=iaS
zI&^)grH;Ib-#40LJa4#_;gBd6nl;F;akCOPv?>%ig(IVs%Qo43(9wO(CSyYn`B^>g
zRtf`Rg0%bA5Z9cdWlda(9a6XwS_9uz&A!~o&a81Z3kJ~YM<u9ZbRvD`PGAO3$0YJl
zg=Qz7D+Dc58dW7*;yN1KbdhqQNyal7RLW&dSLs(<6VgP%t>~wpogD3gDTAwMIHgPd
zEaf8-68mT-JxR7z7Q0peq?Asp6j6d5GBcMcgMx0eIE6=Oc|7VA&7wS@YDUy=mLgSi
zY1QN`n4-Wbp|PBaLYSsaxrvjGKAzTOYsy)2uIZ}q2r^V}QfVr1x37D?7`3u_m_moy
zOw*c5ZrvqyVb=90Y{wU~${?H1)U81;+oalp=P(5@ciMq6PEbQ6x{1`oY$fa1gBBVW
zAx3U{238-0c}1I69EpZ&<rs4_xmIQ{2nH%FjZl2bD=scKhAoeD^H7roBJDHXDHvya
zK&FRW%z-XKZ~=|cqTXYs@SvE)CWpgHo9&i3U*J><2oxWq*&<7pSv2S6Ix@l8UPt6d
zb|p`79cOGdLxwXE8o{+*%gQiVs;Fp#l=_(jX=Va)iqqN74EI<i0Op@2^KHs$)B;*l
z`)s)x(cO_Ep}fMd>Dm(zo5ws*O)qL~I0gm1Y~av(VUkxolWw+b#Y<v+Y=(Kfmech{
zrk-e?WR$~}$l#8H7Zoom<GrvXbhDX|2Q9G!xF1MH9AZpJ2%VlZw8|qVo45I3f(wc~
z#QMHhwy@E(B-%B!3LXJ}NmBK|StJIq3GT+Za!$i$BcW*Hm~OgOv4k^tsxb=9aR4k}
zQR~>2r-r@yWRT7>l{}e2N*&D`;}fPu_dx-uPPFPet^?Z-ZvSKc|1T`?UrP^;e~KOc
z0}hzN-dvKCYKJqP1I0>y98IYIV1elW#scl4+^$J%C5QCe6CD^OyElS|yffgeUaREz
zYCavj(WEJB3N;5AB{Z8zwU?@HFf_h~^vB(9tDpefMbBa~tIaxgpP0tQjM$9_q-jN!
zN@qmsGg+2oAP#MHXfu_H%0Y<fvT$KqYg8w}7(7#}2~7^^@g3KJn4TFq!&0-RaIpsA
zkR4Mwz2CI$Az81I44J@VJQ(6Ls{{3?T`-np43)e(ItVJ-K*M~4h!ij&WtcN!1tUjh
zYlwDYqcGsh<$)=tQ|eEPcnTHiNe<l8WfXIgX(~}Iy=Lb+!Guv00oZS>qBCN?NEnz}
z$@gQJZHqXG*y((ysEjwJPBXQWi4tcmG+ivF)FDE<QdbyX@o@pG)j}ve_Pp7GYSaLR
ziY+y%Qw3xeh!)3bNRGmN1jiJEfop8roj`SsfNS6p+Hg%}MB9zFWGb#X?zth3`!l0g
zQ7AsssH<gH>{J_EkD93sxH-Z28LIjM=u3iO#&5TYHdgEpXJ(Wk0;))Pu_g9RY2304
zgC-3$XeG&_h*jcb*cRF?y5<*vpwKjw(W?wi!iHCHXDv8q@~Jv-kr|^_J1Uzh1=5M5
z@*q+<Ew2sWc379q><F3ogP_>UmfQlY0&`+?O{1r0CL>zUHOq)hbY#7Nkz=C6$iz?(
z>LO{?Cw03njzhCURkS*5P)Md?!}SJXw8P#E@$->buKIyzRofkprE#H=u!7%g%7#+!
z`Fw~^O_P~H<ADiJ5lkG-RG1yuDA`OxHN%jo)M?|QP?!2##>>T3q}t~jg~aDMs~R&D
zT(JstzQU<0V`nu+AjSdYia_OMy1=ZpCtRFPwquMO+hR*51qYDW8AoNeX_Y6sZ3vKP
zmBtLm&0x~BXTAK?QV7+-np^`s=dr^^TQk^!U!E3XqNI|>P*1nV&jbwH96?%>!7{ov
zY<8t)i?*DiTCH?M0j=9Pltc=}BsVRyuHPkeBUanR2^84eEV1f?vTly+9i|{@19O%o
zW`crudx;tQ(X44S#&!`H>ykjV4O{@*R4@#o3ziJ*txzf?&`jw^L50n<3q7Ch6aD;@
zhm)=}@&hf(1%5q|nvw(eE6x5O$LEYu0EV0=!5wJeoBKRYHlsl;KbTsmh79<gR$>Ar
z7W-PefZBp8V*SVipciTSNYUZUPNC5Tze7R3=1i*k3>#42SYD5Ile_}7H%$zIq;|=X
z?t7&{rLFO10d%S=1ppvY9Z5=LFa(=ghZCF{h(%H$IMZTK1>tI{sEzwn#AW)NI?2~Y
zbT&IgOtnzYVAZh-o+d=9KNM^(mb*9}Ov(+e3d59>dV|s+glei|M0!OPMp>Ab$vROO
zmE{Zs`K+X;HmW?FwzziO_8rH~QUvN5^;*7_@VZcIbX$E^)vGO6uh$2dE@b#dqf3^j
zJvWQzrjVj_CYbFj#sqj0;83HvX3oc^vE41F<LcChz#Qa&sv_Kcov!lYuz<Hpe5+BO
z=^a+-%(9sp(C&m$Ohi^qt&~Heot_N<5nhHX<&M{ByKSMDV;qAjLuhdt=vGNH>YYNx
zlu3P*nkm8`yGVOfbA)NoM5|MIDp#CdNINtx8JS#^DlJKBcjyc`L$FSPnwk@|WpuMs
zCeeeMmbVztF=C-mvq7kQq_-QDem5Q=uvZ2{B4B*XCeyB7O_hgE!bD~a^aU+c+EPxn
z+hT{#p+jhlz;P?obZM9^XZt=M3*}r-5=NB<)+IeWG_!i&Xo2Qhup}jp4)h<P@gk=f
z#VRATQMrfVj@k*pahGJ&K`_0~cP)Dur8*n@0ppWzwCW5K8L563iOHy@4>*ktM7Lef
zrZSvU+C)!4w6KR$dR9(-1v;8Bt}a1A*d>ywA_g4<o6=r63GAW_j$&C28qUy#^IgfI
zCgT>IiQ*Q91>Q_bgmQ{0v%XKya_vISwxVbx_Yt64nv%>-INo6sx!)YBc&3ljYz%3%
z0JLsK0lFiQWyf&R4b5nj22>hL+f_KgrA#x!2NeiT!cGRpsv5`0Tpo#Qy*zLx`%*11
zB$x(}VLD3@Xi#o7u}X_+8tuAaaU-II=|o<Ya$xo7uAaoO9!PY$l;B<=xi;`1d%#}O
zbYfU6URN+t#DGr@)84dI=)*!#OZ42x&3Z{U7wR~L4csiiel!}9HCs)u%n6ec3BXbS
zmf?SgPrzA6zJ*WfK`F=7q-yMIn$#zGJ5NCW!6!*S<tnIZinkdbd;r!6jGZWG;6rNa
zphPx3&>E4+1YVd4L!Pp#j!OYcPEb;)g_#*a7`<lMYccfzFg66uH9Kks5tO)<0Ho5b
zqyCJ<^FygRVf#Zmw4Di6WpbP!beeTgpO?xbkEW#jBvR_GS~3xc9z=?ckg*^*wF=R|
zY=u%dKIr0D(rlK|3aQ3?p_eDS`H@K>{5Y35YzsFy)`$#0U(!d;XzX{JgCMWRR9CRU
z#0arI46vG)7zWdd^{9jsR1O*ORkzfPMrg6lqA@+JIs`PS<k?EMVLCQe_a=Bd-30I_
zva}bWI#&{^*_<C#BUVZkn$hI-ux~rrap*`$ftBDSgE!Dro0j|(I1iLrdmKf0HHOkf
zhU$`0(n{)p?hVspE{+kFEg9t!+whcToSzv?GNcr{(k9DsBS-u7xK|7Y5*&9%#MtNw
znvP3aU|0$3L55vd)v=WdSP)icAz_kEG8+T7gr=I#xY|T%Zx$9^dCEd@wbjT|z9jL@
zKAlIU%BW;FO({dT6zO++!zLIjz$&U!Xg8~XBAXyJ%PcwFk{Y{)z;p^o1NbvxU8P9i
zD^h{g>gS4lDkd&cWV-&aQ**TjALJ~0EIL^%%T7YASC5nWq?KBfRx3LYd2<p~OQS-D
zEC3};Py6Mv?i*4|N;<ND2@+APmW)X?c8Vp*1el92k7*1V(Og27as}W&x10u<%m$et
z^7xtW3YN!mUcc=2+qfeMOm!p_hM2>#x^Jd#R6TVP=m|QBfe6#J0*^C#2{MBz@abtx
zY_Z?KZ722cm4=lKx^7mKvz<s8=MlD8B!paFQpi~nc%vq-A^lQ~bDBUz_ylT+W2c1m
z$Ih${xC(JXxKPa?Rbqq(>EVZ|RJOx0omKLr*~bDV(?s-|1Kh|V+2F@sf->Df9fiA5
zH3Bs|IX9J-_BSrFhnWLCpY_L?&d_tGWoi^paxF2)0BSQePqo07WWO`@{6+yd;Bcml
zFeR)JwooN*SPI}aykW5>b*!GM#La?|`c9UVDk4JcwQEVi8w+EK82eN^T_6e-F=f;n
zKs*)*B>|geTazh-@cAiF=&m<GU^?SzNIi-IJ>hy@iocPd4SQ2OaHg4-Gj+SM?uG-Q
z=)wp-rpmHZF)`O4gVqUG3dTO|xVT^S>}iRfjC%02-D<kP9?%`bZyPg=Vu7Y1j0RJq
zN`QPO#<k&8G%>22(E-<IwhJ_)j}oXH)f?8N+(HmHwc$G2DwSHK0Gw2-4^QfhMn;ob
zvBZo!(y)Wu4utYv3^iIkZfYxGh3c>ZlNjL1Y=_UP65N?eEdqzDgc?Y|`;C}kFS2?u
zS@TI5Op;cDC|{MsUM=Vphi=6gXGG70dWM&2X$5u2W+z^|hHyg|)YW6I8WzV?LN)oY
zCJg`wG`5jqJEPiO-j$?S%yJFGv*Aj?8z`JqOAm$Jk^r^JQ=*b77!n^0e8xtZWYD0}
z#VKYpOf2*5(o_xgq=*e?sa2)HJkU{k#!H=X2ZN<Zaga&1-6nG^+l>-?Hck4|9NM0l
zL=GuAFrqbzIYLkywMn4V%B5t|DWhNsIFp32PR?f#r(LiZ8(24bhDWVX1zSKvl_}`V
z)lsU@mL%P%+PXMP7X=~uY>+x~ft^_JPz94GvNjqO%1DnI_NGV-ep4Wi%=1pJJgC+~
zE}x8jB$=RD&9vYNQ}J4Ou4`lhWst9>E&0gMF1ZX-MYU8d%!Ue3l>|kXkv3qid!AF_
z%WbJ$iu~C~;oMBuXB&052G)oePHJKf@<Mky%|W^PsN70T@l;QcTqWO=OL(J`Er%(;
zLhDixW$}vG0FTv&NiD^itX6W0eL<BXeUQskrWQbCT-C{#fH$KZh8{;PDdZ+5;N50{
zS;c9Q#LLc9n@x=Z)oaRyxWY`b(^&OeS}MX>s_9PcdScBWxH70BL)-_;LCl7!3CdaO
zqyu^-#{4*>1*9tLG?GDb{RYcUctdxGa4Rq2NiTrF&}>-X6WhwSl~^g`*>ve=*+wXj
zD_A9h)Nymx##EM6wdx?yY&7eI`8Ho~_bVFfC$ig@!2|xR7<DlioX*@-sbORv?oP5m
z>PI=WZ%9Lz@ZplDN@^zsH`B6g$$VDwotkHbmLHb`G@O;Y)rlIR@JtwG@DY|;c!&xe
z8Bqn2s0Usq$FQuO2gNB&8PGLAvk*4sMZGaHi12|plG=XM9Ok<rGSk~?Mx01WSuCXu
zRXAJlfaWvHsA~!7q|-rzKgDJ}@RiRF$Ca3#eFJ^|x9A7#KT!7n4?EL=51%G2Ez+A{
zBHgk{6jHF3MtX8;)a_+`^godr4NIjCEud4CZD%>u?n8(tPDR(k`(&k3BdltO)&?av
zmG}~n6v3jPqD^??QoGnRbr7JV2M$&zhYqGx2EEJ}M2SMG;ZI0$*yKIn&kqKc01Vt<
z0G#BQ8jV^G1)!ze@Af+NydtN&Fq;aZ-<AS7Pc~|WS+fV|+mR@DVpcX8nfJ}9W{Tae
zN|IxBiec5d+^kah+Dxub$!=4w#AUaF1ONgOW~k!Bexyol$d5WQU*j-6qH%=|CPT4a
zoD@(+YYu5BsLkq5t3iR74Okf>6uq844$yud!EjvD^B9`$ED8I30QX0kA(snSyj<vx
zYEzhI+xadz@V*&(h*xkpU#R&kIpFk0C7KC(rRD%bNGm2&96_R5U~*9q4Si!Glcd0b
z?(&6dqtOE~D7VN~glydFg+#(Cz1VJ|`5>X)IEAFFGYAYscL-6nVFwQ@Os2wlJ+(`y
zz;Ew)vslbzm_j2Gm4+ulmEoi(2o0Shg>E!TTzOJ4`con7xxs7*cqGQLTGf!yd!-6C
zi7d1^n9=;e$1@s=3r#udx|Nyh^L9kx`AkF06R1*Gq$WMB`qjV%$~&JiXJ#*HYQu>(
ztFZkNAiV23*(KXlX2{gbl#{nS2rCOpR8{y{CksFy5#U`T-I$8laU>fz4J^|K1a8@v
z$XV0^t0W2z=?K6vv4{3BmKv1A=?NiW6G6rr!<<LQSWccKy(|wZYdKV`wHRnJmIyNj
zGD#=dcA!e)1kcn_yqxV6GeS@G0UP7gkQrFOu{Bk~h9y*BYl0>UU^mK1b2!K=lb|Y>
z8)T~X39Jg)gI;fJL_nWP{Q(1lXb}yW$ShMNV2WX73HY7F$YEo!ecv2ds2xKc$H{jF
zGbhJ1)dD)4h$XIWG~Ce;>VU8wCzS+pzz!<d)KT)h$&f|Xk;kwwic1_@ZDhJ)qf(B-
zmNgu*UTYRvQEOUKk|HH^BCTJD3Zb8^<6#7IUf82>ywn5y4LzLJEg2o;J6chzHC#EU
z_bDxfQDc=gG`6Pn2#Br%Ft2xWngG22DdW~{AsxREl2NY@RBaMwtiG+6LTJ>TjJ<|X
zlqV(tzqIPJIjcegi<Ph9!h`@P-{={$x(l1Lfu%HEp`OJ+OYP8Z>SoS11+VE%o4t}_
zo3&<ZDi5c@P_66Tc41_<CQ_D?h(RY4juMqGT9K-dd22>TMBc0@Y__AxR(3Y*=k-cP
z54C`sVcIm#DnSEml+&M7rw+@-QXSPxVmiZ0kuIP~5zzj0rZZ~jQ+8OLG)=!EI_{LV
zxR#Qs2MJEVT|KW=8ziUt)M(1#=9H*8eXwzqsr2dve_~d$Ia41p{eI3frW3=T$<;>P
z>T$Xw=UFPkXuVyF01Zkia@{O;0L~5EI!{QbOOR+CYM}|9<1^WawS`<F*BW#v#X>!#
ztrTOMauYE-M%WfQqb8({`V9%=2T0ZqP^gBmo)FI9?zn>o2*vXqDe!&ZT#Pj{j{VT@
zfOzeIvFMzG3v?^!r??tSJ?yq@GN}av8qG*(D7A@fJvS~*_#mE5vMs$o#Zz-c$3sw`
z*J6Nn#yMcjVWtmfgM1ktREMHrQ#4}3@gOE;4CGTds^-Z3VbXySs+D3ACBd>*j*$ET
zu(J}}a|>vDsEzYtv7e#3DGv@443kUGbFRa55{ohYcA>-Yfa<GoWUf-xs_ihIPCyXe
z^g{uv<dnYY&%%K;WW_#P0H75-D^5WALI7iOyQkZ{thoZ)toLzKVzRL+YXvG5F$`<w
zbQy|#c{&DUE#+ZK;FCU6(PAhc<D*eYOmX~RG-XSjQZTKzJvU_f-KH*p$7#pNmfKEW
z$3<&Wq`I^qOn|C%B#nufs8xuoLm>`8H_oi4NfNm5b446{2`K^D@?DA1++L_6t=Lwv
z(V)=^<sOJcw$&V6o~272SL$W0Ws+L0(k(PuClqS}MED7y7ADy-@F`8DUdE{5I219T
z!L+z1ihURBjH@j`w1Nhte#A+@ph-2Ura4_AFLHE;8C8uj@IE7k#<C0u9F1bvOC%5Q
ztPMen8iArv-NrcVka!VbU?Cs%GUX{84(s25<AI$`9c&6IRMgNKvD7dq73#xWYixAm
zj@u4UY*a4k%#c8mph5W|tz&9~osL|EoM2L;0677q$Glh(Mug4^)s6>|tkkaIFxTN?
z7&6*!%H;vBX5ug;z^Ao5&1qJ?tU|6aQN~gg*8nC+r55Yci4O1~Z8o#1F>T`ido~B`
z;J5M2#9*`eT9vCQyn(6|rHGdBZ#*M`XcNFQR0D+@ww~cc)Syaz7UT?dUB>Qqy)tkp
zds@M()Idx`opIxS8_q{QL<`1*?J3i^J);dd8>V1&Aa@w6r3>9kJBtycMhql^R+>J_
z6>F+pDj|&m0djDJG%()ueI&3f0SLN0wxJPlvB37FcHeKOLPV%?O+!pQ>CFnnpp+}>
zILJbo#%OG0lNwGY_|U|W2+Dz~ll0Uwg#p9Vzd<!Y2ANG{Uuuz9kirv7OX#YH#FP3U
zs>l9-Rg<38#i2?g7OJgbn?uI7!Y34ug*B2Knp$x<&E)V>ksC~4EOfKeg!e-%-%G8X
zHm=H|o`#)3tX``cHOz%N5uN(W431(m&DZk{96)cPNkyUw`;E{WqN#fLH?E<gVzH~Z
zzCm_}2n5!z=a_&|0eeh102k+}YCUfZXj99nbsp*t{JJV~Y4Q=l2(4_RN5&NpMQOwZ
z08DDbu_o7SnU=CZfC;Y*6*6K*Ee*_CdMG)-=+tIN*ltegeo<o~q>z~otvp<&$<nk@
zoODWcUT9W&Jj%DrjUu8_gV=LR97hjH$Bla-Xk;dpT9Z;&9q}#1Nhp&F`-4Vk2^?FL
zdLyk=RWeo?vFf&5DFa6+7lCJr=|&d(KFk(ID*6qz)s1R!Q5+dmgXqFFVBQ#B9h=^W
za&zrW6nF#Q=yucs6nhj11e-R44+oK7jhm@+Ah|?Y%<6?~br>^JSgcWLaIaKvA(lsj
z1RN{Zl}mw91_y^oD%?!!7EFZn+Lb~$7|A|ZRuJj~k)?9f1PG@Acn4kryn{F)gSO?%
zje1Wg2a1c1bi$5i0Rw`S6d?UM@JvZ#LPVUY(*|%V0FP2gDY-EM`X{QUZ8=na2EwZ*
z*)NvjI?4`FIPnU)n`B2b5En#BB{vs0VwW5CAfq@?SxOB??QXSzmN`I)7W^z~wOdI$
zEpJwvxCNJNnl4X7s@hKUT3ExgD%`1SR&yi(U%y9;436q4bh&Frvw>d%gGr{*tl??R
za)8l7)r&YVk@f)WYJ`LhzG{z~<!*<kaB9@;jf+MU8qJdJI2N9f0W(Aw)V$vvKt;=}
zPpiPurAxi)IN}DeHm)JIIxed@G6=fxSZ&j;(+S3WwM)vq=FIoKAvB03S0H9lwb@}^
zRvnO}!uInZRGz6e92L0IDm3v8-xX)L+r-LEN6VQ~KZ{Lu4W4<5-@=(PN!ZA=(;j8V
zYB4a}CZ4#YFvyn+MQ)7M35!(2VjZ}Ty^@Bp4y!szby$PUsHFJ)E-V-5xEM=S;19I;
zP!eOo4knXI0~}TmqX+YuHu4z|XP9{dqb#FCu#13e@EZi1AJCnO+Xx3H3rKf?uf}{0
z4x~P&Ibi})>zFz*#!EF+EP~LTrdep3eKwq;n2zw38fE|m2O?%<CsU35$qeu!a9f>~
ztB5wz2N7AsO9(t^;Uy!Eg|OJFc`d@s;#fiQC(~ZO)iN_BM43cn-YpdXb5jw?h`}Vo
z04Yf`qb!k-+sJj9Nro?poNqdTrVcu_X&V9NzucY<I&#7mvI-Uyv|esX_I<Sm+|PnB
zp$lM`zQ;`KRXJ){nhoe`v!=Jx1;ILfM>HcG2l;`1X9So9dQ8$l5tW-2Seupr$%B9c
zOq(%p*)F)iM;ulcaR^4TjS(W1+f_EJH>^Zcr=+Wm>tiTep+G>$vcbtG*(l)pAlIyi
z;<ag|ptTwSHp-O+EtcCE9usqeQOtM8O$xUsRYHj?Ia<QIg?<BYvEwmkVFep7BXSO(
z0n8`ig<`BuYzjj<vc>f&7Zckeuv~<cbo*=t8#-Oi#S%{ACxr&5NHk_@b-FE%8<mOX
zHwF?{fqQ~mR+3B(b6s=BFv@tADZ4qh$6!OlZRBaE9<+wFDF!lvIEhlMaW0RCB3yE7
z05a1##>@|AR>&YxirQvEzcP~RS=rDb*6}0E=Se5u;5%_i%y>lLXU92B#`Cy62BF%X
zi(36u8+KqgU(rAU$wc7VO{uRr6H}_fq@brOT#|XNI)%FuBS{5R>eqW?+wJv4Q)IAy
z6DxtEQwFfSUp3v<3=3Gm<cK(EwX3yQQZvUBu`;&W)0_unonZU8PE;MhF9lepQo@HV
zq+iT*;o%HK=%`-6Wv3M~!l$BIpESu5V)bM2lsdx$sA-OYpU_G0L>UgVLUjP+**sUr
za;}t2f*cLF7J?VG(U76#S_>Jpn^e;3J1REmCsd6t<(m`S8cQU>Lz!5cjQjks&~;Qn
zl2EwD=Yj6&xL$+zLc<{|<$RSYDQKpb=}+^$8mm_HlHapyW<CSFG+snnSSJJcA&`M4
z2;Rt3i3|Y*ENHShkl)no*MnZwX&M}z8rE<odnA${@&IRzKwbcmka?<YN#rQob1GxR
z3W!;tqh@Ph=JZ*uIR?~WaZ($^B4E3ldI!o|2tQ%F3TVp(D=n34B2oiXC{Y{otxR3X
z_6m|nWiy#6g)>68-eL32jz!nlG1K<QZvtDhPU5#Uu<ymGYld*#tE3=KF+`5a50f?z
za3vv!Acw)9L~UAZYm!?=CPb67tw9;FWj591`8WlKo(%9UO=~thtl}ujcmVNC^;t9B
zUJu|My4=pV=%Dw%$AjRZZ{xwFg@|=WkoV5f;e@F0t-cQ6#eev%|0@$Y%|5D1Y!_r7
zi9<Xm271+2dsDM)RRtpEBt3LN0+lhOnslXSI$)4-$yeOeN_G)I=r!%2T+fuyz{$8+
zHG~T4l<(wJ3-yFa&x)l|HHA(T<;PixPC<*~&GJO13v3P855mZHV-hQQ?K(^Nl%;SS
z$Z^a<n8+J0%2rCk%<iI%EIzV;)*}oX#PwhkEhKpmj?Z>7I?GRM(nuw}6yW<cyr2#y
zTu#WigT5?Tywi4vv*rLoyNOMWU0};V43X}zElBQa1pyaIeHuVsDKBXl#$q_z&3CYD
zzYh}9^4etLPkg@(r|{D-a6DhhN-;K_v>`adhaj<MoNF|>;>_2iVkZJ&#PkRu3YOB9
zlY;3mf?b&vQY2#s3MWdDKWq@D!Ol>^&Nxjo-)+<f=5PWS#vF)nwS~T;^cxj@h8Jo<
z*TV^xw7arP^z4C+h)7NZsV5qd2?Qu6Gh>9RM~Tu)l38~+ktaiKYME>Z^Bo1y20gPs
z6DoFTB%qyHVO(g8Dj+>jgFC|tM1ouhoyWtFa)b_pw0bd6vq5Rx275Lhl0`G$luR2f
z^KoWKC`d!AFtk`E{Un<$1UiVY8-C9udNrSio0!s4d6wo~swF8|3(4^eT5be6*~~R<
zHy-wz&|ox4kSWEL)RxfUZ8G)O`>mncCX^!4rH!7{#%%f<PpI6sy*ATrb;Fv=4CRVf
z@aQlZ@jfqN5yHv@($L3cPel}us)^I2Gi@1xFVbC#YL&_UuqN7}+pe@^LZyp=g4+6k
ziL%|QBu4~gnNk=1CN-<7RCzZ^ov!vbXn_oQ0<EN8C*3VfxGCDIn`oJZcu(p!>l1m(
zD3xN0_~o8zO;vi*qp5M?Gz4BM499gc0uoNdUGVsjK%tg79Dw|RxWR+aD=6c`xN2Kf
zs!<6_)e%Qo0IwQyb)*S0DG$1?QG&TtGqJ#96?vjsqH1Z33@wdI+AxRogd&WYkPeby
zsj20PLuCM>kQvgJ(G(4rv)MqgLLpy+B(7i83qCK=)l#<sT2w$LjGgVb+A;_M6phK)
zMYXC~wCGe@(dB+sE&?B)?hTqAnPqBWn#V!dvuu~BwVOqBm>a2nEwc16h?#eM&!5&D
z6z`fHbgGZ}EYWUw)|hSjr5Vx7(bG~w09O+q0FiGG+?0;AA&AZms?`8Qrt>6t$%E_C
z(kL1UlgOuJ)h81yN2U&3x9->?$RLm8Y@@A&6S);rAbD6c5zq{w*%=V~1=$0Mf)(u|
zNM(a^j3;`z29ZE*piv2c-2w!FHzJ<^T?MzZvL4ugfluQ(M4oMt*_tj|Dmtm!Aj!op
z_I;S;*z8CFVYdR4XKSOPAf~7aAn#m7NUd*r)6|`$Fy^UZ$Yu+TM!8q*^m$aa6TV9I
z3sAfLO<)34W+J@@3>D!>vYpN+r#fXkRh3dOOBVw6EmU!cSOp7A_xv1Jn93<TQyqx5
z4sy7dcGXL_q^NY$zH?GurM=2T8i9oOax{utW>iRMNF23$N$4kO)ld&Jaaq>!K`9(@
zakfhLOqI?x5+cgPkb~d@sRZ(Ts-bIS2ND(6qJmv<_^Co97GaHWV2%isnM}Iv3QpAm
zf+xdPrEgIAL6-#*ZGm?-$}JFN67#_S534Sc0o)w{91<@tf{R%?1_r5jS&(Rl6M!9{
zwITu1MrRsTY|Du|1tlO?s2V0DQPPBvHJfa}MH*Mmi-5i$^q|Pffy!aCMjJuIkxWW>
zzo?eNFq8E_YRuFg(a~5k0GcPd7*vrfSr+*Q&zg9MR2zf1QOS9En+$y!={9D(8`wxq
zP6r}PiajlpDq$RIYN4!83h7DVoJizRWF|6B)ohWusW$AEYcQzRh>Ho92Mj296@p?=
zQqN8ZrfU%HIHOk7aX(DOXea{Q2D{yCo{j}X)Wu$kRJ%40CIm)j%a+O|eH-i7yt1OQ
zsW~!pTCT4u!x?Xs8uq9}f!vF6qY9$&_GmJ-i%~gF*fMa>1k-3IqtyEV33A}gn-I~#
z>obkI6QFSqi+jE)`$0?gbU2^L4hx4^Bn)lDwc)%RS3Gzi(0UV$rrPh283ZJskHFhF
z*@4dn*-|AdV3Tn@!e9d|RUg2QmdRV0R=Xv7voZ@_Hu0cgok`!CLTUv#-M3((frHJI
z{qdx2W&y;Hq!hh$X8j^9cI13J$7z{76yz|9Oa~o!lZ+Tl`_l?C2IsE__<|6Zav(D%
z<Ecg~EI9yJkVq;r!%~U{Kz3&#+b}%c6jKaP8pG*1R}{bA_HEKC<trZBXpVYGMUm1C
zN>G$<q^wdNR|C1-)iHXIuYgy4{y(FDpzgn4o6ArRz9_%eTJzW9$buLxe*2P*esy~5
z^M}^#`<QR-l->9-YZbfdh#w&6`QJaK@!jpw%ZPP<@ck{%T5!&jyK>v_$!&G*Ip`Db
zuP{G3{NC3Nd*j|GZu>ZY<WV0VWE}j~9rwKT`{&kte&en0-ukCo-u<xh*0DEt=WMsm
z(&H8!@t?2%nRooxul-_t{qysn^)_9*c;A1%mTkdbwBENXZnsXJx^IqEd*${==YFdm
zwBBjUp5O8R{p`aF*P&T|<9l1<|Mc>&K9@Z6`nP%pWs7QCK*WW@CpUQi?Ko{}mlfEC
z-yRIKp7-P9wpexX1NYWmJA`|lTqXYuJ=OT?+z;d}AA5;j_1W@!s;_?U$Y=gDcF@G}
zKi_5Esmm_i#Ttl@y~HN`DsA~Ok0#e|dD9W=a^KtS=pS0iY0paR^ZUQKX5aFgyIaSv
z*nauz<vW_IK0HM}<n2cu+x5$Bj`$P$Tj#y!moEPad*NQImtV8y?LDktuXt+tK0jGW
zulhd6oXa2XbAREs|G~Alu7H>SVEyQ4Pl-qV@!LcA!vZh_>@1X8HC)^J(Vmwqv~EA{
z&%xt|erO$Q`P{vw<WT<ZE$1v;a@x&juE^|q^zPQjbBN^Z6>ssQ)1BXJ${9cVWbWcS
ziB*d@rj%sg-2dF&?%vD#`HD?`^v2)Ued0WE@kaOj4tIXAdd6*De)Y-DZ@=C1U5hT;
zm3aOT{->9nd+Tloe(&;g_a|4`%g;D#B|hKZcY`OeWzQVG>ZRl98!umR!SUhdub!;0
z@wm6R<e>TLJG)-~&Rr^h{mc8$yX*R0Z`kb3!d8j;-d6`M-|6fRmaTYb?S&@>i?=%Q
zx@VVtcFaG&KL0nfWlQw;pIlUYbEDP!tiAfGd#_)od(6*P9DKr>+s=E2TlLV|8(mdf
z`MYJ^I(sC$Uu)HfyS+@5PYM4CadmF{`IaK?Z$10O@_Wu%@6#2F#tR;q_ad|FQ=84P
z4&k1lUViA7Up{~0ZQC4h+p-U?S#is5r_XuuC3(v~x8ZL|>HIh5oI|oFvD@!>{_+Js
zI&7VrH{d?ob^d<K#D}+C|CWD#y!@CsU(H>8-S%I6{EPc`{pzqImLEj%Cmy%*ic=2!
z{`{ZLdv5b}xBBG%<?}E7)kck-U%UUfmtQGv6yLSW{Pr^9tBXF~<<vb^EHfTmd-<Ci
zu6I-OoTXc@^Of|Z{`?u&?(k}I?+WaAX$yIShu^sRet+3p>K04R`KY?_J^l$R-r4Qw
z`5Wbqzx_wIz53a459Mz@f<AoVwKuKUVOM*HBOb}T{K{W9o4;b&$L!g6&whFGc-tpV
z*l@`+TRr#my5HYiIX(RS!=Lt7Zt&m3xv2B~#+D0Cr`PMQ$B}++@s>-zWAD6s|0w9G
zF|ii>@!!sT<fZ-C?e|%@1+noXzuEh;(q(5Iy8W_+E6;b<dvMd6mu~*npO#-bcjb>y
zIsfe|KPN7755)H)ia-A7pC=v%UAchT<g@9M8?1A}?i;**%!|jLe8%#X*RR;-_@8Xe
zZ+iS|UvGZi2l0}Zc7O28$L|~6(_HlPr{;ft!bhIE2wSts%6rUXSMR&)ce<zChHtar
z=<nC={ABkF#Y5Nn0?!=dl<wH=lf|9KlecdDmU8*dI=gMbpD@(9>$WrJEI8!+-#vxA
zxZ{=Eni=@c9)A$_<i=MlpfcN?x@^mToPO2ja~JM<=GSMmKH3!h*(b+*_RmF&LiqDN
z2=?|(7r(%5dE@#YtbfA~SDby>?v;<$Y`El!vllJ?M}PBe*Z;%o<`3dMkUReR`LfS$
zdupdki#t>fT=v43Kic`zxmUcKz3r<@78iDSGJN%K8!cWB``)&n{NvT_|H1D_ym{+=
zPrdW%U(Pyc#TQ4OIsdYSKlurE)9zPgANubh+IYj`$)nkQPk-{{uKLinTYSx(yw$E-
z^Gm<}+ppJFe!J`3)yI>>o6%-FvvV#ubF)3JnIqhld+zHk&0VhFaC^S;*fQz+e~-7Z
z`&n+=^)Gy8lXdQWjGSYw*nIT^i-zpHXU+rybIp;@|9JNgzBppz=!U(gPanSWjQuw}
z<{I+OHQT*>!bK-I8y<A!0jF-T&EZvc;hKxLHY=;2UA+9DAD^%PihN=3jV}K5*k2Hz
zZ+hvk*1qiArSpEbYS|Ua>3`g3<#o&6VdpMve)_@Ab8{CSw%4~+_y6p8Jh{=r<4fzk
zv*?HL?CeYDJdHlM`mX!V-1?n`D*Xk-HP<`qpVyufoq5Fk%FFZ6IXkTQPW6gCR(!IU
zc>R;Fjy%4;=RFsHvVZrUlMd|+ma<zuxbEdAoVG>&jQ!WXaozK;+2zz{cOsC5<{?k*
z$G`XVg9qL>WH(xL!g0iQ*KW9T>&^SV-s<#yk6E$euoJc;-kg7$dGP9!HXnRHdgSwG
zciQf*ne*OG2VG<y{#1BEp?^^0sQnkbcj?2)W5{+t+E6uK_-=QLRna^9!B1Yt@BHQY
z^G^FtALTc>_<hct`(Img`~&o5D-Kw3R`a75`<oWX@+lkq(s)_e_=I^2PdM|djgH!D
zL1XKcmz#e-V8Mq6{p^dSPh558g^YjFHd{QMe{eaW?y}CApFDKK@{<>Ty#CFbJo3Qi
zPjB^HynH~<HGY1;ig`zTao8y*d>pe!pL5_nS1vo0UATUC?srd`|2L+z@{T=LuDofy
zXz4CjzWvl`FMelUg*RVWeB1_SuXF#3eV<x=;ZddCU)XidCs$l{;?hWXxVGr9_1c#&
zIAhbnkDuLVvkm^b%M0s1+23l}Lkrg48+~qzpa1u?AGctczuv05uia?w9%p^H;)g%}
z;rTazy$QSHr<ns6y!PD_lJ{1hHtWq>eSWmFaNv5+Zp(gu``W{umG>@Ove~ixmd{*$
zh`ZJNH@|mqlilOX_twAb$&0sq?M&$>e_6ie!cS&bys~6IbIiW~JgoBJ`S1S2r_Qbv
z&MsZ@)l2_)^9R=+_A<AR^3a^euw5@*^vmcScE!OTo%Y|uyWKk9EzUoTP}Sm5#K$b5
zKEMBopY6Z>tNX6A_vAY6v+@I*$s3|~naBThp}6dZm;Nhm%Z{DXzi8Q(m!G+2&(}MD
zcw8#cYi+v@`im7uJsK?7_Xodj{p0vszl}i_y*PiFO<b7!-mOc&y&6Z%`JX)ih<M*W
z^yjVrwtd8$tv3awo9NfvaOJm4|9`Lh$UOee!OcfpMP2*7YX8MQAGPp1^pXFDPM2N0
z%d&@$Q668w{`|;C*81?JV-LUU!avLJT<e{B-`C-_wU@TMAcq}JoWK5QC*AP-x7W3C
zW9Qx;?a@Iet+Vo#9XI~N_@F(0b@<Y!FFW9--L~kya_152?efHinbTh1?Uz6qFM;OY
z_RA$}-Mq!m+(p>MbKiJ!-khti*!82|yW5}h#;w~ec<lYPAH6-TdD^n2Py~GSwVpef
z$LH_x_)_1TvqWfa`MHMAee`|xv}Lb<y4GulyuZAC1gNy#0lTbNaNLMoxZ|?9#&7cI
zdJkN?=P6$z7jN)dyf;|?qmKLM4Kepa2vpki(k)iaTZCP)^=1z*z2%i1w_oQK@tg;b
z`EKn;|GaCPT~@>gE&B9NckQ}OyFxsF&+i`l?ERbWMc=*gp5%WXG5&asedZPWjDPv*
zRo6AYocH0quk+svk9dv0^Q#wseCBk`66S;ZAFWSU{gS!n{SzNs`PFCNDII;&0SC@I
zbD6%S@!n@^?)^Gwetgk0U!6ej_vSAKuWj_elFzo>;PIRGj^*FH`_t>5xaSLcZhe35
zjc4|!{vyAy`{f^<d)`S8+!uf3?>l_^oZo%$?3;hw?{0l=_J(7xyX}SUE&p7nLwtGi
ziSO@x{a4TEpYM3n-5<Zu+H3KJmv6XJLhOC#p@+YH<c9Kd^>vRt{Kn_^*zw%Pfd_o>
z+OscR%YDArt;u8CKX%u#zX+edy!*hKpM1FC*?->m_lcZYb<8d+zI*&H-Gi1txX!5@
z`<LmR&0}v^eC38qKltOyJzl?Va_6O=eRjiho85Qb;fGw;-0G^gp1S&{&;I58tFC$T
zt0RB6{zt$2>9J4Uf9r+UKYGlOt6$yb^gph-@u~g3x~4dN<Mh97VnC%Q_tZ8v?mOzJ
z$4^a0k8g5^@xpJC?iYvcccpat2@jnA_(QvtE_mRA>XGODruOV}#xu|S+PLH@_-y0N
zJ9j>2)n4_}_S*C3x19d|TjiIYer5W&eBN!3U2@mo4|G>tn8jPq9zJQ_|F^d{T0P!>
z&1Lm%559aN>?~h@WA5+g@3I;H;g7EW`$cPxede4C>ib`;pYgpFpIv?F`SWyppEXy0
zzQ@6DUixM9RPXuTvzKg{+;qSJKme4T?zkx2`LtzM&FyrKo`1^vKY<pUUOOZheTiLn
z+t)8{pM7sXM*ZnA$6UU2^IzSu-IEu;c+={GpV_Fk$9&`9o0i^i)01oVTKvpa7mW^u
zezg1#8(g~a<7=IA+lOx)eyo4dSzD}nRoH8r1=aP9pRfPd&tH5670$W+#rW`(F7|GG
z@$mOf{^*94cRu#Ub{jq?x;q`<U2*9pUthP2c+OYFhyH&5>RUd1J#+TP8~iQW@aNHz
z4?cQpU*Y!6w#ARSYs*)DwaeyDotk;~=?9+u$;PL@Z|$-34o`bGzx>DBgF}C`$%*>8
zd3(vq%dS#ZU6Lg(MEfUxNo==d<rDXvm3(o*wwHwFst>M4kN(pE)qAc!7Jm7u(OqA^
zvd4w)$JKY&lwUgR@*fSdx6D0Y(ZVgRIA_k|XI<v}`s-6JJnD(Ho?b>Dw|wEn_kQx(
zAz!}tw|)0}@3^C1cy`T2Xz{$4pvFu09w_fH`o%H#&$%SntbNKK-Z=K_FXgYl+~era
zzdZAd`M3W4sJr(WKgAw>WU%LUr#-!3#p1J{$l>!|I8gubKJQGgU;5$;!X?k6doO+E
zrDMOo^2)bv^T-c>dhA!zcUwS14ldm7ugWefxT5sueU}v2lg_x{idT9&d-~_m`Y&B|
z!)<>)v^aiu^-lLMxz|1yWFkCHevhAieZqHF1lW6|ixGEhbNnw{{ck%R^zMS`yVL78
z`1sevd4~my@BhPV&}66lF&FOh?sHFc_P3&gvrhZ!mnSu{xbfvT{o*g`*H7Jh@X?gB
z@8Oq!dedFk{_TwY9<&O(ocg;@PJPx`z4R~4b_@2t=DR;pjnkK1we*nt@A=LK3+}$+
z<N0qK%)Bhz<vr5+^20CBJb3u%$!ov61zNdtr*P>pM}4f=cfYpH`k8}P&-+<AunP}b
zP~F<tto7j`i*Ni`KIXd5H&}dia`mMR=>0#>&_8^->DqUG`uJ0ifBnLtN3S5~EL?kb
z=gyO>2H!h&#ibV@cRl#&Db#CE4$i$~gXF<=pZ?&12cDfWFOA=P^5$P+&%3`l;rzdF
zZ=bjR@r-o|N~qft9~}v=2Q=iuPnX^J+57vp_Pg$$8&|KQ5Btj{J8XM2cGgKd|MdFb
z)=!^3an~o0?(o}nR&1>Ong8p3TmAOAIaaa+ZC-rrrytJ#I1W$w==q*}`dzo4ciZ#c
z`@{tgy!G&TZTXd-yuSTedp^G7X5nA0lfM7chi`xB^TvjM8eMnpCdPHE-o10{O<&)Q
zdXCvJOMSNNqwS$TZ1Kqscibwk8NXA0aIZh_^{4pti*~DTTlwf@p!iR?;K<#kEB~++
zfAYf{8Mp5BCuSS*%l-Cv^R~C{7~E3LJ@V<IvlsUtSgAbz!>#vvS$}+!o6HAR?s04K
z;r8dfc_RClqxR;0_=_bQt-f>L1Kz&lROy39FM4;?c`qIH#>u;1__+Mip|5{YJS2Dh
z;hS!`^9TQUX2s?EZ2$EGzyBOx@W#=l_rG&*<@UY4OI*0|<1~Br9xJYW`468lZ+-p3
z$|FyG<mA$wYYv5;-*wsJA04&(b~k@~-xe1_AMCzI?#4T(zgiss&A$7Czn@>(_p!Ua
zT8u2&S~%dI$JQM4-dSvwUNGmhd(XTOKWy0-_aA-7weS3T?x`Ps`u<J#Ouo9~obk~c
z+<6)6K6G_(_lJM_{qQK{kG}#_ue6k_?iI#!7Cyde2WREwhpbh8@ykV5V{ct^(M@k&
zGrZ%_^WR_lvsbP@T>Rx=@1wu^?Cj^Y-@S9{t;SP3{62W*g1x8KMOW=}_XZcOI{*H=
z|M;u%4Hw_OzqBsc?8*Dfu6S(zI|4v}#wEwxoJ{TOAN~C8)z|GDJrT{_VgFnA{0zUj
zy6(~!Z@T{Txzl5xKX%Q5hdy)lYxbe1&O7TL7d-G>Z(OL}bmJ|%A}9Li4V|Z#J$3f8
zpWgG*C-3vy^)GMk_S%BS9z5!yy<c7F@B1C^JYwZ1;1!K$9?72ZFyZ|3<Fg-`x9Z`=
z^Qy<bquGZ&`okY@KDgt~m(G3j&v$SyY<5d+&fv5E`Qz_y{d}MOU*8w{?pOhqeCcqX
zi+;LSZpjB{ZvMvb?9+2U*#5+=p16nI@a;pmU7WXmcGEqdE<Wt0dk=emn;-sZLF?nE
z_kK29Si0-0CwoueC;cDl-YP84rt2C6f(5q#!5b&ITd>9&CryHDaJMA5I|OJXxCJ^8
zJV<Z}?$$wq6I_D3&z<L;_y50VPUdW`e@-|ky07lLYFE|TYwxw%zMo@G399?@b{w;Q
z-2d)cr~Zl+HSq4C?}gE9@tdgS=8c0Y3JKfr(>{=tRb>TPiFOiRg_C-+dm#?HLdKl&
zgwC@UuU+=xE{VQt`x{?ww$|;M0Mr3)EZ*5N@&8#bb*HOY>`LD!xrClmdi~0HPcW<N
z`7DnMGtXAiSUVz|(&<}VFkP1MW?Kr6%cvi>`*JX_pbNr6`1+c_g0gU9?qmoC*#F*B
zd4CbMc)lWWzraZa&-rmTU!%n?aoWiau2DF`W<H_DuULby2%~;)E30q2KEOyFT`kFs
z5FzGk<YBwJomqZ#+2LIEJ8Z?5)DD@}vFS|<y4kMbo@(n;%i#BF@W0v6@pzH@VcD_7
z;MUWzp#xoBHo&*5Zj@^AprvME86t@%b1|@bY8ppk4<^FqAOUAMUxhBx%<Fc_EVu||
zb;;0W=>xDWf<K4#_oViPy%PG@7Dn<aU&wRPyAb0{iKS~=ZK+oIgr8zMbO-PHZ7HT7
zUzA_pIxmWNzJBEm3V5Th6nfUViM#3_pXBi~rqXS0gg9Q*aA@gKbKbc*R%p(qtAW`E
zlb5kx>3+^h$dz5aH6>LPnib!JY3m;w?a1-_W5jO<k%$m!k?+5J@$jt~;8EsAzywOd
zZ6Hd$;vNt5MlL(=x!MiG&ZpfOd7E5L@sask*4*{lT0El~m{>_cJ+7JT+!~S>kqyhu
z+jlinbp|FL8hn@4^|sSL7Ka|TOiI^?5PorvUk|ADw-dFu8kp7n-o^b~x!FGMj^N8;
ziG_uSxaWWW)tkr)-I1^sDT*-3ccvVTqW`qy9`5da#dPxl2-W;tj+wDflwgW|S?_LJ
zvad4P#P1!>L$Vw1N`<Y(6Crwace4A9^P^1{DZlkwtzvs|+B)vDDT8><eZV*89#SOt
zAG9-_=H72nPaf4Kt@`7)zd9*(ters5IK5*(K#$3+XpxKE%OOgI<si<8JT!Rx%SoVi
zu!!+`wq+|5VCe*6#Rj_Bp1mMZShrfw!gM$1&FZSsCkJqG-G+af8^gTcz}6v4mC2^N
zFxcLpkqJ)F@AqQAuvEN$;sRs(16y8Gt$jsNuEDnIs!DElzPxiwIoHf$A(K0cecwKN
zv`0?HWg-LoFB%PE=oP2cq~`}U$Pr%b_`5C~j(C}U&dF+)P6}WC<3uRdc-%8evb>tb
z|9u2m&?tNA#i5ad%hQ~urbkUqp97HJMbC^K;z2oeZR9pNT!@&cj5P|4GzpeZ$jnoA
zj}{wk$M^~J%D?$tT;2SKJye5}$N1mwx7Vr5B-gjhx5x)LCf7S#6%U?IGmklYTj{H*
zylmbWEj{>V`%K#oXs1_gtvPa7KC_1Q{P}Q|zE}2)e*CzSYv-ckV-Urf>;~@EGg`kw
zKl`jN_$E2Utrz{dcbS$?x9nJi-w)!h`zWP)5{SeY#&f`t9DcS0?Pk4G6)JLEaPI4R
z{yd`WcB@7w8p@yTZ){!9oj5vdEu~aA(f!_`VT<>m?N*OCg7LIu$t&PXNlvYhx0G9j
z_n*_L?;{?3K(4`NIJnof5e}=+Z~P%YcnhYtbP6{mRP#Es6+CoNn{d9bI9QK-v3Z{(
zyja}9z|ypgC$)`~Y8|VjPO|BW%S9rr1)E6@ox*rCs{^l3d+GjR_|WfuAHT%^2JL{l
zwn}blj7nU0-l_Eqrg295_Q-pvRFy8FHh+%y=yzlkaa-N?mZ=Ni09z$O<WL(l&WrmV
zhYC!E4(t#>bH|eFUW$EiS(tYvpU=iCUb_n!7`Xi1muvrLvX4p$oJpO{BFGW+3W07+
z>UtsEAxFD(KqaWe&?l9c8d}S-3P9}WR4bc92@hQRrWAb*y{xf!CGgA~m#$97kM}Fx
z=6FAgOni+P58Xe(q@IKIi@aDQ?(e{@r+2bk?rPO&?CT0XG<#6{gbwC(l;07)&{6T*
zv;3DoKgB!4H)#D6>#InCEBN`wV?f<7F+#3cZLsiDuJ30fk0s-IhwqkEGm*g&<Y2zI
z;+ZC!L}IvH;s>{ghr@uL&(W6>i7ThnyT5EBqI-5_eo`3K2Oy1ThMm7CJ5|rW$KGq?
z!DL&{x+^#=zxriMfPU6AxmbbX-|9?TjA1?eA>;Lf@#7@1k#=y8bL$$H3|;FqAqH(8
zM)<<l=y``Pwi?6lJ4teX>xTV#N*i$T1^eagUggy2p@{Zz=vk~N@q5VBeGCn#>f%0$
zcB&{Q7t<aydMarYJttlyRS}%=&tD4W(X!jjgNCsUsc>?McZcP!?utI(XiFRE%2B6#
zp*@%HDkM^V!_r&mq#u+%!SX?^w-3cl9(N4DwV!Kr<WwZ2_NL&bgG+n0;+AGIG$MC0
zH{gClAfGh=Qi%rY$$vMFe7jpE%H4Dp!{D_42A<7gMDOD;7RMPQL#PC~{=Lz6Ff$QB
z`_%^Qy$lCxE^mW>>S~Q~wl;m;be%fCL0lTHJ^(r5K1cpKt5nee+wOc#NKJTaqeG+j
zS*sRzd!3vjm7%=wMGKs@&yKm55r}i*sZZC=Bsd^73pi<8T`f6CZ9N<8!wl?j9a(+H
zN56LquOdifB1%!bj3;1jma7MflT+uZ_3pu284n(%`Wt=%4P(!m{H~s?YO`~_OgsOh
z0vwrUH>U+?yB;&-=hW{in^Av%d&0hVLW$l)^$mWSA#l&OMfvhN=V1@$8O4bazLabM
zx^e0GLw%0>#?^J}R_z99X8EbZOR(5wk9;S-$db=&4lNTQqB^pM1^yTDS9Q(0b8}Aj
z<@$X}r2vNA^%&vT!TizD67gkQ9A@jEDi&)CY8=#_^U`g)#WTYVi49J(VzLDfb_Tj$
z--Z@cb23j~``d<2TKoO((Hr9$uWY~nEda00u8q*ewWsomWSA0ns4rL1+OC-taC9jd
znpNv1KYW1!`f~IqoU-ijTS%mAKU+szbzJXHVyF{$HxnM`DfUmroj_s=GFwO^_$DjH
zyXnwn0yzaTcyj&o3^)tL%Gh*KG)Z64P<lknhg^I$o;ly2B#!?OjXijaO*5ctya&LI
z@hxddZJSVmx3AlP%uFrkxR%Q~z*sWwjd3j-upp_h3X{*lr-$cZ*qHq;NT7|xKrd|L
zHwFCmJM)JD@8puSxaxV{H*Jz$F8mU|6)`bH#djjjsu<uN;wP!iN$XK=Rv>Yk|LW+m
zCH|Meu>IK@|MTjz@a~<0wTPqGX;5`vg?Qsa6+q4l!fW@B#J0qPK7PY(2C5lD^rJRP
zAEZ`Q>#O8waOgDmD%<^yiGZo+7fm|`ajG!i`gY7(n;##%nSU)hn~;&al)Y6ZP(YjY
ziV;0dV%5}Rqj2mj_~Q@7q+kQGS+iBWk!m&-Wd?ZFZaD1W?6gvyE%4$LNE2UWyNfrr
z5x*Yh46#W2J>B8HJ2gG1A?Z=|yeP{B{u2guar_b80Az<;f(4gPx3Hko{%7X}IWQvk
zz#+pe<9ad2Gr@a+Oofu5u%d5Dyo9n)`!r}7o;4h+8@;!+pHT-2Tc;DQA%d4ZI`22p
zjG~<s7%4Tjurqf`M#}}g5vm@aIMQ7`g^8K|yEB07@Kl!E3FALm7Z}VF_M1O4Z;er1
z0kxmGXm^vk;)7iBPcCG6Oexiiw8g{rg8C;Rs%QYzjcbm~gCnYk&JSK6`fZCy<i08m
z&mj*;W%y2DblF-ez5~E4b0X+)s9{WMX$%fjaCVD*lykIr57XA`7&M}j)yM-cBSa+n
zzVGUuU0jrO%V!Wb{48p9qFUs&CT4+$$lQ1LP51@s6JR6(4?Z5&&m7EomvX*S7@-(7
zp`J~Nfl`3(Z?%FmrV&r7Cp~GQTxq{io(E^qV~3u7npxJq*G?5}RFkQ|YrP(%2M<pD
zTCmjk?C-xN{Tdp@2=`c=Y-Rls_2k(t@-dAh0k-{fE6)Jam79H)c>(v(Wa-~ai*w=|
z=vb&w0qr?I>vIYohwNSPS_i6`9`@NP<+tss7e6;nycpmcZn`xy#Q+7T9GV=1NAZ3C
zvyQbyo^0ux=|~Kk>25_UQScxWh8$|Z<JuS(Yv65rs)X>=b+UrOEKp@3%vcyv$D1p;
z-QPemxOh<ZKw*(553(`Ll9mxy&aM)GI|yJZI(rY&oK&@Ez4$w6?Kc;<Zt{@RXV!Qw
z8&R<EFn%Y=dF^sNxh8rYruw51a{m77@>{p<pxeFJFGJLz*fS8Wfx6s`4h1L{D0$-8
zQ2<N=!x}+LT>0n(kiWWTC}Xl^m{zzw{28juZlr0%*^yIlHY`@C$}4=0xf5wn`3mlp
zBPj!k7pwU~+FDh{x@~vYrSVmMYcIPxI5~y@(Iz3b-83}+i~&A{Uz{J~bKyRgAcNuc
zIDc4Ldx|%2$#E(N;L(aAIpfLP%h2S^Pg#$~uLCuW^HRSktQ4G06q(qq#R?m4Rcvs#
zKd6fzHW;PYaoyJdd2%u)8-)th<Z>4OfaplBt3TNm$=E<fts#caqQ;ET^r+FAK9Fg)
z3i$+|62>Y*8t;(kDXG!o$9Fx?k^CMn3?099+*GW|g*fT#lWY+ja=$o=1j3}YA}z!J
z_8#BNiheLh1;>TDUmm#1Tn0;M{@V+%$v(Ql`IXJY^+FPUh1hS~i<8&#=-Yo`*SFv0
zA#se`e|dd398)Qh`-;A|aX%hKN#ZJb^Pt__5-aN2WHPoMp`LX~XtF@q)h#ij@(N-@
z7{@KQha><XRI^T20A9?UkLag}+wqsa-ZElKS~W;jv}S&YEQ27N0D2`Ii%urS`IQB3
z$mssQ{WDkEQxvgxXBI^ges&1WBGpJmNOP@Y1webbXY3hxzyIK<NPKtn)gZf2{D1H1
zbqNr)@VP4)^_H{NAVxQ>*NNViA@vT19<;gXvb9S-mfPDctArLRcu>G0M5^-$GLKeY
zxq80%VbN0FHgV2yNm0Zz=}d}@3E&k4Ks~9%(Eeh|$5+Zl(`wYrm<9J2bF$wv-mx~V
zAl0mW4Vn>6@)H0tnIf4mxX=oH*cB@Tx$T)r#^mTYR&MX*-?mDX^oeNGKn3V)@%aNP
zlwVMKO0H3H<J;<~-X@QU_<$)SocVX8QU^1DdHtBn7@@hY4sU0m(2U&j3#lNRQBfq#
zjUZ{4I~s{d!~w+)%MNaE-%)x+UL>h9+pxSMcWBz_-SwOb%+I5TuyuSDw14g41!4Fp
zDNhW27%{#aI@yZpSUZKS$_~;<>mBVHa4{9Iz-tGR%pUo!J1g5)=_M`^_F|2$K4cOa
zF!opH5z~@m8y%ZI#Dx;Y28T2(h{spsDN7UeKD`JoW*}ebs_plpJjsA1ZbsNBWC1jF
z5_g!~uIfQQq}$VxA^UZYoxO$uwSG>4K?r5~TZwpCLCVAKX6FwNN{<hhl>}BI#x>uf
zURp8y4sBTdvNPvf^IcdYOz-dcyF7`?YjS*ST~a&Gr5|vQHHs6Ol5B4rb#e!!(6wVZ
z&5cOGVD4j~6u_o8i&(~XpFiAvRrkv&{C5_fcT{^O-l$C|@6g&KMK)dn={M&}pd9eN
z%qOaz>pM&FWlUw8`{sq3@T^F2{K^xYM&=<EW))y#GM94JEDzd)qrFzr$;cX4NYpL1
z4BvgB$kx;9>XTeYqV??B80Do>+{ip^(NLuQkk>192Ve5FO0*jFUn)M9x?g3V5<7t4
z?t6T}hCM(2A;9@~WL^~H^WmYCULGw0){4Hc-zNI?@)`Iv%nG;Wh!Wt&Rbo;_IDZ)l
zUyM6;^)=>Ewh3;=Epb_5NXE-;=DM7j?Wv+VIdwkGdMz)a8SpTd6lWn%MoOFzak=Eb
zFIF#L6fiH*w6+w*^_>7E@GIB}W3b0j{qK44)HdBnP1MFyj8X+J!@#Ok0N_Bxx@KMQ
zFoIm{l-{=E6v=oQs~7{A#$ffqbMkPh9!H8$^;b=<kDjc8)V42@lAX`$ELkICkAzs=
zJ0<VNMdVm|o!UjuKGxKV;5*A!jJS8jp_*H=FQQ<*5$mIr&3ijr#?vjSq~fO)!XpGs
zdS1)QO$}Qkom=Z?8G>2Ps-?{5SquVC=ZZ5Adj?rN{EO_f$5@1Af9LkANqvu#a?7^;
z&G1CZ>)ds}ZRqj#-7y{BZCaI|-RFv##RCMzCsO&G*v8au6n=0<EOO*cq2mfDFsm?J
zAd565{x1*09~bWKDZ(JxSzP{SCJ{&TGciv7mROzLDhl)^s7EL4j4+KdifCg_p*9=_
z&K*y&*d62Y{Lb2gIuY<j{bk>VFM}yFpEVO3LnJAq9c<peBzFm1D&;@B=&He`9B|1+
z7$WEEey*3#?Bf);isy4Ej+L<JxJ4~^QfE9`R^@Q5EF98sne$bLsAf$zgj>zxSd#yN
z%S50rL_wWZ&{9FDu!s=hke6{I>}ak;-E6TWkd-9Eer}DsHCd&ysn#X>ycQZD1VE8<
z<-v(eFEz+>Rr-wvNdz(CaLK>-+FO&(Gi_i1O{ZRdhCafk!gBt7(SG)IcCZw-(wGuc
zD4P`U-F0BD0Br$QS-SoquktsUUAzdP#d1O>sqA2sqVyUjItDRGErz9Da;9t%RbQVQ
zXf(^U5ols5`8YS$#oRiTWr{md3#scYy-R(#xnd#{gbWstW>*^YA;0|SmEMhleUC(}
z@OWB|2XIi&3I7{RRo7HBqlF+fs=XFw_<B@J^YR-z*U@zO(03c1=_R$dT<Wq-Z#I(N
z?R_^|ESE?KC7~S?Uh*H4*VB4AaW<o>0I2dHpb&uXJP`{`4rI({V}oCHV35>!<x@Zp
z6<n5Pi<+J9?#HCK296jCoIO0gxtpVX0f(K-85#WX$afcw#Pg-crxHbsk3tosM6@o|
zJ>ifiSywJak8nc7BrhXHa@Y99vwGJGnsnB>DiXznWZPWdhfmm7LnOVErXtpF>rx(Y
zyBaM%_UEcHW1&Q2o~V8&K^g6ia+#;mR;uh)epdN<Sp`5j-=p2mje#8#`KxLQpKDV#
znwwe4i8kvvS;c=SmTSHABO@Gs2by{!e2^!>)UdOEDLFY@6*8?v_?igB#MGVGm}HbQ
zSh!#5mNDw;aXv*@WVC`%FkHw_$i$IV(D<Og0C2r}0-`*ZhkmM0$V}y<Sxx%CD0O?5
z`NZGiQ0@2{HXawqWG7JtwOTL8#$n#tPh8<mTrK-vn!lp>$YX6mgzZ3$%jPBJC$0Z3
z6Z9K)ybabQuV)T2ufWDg_>}8^P(iVwKoG5=LNqIc$aZNa_7K!^8QXTT4DdLt;kI~1
zE^*~e7NtzVi?Ze`6JuO(1LBe&Hp9^7s+zA7N&!-``nTP?9QI)=@?}(2SRMYRy)w8%
z3hQdTA7yLM-!IrFS&x3$G0WE)`o^p`zm!i@8g~-R@5iuZo=ZbLt)$U?=M>N}yE|D}
z+<cGz=#{Iqe*Y<V`zn0@U3UjY!;^EGsXf~j+|-@mUi(u;tCD20tX9Jntc7h7H_2Ec
zKF1n0A=e7)@{DQrCR;p?SAnq5hfgfMqBc@j+gjN(RjDdf>)2J83+jr)$k)Xlq7LP&
z3@4XXxa#lgiR2Z#`a07~(<2t81UEHPlJP#4%4s-GPmGXls#jp^CLGbQTfkddw^B;8
z`PhdOfts>a%}HGa5YEEOqH`M<nZ6p2^tfzMA3#|S(lf@>>Js90?_{LX4(}C0E|Pm<
zmRZIetB-rV-~BPAey{sIE%a=9uSx8LpjfH(&rOgM*c}@!%)1er3?(T3MI(5YmGGBp
z6UO0=PVMpW%Ng~!90A$GR*~{wD@m-G?Ci6042Qd$AcQT96A8MDjjTiP8ncpny`}3-
zkho_T7jU+}2PoF>Hc#2gn=d|ibYJezcHH|6jR?o2_r5T}WIKi7W_B6|)4lRtwYp24
z=$8rak9K)nS2U`&sU>F-wBr`=pt7QqTo+|p-jRBI*sEPw_=9LL8q42PH1T&VR!~c;
zP|WpTT0F<;h}C%u&~=2?n`k;*;177kO#xL02j3QjoMbmVrVZb5nN%RHAN3=DqO94?
zacl8^I2W!jc27H&T<8zEkl<7Nn@S>KgZ3BiSKlAt#w$W3o`)H#QGKWpTC$N<)Kv<M
zJAaY%rDfQfo9xGC%}frfvcc6NzLf3l6MOM~`loD<{!Vj{Y|r7}2f`etclcne*)m@+
zgW$`3i$ym_>w1jOM|4LiIQ(#=yN&wg?|Wv}4fk<@2Afi3;dJr&m&|Rn*wz<?TgLi-
z)`?DHR^7{25egUSx?xD>9049@`-K(z(}Vqn&+zKHg94?Z=_>H$g}%j~9?IABiq_YE
zQwH#WuG-+9Hvn(mF0C_%eZr=F@1KW>_?z#NQd*56R7t<?>A}{^K7O(*h9{U+xjS=D
zoPK=(X(Vh!<odgIK+@*<jUagKzCA2)-xOx{+-uryJG(w_d7TDx$mFIG`9Uaao4~gR
z-whJ9v0_m&Bv6L%e$^+UU+ypq{eXb6kZoqK(WgXCOcN|Jnv)veAq9KP4fHj5@QlFb
zuxm)clx9)d)_=177P4e0y=MIIo4j3qu2fjMgp;_bU<kvyUl34kt|?GDer<oyFIVci
zZS1@2CH9hyVM&u}yDeMtz4a?9w{D>q+_w4iG*R~SN9#BNw4x)1qi@=|!0z_YXkCb?
z)aXn4(jeAo4y+nENr|E9YI#D4mpdODwb&p6djWZh46o2I!==09%|a0W&rpGCTuU_T
zxhx#ehLwO}d_P^V(CD%qDKm&OM#QI48p$c9*ShH=2T9~o<nd2KRWKtc?5g$K(R)s>
z57i8+38BfNAKljO(S9zP{UFnDQaTz%eh!Xa4b5C*WjwN`WneO^y=Z4_?0Be|$my#{
z`kE8CBE!HelHLcLYRwsaZ#?*>0W{MfD;{NFc&Vb_0KZ~&?p602{vP!nGK}uWXn<3F
zdzR5BHuRiq<m}I{L8kG?)5<Azbq&ueB@YG?Ql>KMhE?D5kxM+<i>PhG6-lR%hL5xO
z6>1xie1Wf{fJ)Yf!E9CgbCtQrSQv9(91s|YXSE5L&UI1xLd9FCwj3jG-Hg=#*lIvd
zH>_}hC%-f)o%!lfe-ZVbu}cFWc!`*S9O}U5kDl1q!M3Ktp|H3-SD7Ce*oF4T28Uvt
zGr@|A;-4|R<YF#<j<q*7Z%h#7Y8{dgUdbcwwrCq?5(uC^30ye)^o3v5%QG9)koQsb
z>m2Krpo`x49}274xg<@J3Z*s#ux!=}urv1lC)<XtaC$oK{L;`+8oJ@!{y0E5q7^4K
zXsKnYl_zmU)5o6kLxC7-C_)pk*+XAr)D$X^>v|EI#5?bNC=uNhZIwLf_~3C+P*J^X
zEZ<HGeJ}N9!OC8kjK0#vQtLd7cTE}#BypK&zYtEY_v_EZ1EYO)ykhdn^DPTZ+lZ9d
z#k)oCp^@)@8Y7~q+fIXEoBX53TzT@$n9myJWIjkEL?DVpP`<BOo*%@E%tHI+iJ)Zx
zS08)DIAm+b@6NY><&%Ttt&ApoVD2ok#hp#j90Np%e(`ltf>D#R!rKLabzA75o&AUt
zYRYF#_iKH!7aBQ4mGXH+cyUtiWxCE;SO^|t7I;vhHZ4cfM}^+2GvLwu6HV@EzEg8W
zX;<%ZUFgNnzNo9Vi7kNKCpBUuZglQss`xW$mwlf5o70Uti}>}5Ma^@|&3P{k$ZTZV
z$Mwf`EQq>~+1G`MVONR`)HIDSwNLi4Nv%=zT7LcLpzQNMC?3{+7K7|4a|#P{L$<}-
z`|o3Zx)lHkWJ;U+E2LC1Sp73ZFx|_Q(B7Mq*dhVn1=$0{llG@{k>gHo4J(`671)l8
zwK?N%1zz!!hP|CnX^Ryco5hzWfc}iJYrZ&I?}Ab1cM1vNBmot%LJQLE_ick{lhj@|
zg031O`*Q3=`%Q#;8ZW`S2Onu}Vf^!b>;pEC)92P2BC`$T?k3-)^90Y^*E&&TPBs8G
z?7hwRK`z-ajayPbPG)o22NMlQL8W)kMuZA8AHc8tT}QUmguJI+&aS=pA<;%US@{2n
z5RxHGx1H0O+;^lBTv9>Z`vHapZ$nA2v3Kr|teGW15pK@aELO;=<+_7}cdrJDV)uwC
zpjh^yRi~B@!xLP>9%7*+t1;M_@s<q$sl4mDmOTpXp?M0j+;>|e+;f}`1*vDo5k=vM
zF4RRXNPe@@vi^Rz)^&YAjmc)8<V<B$|0&bnq(xFi=aROhN{hx_$16vxvQFFor18T-
zTYhsM`UuZVts4Vz3tjaUyJ5n80@>76BN$5wDj)tP@fUZO9n^Rx=`vKHa)SLw@8A77
z4SV~x?<KKPOPuqsX<JG5L#=^p3?G@>hdafrlv?*n12nT3F4WIzehzUlwHLt=S~ox&
zPg4vkU6l<Bt4)u{6u`_q+ONOGWs_{r`bF4463P{vkk_co?1Rco#&4?Jbr!09aBJ!Q
zeSg}3)Q`rIO9q=PXI}G^w1wiQ#h7ib^KYMwr{Ih+ks|ZzzR>#9&qbobShdAA5h8(_
z{WB^wKp#8*jx2>Ja{|D^_KGIL+~z5JXq^iDAiUY?<ladtyuuz2rTP~Olle9qzI5>G
zk2R^-4QYaQGqP&CQy)^=nh$eJ8nq5xBjzBu#U9r(A<^`x#NUBV^U2EgTeIx47fZFH
z68pe+%8P{McXWT>hMVJg@ZqI?+D<GBPd*M`Pb>}(RL?~x<YYo_gQ=L%1z&YNoQ}26
znZ_5GM{_}hb>0GCz6kanMclRcX8V)kIlyU^_?Hevqp_|UKl4lb_#m=FSe0^ph3CIT
z7oGAc=EEKJrJ3U*(E4H~$X2<>3(lD7N(zWoc^kx}3ds$)7PKC`InMSP<=EEp9?1}y
zTr?k>Ds>)Bhck@Xe73@!LDz!WB5|Uye894;NolL^w0(ye9;3Ry@sy33(bpwP+34Fg
zr?v7{JIjKfmSLXgNI|)LslFi3Gj5-NFAUu#t)d3Yj}0ll3OBG5-dc*-(XubzMKqPN
z_PJaAJVD59A2V+nZg42rFoh~mCV;}W6Fc=AlbLt6T5t2D`6v*;B2a^Ow}yb$w>I_7
zrvl>1JWp)_N=FIONSf}osWmSn29&N|J7x&7l3K>&-%Iuhz3X_p<RM0j`Jb1$V$gSW
zk!Cs;>i6l)^l(!2F~Q{`4|UcY6d?mNA@7ldIm=!qh&DV|9Nwf_00jVO;Z{r%&a_W6
zKm_F~mhp)yBwijl#+UjXj5%RCUFn|B0AISy+$xM~>0u**W%p~k1-c*oZXcl0x`(|e
z0cs@-a6`EeA;Co0+g9*Jz{SSQC>X0)4ireK^R73r-~Z-=YueipcNqPXKyKagRWxf$
z^Ln+n?ZXP^mIs*%kf8<GMqS50eLHLx)o;J+A1Lek#$$5XP5*vz|Lu?7<+9{Haht9U
z_Bco0Gg(afkG%2ALJnC!C4B+v;-KW0ZfNol4knNH+K!TXgAC-U6ByEZv&qNl=MmQY
zj6?qip+nEJ?~-{>Mo?#dtWE|Xsv^X}(Rv>PUxc#a-N7}EC1cYPzFH%``O`xzCMcbo
z1uSiZj-H<~`waV#)aGZMwcl-uU&ca+Kb&o|$o_NSL7s{s74(B&^JYGOb#WY=(a`z_
z0Td5LFp+6&s-mj<Ea?#zwBg~vg%`&NCs3<(!KS6%>qk$)3?MdsH^-t1A2#>B7>_;{
zw}AiNvDeCc1u>H|RTcwa>lOrjb6fI8eom$0%jSz6PYUR_u1D%w)Q%7sc=QGJo3Ss!
zlyj9u7YpyLJ+36T@^iRi3A<yEmbUW>Rf!PDvWFgRd9}r^i&_Ezsc9X(wMqeW-OxWl
zXM8Rkym3l90p#hf7Kx@Th6Vk`AtsNBjrhCky<|Xu5FACt54#Ow<XAg-KSI~v#iBbJ
zisz3UmPzFQIeybqUpJ917*jnB3qpd@h7aZTNdl}wwxyQSk|0>$DV#4Wj`1HTQoBHm
z6^dT8eT!~%c*3Yq<VP>exsLW0Q9fo$cCOUnY=`)kNqsGyrbrBh&}9~1@}9IsnjdWM
z-<9%gtQ8E06Y3pJ%sE#sW`_$@!|I;4zp1TwqoGeJvxkjBvUQ!Jw%a82w+hbn$Yh)4
zyMw4}bg$@p{fr%-?Ku+A*kn)=V9)oL{xpQrKu;)F=YBBL#KWoyc@J!od_kS*rb4w7
zatl^s__ph5ii)#ad%lGul7l(hoBU&kyF$)bghOfDqMiV;>8}_fxD^&gkJP7!Mt?_g
zEx}hU`o{LP@n+4K>~07x$adHjFU`NAk8d`F@+9`|7lBT0L~<EbKUZW0w_yW#W!);%
z37Q-+ne+>Bq@gXVoC!07$(giVi;L|OD?drbKn4A_4%p}HGK@3Dl6ybrS1<ndZ;8=p
z{We2`v?))v64aBW(IVYficW&20n&1<>`a};5acy6-#G1!Xz?4%jM)I4%V-%A(pX8m
z07-?dtTsVAC5SW9Y}Ak?N!L`#ggTnw=1+`s)&<{09%t=K0!dZv1h!YoySxu!Hlix0
z)FV`O_U&gtYohoF*XUK$MP;o}a$QiLcl*n=;a;Ul?h5h=FMpM;YWVUH|0u2BP1w*>
z%i6_!;geSiOPpgO)+lrqi#p5xTNXNEe2mYsY^As~OnlFc5PUea>T~LKffi6EC@m@t
zyHa&nD>5h7Az!^{8gP^Z*Uj!e`dug8TUCz%%_+>cJ8su4ga|j$ve~`YKv#<`IHOfN
zb(y=UsVdn-uKFi`m#BDyLcr92dz1d&u(p~xGjBr$#xPiyGydW3C^}bW(P>cnKcxEo
zf8H6M2DFc(k73$k(ow#`!m#+uU*QJ1yZ{PV{~<%6z!$xJJA#cLB)2&d@LRCYj_<!(
zuM?smxr?C|2hU%ze0iM;3%mUP{m(xtLDuj<K&?pdKNSK0N2nE@KT-ZmYe^3JF8aTO
zm;b08U|N8Da+Acrbm4z1kN!RqX7R|sG?q9)S@fo%0!2p#8~=Vv1&AZw|3f!H0>~#t
z!<JtN|7(tT0Ack15_kS1p`k0n1VjUDciN5rQ}gl9R6hClhx{KP{QUprR7%^1CNFy=
zpM}7u&~=nC9XN{f;gyW{TAgImskNWs?aE$UC&VpXSH5H|sn93;U-+V=^G{#oiq+>Z
z&c$+Ei^o%>*foDhIctWSoP6cp_`)_1x#1nkHe0whksRsa@R*f!#$4w{ImTp*sjY!i
zGC2|o1hm1Y6D1*4;*_YA6IzMt8po6^)2fJ&SBC)%wk>e`U@<^7TMak=`E?6jsg;SE
zYv){`jUS8{NG$0g@HN5MUo?VpD+w!!X(5uBCUuk$dWpu#XN7PNCJC-$6$(cV_R517
zJ2hW>A70qT!Bv?2Ks(AxPZ+Z)?j%07O{JW@I~r+Lxk3sJ6=N7OR7zr6WZ!9j{Q`f9
zt(-P$wDfi6w^qjKsbtfUFE`o!{w}|~CCPJ3Rjy5$r<eT`Q&cE8iydG471v@vpMHht
zODt5OAnuP~_&5m{lbH6xr@#3M4`Fglud3{Q&^$GbrKMf7zDRtLs1F|x=MNV3hhQBt
zpJMrRNW9`6#@<!m6u&3~!N@SX830{o)$gj=nkpL3oraa=<}Y6fkLILERYwH8;|bD9
z2u5|79O;YO;M9ypbhi0$RRTojli(q%Q~l_D31tLWNe57vu2B*|vGmm;H@1m@uYYss
z%`PN((j!$s6Y(bT#yU`>=vR-XiTQyag-r4|K?rXI`X<}y^cr$Xjg~`p-xZ&dN6JyG
z5u+bXSxi-89Tpxo)oNN1!Ozz!O7{>e<1hMBdr{Zd-{9`hd&s1xFKclU+?r{GV<R(N
zH{esq6H4<gGqGy9)l8daA^hHqC#Digf{d=3TRg839Za}aSO-(YOcE@BA6{wo!fhfZ
z8&+^h;;6=FTr)ywfQwbYiM}^pH)|pwH?zbG0x+N&4Y5S#3&wXc<Lr?{`snH(iGyP|
z=#}Qt_3=@w14!Tg=ne70`z;|b?(i9;$7riWH$HJ>#9{k|u@>J(O&{%i1XHNSMh95~
zA^}=+os+*Ex4DRwfm~BVBLhGC*D^Rnlc@LIGgf(hPM6CJCS`Gxy%VC_eyOocr(mN@
zC*d2fwQJ6+K)b5y<3+RD8ATb~^>}I!A-xTT7@#i~n%H=lf8R@zHh*uCcP$fh77c%v
zc#t|-I6U%Wo#M~aViweveI<-;q~ZLi1>Qs_{7t^OY7nxQv`>^31vCK46Nir`qCkZB
z-SMj+dqX^dafTQwgo?*S(wpod63DZqT#|dKN9n-f&}PGso*9q~E=x++@%QV_cw7k+
zO7Gfw5~OXX`>hR>PXHAX+2o4oPeoCCxkt`PT0w8K$5xL@UgMu0-zFD_u`H`@I%3+I
zALcKMQ0Vwfq$a#a=B0_a=|bDALz?eNtia|#pH%X>5?((}KpI0}+FCU04t>-sJ0H2T
zp@Kns_vfSy2F)@B4Rf&E(nZ@-8K<9!@YfBiS+IzhQzKb#LKau_c&I6mw*T7;@O$<{
zM@AC^tkS0R5UW>^M!gJU6V;7E4~K0%fNJ-ZP@d~LPp_E8qB~E!A>y(FL-gH|^ISJo
zNZKchIN%4QbZofqz1VfzRYJTaK=Q^lT)Jzz@UKt2WPg9_HCz-E3B16W#@YiU!k4Zc
zOa05b%Vg-GneY3Wi}j&*ZS_W|=o_|p`X*R7Kmd{OBT=HDC2o@mjJ+LHyu}28A*PaH
zp+dN|+L)H4+mq5Z1tTYnu#x!KkHKsN`A<6nJ?U}odc*2g>rLG)+#49}R9}MR=`@8Y
zpi5f@Q%@YN;Ni)*p|PQz;xLE7hs1|u)PVwNl-WpDn~3o^v$~E2N!IaB843{Y1GOQB
z4Zfpv8seZI{K$Lgl|+C_rA2aIgcpz2AfNGvFtTdyz0G)ve$Y`U?5&tw9(^bwVx!R$
zJlGPF?CKe;nh!eHFM*((siVdvnB%)f1XdwmUk_vz<}I2&T#ewVg3>RQR*^^>z2u0Y
zb6tiVi)e4rc23rGq4z_KRmD6kcOChRxVC(!CGhJpg`pes16Nk_oHhcO%FS=Vt6`mX
zu9uB#bf{Kk4C`nSzjB%O3Bz7lNYO1WS&S{5pCGB=v`;)5gs9CEBeu^m2+_6hhGAre
z{Hz2tb1MT+-7F4Q>i-E8k;z_zWf(gI$@6Ggm-*#DV2gaSbMnUbUy6wHL~@o*7L2D4
zeL*)xNCv&sone(M_tu=e%v4t2@TBKjInc0q)(FV=KdiZegOCZsA>iQIfHZC@M$tf?
zi>p=luX2U54?KGXZ!8xj38~U_FB`N>1<j~?+5NdYN)&vqX)0bc=*Up*;I9)33nsIc
z5hfEDG3`^H#kQ|;o%;vkI$GoLagXOQD2^NFa6Y|`waoj>crlyMVV3-T9b21pH=fI1
zBj!%rFGQ2Uz()84J<(or#L{cynajL6A=EkigrZ3INP1y>_uH=*sQL^-n8O^H453e~
z{A(jP0Ev5BE+*fgRgiWlE~;4{ibYbG%Qq!Atcc`mXN3%>7J?`zvP_C~)L0cpke>n(
zYFsR?76An$loBQ6W)`Wn?iqcDQm&RH3MzP4SV-taXzfh@u}7$&ADFd3P&#QnM%c>@
zs#yBx9+&XfB(guIrcgc5-f8-DAIv~)TU{gxaU)i`GV%I2v{ri&DxkeQo|o4c&nQi$
zNhjn`=0Gy6ghw;1B19<QZ#+M^AR{~0+bt4YMe8FV-SE0Bs^k~Y<w51uzp_l_o92GJ
zt7!i8dTo9fO~@RR&OOI96jjzU-v~c3TZd^R;Kqak^u@$x0QPL9<9Cc`BpPaP5Hlez
zZ$my4+G9V^O=h_WW)`L5^8nG7@d9k%3@!8PX%cv-AdZ=-!*g*6vRv5v0V`bp$y|W!
zmktC(9yj4GKbIi5688~b%{sRZH@?vAjYrE~u)?Y&LXNp&Vf4!hzo}NqRq(`E`7w7b
zvSs+zp6#yZ#RD7BGvykYO-^Z?b9H>plDGPqTc4pkzlxc$^a~X+_-!jH5oVOEULay6
zP`U~Qw6PLhN4h%nOUsjVb@;T*J)Uqc?a6E^^7B6EkEK8=Bzy~j>YL;|6JOdr%CTD%
z?LXKPVG=z0*c$UNpFdNkp$<s2cuCqgL3|O?9N@tL{EpF8^R;6N=z~v@=|sU(uuF}e
z0x2e&9MK3WW!X&N(Z@?rERGO(K6FJ9u&TOvXvUPOtXgq4j+Cb0MRtu=h*6d%$Yht}
zON3Pq)M)zo(EW7iUSb>wM=?8R_BE+oCb(}06x7R@IFNd7Oms|Ca32}B>Z}MzN|HPV
zH!*g8jW7mQ-PmKVQ;W}b?)swlQ()WYMy|>pAnBIKkP;GwFt1OH>EMy&^k-9E;Z+a(
zZXRwqKgB7R%nV`SPDu0mc)v^+=K!B5{0;#R7XI+v8h^qoV;`G>&i}UThBpA5Ouu(|
zils%{;aG)fTmARgh-c$XgxxsvOSOiR)OZviVb+RY+1Dfx8O2@Xwh@^s8a$7IIO11+
zQh+evx2?on;Be}<ocDnemh-WvNtP5zJ#`q`r;|W%f_cxq%xh9LNV>VrFK^Vs^^@ug
zz+Hm<5FNGA-1pjp5hP!OUwlFV#S6<30p0$R;&IiY13CmnLVMUJLX)pILNc#tQefL|
znsj3mfOM?41pb5df~YoI@@i@b;{n&!B1bkSVL3rxWm6o3Hb>UKpg)qs*kEY+in<!I
z{Ax)Wp?6!0d;P;VW<fYHUCKsp;-ecKJU0O=$D+-Ai~_|g2v}jPSDnH*YPc+t0vs<f
zf<pd){_qYlSP<x6O@A|Ez@dX<GUlvJMtg-NF`{6ZNT|*w$f)JO4v0B)cjZcZP&V2L
z2~fG@ZxVV>K<6ldVFR}bp_7@U6D|4ID{@bRy0g05uLEvmAL-~F-a7DL8VgC~oF}AU
zBB_@UIHB@@vG}><2JE-*e%$=5TOM!SoNWeOv2FtR=djP7tX<w7a%>{MH#qRn+LER>
zYb(b=3yo(GbhCNe=0)GPi4J=EaeT61XALQo6b~FNv`9-XLgH!@NLS?=$eZyRtI)QV
za_PM5k}Cz@)?%zRoJ;%O#)nRFSddoacRA`X=YlL?U|_NJM(LE6nDt)nHf(mOvK%NY
z8kcEcQP6JEO6HaUKWwp%EK;>pFmm3a1vEvbU`&Jxc;CBXZX)&GNME?n^%$XX5}QX)
zlO#r_;GT5dsD#5-Zz-MqWS0t_vQe=4QcMiX2o0*p)bZoD_VQ-HyvsMtkU=M*B#Avi
z!JTufNRfhpiCiFg;?b)DS2)jwZc<tR73y`a@PxUgbWmppkFi;sd&BYRJiGb$QFXfv
zB;CGWr!68pwUt0kl$7{FaUraQR?rRQc~~877%qm&T=-O22;bl{yz;wvk5*c~3beLy
z^ms3eW<6(JZ+dKPEKDiKGd&0T8GEnvnCW;F89L+SR*fhW9EY^4HqS>oNTyVAeysHA
zHbcK&zr!vndKF<_xp_$7w`wH4$Yac6{7E`GRA`tefxBdp31%U;|IhA6d`Av!!rw?i
z@}{3%qZ#4;vT{7qVxndY0t60M(lY8_BSvD;;)R|O<B>~4TrIZrgdD#~6AzsdkO?`}
zrrWT9%7IGnaod|E)TaX-GtnPTXbPVTodOW%^mWUJ-;?p-1bjhb#PB)d=gu$KD}M^}
z4dx(`3urrFQiZo{N+cq=BE8M}ktP~_NQ!~iON1+q!Q7iEiX3J3ay*5<h|UhCfG0=5
z9Z7U9*A$<u0J#S-YTq|R^EmlS9&v7Jmifz}Lz%=Yl`|e4jF%`n3%8?W89|m@pOrRY
zk##e-zNa2Sazbyg_FihgD0=tQ_UtAEzE+P>ST=ErA_W!LEJNdaeMISo8lbVM3g{)v
zU0cp2ILC29D4ESL2@2ohQB{?3Sb9<SGQd}GK<2nr_<;Gvs2Ks&x$#IQ(&kHg$Ykuo
zmUOUXknEGSPJK(GKH!d*2?|~f6UCV*q@lGO%j#oEZW3`>sln-3KN@O_tRoA<bdP_J
z#jMNZt4Yj<Ar;WihGCKOMa{I|7}A1vcon>gdO~4D2cI}5`J;D-D+R?v{v%t(A!+GF
zqd$+08RQw7jGL}B`sf>kR2}F_zV-Sg`>$cbQ4U@8h%_F$Bp4Iz%Wj3Zi?~bO*Fe@v
zLE29foYU!(hSi!fSkGWg1Eu}^kpNg_8^>9P*#_Sz!`cZex=N#{k0)O=&WKOIw6<g@
zgJiY5u&C`wbyO0RVf*B9?<!H~_-3^pbezsd3qB@GQE4Pc&VV|R%qy9u&6^wS72+o8
zv9<xSRR2679kPY5wRDNN#qqHbVXYd-D?grdd!&Sa+G-7`)wt~r@9ZUVOgZ=O7=DF^
zRi&(Su7%Iow{^2Y&`Hj>oPSGry>GGkcSnQcnfnZoc;c(IXI0a62*l@5pIRN9I{Qb!
zwdngo&JQ1HW~cl8rR_wsPW<o!s@7#{!%4sttq<ciuH@#pnshxMpHXlewskK11-!M)
zi=5$t2u6f??#<I?wNkBp1B>$*r4|hV2MDy6+@8q1?-1x9Q%{Cf$K@{;&~IVSX|z4<
zIC)4;u=3z=d|2uBl{03*PcLAl5+T>rO{x_{EV6t1dE>HCeApD0Bkk-53Gv|<OL-oP
z+do5zLxi0_Q95Vqfpx&%Mi#NaCkMPOsll6i4E=s>JvoEkeUmxLX4H8Jy;I?xIl(Gw
zURV_>Xk<P`AL%vc)YJ&r4j(%e2#brmvqn2R<2mN);|}wWK}aa{WY@;Fnx8j9@{Si1
zorGTOi%+zhyM0!(kuUK3aE0rJ{5k5(hqD+Dm6(}aXvp)Saw&=$^!q0mbjQOVkSyI4
zpEwsrnpy`Y;9)Ws0{d868lk2|r0q=@Ua>knl2D>mkfZW_{@$;^R_s4bc+Q_vRwW1h
z0q6U>wT78?W683IdiRW6=w}$+(?{^#MGxqsY9*4c-s)SwEa<~^3&J_kLxC38z|Tjr
zkcnN)hkc_fU+=P&9*%n`SKFhj5NgNjhFzyd4iTOtNua0@ZMvF}yIb(xl47uJGNedc
zByV>*KOpt!($msa9$ScprryWhb9g^WEB$DKFy<xK@j6$fn0+3Q>kvfN5$E7CtL#v%
zU2ZeO6hQ@)7wB5vm~`7b9rR_0M!^@x?Br6RFZ9jS2be!1C@ZT^KGbc!p%?M=K+WlK
zAonv@rC(;<8_QE-N+S#oT5kP>APj^NLP$#UIDUj?4+LKzihjl1aaAWeuxkTEA)N*R
zV4!v%WLndS6Q(CMWi#kmE*BG0hx5oHFsKc97&(lV%4srQp<Gf7XI{;duKKMs;8z`p
zywm@`eY7A?0JhLJ(7KZ1UrZ%D9<W<G<E`cXH@6560)_>C)ad-%0S=)E7{lF=&yoGx
zvkf5)3{$Mkmj0Vf++hHiOh24`?0*m2(gDM`M}OY@%L*>&?5{(dWcUTzzlR0>J~z7O
zg`F|IY5kipVUF}<S|KMPr{5q=A^X8~Z}v<~_>x@`cd>cupG>x~pUEbwbzv7RAOa$k
zN6yC*0&ZNar;B+R_a6GWkAv{LZ?4#8n2rlzBvI@)7G?|8<AW^^@PSl<aF#`{i~(Gx
z&;$Kv;nKyJX)A2c^3P}?I!E0$L9qrmUC>=JhXv&@R&Lt<&V~(3shVG`N!)s<GXY|Y
zUUs1Zp)%Dk!16y(H85R@*ggLmvp{%DzPS$J9_&nKgt@BJD_QRQ&69!2tKMzdf(Q{<
z9iDf@oejFr8fE{!>DPbX1lZ|4aU-=G>GU1~v!C81)p3XNyxbZ6>B|J~VsvEKSzyI+
zRh3jhd?fj{&tR*gv=8+Zo~g{@wuQl|nX)-;_2okRKh;VjrV@?E-piim`4hhc;eeZh
z`v5vV#3EEcn9ln1G)GxIM=f6^&;bU<>`FR#Iidp=Gep!SX7td(395J4Of%WRb7}~P
z5<%bVA+$ug9Rp=M#+~1RStqULD2YZUP(h_3XgNHq#X6WJG)PtFh#Z7#1_StQ^mUHO
zjIi^RFzwBA7Dp3IGfzZzk!3oJhHHmHi3Ot1xI?PMjhZpFl$fD%qYe;iG@npG5@5t(
zqIycLE)-j7+1e{hTdC8?G!O32M)Xq`fJuLh6VE<8pF*;B!&E)8AQ}-_R{nqaz-v{A
zMzR1GRu^*600r)jPvWLhlQ3PbHAGC4h+<$a(OXH(2%(lJgx9}!j?C6wE1`>sT)A|>
zd!wUN>m#AmK&9zi-7P=}_6~tJCB_X}Sr7iGS33&jf$U(>{I$nPBt$F(W`*h3d|UcY
z{On_ca5S1u1#+LV`6qEJm|^amG5ZMQX`nJzeuiT(IuhB?^P1OV35544d8;8lC8{U9
z^F307tR5tOWtm6g<lw?go?;{TDTlf9C6Vjt@Wt$bhGc+L;Ywqj4@UXJp2nisy1&GI
z_q+E^M_Kuj4t3Y0Vt4)KQ~MeICEnwozg#$W!wk*_2A|1~?bj?LHytIi|KqF!t5{J=
zhFQ5SwB2vAtNE`Ob7o0th#xjJiy66=Uo6-!p9pY#x%g5RJAcN8Z!<AO-WHvDI%k+~
zhe54v7ceyRM321w1Vt_XzW=7}=p<&$?&Rv#&ED7>u><&=-O|b1H(yuGi-{erX8pN<
z%`+BSD?p2gOB}~KpxcpE^1%pwC$5CA92|M#A&@BhB<VgUQ6ED#uCd*FCF!{{sp(Bo
zoI|o;qLuPZfE7u53gF@)qjA=1DMVoq-u7JU=&UKfRlD**r5=~MbT(cpUu9lsLPe$B
zWzpOG7_64<lDj$Tud4C#G38>h-7acT;^K$VWWFQIn}xVymi=!(xk|iV4DNKY?O^4|
zJ+&?D-Op`*)ZQyBYP2&jnv{=M&t$~h?S4+KhgFpP-PPdej^F!H;$^~gO1w_R_|=5R
z-T+yOP40aO$?y+Lvjr@`n4&;$k*w`{7n?d@o2IFD{(VX9rGL}bD6&|&F~j@)r(vvF
zi3cw^OGDp9!SPWzw@H+4RlND>;9p;AE0?f79oaETG$prr4S0zWgW%~IN>|kg8f75?
zxJ|eemLc-N@aux1Z4V;SmlMKbg5^G07o~=3!uS+BkiEODt*>L89%LbAQzcUG`i%|j
z2Kw4<3u2E@*@hE=1&)9f@;Pyj<rb;vXP1W!wf(hSfSdR~C}~JYs>Np1y$(BHEZ0JN
zH8|HGk2fgwcvmp3w)@rfbXu&G?cgy?tD46t?8(wxj@5e;&y}{%-oCvg5+bKwI<p@m
zm~EsFY2=mT!N}r^Ui#Wk*-Fa8kUP4mB;^qDi9$&NuMh@_PRX_M1hsNVBs=+TGLU1?
zJ}s({B_iT1&@{2&8B?D97M;yr5jg$M^(MV~<++=W$w=mDC{50JHI2`@%!rp;_yM_S
zv92KdQh=g$mJ3Y1!mqIHZdEIL5gtY%==`O;{&*|4Q(DTXBS{II(chq!EuvpT;|F<D
zLS3R`;`NFp*DahuX#TLjnk@@D-OK4_aX{-;WTi@78Ii01^?bUEsoh1Hw71;lhe{HR
zncgIPc|s$`L@)0ZF|GS|wak~=I)L>@&rtgese#iQ9^E2wjnhxC#MMY@N$ok7NTxz3
z`mNUNGd!g&r3?n!WOyUt#2*{Y0SSuii>J73a)N|OUSh;Ah+kb?H6~)k)M`|nz=+o0
zyAy1GQa2dtYGxybss<4s(*7R4IK~S7evgLxZVI>FuGFz^DiTz07sFXMImtpwGRd4?
zu`<92$b}cs&)d~67iLXZro|SoJz{AkR6mtceKosOEIi}H|JlAS2}J8@tKNCvVTtRJ
z69u1Lbez=n<cXB#D~aLGhG!JvK$J?}tr%td-vx~})!pX!lpanQopqd8dOw`U*Zv%B
zeugvspX($Fk4{EO3bi=vTkeUJKqx`Ia$@95(1;e73iAnUv-;k${2&=k<B_syuaCCP
zJxJqtaM^x(J6_6vCdI&UpT^7oK+Dd<*hu0Y(Tw(tF6qBd;MBfD5liJ=x^#DB?O45I
zrT=g+U*d5B8ijBA1r5!~X?$t5bX@SIE&53N<y9Ak?ZsWEYE2e!Lt5S>Z+KKUaU%^c
zhBzU<npt=x?`C<`(WLXKANp6#kvvlx&Zt(0ByE>S=+j}s89g8F+&|fAFDp_v8E*s|
zuRcS`dwB4PEun1{((AfC<kVVB!kZp)4k&hHcQT<PP|^EES0m<R^GXq~am1E`b=mLs
zyB$?+>1NAa&c)Z-_*jEoOpEsmzBRiuVyfq(P8BiYNgQ8}lu4%zJ_n^udEHzW(eBin
z1iaZO!Psekd~ko*xx{N7X$*J<Rr1K80CB=+q2zvQx<@Ys102r|;+F#|4V;^lm;LWQ
z(WKu;|MchbbFWHiZ+;z{ks$Hs5j)`u&};o)&3$E1TwAzh;})C*w-7A2yIbQ9!9xfR
zjk^;hI6;G32bTcB9RdV**FbQ0hdJcl_ukaho2r?r`7zWfs;E9}UHjW>t+UV98=nWY
z^z8B;dCo^?jC{(0q+Tx(8^u^AUJA0c_wUww#Ls8{UU%u=LFZg6*E{d5R-Oy_@>n{w
zTR+xJ-uk4~tFfTNk>#O?q#?lj@G7~3<)d*+q8Q$|m7~^HQC$MlyVHhW+hdiACQ||>
zDf7<tf@RXlHOUhh1;7EL`OkV@@)8t~R=(4&FPHRMWaH{17KQT6O}btO>T^DR=f`5H
z_D@2dX1`BATs+(zwKe8;-FSZ3)VOjRY;lY6nB8Dwf<6bJgxI)HPTX~A9~0?C%E3E4
z$B|{BTgoZkFOxrm)%(}#h0Z4Pm?b&|vL19)2(e$An?5f`nuos;$3MswGNZxLa$V~C
zbf7&Y+!Ao+#teNy=AV8j2ppoZ7^u3x;z=Op^L^Mbe>h*inm3&hLRH%SD1Q7#4$L8B
z;CFX{?{jv6KW`h;qJLW;&aeX^XdtQ+Cob52Gy~!``nbv^!etOwi8j9eC5A`}z(#<S
zS++V!Pmdw5-TOr%sjpZKI!u1j@NoR<VdAMT(6fHk!?Z+`SabEvUVZ}i0|@{fEp%mp
zvkJ$<h{Iu%X@(=GTebkJij?}7RpI5K64x!RW<tt<{n9y2V|uyHhcXCT-Yt_xLgVr7
zC|O%#TkWQF%}>u!z6hs7Adi<;)y@NE8K8Ysz`|z}C+egiq#Q#3rBld-w#S54Hn^ue
z--^|F3>8qur3AMU6H?1jzK_?|d?WUPBisq+V)Lb;B_|DMN0(+u0_w}R(6Pm;8>_}Z
z+T@_g9;Td64!O7!(|Wn!ZzuWyc!|pB<16L`(T!*Sc3y(uO#u-Xx+NIg8=Wt>vHdXh
zI8J6Y9Y}08lT?@7Twbj#G`?ilI#j#Zo~qxlS1WvbJ20|zKoXXuEUcR|->We_r>M7^
z)Fh4H!tK)O<9Qt5HK0J0>x`hB`&|22O5&^&H^kbZ>LLk9mMEj#{RT>7#4gae(~nOD
zO(Bh`H{uW{!WnfK_o8rT-kmfU(yKKBwy@MmVN!89DjEepEg3SWKx<5%F$FS_eq_r@
z&Ps=Y{YF{0F7PIW@g9?tNpi5mfc|=aPx2a$T3SI0al7+Y5%&0ClvJ1z`*YEjSmS)v
zW{vlG4BCutSEg=9Q_2E|%_Y8%H!|oAWff#u!gH2tO-2jKHM^6zK?(4;X^_U0SIq88
zQy9Yfbt^)^f>c+nCijy(%gV|Vu3I!HT{o1N|I>_CWAQt}(|ehC{{1KP2y8WleyT&F
zY^2_$5-w}460!9&=bI^>UB9QhtYK^SDyk`y!v+bzhl>J!=LR*!VY|Iu_q_`#qS2;M
z>n9Yhz3tRiI0L;Z)$x~tDQ~#BAz?A!qteRiewxcSc(R*{Tb)v$c*K@_1IILZr!9pn
zgE*P4<2?AvSLZ4mHA#InqH^tgF1okY<YYOadNvJ-N`j?v%m=Pz7<~5@==(GBcbetE
zb48ynM5_+8k?qqGyKyupbIW7@y_)m*6_=3=I8Nu)sNK;ED@~zom5ZkGwA8w0Fe%H?
zw_Y+26{pPf906&ia9QwH>uKw-^*i-4=H1d-(k9@2YYX|itHYeb7RdCYCWcy_yl$cD
znO0-{KqCLg3MNg<tOu8eA-CPW6m>3_H4CPjYo0Ioi6yLhKUy>jX6*LM%c!<Ce-fU>
zbuj-n2cjIXxQOe5LcdKS*)H0&B=XDlZ^6~FBn0?$AX|bXfFnd;(d2B(jp<0C%f+Tb
z%C`Iay!tdUak=;7oi`&zN=wyCn!yrC%TQB!qnS?}S=w18{CiQ|EPF|mS7?ss^9*Pj
z7pq+)JD&<qUu;~am5(JGc^3zCk$qm!si6EeJoBmr8eK+gxx9vXu~uCjp>P?cEFw)i
zDX5riFxTsY<X1;ow^CT}iqR>^d9lmMdC-BjiC7F{!Hsyx)1j!o5psUq;dxk-s1)Dm
z*zLuCP;Zs`G;Kcb8=Aq6;rsPxPY;t*m-)Kco}!w%tR^*6mmVElHN#4q4E{QfT09Hk
zSC|gk@kADBFbB^oYTv%+&K6iyt$0T;aoO05M73No%bdH;d+;1o-H}Dx_+D(7H}0>U
zkzg<RjI>N`u9Y{gb}9{jd;<0*d<5DhCv!h052wneICeuI)xd(*qTvCHMRpYT@;lme
z{esW2;_MA;342>~EG?|sWvP>6X$*c2LFYTk%Tt+N2YbE}Ap72{1e7a2TRUeDo8`3m
zCx0tKq-Wk08S>)|ISX;!pOEREveg-4H?k~THft;`#ke%7?#$t<qnn=mk5xQ3af^ds
z?N%Vy3C2-E7LdS?-MO#zYuu_%vwbVSfqQiRkgpWL+H#S|35sM26di+{Z<_3^uY}(2
z^pdyDa8#S^_82Db0JMAiWJf(*YOX*WEMj;Qpd@q&+B&<Lh!LsJ&VLo9<Y0Dnw&3jS
zljRQVO`An2MEtjuGGMkVrwh|J!21BRU7o+^3OFCXG#noJRp{hht@!8xYG&!C(5w;4
z;0~O!w_S`Q)*-G7G_<o|r|td9Hl#M0uz7d&y2<KJoNyMu{{`j4<^uVTbzv8cr$}5b
zr_=0uwd*f3S(<rgUU(0!-J5B}J!7GTen;{yvy_SP@ymqX{OUo@!_`32y@bC>_cENO
z>G_*{hj8>#BwGmgZIwh4<5pAHT$=NhNb+R1TJy7fZ^aIV%SzC>7gf~BvW^CK6HCZS
zm6ET<fea$WRG6xL*{=U_7GPFvBFk6P$!zGeJQ&3lAX92wR@21<1Y<;MzCSTv+U|*k
zYPkjBF&zws((XoM<JL?tHx*6sxui3|(3h-Xj~}%@=;CvWbV(haWvIHXl%tr0>~-@9
zW78BVBzsc=fu}F3G)hmbqK^2@MKqhqF?#sL)i{lIewz&OwVAVQjgKWRfYon`0c&jq
z3`W4{kh&ge7`Sv{J=`vk-)?pl)I7K@{FJnsA8fGl7E|_V`A~cJupuSSWwq}tbo=Mb
z^DdFT*t7lV4vPlK#Z#G9?8Q1wkJ0%&JE@DKyuo~HQ8eE!g@L1vUlif8UKH_k_(J`g
zUB!w|H)pQnK?#CTYpsgu26ScD$GilzdFM%hHgESVgjkQXT&WNA)YQyv>ea6d+l>C)
zAd6DE3kpeC%$JZ>bQrtE9kpTBH+)NJP_v7fp_Nx$yW&7A18xI^unhBxUhU5ov<?zJ
z-+b;Rn={}w!m);m633}e`V^C2y%3w+-a_g=A+B(U^^zE!22f8K%L6V@;Y`m^&(O49
zIzmd$MbZ(`kU|{`lQ;mH(<X_=vmb+4<5WuZq6fMe-<BU_c6+P^3VqU~sd2`)t(9Vg
zzC{&l&gtca7$yt$6S`BOd>}}Utv#5WL0E}wE=jglt+Phli=rO57|iR6ihUG;rC<4V
z0_6HcLEnTlX*=ply`CA1gEXJeFn=KR8xwkaP^QSe#0bZc`$swUcBfidtMS%t+0+fc
zFaaE(;&*kxGw)R4^J7e!nD$goZUn$R4mC84bGU~IJ7HSbWc?L!39PQ&&EZ+f&gBhE
zolnk-eYCIxN9$0|JZ)VXX7O)s%X&q{GTaWSV+%)~qGv|F>MX;pEcD{@1y5k1$H@p!
z)8+b)#ly4JDHPD&8OxICtr@qU>5d~ln0q0<RN+55Dv8nn=<iuaizHO7_Bk{SXk6S#
z+Vtbu^FaW#MI%vk?{Wx2EMAv@J!z#@#<mUr++C6Q={!!Y(C6A!P=|{GBMKbO+Ounw
zv1_hbyoqjYwg?+_E!XE)PrRI!vSZdMU-32CIOZ>$Z%sYh7dNkpnU2LE<&)}?5&AQD
z*+3_Y)m8tKoP0i~?n)!R+PaBZ^!pxHgH1-O^d(D}rxT1k_bz8gJi10+E5N|Q#w3Y(
zi0iPh=+XdBuq9EELd~wGFBy1W{CWs*>y~lH{T^?$oA&)0waRWMZlcN!tEIDi9-|;l
zMJ0`HmXr!N6)iXAa4*VsW>&n(<6{jSE+~Ai9+ZiAs_Z&f%}zVsd^Xk>Zyf1BH^^eK
ztmC%sos`+MXwSZ@xeh*YNK2J$N!f=mx0wdg!k+`1;&ZNMD1qi|{RZNdv-MDC{kH{Z
z)nzMSGDW@vOr|50oo4)3C1y(z7nA2Xx^3bJNIO4^s?Wz6<kM@<MntstR-PVR{8rra
z(77HeJ@&B1HA^1VzQOJ7r5@T&Uc@;`5G1*jfCCA7sG$XuZIV-%CAoUQ>j*E;n|yvO
z8=VyOyoh=EwZ0Zzb}|iDze=SDC6m#g=%LRUA?Ep9FR)KRM2E%8bl%zNSeP36jb8@;
zW1e0G{aE_Ovs;kmHzk3&s0&X^q{^9R*K(fCF(LDc-1O;iEp#bsEZorJ!Ql3^<S3oS
zVXtgjS}xQejv~YXWoL=1V0(92qQ+)S0yv~yF$LX7Z2Vn(DJA@^@-{)T#aJS)>=y9<
z(B?iL<^OB2{_ng};C*7h(p&Yk1NY|q=%((C#q;38d7<KI&4ncMqN}Nchz=44%pIKz
zC9MiFTTPEPAqj%G*=KeQBthh|esq1~+_OH?Aa_ii!P%JgR^uHDq%E$H$9(o<{o2>S
z0*_C)ON#QX_PFs)fG!>>l^Igb;x`l$sGIf|IATaj{-M|VYTmR?2(ahU*k6cUiX{IP
zNt=#~RWTo~xn}EwV?1cA0JUzKt7E!D>`}*(Kz)p3YYj|o1&U=ZMMCfvFe2Z~IIiy_
z>;O@d;ad96Y9_w0ZqhFC3(^{DCRGfdYT1GtHd1OLL(<-TwN;~cuHtYLYl%Uc@CMAq
z{&>#@n~&=l(!6I{&H5(tPPtd1UPW59t{=B>CeW0S*iEa&ssO$6Hp}&-J#28<BV(kV
z6*_&B-NsU@ifc~f%IRN59cw0q^Ygogc&b8td?8QD$mUtK1A_9<y!q}Lv-ZoH7AJwK
zM6g@rS7^RV%!Egoz;>(gC(ixHP*RWPoB4qn`wl1m?D>vehXc?v$8@{$UaC$l`pvi1
z<Yq3&DK;{a@eN>N_B_nC>J9=KE4OhUu0_G2OLIm5bx{IL52$cmX^WF^Tx_}f6X`|k
zsTz~d9@jyOo$1A{063IfQ^KyL^%wQrN}Z#oBB5Uv5VBh!1gsjNWDY;>dVchoaGCui
zy<NXrF#ulRe2dG&@$k4%A%zd~lV~5%D(pd=+UrTfZ&=XVoLe)7o46r6*~v|A)vS<)
zJq3u>aFY)u59`Oe>AwT9AFVt#6HgH$4W1y5uMga+$F(joC^)>m%JN%`lEbgFC@k?>
z$?qA`E9r+23&2ia#xYD*8JO2tx;EA>T3H)R7KIvm8kaZK>`Yy?Izi7Ku9m02_EqaM
zG`Kx(uPaS|TPmL@^QmwQs<iEe0L%4@gq;%)?No)!*QRJZ@f{S)EwY)C%dx~7zeG3}
z*_o1y&gQwp+fhE2wk?RY)zUd<hUcbE^!Bog7kSxL+XbgXkDyG@LWZ{T%#wHL<y=L0
z`R=@yB8$b|e4JGP7`?5!Ez{E`wMW@Y8qh`7gUa6++~X*rfNa)aRlsZ0Xw3+7vsq*h
z_;9@fJn0Bt30s{0>UJj%`Ta}jU}spJ3OYxa?9vK6@^}NJ>=KQ>l!9_uNV2ruO53*1
zy^4<YBv<~C5VcFPF}T-sXxNu>Z4H<#o|pv12X2cxLwi}-Qb#^}<-{9D%0qs7fYrbU
zK^lt{2WPoIGF`OgG`(s*j?5o1{4;p{bw5V%QnE&%qTKLlwfA+|ALBd7db#Ri?b>B@
zY-{Bjt((jCnNyb9%;nHh^|O~!j@*6Ve>w_fE6<#JnpsZqUr{-9xYtXdb6K#HESC(*
zeG?vsgP$A`&MjHU7+juEgHxqZ+wWQZs$9jn`eT?yypVhnYS=4zN_l`QnJupKE)1rp
zI1T%?9-W;BaU8K{tq8PT3ADMTbX(hY)do$rt+>DJjVA3zH?`~dO7E1Hv_5?@iw(`O
z_Ivn|M*7+HW{zGUWu_m}2i-B!A)6g)>nV^?L@A?G)P9O1Ltd}zv=vFDh#KLg*9OPZ
zAeg)kmUI!ENn(z;Bd%fX&yKBbRrK56R+WfteHwd9HGcir6hF54<+fn5yXex3+#9~m
zzzB=9@v<jJ0*TG_=lf=kyXvn8s4t=SB$kP%Qe#yHw2XH)VU*yr%_i8qP)v6Cl0>@f
zlIg;TPHlXi*G2MLN36ad$WTA{x<LDfy1%<jgv9S|3aOUTq>2YmI*{;4x(=r|ykOI<
z(n$G%1ud%5d;KEgR32)>Qtlw|e%9j)*u2T3V|svH=zCi7kYaDu1VkS!1BNX8sm+p$
zZLARuyrBl;!f!lQ1<j!9XzB!>w?$~>z0WDBw0N`;)NdmaHmrFFvRL}@o8eD`U>kPj
z{?!NC{n^W?ld`+;x|LugGG9Ssv3|4_uPr8L-(6KadM#Mfel5}Ne$A+wB7L0pA*;w=
zNv*rEkzZMHLovu$^Vjfw8!qs~`dun`lA2z|)a`w|ueBRtnQgg4!ei0lrwkvdwGJu5
z?OppZ;olXCnf1#_XW6@OCfi|<9r2P&m~~7jCmg4#(7NQ(K&dg#uYeWPNMbj6X^m?E
zz0KX!>!N9Y^QDg6fQ!JW*Fs|?ozOu~aAdYw^%uxp5qy#2suD{wg^S#<tME+pR&th~
z<lF~8sZi#YlToU&@9rHW60F1TzT;@Wqz|bnNy@v-;2}WQ`*E+Qe>x-BlVv`VZiPI1
z`tr$7lJH50Q@p=T-QF?uX)B9S{7j2<bkk@_C4ZY*+Gp{k&HXpY56fv#R>gv?O>sC4
z*l1_zB}ajB=dr^P(S<8y1(t8P-?CIw@r_O(YC*^6NZp;P<{Q-tvAjrJXkV?KU<l;8
zn_;13WkNOf)|RJY)>Ef^Az*o+))YKv$;12+nMZHPc|n;PZ2Q~Qw-UYx_$qoQepkU9
zYcX@Bw9mAuD{i?~nJ<YNC9)<qZcg!8bH8fkTA1P`S}RJZ*HdiR5l@$iMJ<$b!PCe1
z5b;<cH@ex_BFihmwOyaGQpw$%bT+6W?6kIcjaKz7y=%0{R5b|OgS59}<3^c+!t%G(
zhn-ZaCZkj%9QNE_<E6X0$_M4B`-UYPbaPdu;cK+8Ou#jHtZbhbQ&K(1ds0TO5Y;_s
z1ab<mL-i@eQt=|g@Zx^3IFbk+6Sv;EGBCF=Vh#%&-AnGex}YO$4j<J~t-SQ(n0BCv
zBe2~#4d@E>lL-h6b|BhXA$`-7=D5h8DTz?@NiSf<)r9!q)XCAP?-x}Q^{3n?Q}_2g
zPj;_sxXkwJV|8sVt=txr4<Zd^g`$5g+nPO2;XqsGSS_P_>s}S8N`;Z>{h2;-rw6Nt
zaj5zhS>^2~y*)kLy=aFV5+!&va4}!M=Xh@RBob96DvDZ`s0`m_U)_8)Sn34B)ostc
zs+gkYkb$gC!v%MaN%POr)Ly{SA{X76qkQb`^M6hzQC$26vX;LLI~G2K36k8s$evHY
zqwigZbFGV}EkRQwS6D4Xog2cgO)x)ghtRjtW)WvQbRew)NF~k5rT4-|)C`(L5cJa7
zaNj?1bgt3sr4rjkZrW3*O0y24r*DUA`u0ay?Ojn10{LpwK>-d_m>^6O9D57T-=E;q
zu;vlTLV9w1L&7)APE>uf=Thh?@4&cl%`x2PQ+|L5S$ST=H&$GGbHiyng#V1`e$ry*
z>n7^t0;6vp?|Ir%Gnp?W@t{TLEb6pT-EF#D{3pSpXbC5>I|CIUW!JtRe}l;FM~COp
z>BWMIJk+BpJ8wt#W_nLs4CY4}<>&_EL?i5kRxeA$4BN6@#mBcL{aMJ!|Ngozx;L64
zaVgH+Vvfm!@qw`quIXm(-no#Wm_jkp@3({hy*Fpf?YepKljCDBBCSO1=8zo;(pSsA
z=V*M<w+3`JAes_vSD^+M-xKpQ{oCG`w)VsVWHo64FBdOVJ#JgkU^&g>L8phv!reUd
zr!dEt5SfWy;EOVbNiCL2g^2_wN#Jgsd)-!SdJMRB->)8rwJyEq*8mU!Yz|d=r;N-^
z)G^<(sDW`1Nz1};k$MR=yh|2)$n3qrHw?NX7V^YFLLH~9xcrF#{QX)=-ajiJ10VV#
z)p>6er$n1sqC;Hjc;D$k^|t4<XK?Y?%x#{^mk(cgUE!IVw?8-^a3Rka7){fe^e@<b
zm?W-PZMSR7(Q3InxsD%st#`SfxzO>qr;Cz)S`=ctxZ4DQ=QqL?gjolI_NzQENCh6=
zi1i<hC$oIP$K<vQE9Tn!Fq<uW?eZiXnZ->$K@-!Z&O{uT^@{XZ&24|Nft0Feq<!|w
zr<0=`I9Bh@7y9bC33fbRF<$c73m$-8Abb~*yUW8M_@DaC_dP9ZXdtLjZPUom^UzX)
zmD?kd`S_{t_C0D_srK2W+E-qaN=mviWXqC*#?|O5DV2H)2f_J#D&bTSG&>C9_wX9i
z6$Zi5SBt>{PlgLvjc~WFy*mXNkVtJbk$aZZWo@_&ujW98inmtY?=~`#QnO8)hBFub
zncqc8V16lUi^F!@!;Rd8-X^{E2Xay5t!hdsDa<KnvZ7KMc@d%|*|K3IuT)tI9X8$&
zF&Z|Y;~RL>0|k>3&y;loQKaKGH$H#=W^62ILJTwqI)^*{0B`)Zi8^xpCKv<p#oQ3z
zA#}s}eI0ZvLkd++;0fmzp)J#3VSPPj^TTfGNTj8=(p)mV_85U%>-{ahiEdVrM$TCW
z{y7?R^DGv0faieC?Btd2HOxF`iJkt!TF!j^ajyJmp2*4*Oth^8(nUvUx3v(@w*_A-
zbI~zoFZ@l7f0nZef2Kh36nj2@2>M|JPmh8`i`7OX7xcl)mHTz(@12$#29@hw>+ky$
zF{)4GFQe>+^e`LYi<rdb5k*-*1#qilW-Os7>T^SeB5uw#pI{_hoh11iQb=Z*8qj~;
zkB<+x1)+{V++F6%-Q-%#mE$J))c(YRdYl;qjCCOq5<8UrWGl3aYNYdaaiEH8{CI(}
zPP<uTR1M3$FNo?p?YWm9f5oLTs#fZmv3RMDqE^B5F^T0FfxPOMcu<*W66uSDa*kH1
zr?F<85DHRZD7kRz6--a-S<cvKo=o)n^+ql_)m_ANhn^JoQdLq$9nO(!$v61fTSM9)
z9B3^(edQzkDr_|^2?lQd-L^s(u3HEDYl<f7ts4j4_rD;09Y^(ep{`E)Mz015v}13D
zn1fK&PB$H^hLwfFUw6x8E6kx5zH4}!Ds+tL-TEl=aUco(Ucq^%gt`4Q3Wvk?DvZfn
z-`~l4%Gl5=jW)vnj$_O_RX09|%gnUx5!5$eAJ`0k3RC$c=FFJT-Q|<5uI3YS#-}yh
zXZF*hm=TIIv*iE#)7^)~!kO<DH+9$Vd+-Fu?uAW?ALnX^zHn$WzaSTZL)&bPiB{(1
zDwlQcxNdJ933#H<ps<H;cLuqvqo~a>m3b;k&oY&LZm@_1_Hn5x9cpWRvE4+!6d|W#
zwhhD94;%6CLeSJuhwrSlJO39#u|4{&j*X-4zbHkNvalhRW^C(t6N8yjR?zSDWv<<W
z#a0Bz^h8|!l3f?a1!WHD*u(h;4(66BR4c8L&pUQkG5r?@PlKIQWrppoq-e^G`L3T_
z>E2PW++Nx2)hNDB%2E)MdljYY-1VoDg;IN1WMncj$!0PQhnY+xHFn*&CH>Uhtv?92
zSNLN85>L8iIOQXOwf!{+pUHs~($q?kDm;x5Q+MF}fl4Dw%r1A4DN$uFnmG2Zx$U}2
zxED?<I>LF|-S>@dwb}x<T1jP9nn|^otJ5%ZA|L5IQ=+0$U86cTGi!g!7Ld|%+FC_H
zte>2xRPPf}oq;v78rdBuWD?8yXm%beDLupMWWyLBglxiwygbDeOLcCTy4(%K^7um>
zyC(jotDnPq*g+NX3%;prfzH3a%epu2iS9pLfLv`eqas-NMP4t|(Ts>!PwS}1lJUAD
ztLOWD9$Pnmf7z`3-F(42-R+PM2TF@;e<2l}<u@Ni?snAq(y!_CC(Lanh+M!EZ!yP?
ziT~<0Ksou8l$WerYTpuKBj=88nR~_1L@h1nDj%udXt2=^*>OcHf++Vyqv3rOXj_T)
z*1kvQbJ~>qOqEwAQ0w#QUs4kl)#&>M;0j6%>WxJ^x2TxemR<AP(@u1mN0YEaL^CMq
z6iX)$5W%@h5#JlrEO^WFf`P>Kll7b-R2N=*!Dv@&a|Ep&XUr-+ejTclTHfz^Yr#nC
za#3i)vvam0L&##cQ@2cXTAL6m_IbB<0!G6|*a)+z&u_uRg7+^xUF=-OkEnVTBrcjD
zeK_iuCDYzfaT~h~;zTzv@$0h6z3bZ~YUB`V6-{0OKxlJrQ)28VA(knX&{*gncaT8t
zETxghnS~5{PZNnFYx3kp^nx7|iA@G}cj#h8FttkO{f+!gjuGe_6e$Pq-?PzyOO|vV
z-`qzeG4i}F)iFjdg0~!`m5-){BT){b>4#LMVZyUlqt?Osm@T`g=`lUUX{3Kab6QZI
zFpMP=xI{X|D-ZvKgh{w3jY-ZIXuIH+GlFJ2+IsP6RpYVh<=zYa$W-6DgHXn%kjY(G
z94N4dTqk|italN_ffiL}?<Wx*LaYIL2T4n?1XPdD_pbm(E(4t(WsI`bE;g^<?^1jm
z>A`FPt<yf>%L^L>b7d+0%#jE^={*sN^b=v|P5H`e8Vzp=?ww%vBG2puY5kr)0mtU&
zPyB)u{XVSK!rb4il}$-90w`ld!$E;13plOTo6PI-u=fueOGbO&XF94phU8{bg^AGo
z(&hm}?D3#XI=;*x)`q5EW$RxkTzXe5A#+3Y4U?N^?U14t$|viY`c*imx(CiO1o{|0
z{=0EKR!e8Q$&!aB2~!WK*yD$(u|GKgUMlBgzkagE*x9W~Q(4RRs&u=9O;5Yq|GK1r
zt@eM?QYNs^>XzP1?jU2en3*co3Oh!@a&UC7Nju~6bHgq=z}UsHOKM$9wq<}swvA_J
zY6Dm&<L0x7ynX*h%UFT{P?Oatm<HOXq5h;#*cDP$dV=D8cLuKVWbz{Na`{X5Hg5cr
z)K6<!FG;S{1qkEcCfu9ElX1z$=7*Awe}CBk6A3#?(EiA2la=}g_B93U>s6Kj(P%Va
z5?$Dz`K7rb*L^w-CixLkw#g=!5z{LeY%CaTF#A6@$sS6Lg~wA|pg5apcGayefgC~C
zr0w0|ab`&{Q@a1&+5TLd?4c}BCd?4g)G<>%lRpOx3F4@c>UgQ~yQN^{&;H1~Z?I+^
z!~!RcV=B`WvDKP>R^4CZ*mX<~y_HMd7%C^m1L_SZgohPk2B`M=$G?2rsTjovtyZ^;
z&T?~Bnd#;89UpO#&+93~Quhz>z6t(qzciOO+9OIk8wd%c|Nc=3-+v#(JgGZA$O_e^
zf9|mCYpiRaVoq5hdP6oG=m|3LH3EmlelYD5AVY+9j?-(#T9{EZQCg=NpX}^yGbvGc
zNs4xOSdr&;Ox^)_hRsNsdKbvap;w_5;s4x~H)*1)JqUV%oO3_m-Wk4bZn@5te0_m9
zN@D3W5a1+eDPf3?{oK~`H(2!!9nmUd$*KCzx>8_+_JLa+xGndpR#}(Ozf%$ZTo8(y
z_5=0FAPzLF8g(@hC;;Y1B}4~aQWSn43waUftW|+meZ5xTBtmm(7MFcA?OvLO8{Ko)
z%8}tA`mq!)Q${!=jAE(0B_KmW-d<c$;=QEjZnR?rv_}C`<JuS+VNL!2XZ696IM627
zr%3sV(!kWIw;yg79`bZO2lY5k6t2Xs4@*+YJtf#VXys#-Jk2p^l+z+idZTm1T8gi|
z64SnLm`4{X)~MRC+i9E5QDO1+nqbD1KLa?R9UB<ASk?b_qxyhJDrAh9xqJTBcb1dE
z*7G~MF~LQ|I}YJ@K}2pM0kup=GCO<a(#iOSG7p|^IWoz;Z@?2h<IzMoyuIt6<JiLh
z5Qy485B)DXF;Aahq|qP7w$6Kecq49B0cKHq@N%)jH_m<}!jABt?p#<8y^cr2+{>?y
zzjiK!U||fKWcs=h@%G01(x^lLb1X?w_O~d4|9(qBPM|Pof^ZBE`Cu@Qc!fEvU<!{T
zw2=!(hS^;LnMBI&KOgkK51-7^Kwmr~3X2gofEb@$I@WdPgg>cb@qH<&!R_4Jgpx%9
zJuPLKfP0VcAEprRZx_V!N|NgHvGQAaMC6s`^hVZNAKG$kdmjndJc(Y7kcFL^%FD%?
z)#jn5ulLzF6<z#=E8jDo2r0oaq<3&0sM_5fpF$&D^3rRUYklx<n@153^hA)AHdyp3
zhqp;1Qz7$JOarLnEZU_qQ~YK29+~p2ke_JvjrIXRCcFH7AvdDFLJ9!RgrgcE&&%qg
z@!z(K#|}SjqiGZ`laMQr2JO#P5kA=6L!UnM6_kA6oCxD}-e!H|eGEN0GT-3EHCgc$
zkw+cpGP0F!=#Lt0kWl|&djqO66%abCrV}H=SLN3SU=fcoI)(T*Seg7=1-I>u;N|{O
zOrMD%|Ck#Aw&_Rbjkvl<%iGbP2A50=7s94dJBbJ^y>ktY`Ov31tPoYVIUegJxIJ#0
z?(YE^N{et(mN|hpxy$iUp6Z=aiA3<QTkap-p6kF0_zDlp!whYq5VAkU8g4y*r9iYq
zd|{<6Y}~Tm8$;z+xodH%0sv-qA5U!lyT^<=BdJ}>w`9yv0Ee2fujmbsh+7{op=O)H
zwD^|?@94yKznwI*2-rvWdO?fDp5p?0DiAk<f8aw2momhOvR;Vo1_eiXX5du__EA1}
zm&E3+)bo8+jQ<1x;R{pZPFAa+Nm+H<$vvq1hAc(^1azGh_dM*t75h&Z%HnTDHwLPU
zSJa65_mj_}o>;MfF@OQY_5dxeXB7Os2xUYACu!ql0arwK>YqZMXOIL22+0aG&$4E)
zI0UKh5gjDLQA<^wdRSO92SehLpq}y<^0V-dsH-F(z3O6ez;-D5Bp=HU+k)Iagzfl9
z+I|-;C&)KJ1qumCG``N%1%a>(WF^Hk&anMofu>>I5k*x&nEZu%f*>!DK4>-h`@Q#J
zCgLczDCDf}0h9@Q9P+7g(QFPr+l?S%9MlT3XY(gt(t5wSf|L#G|5Em?a`$t4K`8cc
zqebRIutJI=nJ?tdjgs2TJQhGN;ErKIi1^W<dZKnuq(mRoRX)&>)Gs~s_BI%OoYMs?
z(819AkWafLrVaS*da90&f}8aunT{~m=<S-I!!Vvqxmf<HhzA86=w-@@{C7|iECET<
z9TEtMB+=cGhV1<UhNuq6`xP@R>&tdmkS0hFEt|mqj&#x#WgHFWo+RnX15}z8y)Odt
zg;l|9e+N>Ab?+AQfmEr)o|fR|e3-?7ItMl_;gQ8@0~;EJW+d|Gc3JqwG~v5eR^}I|
zK}hiFxp_~XgLqMVph*}}V|O%vWq|U7da_p7>XNTdl~o@g?`n(9j)OZr0}1JVyK^D7
z6M>$-Ye$uo+-T;U4dL+mvZ-`|;3keXd0!dA1oPi`L!xcjdI7<zeJLwg4R5(tDqaqy
z$OsTwaqPmCdP{yk4_fA2S-cMMs3T0-$35llEsEBLSu~U0;s;%i;|1G-^l{aAE1IxH
z8(76|@j-Bdw{)YPDR#Ysy0GomAi30o1DkMOehE~y$E*Wm0Xkm#!W&j~{}7tmClnA~
zbsngh6bFS2-T!HbZ6M59Bqj~xlX!9wwSV={_EyR!vHKh0FywJ*vjslWpRVc}PPZ9X
zSGj$rc{em7i=}P#eGiTkuM9x?1;|Bt=3>v_%f-#6_%>ojK`F?^oh-e<<)h^U8oSqF
z6p<CXjV1i?p7eZc1+E^gJ(Myq!8f+rs5EpVilY~Yc_=qK?{gW(D#rxR2)lV6qu8xK
zA2%D8ICj57x#g_l63qKrK37f0)hj;cL5>IR<cP6~GqPFcXuGlNcSEbZ-GtKC`;puj
zD)rSF-l`S^EbV~lI6=h>IFFnN^yEoy;*JwAgGj9S?FXBjPDUU_1va_LK1<ke4dFtx
z!8+}5acP*F1{S5{2{wDwfX;Bgt!jG&P&f@g{6N?{JLmg5^vxgq<eMk?RBPr{t!yi)
zd}W=MR|aN(5?$DL`@sys?>#`y;qo9AC>suO>NQOtHC%G4`1uCbSU-<N_-Y^9Vdz!I
zE#p@A9$&X)PQOSs`fu-Fae)bT3lS1TU6-n(?Be}*nJC#@{uK(9gGFyqEyA6p)pgup
z&W19sz)h9<{9D){y8DKE@xaNkyle#^Ot2xV#7<)hjHDn9IEAM?CbR62e}z3RC3s{^
z%7RjC4DbfO=_QZ`@6Zx{=6&@VOvtrw9zL$Q`L44vAOIDSu-?v}!zn#{i__g0lwMUp
zA=H)5)ii+@>Akb%!)9c$<`O!@&HBM==7@u$>!gUnp9w@@Z~P{u^=_WPI~Qf#VjZ-%
znjm!aWxeb*bmy1=2r*T#z(~7>t1z4}9jdCwo(dFu-);N=k&0VPei@S#eu0;t&YX?A
z`oSJ`Bx7?DZDp5N(Y8$H8y2*_YMN9>{j$`qf?o87dq6Vaw&bQ(p;G!wzbSla=mep&
zP3e=1Q-50m{pdAfzl(Te;CM$c%`(G`_PDHyyEG0o&16wIg)X{C^Mx>83SP9c_kCL;
z-K-ru0zCq?R4uhLId`xv_SrXo*Xv<t?g53dkv2PIolpIDTXs0mXfgy-92o4+R6wr&
z?2$?wA0jIE^YwG5WSDEX_H(+&$cclo9_-D6wtROJiU49x`b9cada^bQY$%BroiDv;
z$~I3!YohIM-cC`}ym-b>;vU&^G~tqnTD!1cKOidBFEY$$qM$;eJ<`L%-+)bSd}^6e
zDK@^o99ogVg*F#}*FnjP9#r59-g=r?uw-fm;~f1L&v*lK_WlSi(-CC)6<@-l0bI)Y
zU$?ZWozYL2%6KWxO25@FBiAYom3l|N97jc~kCzOEX6^~4TuUo`>83`og;7Zlrm;pZ
zy0F8G!0Sn3jrjAip3H&>R6b?=gIyz*J<!r;h<^ToRh)45>n+@Bf@}<d_jL%FU4N$F
zbqH~4J<6WG6Cr#uriy~o7ZEDNNOASAPYglw-0J6Yu{7&_Due+1a^OEG&V?Q!tmfx|
zA0vt5I?c;U9fEw>z56o*<JOYYch6lGCnZ$}ENFvY`|#mDP$uiD@;Z`Kn#GDQj1aOS
zN_;K#fhVcslV+>$pa$iHb>Km`zhV7}gK6^35{p+6>%UJgHeaAi%DeF@K?nM)kAZJV
za;y|fw$gi|(T4&f1<p|62r~Y`1;vBv8+BcTM=G)F%8K>+nfIHw9<a~<s?$VO2&bb7
zS5T^c;s~TL{TOTa?d8yuJJU@6F;k4)Plh3OI@BG>ZEu2_4LsD9U-Z#8A=XkBd`pTQ
za^}4?MR;j8y%ZqtPo`@LFP@^*_d*9P36h3*%#|+QMTe=Ad3Qy-ilifcNJv&Sp|Sq-
zD#Mgh*t<H07QEx!a2}w3uJBPZb6}{DW6#(*5F`3vz0r_nzH_Z}11nMmSSy}*39<k;
zMT8LuOpwoieZzq|@XnWAKemcbFe6AfGj|jRx~SYv$ZXBQ{E56~FskJIY$mN*Dt&I9
zhTd?%A(}YxoyaG~={V+FmB%?$ch-JgTo9Z#iT3rH@{0A$k-GP|k<*nxFNi@!T&zE;
z_RF=sLX}z}h3WTzR-tiTK;T8kl))EKmArWi*5S=~QShQZHa#~sH)>CU@1i3###*3Z
zICz2~oqcQJZC;MPx+ST$*x1j|bZeoTo?n8~srbS+-c;YV*Bcec(7L&b^{IaPne*wB
zwbw}CWkfdXrlr;|e^#G$R!GOlIfLT<er=Z|Z!QHh@G;B~)vz_d$uk``xibxjeidVf
zTUymV`o~IRV5CljbG*i}>*LuH_rQb+&C;~=ZIzTzv~zlIzM3ve8h$qB>n$@z_RV=+
z$KPW*R6}>vR48gPF@evD;f-vpqyFbVBSb$>;_;y8=8I5?l|c|PRdV`c{`C)d`fhrj
zH*BNrI#kj!d>AL^c&&>QeopDeqCvJfg@2fRhW#CG2%Z$a{3<N1@d&#k=QZi!dYG>C
z&|e?%q_^T@o;$h2Yk5Ite;E^iDb2IfOAqE#l3K?FDpiF1a9A}N$=`noe8)A`8!%~Q
z1TDCs^79^XN-61~ciP7_X*%IiXPn<#1!G0_85U3{FrVfyJub68rch|YTH<gge`xfD
z+P&kDss-O*FKKxCEeh~Kn(w>i#Lcx6CF8A4)os;3s*Kh8O5G|N94AciTPf@#P=c%Y
zmGoU&IDYYRx7_#n6j9x@NfaA=(8zl=`zl3+W*IW|ZhR2`M}Dr_tl>~~g~jyl9dBv>
z8=Q8@avM0m>fFI-{@{PK76D8+A0p1{gDOlB<y2z#+?#3uisO9HMfvts*oRCPj3Tax
z?kYw<oi04vMSn@L0f!*r)GyMt!5whr9N(VmlA=`K1KZb-dOAIt5L}`cO1*bv$BSX{
zv9|e(I~$%)R0@&9jwvt*^jG4N>n6sSSyeII81^)ZTe0~I>u<$&Bc!gUb$_K_u6B{7
z@|KP8sG>?0Y>l5N#s<cv<(Ae!wMZia9hWS&5lh{9hN#5qwR-1OB*$b6+UXbyKGr|R
z56AaP1{YXdby-p{FK;nl9bf#ay#8rj;(`jZ@G}Z7$FzFMIoyV}VqLF!%1k=7FBoGo
z;$rM%zHH^M8Idl^=(pLUtb1Wkl(vq}Bjn)9IC)j+J-inD<i*G^R^b!_lA`yaPj+SN
z?<eFzeP7_4)tVA!8kGOeJGX{onm1Mj^GEgEM*V9j3R{%>m*=P2{ePX2XDT0Cg97**
zAB~=^loF!9`sG=|L{b4y$Kv~*UHmeuXUhKfVvqKRFH)v?_Je*DKU*AsuV1`?DJYzx
z00@H7V(EWxm84il{uoK|;JX#`OTE4r#{Ew$v&B6q1{9GVo>2Y?-RMj*RYbkA1Z{1Z
z-z`@&!pq^&$ek$aiJma<{MVPf?b|ituiPduryDJJF4jvB`(K$E#J8_$i7#$3rMSZX
zs+Pa9!$VR&mqM*^z!wnN7IT^AU9fd4x`<9I8@*T`(8g&nr^IV*=2*>_X=j^V1juZ4
zF(q!#c)?HcfYwSpi|~<Io7)P>7Qa!OZ938})o;O8@44s@u|D;7MNxYyD<*oLfWKFq
z7=_7e=*C-~p79Z_oc96EA@z%`k3N4IBF$a1QcIC{GHuTsmK(O-U(74gIZU)jL!E~T
zz6ir*zDTGX0EbYNsqHzGzH1q)ztXgBmEJ+UT5oMC`LOsI{|&f>C&%iakpzApe-@QS
zRzn?Ske}|%NbFyWmNOm8834UFoPmiEa&!|9JTL339$o!yZhxjY^I?o}uS=K=+l9O+
zk~l1%)z9PDT_K%a;P4cISbWxqqFC`iW<wNS0TvMMMxLfWD8>1fhIg!_1?=9c!^&?)
zcNQ3xcAO^eMJ@2HNACBe$}xzi-jVnUBF#QAv;`qVlFz}R@1Af}KhgpkvYet!|4h}p
z-rvX$KFRmmE}<nL%QPn4W^zviZ|bin^^ZPGsR}tyB|hqycS##7bs*Kx+`@6*>8LP;
z0bGdQ!pfty2E#5Tf_S6&DtOD3@oxLg6!UD2zSo6@7q1o_^ya=2`#0}n1GcHYSmVV4
zDFKIZen*bhg&vy=Y#S#%Ac!9{YX3u_`n$o83yWKoJw*m(+r`kT0IxcHhjeMJn;&v-
zA!Z$OL6?8aH~;yFJT5ntMqB`IPc?DWM&6@?pNy2(e&@=VK%uyZ*QoX%14S`}3@G;l
z;7LK_6-hXTGPh*cofW|~o!XJ*HK*LGHdF@TOncPv(zjr19t(4lo5@^RgY-lyBtt@W
zvNFBN-V&CYWD3P<B>AELTp!?c(RnmAh}l|xzKhX4l2mzPXMJ^^Ow8}V`^3naoSY}f
zQY|(33S#8%<dl`ci;h{LARXlV&|`Oe)mVm3rVxy_(Hq%Uk*m_-jRPf{SCab2g#_U5
zE>CfQP*hiUWVF+V1=X@{TeyUK_C_3dA9o6}$j)ehc_Du>^^foSSGQBb150`r0~pyw
zMMc+Cn><5-?%lsW?mq|w6TzU2LvtbxV*j5x0T!B6|C$D`V;!EWXHNlwNVevGSqwec
z&lbbS|67aU|D@pW)L@Ro!|{ra=ffgm)wo47pR6>Y{|O!BpR~s!vERFej7e>yLBNl!
Ml#*n*`1`>B0W$aH*#H0l

literal 37069
zcmdSBcQo8<`!CG4BSeW720;WNh!VZdjxvHEM2YB~L^pa(bOzC)6K0e|q9%IFXwiFT
zL>;4M%;;s#NcR3c&w17<>wVX|&N}Zu@g-*FzOR0LuFoa<siq3W4W=6e1OybSkCk)?
z2#CxH2ng?x5EBp(jOtM>5fFqBs46|u^)cPfxt7i$QB%VgnvwA&-3{Ik5m~Z^_xJac
z)7s7piHO)K(%Q~$-&21*05|D6+gtPBI^r{OJ}K!najx9~pZVt@{bvhx9;x%D#)DyM
zH0)ti3MBuB{wbh%!i%aQH!+*3wNizaRyKw%acNi)Q79%!4m1r4ppKK?eeBwOjm4si
zgw9U0IFj}~1*&BJK48vLd(x;Wwl8V@=sLOky?5BLq{&Jda>ZoHoGQ^Cwx>&@=4>Ei
zKHt0d&5(SJiofqP!q`N3I0Un^Uf)f+mxkTymB%Il&8b4sYq&)ad4)^zEOmv7+Ub!w
zt~b@{FXHsyvM_;2v|V=Y@(35f1e5nKaeNNCJI>uvN$WMQrTR0zbZxYyx)|$-$83IU
zj^}f*rQMx*h{_^7ted&c%q4fGMG21um|Ll5ju5Yhl&(rLKhGB=228P}StGyxe*Xg1
zO-A#d_M=up{WM`i2BTwK7v46DI6l-Q=bWo#J`BjU=7ZB8y~f4zbX0<?nXRK;l6^b*
z4tY&!81&kkpQxn$C|am@_8Cy!uealMr<$KIs@?$Goir%u@SJPVg6l$l8aD?EXqM&p
zAVK7ahS}jRANk}GidVs&C08{TFXFP21|Ro&jPm<)+8C&^rD5$5tX)~$eFPkW$F0ML
zk0fzG4&Tdws3t?1D30Iaw@>mTp1#3#A$NbT*ljlh3R~2JQ^)z@&Z?7&wxct%nYYi%
zVF4^b{)$A3-5x4j+e424+j^ubfM;!D@)U5lwAJ(TOGTX)m2~O*W}&W-wjQoP?HN#-
z-m=B{am7<#=*xV<Lt9naKPrcbyU5P!#3Ksu=UFiZL><nW3UbOt+%;D^3B)C0SE>D^
zP=@pEOy-UE))Jq<g_tgIHQrq0iABlDxgN`Es4?TeRnLhaILpJUJ}fb9n819o<}!q)
zRu_)b9<55PNKML3<k4RYHB4rOZeU>DJhS&v!hfE=3}G>uS(mM&gv+WXSa@4;Iac?I
zGZ~PEma6#`!9@7x>h^_o)>d}2R~ru>B`Ql^CShR-_;b#2ldQBbcET?(nRl{r47TdM
zAO?#?tzP1L#0*ehd4b&oDkk%-du!ARHNv9`92pp@!Hs;oLztry!f5pP&Oayq^FaU6
z`BV}skHf*s?T2-Hm3f`g3an*eV{O+VfZ(#>kWd5Or9iPZ(^5wxZ{_(-I3~W_5o(qS
z_Eeoq5mmGaer$#enV{|3c2Bjs(?{SBts3All^pF1f^Z)SSCLq}2s?Q*(sdEJn}!#S
zk@ERHilQ6KZHlRZfa+B;4#!f?6KL@7U=St0Od&LnxtH?AS9MPxo-X2PDQ3$EifJMY
z$(ogT4WQ=8*5;8Eh-!4@>JMdC@}VZ;zs<f+42bx1uI_vIQCp?F08|km92CYqj(dkS
z);arJ*1}}&MF}_q_*$9Lp9QC5>#o@{nj_wSMZP{u+Ls5>yrz~3mWApb{3K4lXl49N
zZYYfEX?!|AHKXbql=-*cuzk-Yiby=bs7XpcoR)B+innkxig4W{GNulkSLU_I&Ei`!
zBnCjm*smMF6D8WtX-)u^ho`0h^R|*v0o9qc;YL-zrF)WrT&BT{=DBCdxso1waJ(<_
zzqCX{p8+z3K2*i6A0)byP*k`i<tmFKG?x?p8hSP=r8=WG&bAWH2m7ysJZvLi5M2+d
zOpsDh|MRTrTZq3UNtxFY&EAst*XQZ*(4{4VpJWt5E-AwM>v6Q^6H39XTW_MM0af-o
z;`d}B2-m$V-5Nt}*RL?y+Mc|zy}7$2M&&f`RH%uzb70)@Y@d4{ww>b?8Y22yJ@$e|
zXa=tVk}i*jKhts~bt<#+IjpFqOSAzNGJ*GzuYvC{KF{luL=B6oG+1e-nCs09;Nibd
zs*z+D@-19dD>O%&)<wwJfYeI+x=YiiOe}xhlFAbh_Zsy*czR!$cK61Mx;iEMkdqT5
z`#m)PCR?Fa!6k%+KJ0|L!A*5&6H=}ZhwA|{lGo9(KJVA~ESQNcxVdeh9r1L&3Pju8
zs=n5|<E!R4m0y-eX2NesLR9-00)pIU+oiZi{$OwRab}s`sDkQ_{oJvdapR4<xLAoB
zj8fM5H(TP?DPEFGy8I>LckIA*{#otsDEuo*LfXRAYuI#2k^bh3e=g-4u@S%Zt_>Zo
zBk9yZpHol>fy~cKj7nVdsa!%VB+UWV3q-qqfUq{2pFpz1N*>}V?A+4EM8MSrOh2ur
zE8FAU1FF?uxo^#Aejdn4QoGM5?RTiLe1>h{j<TZ=x?^%)31|wj=$#jW19r~up@jDV
zaoAGc8^1GT=4u2?_C@=R2*Xd=DMuWTTdjA9K+|8j0y9#WH~E*{40N_Gku!2A;nn9~
z-`Z?kEYsu@64w!0x>`hCz+fxM`}S{JBY&PlewZr>xkMiWWco4t94}ZwamR=>Z~u<z
z<43C8TsDdVVYdi`DH#ViH0_^6n2VnH_tKCBWT$ng&xqs=bd#z~NiOE9&afnEE0U{`
zKN%H>rAXSBI$vh|l;rb7r!nR(G`Qp2!o$@hwc~14DGH07-Ws;b^H(~#$u+h%t-tNU
zk|5bGg!J#czhP(n&$Ck2Tf%#PXIv-CELhWvJD7KWfB%21PWWGc(BiEwO|f;Lda#Zv
z6Mv@dwk|G*&U&-b>XNTVGE%qHp_q|5T=grHG}xZ*UMG`$!1JpwQtnE>zF2m&-cukE
zCQ!v8CD>F)-&5Pn((g+vb*opVXf~@m=sr;^w$zcnMT^pX1|?(fAYlJE%;|6^_MsD*
zFH6Bn;d8ZOM4>3}y~1b0_j6*R!5=@2gi~3DQ0dtxiN00Ns0nYP96{^w7e3q5VY-z(
z`XN0<vr|cWVtfPtnSVv`imEbwFL*MiU>dpMbN}7w2k-lFF{-FvA9^?E<x4Po7~qc5
ztxQe|YwaQ@_HspV6P-G0yfFpT2?PK7&_qo!u?8vya;kzGPC}dIz(9_`lO<-3<_jzf
z3U_vz!*gjq9J$yk5s3b_HD(sLe__99PXG56B?<%)KI4UGErg%!Y+7OPS!W}v(C#*E
zc<=#YtW1{vK`b!fV5x*3_haq)81C{QQVe~$c3rSY!c6?Wbc?oFmga^F(4Ql4vusa!
z$?$aHrkSQo)9M$VbCq(rB+a1JA|6>SwpO1!bw1{JM8Ljy+U{jj0>aOMu9*5jg^4|P
zj)ApK;9|A#3>s*7*QC{ZmsI;$YcU9(8jCwVys$TWC-1jpnno!;ztyrO_&E4-Fj?CN
zx|smp_nZ@y4-lVC<H{}V8<Z8lf!kjwHfFlOWLXZe_*&-4AlPxI6LS-ixXT@X6qns?
z7P;8CkHAsV#A4<B2YW58CqE32f-hJ4Y74CV#`p?X%9?*#8(a@O-6tBk95VG43S5ro
zEBFceH$8p%R^pvoKc+WcJ@oLIv~$@^DU-Y-pzmFA!eWpwxZhi^sUjX|J;XB~%YeGK
z^*Liuv>lA%?GsGZ#M_zQLU^$fnz;14ozeF#Di$+c6}F03(thJ9sLWD+<mhq9g?HTY
z^Bk}7AQa5%CIl{#Uhz`e`up1_P!4q&m*z?w@RjILc6EJ(1Hk+IV_ze2iku3^%X>Lg
zaSbN53!Emt^;~!f&N`IFjW(9aOGtPBv9j)tQfeP$O)mUuU^bG|7s0XlMkHS5`$d<K
zspHZK8S|*&OYt}%X@eJD(O|r%%zMb#VtKrAVAh$se98s?Zfs0-LM+_C*w!sg|K<YN
z?TeuLt4~di4QF0ipe0j5IdEtr+uUC+x&yGW#o)`L#_OAoy);7$D3v|7&T46NMf0eK
zrmW}<q%mw{#~}CCPT#2CtIxOPJnT;$ipzvKDQB%3u^%y-ka!oR0a!9a1j8B%)_xR!
zXqe*(|CWIkpEGY#hn5Zj@6V=jLq@~bv$`h|O3S}IY3QdRYxH=^5O_A?G2sK~ihUqz
z#!e)xl`2c65Jjk>n@<7qCF(!q9jlwM&zTrwEY6!0gKxVnDv!LMcbYqo7!5CGK|%U}
zLBcamzyJ)_;PCm(Twrhy1!v+=&D$w<>6WP6X5=P#JcVO{1Tcr)t-1i`P_vE(zso4{
z=AgHbl<QZFq*T2*?KB5IDP)L^@1r4}4x7-A8U(s6h#9wEV5a(x!c7CQ>)^y|U4^)V
z(whYsIz6*7(u*Zb4{9>&)7GGN2LUucLD-u46ksCS<UQ{W1xG#*hs5{kXcLi-sJFU(
z5U4=Me9RD4INNK8dj&Z~zDwMavdiL<TN*weqz8Ro&$XH?*|ii+u?yPEsR4sR<pk85
z8O`l;#PnmxS3na_q;vdMTgsfFPbx1?S85wD+O2bSrn1oL0a+{2H;T|aWIVY0$JcI~
z9%l2bvQ~Y+Q{n59p*B6r3}B0Aaf~Jdjb_37Z~TkTFICEwkUSr4M@cV`g22ZHMs6tZ
zbVYtfgdWf49qH}M^W>?Q@Dre9ZbLL~j5Uo5WGZ}GvltMH)dB{6Wzv-Aul)W;nZOMd
z^77UVUAxs84Bai1?&kq~fFz*Svk%qwB?eexV*^`+<2DK#H?Yuob^0l0Q`;rsI)NPT
zeT%ZGw_O5IN>RV*BSf`#;n0G=hDHGB4#H=7a#K9c!mmgQNSy2)zF*)LQp$AZ02sxn
zOFBOjwe+n-1ZsFGMuD<fY}olruloqaG>O_Yv5ugXl~ye0G^S$VLX?ii<%);1-+ChQ
ze9s@DbYdT@g3OB2En~0?(s8Qg{CG`NWVqx?jmmjXH{A;uxrWqWt6v-C3DQ0g%dsEU
zkQf**L;C0C)bd-0IMp4#%-FRATnD%h%?Ij9TOR+!O60WwIxAmx5OC}W{j4%Bj43I1
zSpC{3PCLdoB-L`yvDKtjbIL?yF{I$OdCIga$V$~?^KS2W!eRh_jFnyoLE}*aW~`9G
zO!CH+hm7SI<Rl|Jc6`MwL=Ro+&mlZz48)U(XFD|EB1{^Q*9AV#)9!v0k583obFQni
zgJn{tRz8dKS_1bU|8x>J9Fm(myseD43wDG?3JatUK`sQs94*+C@|>SPqx2>oo7w}e
zfGUK1oNkvkd(FcqBz!EV1Hm+Lw}lIAMdQ4gQE5ot`K-sqxtsXTgqTu0TpXY`L;%%>
zl!mcoyf4l{Jn?JSk`KrOFP$LWD5P)--<sbrbkQ?wxC#pReQ=9w_%Y5ge`$4dcHw|7
za_W7hri*B+)J?(tdfU8C7A>O0tr5&nkg1a6*d3X*sKBvKhKxnVr&#x~;rFlQaIzjn
z*Z1lJv8}Goex8g6H`;6G953KdVj`2(>IbWyQSpKW{2Xar^c)i60`QPxHx~*IkneXb
zm)lm)$3iN*jn){=H|j!<T%mU@05(ac{>-%{r<s}%X`drgX<9##wxAO<X3)F>`Zf5Z
zXUo}o?=`a^*dZFN3scOeCL(vLN6&^Et5%OsiT2Xk5VhYXg6}Gh8vqI>&5J41eY;R&
zuLO7`-0zo;iyZot-X#W{v_*ZtKD0=?!}sx0ja$0%nC(MjMxV_Q$7X+pGeyE0V}5d+
z)4UjFR7&?9VFbgqsXMY@T=Pz^$U?Q#kTyP|(%QUv*mk>K&)|7O86oIJztDnchVwDN
zOt$uX-LL>r>Go;E^@E+yzUfzJ)wssEk+?H;qUNJOfiNnfh!O>6$UgD(I<w~Eqv;?K
z?)hUOXZwz0R^7!r@|LYRrQM?&>7MJ1ADg|Jh9wbu#mE%Iw*q1S?Yi$$<zH_WwFJdM
zw_1nfP(nzNy(3`>>(b~gU1agN$0JSwTyYLYQS0<{#UOh5CJ2qodT5%kaeTKxc6#JS
zjl(#;%}WFBbCYxD38i0H{vo<;|1_dZ#_*$Eh_<Gb-3HtnN5LRXR8j666{u1CRGD08
zey%9T&yg-Ziw5I(Zb{nzy5rX8T3jTE<0zN=RfxrS_>d{bW`gLA&gPMdA?9YwFW0QT
z3*7nkUD^X0z*t~f(1LP$orae;{5>W@-E5o<g*^|PYH)|w|GZOi)(oip_P(~v6CC!|
zt8Th-Kt%Dyv#B|cuquJ5?%Q=)TVQr_hL+(0bal_ode^Omr@Da}+`F$&3=n!>B#0nK
z3{y9dw(PRvk7a<sSYLOD>5Mc#U0Hs1U%XC4L1=sHR?|j5oTSDrwq&0lUlfLLd>kf*
z1`r*$E!29JaYKlO?IJ-c$IBL-Jj-Z!!$FKl>5F`<`+M)~H0G{%Ia}?Qc$t$~4}T{c
zN+9g=M4i}O6EzL!fG$O!srZ!n5d%CZB@;~U$0HW}%ASO(p5w&0n&k1(D=go}AlHhy
zy}wJj&e`Jmj4ORnn{pIuKnKYK5AP;2)?KNkm{*@JK|XRs5Gx1c2n%c!ruFi$I<4SX
zd3*|d10Pq~op@iBkvepUY>kmhDD9$pWXi#34s;QyRRc^r!HVG*zA5(gjpsz<wFrKq
zil|eLxTeQ^OpTYgJ{l;{qhH#CPP6$nbFo%wZEnND1UKY+1zS7+lk0o#hT{ZQe{#IS
zHkX`o<a~D_C!%M#n^VH=bQ|1#H0Z^0Q~b8K#idyuNY+4BBxs4e@uh&r4#|*F7?BI{
z`iqFEA12@A<7=k`3tEb3J>_tDi@O5^?MCoPhNz<ncl8G^G&uD6J1V*#Bwm~cTQD`_
z<`>Ew-4+klNgp!3QX#&dC7XbE1HJvf@Zst9Rd`BWd-FYj*AxMGBHypIS@f;@cFfbr
zy~gA%VVhvdJhIB;M|c^X5FiMk)G>1fa=`sq?%|)7mQ~(P7T8{`jWW9sUk0TuP53;g
zjGaS84EE#wrGw~r!MlFVpULKDxNk47%TL!uB|ZSL!-f3TgnR3*^BD&NzU?=Unbz^R
zJL2EG3;hN&XhPd{t`uKe7N|V&TmDwov8e>@LO>O6by@EYi*au9qX;FaKvaM7%1I^O
zo`9oK$TT}<=(mOeL+!;Mr?(@P;Hp-$MYQ`4T2@lz-R2xkC4?Z8cLq2*?O@n59{tz-
z{<}gYj>RNUQ2>p2AXTgqRb;(~g`}s*5`w^LwU3z<0-lkAM<E&7rzSg<C(^*nIbCj%
zJJnEs1XUNi?-m6&AA}7dVv(MgW;1r(LKfY6GN>ll8bTF8SB5#1HU<2gWDceFLv&sV
zxJ(cqW#X@*3?7Z3+T{y7%e9>koVLw103@X5q+_*+qSGWen3jV4ZZs2uIFbr`+Ajds
z2!i&CsxB(IGJsk3;?XY6;B8{RGj!=u%kcrZkdjr$Vk`qKawog+8}EL3P1F<@z6wi!
zW`_xp@)tL2KJH5KBwIa`1ekkexPktvFZZMPDN6*2pyg{BAs}l<<sh>UU;HE972l3P
zUUF!q%-;MO$P6$@LgkPtfx9mDGNkec1)tt*o$|ON(~{bEB#G;1n6}H2(UQ1RoZoyJ
zX?5w{hu-mL7!D~z-j^YXMS(fok(sL8Ag$+g_sOBAP{qD3WkU*o<8Wbtb2!M<N6giw
zT+T}!%v%0Q%G}pdS!vmlZYEH994}s}+8jRsCLs5&3ndRuBEquHp4cW9j473lKq6aw
zxMwgvOE&NWIJWcH3YL1r!*M<yxFifQUthCuSq|#etH*b9{8!x^?+}3%C20#1OEW52
znAp8R?9my(5n*cMchE!_#z*JOa(PWrkVjC+1#2ve;$c`El4<iK@>X#P8TsBG)o#<i
zc)hu<pWQr{n8OL%aazwZe_b9^XID9JE;7&Fj^m&ccn5hOcBG;x<|5EzPJ80`tD%jJ
zieM=%_dR+FAz<?KWy2JbyM7*iZVuyqHKEjdhgc<7IPw!AXeD}IY3N~>be~!z+RfuD
z2Vq=EIg0LM6d~vo{pq;t4jXmXo!%FygcE5%MZikWF6QMp0Th|LM?dpR0m6GJvb?8m
z7Ub9F^V>lWyZ9F0SGj#qEHO(bdwJx0ORP<Z>f#T!Ky49}lBo{xtX{uveE-wxg&82*
zSP(2?=GCM85T}K>)^vUIf#^sSVHXe6$(U-b*?1P&U60s${bSUP-aq)_XX~PrKllJ=
zT?I!okJubKT}D&wQv68M3=y?8p&^>;z~1SSx}%p)SBW#~mr0AXz0LDpQK9g0z!bb^
zNJt|Qq-1ixoa5rp{U&?Mbh1fwqaK>LK_P^@U<KXvNklUn+;<5<E5M^H(Kne!vSYiN
z_VhY_v=4;pHP}BL`(Ys_y7$KC#IU;_7k*)=eqs4|w9XC)b;@51l8E{u{JSCzOAKQl
zgj2zUgB`|av@@Tsa`f3_=9sO$1o9b5qqOf`+$GfaWwY#};CU4;{VqhXs=+cBU%uqW
z0L(vas|+==z3#eGS__eO7_7Eq2Ajq)nad7Me*IFCsmdX(UUJhhsG!M3cj*vR@BdLJ
zLg;(cP_`g5gYZi|mweJqUVkULhOp81_ki0?<Dl^;(%Q9EBqW}KWsZQ<`C1d}_E=<M
z!+b)fZzx3DmYHG1SNWt*B$G+FEj@N-DxB-72wvwBMC2BAi(nLLS7M0CH+w3hqw!A(
zgSj5snA14l5x{$ZV^gdox91(=3eTnDv35TvI=#Z|8~^0YbJ)u*ujNk8ZC4sG4sES@
zw{T{u@27{P5jQE=?(K{b#ZljC`Kyl78Cb_##g+!Ih@=*-4aWd|R+LAs%hbygGoBgF
zrYDBD0-H<EfnL`{1(&PUn7SF<?3kYX4XYf3loaolL!JxAv=@^d$0cf9+6=Hf66Kl6
zo1%hkhtd8Jeh$|YQ#`ThIcoqAycEx&f(9FQ<=={U;2e40Wz!=*sbKMbKX@j=$X_^8
zVDC(!E+U+7+)V(qxLGec`Aoz?38|T$mwyiDGqh~U!>u!esPl#^+?vi6xF$dMldW`^
zGKc$i)smh`>nvSW;AThr^D~_hRUI2KY#Lo5tHHfSa>-vTf)Q%L;K@oh#o^s*cu57>
zfoOs4SWoQ!ipeZSDRKyun(+3_j<?j~rbSzI>v!*CSD96o@AGQ8Vxt%MR^3AYeR&Q!
z-k+Zk+1UJd5K3pIia32->TmEUpfoIT!2YAjZv8|7;kZGqbh%d%ZJf9<+RsS5xivi2
z<=8hGG%+Z;58o(wvc{-)aUYr6JRE>J+G`b`or;>x(F8wv3sNSJxTJ|i{2&~8!su)#
zUFpZ}as6!eWEi3e^b3|yaA}W?`8OC$XC;a7aN!?RtyTvcwb$6EukBOX@t6lgT5asY
ztlc;kee!e%oMmGpt54@xl!(|B-M7RRl(SVjDe<FN1zZpYo_BKq^?jMlQ@p||H*S(J
zBd!ipr)N75_o@)!uv*{8j1s&%`k@HKfQJDTbn{i?$kM8s>Tl`z|AWDOrh}3w&)+&z
zuR|<H*tIk3HI9*{HMu5c>c?s)2_i~7shxFqG&uwz+fBY^7xzK;FzYYG3N&G~WiAKC
zhG)+%JU?r~p2=+8Q;JkUoDUZKKZfTtl8CE##(seerZI)!O%pZf6|i`P2|g~)WHP~5
zE+pFF_f&}tQyhNbM1q|<(gy;C+E>8h6($IKrV)1)tHSWH%HFbtWU;GiDO5W^G?^;|
z^G<Mxc2U*xpsI9GM!5O1;Mqdbm^fvhJA>imNuxmTmtTF7OIo^CvIKH@o0c+K|EZNT
zX&>ZlZ{+K<oG0AxItO30<eoSgl6n42G|i=EM!fWV7)PZZXZN6fXn)ag@w26#Ca(Fb
z*T*pwb~VSKsMf^SmQIt=|9D-!!1bM~ix+iPOGz@S!;a<rXzi0vf_30-NWJmEAaXPI
z-)<*94Bmd!c(d*n$zJ`TY^`4g#hykVe`Up4-P%g68S73TEiLw>0+T)homWvmAtqz%
zoHvJm6f141q`Aem>e!t$`I+u-@Um9jNW-}M!G&YH;dV$s#S%lV7wl{?(}cf3a8p+r
z{rUacyBA;9t=ddQ-(A$kD(tjwiSGtsS`Nl}J$d7(hWOb67HzVkN8~rV6psXgtr4-E
z=94BKYv4=Yb;xq<3%pDq*Ei+mIUSEGrSC76+-Tp}es2BRt*q<|+UT=pA+3$ol#@iy
zh5hLj`Xhg5$a};#JLtuci8OQT#NC`Xr&o<KIldBotCX;}UEM#PsQudZ%hGIVePNB0
zj~x2ZKq3dI?q<1j_Q=nE8n$3onF;$N1quV9m6d)`{&z1N*(Idc+fSama+?-h-2X>`
zk<KlSNw5CI&&doaZ1o>@gscZO_7KRRem0v(+c27+?KF6b?rU`aiLPICe%Mws=NNbt
zY{zK+s=UQgH1k{4cE{d5BFL#r*^Hgr)X>AW^$d@ijTuXn-?CRMVHXlmgRnO7-r*;L
zBvyXtj{o4hXjzdc4|~RqV3~jQRs71el<)`}=T7t3JLf<9UB;evUTjnwu!1?B)bRvj
zzEw|L0R7K;u+cJQu4ql*dI)wx#l>r>>8U*(Eog=FHsi8Yal*X&2WIvLpHb~Atn-t=
z&c#*NW>r@I1NN~GfV}FJOv*(U11*8*^OHJY%fOe*W4+Nx=Z5KU3*!af38@hrLk#Ey
zk0x##Ut_%1zV^MVa@K3?AUiR2tttxLtB!fsH&haVZXF#)p7D!c{`Fsx=S9l54BPa<
zz{dN11l1I_y%Wo^{d*9r-6?UJxqMC7*sFwvWqI4aDE)VYw#@yq)g8c-pMed%1noKT
z`;}{5_%0*eMQG(6s{Qz}(AwV5K<`anBF9=M3C{x-&tkE*5;8OmNxKhm_kdWVOc^t|
z5V@}WbGe|2wS~yUYVwM1=f?i#TDTC1BD$|p^@>+O$Bl0qV=hkyWKbpdRrKR{!rYXg
zmhpymUKH~K;($G-JC|7s0I4QjhZ2kQY@cb<UFN^YhJXnoF`81AX2?vDopGx%@4Wgk
z7R<)yw4&;M(ZGfuO&-NEPG7Yf^ikKu*1l7t2pgTU%If-;cUMw7j^fx_NI?w(HTZ~5
zY2=vB8rW9mOxGLCe+vpZvueEw9eI5F3>bmi!j5U?An+O8nVI1$E#-np3`5Kwk;N06
zfn@ZujXG1fMV*wX?;u)x{`K8a#IIpVtquTQ%HOK%*TbmN7*Q?Oc6o`BfD(&P*pNFw
zD0XY{;%NEM?a>=WAAuKD6;nU$lu5hXP@Rff>w<Ym;K}Z@!Sn*fsD>^ka~K<iRjY;^
zTxI2h3P5g~o`~FAuT1^q{ca60U})uaql?i(Q8mTT4e7p(<4s~Bh;(;j&6q>FO-LdL
zmb%!L{Ku5yde9OjKhYBs_83|{dm`m0e|F{NnVPai5O(#YQVX%wU?!OxwXX$Pg`j~z
zk0ZT_izZ#67W?e5ZmKR9a-sPAqXZ|W5rFj--uyMOn`Bzls@T1?nb52Mb)(+U(Ws(C
zU9JKBi1}Q@ol8`AdBX^ItCQ!RCsM{LCj9!eZDK@*U3N=X;BZXO%Dl>Chr8gVx8G2^
z(7J*Uu&0ZKXF0g+S1;*OI4HS$M<Vi%+8n&cjTlN3oF94|nXW)Q>7Q)i-BkNJsP-hs
zCS-<KWQdLBy6nvb%Mdh9`+AQ5sD-mkA3=L~ytaR9;P_=@QI?D3w~JkRA_%4UW^X5b
z?&1XcYU(ZB5~KoF8*%H}a~5t>z{6l*zvFOIpUHHs{_5@;av}CGdk8+4@W4F>#m$A=
zp4;CgZ%++l5D+9Mub^&^oT^(LAwkCh1oeqHorH>n+JvwQR*~U^_JC6-roqd?T?(Of
zvv8#K_)Y_vmRsv*!pSUq0>>&uW?47$Qf{Z*Lf}BXGjKN^JLzW;;ztBIt3_g#!&MV}
z01<8~R+GO)@nG3AI^HD};k&LmWwcFk6@1R?Hn8rYj8L+bK5u$(ETPxp0YhCx?w;fK
zhi!1YFc_ijEn|g7<U+Y)6|_8k1W;$IlsAilvaTl5Tm2%Ep<z4t>k3JJ$XfJ26sUEg
z5Hjv!^gLwo{Y0Z)oU->enNXh|YV0v2{K}JgFV2<vHlw)$$p`=+s-K4KN$~}TTL8?T
z+@);C!_4{ysv_S2L=cc`U0xTn;DQhYMArS2fhZ!)t1cK1|Emv4Fa6jQS7$aN>My$&
z)wkr!&9)vSVHqME>9jG-eyRM7!^N?x!2mEAKm~a+&$Ly4=i$T0^Fol3a81$vQ}t9)
z9yKt&D9dl0jZtsxqL{UWlA<=WpEWGDBsR6x)qjlm(<3h(xw<k&wHkEN_dQ)Mm~WyC
zTrkono1!sgR(cw|vft-1=<*I$OR?@b`Mk#yaC{6g^=K_-YYv`>t7Ca-z{D<X5pk1w
zeLn937dP@3<BCL%a?Q7Xm(kwtV#<Qu6$uQ@j@XMrd>x3fsb}2{^LK2`p!K|uAUU_~
z)F$@=6-I02$W5e=46aQT({=%exLVP=6;t-9vl*KD=jD?GD}k}zf3{|*JZ92SIRV3!
z+0p_K-wp#l)1k(n<=~;QHvGJlI0kU>?SUXUbl=P~yzmv=b+Mf)zgQA)!a~wJB^NK@
z08jI<XzeIGA*4k+PADRsdtK!dzRPemYF|r~GO;)|F~~D&(fr};5YESdnNv7Y6WmSc
zB0-Y2GrCm>F23d*?MOlUDRtMX0gU_WF_yQ~m`2XKa_EnA^&K#1LqZ{lmg3v9HNi!X
zYNSiyY^rAeYciZ&7<J4yd`}o<Hztuz4SEb<;|M!j^KKqlywq<TB@y1V!vkb8o1{Tj
zum}p<uvf1DY|uASU+_L^J(rIc7c|INP-_U`;psOmFQYH>;raK@B0lIa;Ny7_QJX1L
z@1>ozAwDI4YT9CmTh4M2&w0!-+S_<mtDf3K_xGrZZb=7z5y?ky8DgRPqo7-yHr8z{
zzc%^G2O<P0%=R+dYa{JHx!#_`Gknt9;l^z6GI<>5R1_9!*)ROl!<dSX>C~UH6J2Lj
zNfo-#z3J10-K-CEH3iYRnvsu1f3Zr5WmSJQE&Qgf>fq3rNEz0Af8`)5-X-l5*<G@{
ziqrHs<irDkDE-N6iFV%-IPE?d8B=}!L}oc{KByy6cSlFXuhvOWe(jOTfD5^vbjfC*
zy`ezVTB5_cWMtmM$|SM^Dg{vxz968p$QCMQceFMS5uS>1^;>gov}mC(eNhR-^R>A8
z)hqxakSCR=Oi)Y2gW_^xKyr)JO%8X+czCSE57xEil*8(r+8cLcy{QDtBQv`Q1wkOn
zZFxZ^hEK=eFN8N01i3)Gi<IyOV_kL~uWu^P{lsUT(%&eNNfj}m^jHqx4Qk5@GMcYB
z^;Y1>mEKv0+<GPC9ZLAF(3KLvqFcv3E17Q{k_xP?TIcOkMl19X&?-cAMe%-l&8h!!
z)~zq@;~c<Tbn-a<R+(qr`!EV&&}-A+^T`d_jnBo--iv#d)UmFKl!KiGw=Imx3(67x
z4GnRncb%=z4~eH$1n0Mmjdxp@;2#M$!qLZvW8QVI+=6{M%#GVkW$2J=w-1<JnAXMp
z#92J8`YUoFAzdVtovDmpYFKIZ*VzBqtLL3EeO^i0E|~7^RL=2+5BFGf;*DN`R$l>O
z<BRX^?>2Pr<T(12PV|T>)X|;Y+5|Y1*Ph-#J%l2SS*+Xf#b`0@!H4qF4`#k9(~|kF
z{+}fjCV4#{=D_VJg^{t!VUA5t`-`m*`4lc+|GgA{i!Ie^*)!g5dc8s+#9W@UPyC$Q
zbavoI?RPJ&f?b!y!~b2Yf-mNm98?;j$IsspPI@oRM$B1rgPxy0gEjx^HqtEb=lG?K
zmRvC&uE+}i)}2v^F2TFq+s|z?%t4)w*<Lp!?i))!?{D8d6Nue=HgN{P{#V)de+><T
zgq-o=s-SB9AIY#s`Dz&gVZFh^siP>IV>+FxgYI9?o(PAJeh9(eRadPT$o;=2to;A-
zf1?gXY*Zstv0zoRRII0}R>~I@)%H-I&z9v$&U`4pM(DNe<dpK%LS{AT3RvaDw2iHn
z-!eo3PusOg%6;|V<ss<^2?M7a<55EuY)6ghMVv{*{?5wNU89~u)uFH3fpkj+oR_Ul
znhzXWa!Vc{Ltxw{u|ig=d1=_^HvUgWJ@&HB9&-tN4r8GS6T5M6mi<lq=&@YuQ>rU+
zQj~brV^y<vDeD_MNk!#d&J+?{Yp*U!7e@YY)>y`k+#nKltl+2Lo)aW-xrU#kPraDi
zrY)cjde>nyw;2wrNe<uNInu_TBpiPdP<o(pTT}&3t0Wjts(ZOT>8_>Ivc9(yf7@BL
z7kyOb_^kW%OnyLx7u}~e-Tvq~hvPFmCwGS?OyhAcl5_8hW;bH%g9^^VHXY~jqP`~$
zYWkU;Z%_yS6}yo_=w+re_v-RdMFMduzM%j_`4DX<aA~OZ4t*8eTJi{f3%tXaPCRXv
zG>bg$PIbjbCV@_fUOeU?p8kXDLQ56*xR=B1tzC%R)6vzWL2`IZmCmTg;4nc}0W8k|
zK1KG6<Tpa>x`Y<ouo`GVCFK&&<G3mtZwR8@xIVu)F&YZDf2@jm_r+RuEIrViLTFhq
zJ7vfyfl=*66D(sVUOJ3;`iU*u)I~wnUe=waD8h(FdbI0<O^RLb4@v74GD$6r=G7@*
z0(3O3=HtQ$aOamDyN`Qqh&e(oyss_cIS7~ac6pu?1D0h;?=-9OVlN-I1)0O~Mq_Ei
z_M+9RmIw-wrm!mc{*{rsxU2_5Y>yvwOW*HqmLcgAY1@2sp)$K4{KV$)8?Wk+lp+zi
zP#0<NXc!rulvdDrE^xm!%<U1(!e-_R%qq!Ui#LwxCWiVl1=}<~vZy9~H;O15QwI~@
z#nVfDbeswN6o?$#VKv^Yp;5lt>>g=&&Z&Q(F_%*l{}t8RLI-hVOQk<VFjo&_(>wHS
z)*J1$fZc$toT)k)j|N#%;W@lUl`xzEDO7uB%=t5{+Z=m(1d~nEh)6H73$c*BFwj9H
zu&?%rwE0|er();Kf!)9qYj4isA2(G-Mz-+O(wStNkR7%#;hSQgy!r7=ho>$lm~E;s
zD-xZ>bCGGSW)HWdi~j8_EQH>-IVpuA?=5~Nh~wt)>R7l>G9=PgWlWXEgIZ9ln+i5q
zHcuh+FloOqMSS)9+ADtG_MG9DP{M^CEwrY%h@WjzGm(f=nyULjh`xZd##hhpoDZow
zg_0|F<H1`Z2&*rXrzikOw!EyfnjwBaL6iD2Pj$xW=U%E4qz!Qo5lK9quj-pm-C+#z
z=UajtJ1yOOyiZZl9_q^USN_s5o9fOR7SZVdXaX(~_gacKXpY$tU-{wkwYv|?)iR4|
zTqa@juAJG#cbidwpv&ieGs@m86cxd$e!Wugwc)hXRy;BLnN}|0p*htcW>?0AYUsdG
zPJ5w1I*85LtN+7lhUJ-rsxO%36Jd?&s0eRlVzLcY_$s4$d$N1;>d?*>CuFdq{Y^ak
zCVA_CnAD`m{b?1Gb_}vsl?aq0r%X|S#+wTu5JM-{1|qhYAG%QW{W4TMCz9%ew@BD$
zpl$bW2I&jP9-%TaJ@DW?8nm~y3M;OxvLE#j@XIBf;(*XB`uucDC=1Jv<F!B#C?fUY
zKL+n)YYR%^In(tgY$1vKt#3()_2m?Z__f64s>p@>GMfpNiG~yY;Ll1y&Qoa;jU4>|
z8j(Ve<{CXle-;uxk)$vtp%BvlaOSWkf#l4Q6U_{0szA%!zdsaep(j4F;DdiBz-3D9
zTYe(P&^r*#n2}UhP(9!gdMV`f&qy@@wE39<oKfQl`F`aO1tN&ujQ6UmG4)Xwe&7V)
zT-1eAy9KBW!S0+N!(<hPFusOVn#JT#7mrW3a$t5}Zd(m{9|yc34aHkk8KPq)xN~jh
z**F0i((~l>CP2zslMi`>bCF|EdJl~a-26n1Pf4>KnW{s5QlfrLgwU7uNt$YYgJK`5
z1h-!ovj50OcX>^>-Jqy&EhE_l%Fj14oQ6Mil_HlarHPN##R~;OUF!13i7Pk`c&A6a
z4epPpF0Yd=<UO`yD#X684z!}W`4|Y%)3OioRiqpVp|Ak=@3luk(cvRs%>k-lF$4jI
zqs$Z%byC&dvT#ADT0`jNZ<D2MA`^)j>6JG&%P_b#$+zVDI&kUs$6*4DP0lMi&u9$s
z^ISmT%E|sQ_Lzr+h@2j@K8@VEWn6sW3r`gIi7BSq4S9+mJejg1B<|tIrs4u9Mh$DF
z94TY|?2c?5br<64QdQqPvm0ht3jz-!>H(K)*%B}D6B#s>LOGgM)%Q;bC=D1ng-Z3Y
z`_m!_%ySbzTsyuo^OYuEza-4pyUA}~4r>@joSs7ja*@Mmo?m)i@y(x4`(QBwviWXd
zGLy33yr%1V?~1l5oKO?rJo&-gst@?UAAh%j*~Qg*fRH&2yWO2Sk+#H7wEekPeAl18
zw>lvK-Re;>;PEu!C*uuISHK5KLk{zek1*N0bWE`TL&VzdyF2qMDkY4~MAB;0UDK6h
z<W71sku{&f@RPIWKf`<`ovKO*iI<{Gc<?0Pi03ZEfG*}UM8&VYnGq(}U^IqppMFV_
z>%PPwzy$443V5&rV7dq3tK0tJWygH8nAoB@uGfbM@+y4T{wESozt8hmXkV?lt)Um4
zr7TB8uCNlF>x^&41ElY~GS?(O6dnmWG|%#NJHVT}@%!f$VT&;D4foX<$@>XU!=Ox2
z^%9dpbPIpKveN~>h=9R-Xeo1y%ud60bUV46nP*9vT)jAVYl^QmBWn#AJ<-l=KAGP0
z<}_|Ze%XrO@E-SCV1MbGSA28d1#j(<#y!ae^5AE>e^H{-SC~el2#LWJ#%U>-4z@_4
zhk-2zwjsVy%(dS;Kc7Px>VhA!*&bSm9*^7uH8gF#HTxEuYkbo=^imdJTH!=R-*9)y
zv_mRgrN)!%6E#u41TpX^k@p88tn{he3K^au^t@+eulp+v-s$!AF&1?k5kkfAJ*xt_
zuClzXgO^&O*LQ(HH9vX<QVJo65$I0q2S*W!F6Lf-__z6NIlj4&{661v^rV;$m;oP<
zKACUy^}a~0z;}7nTs5daNs#P1Wx%p+C{ajNG+#DQK3l!VxYUf(<_OsO;-!7al4X<Y
zs1>J)Vf3>E5V_Eg*$$kBPGj5kgAlVjk=izGIa7<wo`xi32&&6g)gf-77(9kXigpdA
zy#F#}<ikVx`w2heO2R5DKpXU^Egt$qv_+#HTmhlhdsaoRbH(@mLIUA8c0O0nB%XPs
zTAxTY2rS57CFSlN6OrZpkkh`V5+@Y){N$~WU#+32We&{i#HV%N3}2DFel*R9&mwQ5
zQ9+`6g=$T*c-nsfOxA8E2`1|vA>2_ydj`*4#RoU=s0dRxEP%-o&v%#eHQ+W5kexDs
zpL<68gGZ~Q2<2AH4IYIJPtR4;tTVeuiRPnaWItTgFn8r`m^Kl1Rl8*+vN<ULI8#{g
zHV0TbTuV{5s8Q|Rt%pF)>iAz0rYM5EN!m-6>eJt*EL;W{yO85O67TSpk?izL{)VD2
zL&80;L**f-!fHrl>ORm<kuLQ6)cx`BIvTCVy?ql_Q8Ym+Tmty)alXOm8HUTtFCnb)
zwqX`NecY0`4t{^~^MKDy(JCEV>`c1wFj#vAiP5|Y7*2n*cQurU`|*Bo;_+c%QX-1=
zHG;hL4SjH%(W)$LZ=~ji0Rm3|gmPb;J@VlJO$I=P6NU*&i)nwwH)6o!y3-je>bPOo
zusC9R4r+1M(|ahCDjt@%H?Nc=uvop-^&o<qLd#!3(6~hDDxvx<GagcS;f3&{1lIkt
z!gB2qs9jdJG^p^eQe<*&-4*1A3i^^hnCXYVv)jV3J$Wb2>xq``^`QbY)OZYaK<*<g
zJjK-8CYX}k2>%2*y}2@iNSG2)9paH<u|B*+(-6obmoQ_@IaeKuJqI@C;bJVPQ{oo-
zuH9u)aE8BYw2Z|4Mgp1yZISs%SH8^b6qQ7H7eMgbKiS<X`na5PO<5L>Va-ZOu+dZ?
z5depA%qPoYPg!$QxDz}yTWBMwC0wYu6KIoiJqps2XA=q6?LR+{GqKHuakrSny$xjG
zT-(cZ6T}vIP=LPk!!Q}D_1_*FUp~y27adPWu9e>s9|k<YE<x~x>2#0kjO0oH$%-iW
zgl>Us?IHi1KKP-K3!im>4Vz_{Y*?bLv`|obO5bd(YJGCYB(+em&hDrDvDbBboNLr(
zz)|9U2F%nAhKBbD^peN5C3=v5w8y)J5wbdYt6}r^?k+buZ5Y=1>7Vot@$ANStQocg
zC}avl66JRbqqaEvrixWhLJwIXTsUAPnS=2QQnHn#h&t|P-fmhOUtz}FmhZ!dN!PdT
zZK}KIig?HQA3M<qKD_()-b83bXmakzN6pvy-{pP;J^1IR^u`QsX<dPhw0x-#mphw)
zEgB`h1;2a6s7E(<-$4FivGe{yNUyy#Tc`i`0!qjG$odykoN+^DTQ}3o`;1#q+mu&M
z{d?u@|J$cUhX``U{j||M-K5ix`HGR}V)xtVo;^V$IaiMLKj_r_-Y`1WOf143KTSI2
z?|u6~S4ye7M#p#v5%T*;Irm=Hsp)VOkk2TjDuNyPIuEufejjzjp&+7^Q|#{lbuWqv
zh#+U_J(>?_P+g>^U=iPTQ3V#b0D~kZ7Ek7P_XN~BMIEPqwsIHjlpxNWKelc)zXpxF
zrouo<j8);hksH(cF<UdtRGzB9leJd0Md3t2acCxIL&Ui6fB&Rp6t%Cg+4P+uOM5|S
z+;!((^Tp_A)Xr31;w3T<5q)TbI_WNRM=Mevy0ixK#mqhUM~E(d7bjVxM&)vHf$K|r
zW4M#8vyn@`WxqpWC6j_)?cZ;yL1Uu$pHBr3lziDeO;=+FN}TV$tn-S!+}mS=k@ZU$
zNV~5_VK3(fV-M{da?pH5R-gp6?g2S}z4izSjp97#O>DhND?{fVv37)GelNRpO)lj^
z@EN#^u}H&v!NOAdD7(67W$L-WMFOyerL&KAs#@aCCXHUa4pO;g8uBt}gXuq?Xq?!4
z$v$B2nmK|zX|>zG8euo9C5o(ynQ~a&hBeFFT#oEWi5!tT;vY%R=E}`9Y7qfBU~I|#
zv2uC;yVAB5u@9}u^e9IveJxjCbwn+phU9(6op1*R$vM<sGbbu4ZkP_dsYNv?r9%Hx
z@c&ZAf2x3fr)BION<Zs!PSE~(GYW6(i9wf_q1v}g@%-?mH<I1kA~IHkyUg4Fs3Mng
z)_V@TK*~1}mg9rXMe*_pFm?|7zqmd#_Gqzb-I$cE0i#gcJWfMa6v<MdNH>J%gejiw
zR@yXdkCLe@g_@+W8<n_6#Se^yiKTf9L|IHbO=+kVv&!6y$>J`vbNOqW>*MXh!PtOG
z#tF_5G1z}Ram^~EF#OiU=DmW5R}vdtdpXPNPA5{zX=biq=^at4vEa+g0UqZP{IV~K
zdpB)Rx{qA@*qp)UJN?Jo1y-0Z{W@fY#kXs{HKsYd%~nCY8I5);cG15KKAKnMw?f5K
z!4&`b6kS8L7XDik4bde_vlQ@Dr*=;R4Q`-E=vb(?@sAsD4TGG`L)?1t(8MS3<J<5p
zM)Pi#N`L98>}n<a&0$%;g3G!?xDA84mVC`5f&Z}pqxpk-A8%J<J6`1#RS!vdpKjW<
zg6HOTt7_*!9{plk4e*bzii~zEj83Ol6QEWY?4VX~o^#8BfSB}ohX~YXfoH$dRySE`
z>ieh>@G_@wEYi$x(TUAxyk{Z+)c3i`Pui`NWwbHRPYiGJ&UH8x_$zv=&|^@<rJ?h%
zu_zgMp-flI{Ev<MzHm+K9NXIUg<r@YyxE&r#f<iqXO~I1^<g8<a%1BAXomz3LM9H5
z<QY46#IK^kl>y#WcL_oZmdykA_pdwq<2bbXKOOD*o_^h!mJP(lwic`8`8hWXTdEhS
z3)EWr?boNNxHahg<HM9*_(d;8fic5zcH;rp%n!wCX=b+C49KrY=T9@<Kji1?e&RLR
zL7}fTWz^_}(82T2zWfSOvQA$RFP&MR<7acS?btD;fBB2mJNvh^N2yvZZgX3XUkn=#
zBeIGl<bHm7)uGXFJPSVeV=lHTV#<>Lc|JFSHf+G44MrV`as=eIs<uiS)(2Itq&hnL
zTdFh^)XRvoZDwcx;$}*<G((G9X+shS9HbAbQG*a>kq=N>iV!<_QZI@7Xk<cU_t{gz
zy(>L#D(>!QClKC>;$$_%yf(2hMy#%_3DRrX;9Xy&_X%XM<vao$DR+n@f}^Ai=IL0n
z3j17ITGtH`w#equ5vwj(g2p9J%$(Czbh$D-<#(pIk#ZB(1>C%mvwyeT&BtjbZWEb)
zH;EvA34@&e0q=yCZ8L3Zu$wZhZV_|K>VkQ!?(-<^CTBVaCy|d?QA}l_Cf%$pM$&t-
z)hZnK6BhiV`N;P)S~#S;`0%XFgZ8EN$U*}X_kb@h6|th#za^aGI?jsU_<CSO+jU#6
zQAkD3P0pXFKtOvfy%66E5NF0o1YJa<)Jo&Zoq;%nmN693rC6LDY}2s$VhgmSqi*_i
zM64<WKj<zNXY~aR1(501nw@U+VZX9n^2S68jo*|$P1!uG-V4axZP?34buV6WT;2H}
z(HRmsCU&O9FALq38vu_rj=Ze#y$ob|ss%|IlHJ-4kE{JAfH(dWdopcPYdl2)il{=@
z%sDNB=g-OXYR%4nBI^$jU&e->^5P`)YlY!mPsH?2FAaj@m%XwczxtiCSo%-*TAR`a
zA7Zd52kz~x`UELs|D6Ur8be0p7^~4$ym-E$TAUYveeAdQ8QHKG{h(zd0M&}iKlc7e
z9Lu9v^K^}hC97C#v0*CEY^RgE%yZ7{@&_8+;85)4;Iy*-Ph9Ed$9K9LEVmzmOZ@XI
z<+ipxin=u$?!-j8TFtw6FPLy7FwEVml?Qa9^b;q4Wn0iYd|OlpqZw-4{!vvk7Zj<C
zrlRmMrY~<kLu1p&hP3tqEZymfjg*-(175z{VDja)3|UH&#&0`eGaFew=9rypG#W1N
z87ZSb)hu}Cfm6k%qT!8KZpSl1U(odZYITdjfB8h)#F)$*_#=u3p|W^BppjFz)x&DA
zlO})J%X0REY!8HP)PN~EiF-0=w|fe0q0a<7Z-(MpQf_ew1`KTpwgzy`Mhdk(u6~MN
z!^+vM@3Td*wFC*rEK+=gDN+|$ZJ|an@$PIrrhYx)quNkV(e>?SPrBmVy4+?$?X(j2
ze*7^YBNWSW*kym`Mg+-pQeW8*V^K=UO&GZL;->emTEA2kx<MP3-&1TGU6xpg2Zt!g
zmzPZ2z3{*655z%ZYezR}M{gMz@h#ZL?u;fri+A_naGb~6{j)3$4O$ZpE&S$2he+>j
z;UES^LjU!hp`4M)?*SSAW;-8B2oJwa1-qdGB9)q{E8H-G>8t_$A10hY?UIM@B3s$6
zVjbnznQ;>(im7k><fSgHKGHXiEN>}+YW6`@K}Tz36C;A-pFsvZf7gPCMT@})Z;S{E
z1=;G>|1N8U)s-oPWFG2hNdS+HPv>RKbZZ6cFP)KAxGrRC^p#C2CTk7Qm~dSEp3C!-
zh{w6-Cjb=IVzKdJ3EUBtIH5i2O#iQ2QB<I8(}m?O;HA)?vLhL9zB#ow8vb|cH85U2
z>3p?Gnm~!LYvMC}A5s71Sv*q?0zKWQJ?~8GV88#N6dxFp#-si=<T<r}Id}o_w2j09
z(o|Y>c&7z{-Mrc2IrS52x4=L8ZS`3-KHuOqw~eqf^+p4BD$W!tX#i`l-L)ys)?EfQ
z^f)+k`s{SCb~kS9B^ERv_HSi*#@EjmT=O!MxJvE6()bl^eMwEDsbZxfmo;#$9s4tR
z(A|Um;*3I&nz?*VNqXG&nK!kVG1mv`e~Px$e`-X$Fv*3O=ISyk1HQ^3Hh1#Q<#W7W
zhnpxOB1i7nO^C&Xd|kbE>$fcR6|65qDTZjuin?>$EIgTOb(s$8_NMC=?)?+wpHUMz
z?kUE*5#It;FQjp;cfGAW?f>iZRbhKoab8*7beECy-g@KgPyVGE|1&E=y;?@<ET{PT
ze{zMy|MZ~vCf6Ah)Z^fHVsAerG{!Fmz02R<woG;y5ViK*H$!;2(|YY1g9%*h*y-<R
zty=KvS3I-W#bneQ@t3t#?aNbI`mT|cwLFk!Yxt2lH-@rH?|ERPMHG1ORpPtjJQ#Wv
z(C`&Aht?zk)NKR97@84JXRNF!@vOxLJSvr@jNF$#>djiHML7D@{!0V+l?Zwh%ZyAv
zmo>3TuY=r#P)hkjQG?bQOuihYEZkE^_(lYKm&Z5c3&(%UB>zK$;eznWf810lK9pGw
z%S`xe`L=SrF6Vvgw<4or2D4xTu)LC09s^5qD#LT1O6>3o_pMzAw(YC*c9kXhg}Yy$
zTMMwwacBFwX~<(f*9KC`iMz{Li+(X}&&K~<g!k29BEEy0e&U)EF@Kka`(M_jnTu-W
zP7BU7I3D&R#njHy`__QpcREE(UGi@aR$~y?RhAb1l_H`~qug<;TR_*j?mS;@r{>yA
zC4)V~qi;Lji-K}%O?(9y_W7T_^vA~6S>oLLcK%y!?;X|DwzUt3B1J(&R5}q*z=QNE
zRg|KFASz8dk=}a=O^_y45$T{5k=`K?N&x94(tGa)2oNB2emm+pw|Vb-?-<|bUl~KP
zGS^yj&GkIboO^D{1EZf8=kKP=@sotGg%tdvBY%DR5nZFicyO@n-#Nv9&v5>kdH!#7
z;9n`-|AKJ;co-i1i=*UU`QP{uc~cO@QGG@d=-mMO<2#Ea9Zw2@WOPQpu{FSS0yY$%
z|Mzy7E!=%aNBIlAYv1~lB9uBEbYrtZxyp5}oC2{l;|F%0fJ;a039-}_I}EN<D9p-z
zy<9)OUSWHs$6?ens4I|y@E3oT|FF+Qe{cZA{A^OwM7zCl2wg9mHu}AC^-2OqP4GVV
z)l~8Q{c)dx*O-R>@QM-cH`146f6+j#Id^BI_3A<Skz0@Zl(ie{80I7Eq{urfH_CO`
z@T*k^&ioH`*u_+f#=@F)=E@te@Soqs(Z$f)lLz~p8}3D^GM~sUdn5Y`<yr;PLbTVT
zQtaEERoy7_;af|}6;tAi$DEVy*2k}||0gfvQ(X2!e{c$<HH~OY^_UlkT~evvO*hkL
zO#mXHV?FgyqDeA@+m%9N;?Ur+D424}DKR;HpFrkVnk_ZqX0`kW8}}7=*L7_Dq2puG
z@z<r(O#Q*?x|WkTbP?GYY8HQ0-5!f*^=-0~VyT(4u$0>(_S^TdD9ttp;#0_S3{0Zl
z1&q9dgp=FKYHz@x)u_54`B}NL*`@Jn==f0-1bEWAWkhFwDRhp4aO*72^YZ$SCD=if
zaz{!X9~KJDdUl3xYJ=UG#XeK^Hng>NTf0$ml^;LaVB&=}Zn7$iiR+s0lBxphaGE-1
z^_}}SddnBZ=XXr9fsQlH6h0dsxrMOBS33a|-s)7T)?}IVXdgB&tf|kJKq$dluZENo
zPt|Nn0;ZR<J;=v85=Ko#Q@bb7BVl<sM&YAI4(OO0UVGs?jh5&K;KXBlV9iF3SEzkl
zC6jf|G9&AAMv;;ez}>leb9g#g3c$N*&=uRapMRk>p!!SiWG!}8$W`ilA1YhKN0aJo
z5bTMP7aToXqOQ*eObFX&zyd7JB7zk>b=r=$Y^>o$v@HW@f8c#2fTt-)qLCEdXUPe6
z+Yx7rpQxuDhmmx6)!6u9U9B#D<wNa^>|Y=V5Y?rgLYur)GO2Z4v!GH<ur>JJMr%Jh
zX4_*stNzDykI-iWnHU0Oy{;p}0U~H*y_}M0Bu$Y!@glvjMD0;_3gs~FH<#ZbU0%;+
zM%Fp)UI1TNK0Ixj`@})8fMMnFxwMY5#evbE6fc~{V8FU=s+2L(u$J1}YKq=Ka#wHt
zC<r?)F5O*#<wbnK0suQEbh!zr9k!^&Ho}Ew>!w(1hCPr2i}{z#E&dO%Q5?<vAS=;O
zemHn;k=JuSILDGX=4eNt>%C3MZUl~FsF=oEN96GN<H@I6HppFb4R@l`ui+R8>TMCS
z@mb4UIV2$ni|t8MyglEu-gF=MOpo;!Z?C4Re2|sw9q4p4iOp?c6tmK-#$6ksJhh(H
zU->(;06?T`ecW5LFmZvqTDE%Y!#Mm5xhKC@x$pB>N=Om7ngHN#My4XL?$pC&7yh*R
zt^QQ`CIAZ2eX;ySL)G+EZc~?7Y^n7;vvItmdS?kng{&*{NM^ntR9#bs4d6vYNMl%B
zCCRwa@2KbuXgXl5Mq$xN0i+n*ae9k81|a#7aPCE`1GOQ$;jB_$FToLWj_y0injSTW
zz<85~6aW-nkpgOz{@i>30W;oQSlr(gcG*Z}l~-QXfsO|8$?Gh$s69gQJqvdZq&wn8
zT+?*pGv!q!;!VJct$g_|<aRMqYIkB~EZCv+2u~pc5Exv=ru{;ni|gg#1w#3ZLoK6C
zVO$;%pQ;!Q6%BnTfkw|4%eLiyZdk{9dV0WeT#1YnQn+<uRb70Ya#F$PO<Bd?P+YRl
zV7_YYvhD?2@69TUS_MV{fUjwgGk?FTjZSvzF&;!I@ikwmV!BRafV<$U$6oJFigyh|
zKjF#OR?eD?X9#rjYpc|19HP#^9k4>In=q`-hX?KoS*0aK)%r(B-K>s0z49Mj=qBAu
z!3cBFfX3iC2Ei3K(r+t98Es&Nrofhq_3osG9}m35(Oz~Qlaea1X&M8RZHbyQIGj^Y
z^q89;-(@u%TOG?t1ZK5;9YjRKbA?P|x-)pT^}=>Z;)WwoX?Miv?YWyINas2N@xdmZ
zy^5tK7pYk72J;!c6s#UjWqJIQ>)!?n7pPt0kXBScp|Rdb7}4SU1AV=<<0+#f-Tuiv
z=^7JoxG7M?%>|#!84Sc*G&~Ksmfx~&O-M!%<^&es6;x~D5(J#hzHmpj^yDzod!24w
z&|stmH0}x4%~_ss8TRViLaF3Uu?JR&NYU2P3VD>#3zcL0^jRMNa#p~C<zcCQiDlqP
z+K6^iQ{AVM5j!`@4{FcevVW$xs&;k;7krZ9CK{2za6Rc(>^M)95IH_hLIWgRuVClN
zE5Cu@r&H|x*)7WJ9T&z0-<yBgu%Dr{Ni2Dwg>m$_DCzyQx@}GoK3x}9vvpuQA&iyr
z9~`C!j<>bIqp|=Fo{|M}-lN<59!PqFE|~@0`N(wt7Np{Y)wpiXB1Nz2ta!8=R+Y`G
z)^*2x`3|J56_|Y3@*C4Yeza~8aAtdWHmC~;E3&b?UpyJx&x#i>0`==v-96AuOIuj>
zQTG{wB(V@12YyuRdgf>=_iN_PS5=$TehU(0{jI*6e|F}{dGgxvBG9|+0#UeL7?Z~w
z$uv5cN{7`7m5g=bDc%h5$8+hq3Wx;Anw`TgIceN1yal#Wdnxo}0Uu4D?vb1eSfxfI
zKoFLS=~fo}9inOsQYy>xNs9C6u3B%ud&6MmPiy|>2}CwHPq5LwM4H7;?H~5`exueE
z0)m*bEYEK=o2}nCKJ%?r((<*_X?4F7LF_+f;?41cE1CK79U3n)8EQhFVCd)ZG{ov7
z1H*0a7ROykmoSuq$TGNZ-|4O0iGA?w7gwADvHGkDbUQLtUpM-~oAa=)&uspH<rpUe
zwa}&G(FlDS=SC{RcrS8m=pZBVOn?R|$Dci6rkId+A>i=@_F?kK`BNI>8IkbGO?OjO
zO1g}V<3#IudFdukJ%_5RlXKuby+g%gG+U}H6sPn}f#9>JS(Yj1#7Iahxr1LnL#v{n
zxR6K#_*tj66&moC-*$$Ttf)yvqO>}Eyuc^IBN~`px8u@}{pDwL@%RKl`--uu*Lc@l
z;#4D_nqm@<Os|N0GP(6^%YQk#kBf5!fVTfT<O})m26mnQ-q+5cUQ(S8ak45DRU!zx
zWJI^0Jb$_h4P@%1le^J=Z&5^azFUt+5!gBk*4AD}Fr9&57n`o9b+^4Bj3$3pB)q1m
z)wa+-vY=X^4+ZN;K*8QdKY^K!LRWeIy3gNKU+)94>8jJ$r_krWyaB9RFcFb?XB&R_
zWps)e_3U+Wd&P3>+stpE`!s4aDuS{#c=A;kZU;S@CxX8F;do1(3~7{dkfiCbuZFKj
zJulL8pK$dJ{7IduWtZ}!kQoB3LkrLmyNrInnpDRP`wiM{PVktVXq<oZ?zVjK{Q#g>
zL{z%>Ci#7Ei%{eI$|YKnvwIsNM4B+o3mD}p1#f|pIzU&IKUmYfqRo8j8m*Eh3DDJ>
zw!XVder+VNx!6a>B>u!ATDHEQEH^y5Vs7f-*$?k-gt$;%9NrS_zuoKg@|GYX%C({8
zBZi2^kQnS}(QO};>hvMk_8=VZS^<C8foF1isobPRYLHz<VZB^Jg6H=?{w)jxo~Gux
zBRQw!i#@V;HhBJOTF~y8P(MsazR|M@Ru9N93m$YQ`{Mu-Vye^KyOXgbN*7btpos)E
zxiuMoMK?u#y=1m%#E_BQ{RQAtZ!=klXg2tdFY00*C`cQ%Kah+20E0zQA8)7-kZH)u
zTZ&fNDj7II{ij1cGd_Jq16?Ru5*a1ayT(W{C%5{Fg%cCl0F(<ncH9-xlqsSE^vg7#
zwtM+L(VfQ|EHCWHelVFzL=#FIWxG%c^da2UaTvX75s}^$6v$BzNcUYQW$C8J^;k))
z$zx5QH)oRz(Pk8Y(wGfAz6}qZHk=F`FG0Jd5nR`?B8sG#vixDzu$kDM$>Wwccop+0
z!~?KRR-LmG4kw@j1?wU1(PFkeU+DdmYABV3(Xuof{%^GxVBVa(eDQHa8=a}1>pJJ{
zV-4QFhSA9EFqCXFaz^r*Pl@yaHG#0AUe?w(rhA8NbOt0MA{(9b1~r?TmRSn#pr0$F
zC`Ie(o&#MS<beu3??FzwpfLQ`xktN|Fu+ZI%1mUuI(BApsot}L+^O;6{*=vrq+g9Q
zUdxk)Q_@W>z->48OnmuNn9<shA<+Hzd>fr2Y#jcqSqzJ0CfjcUiN>ci^Gtnb)nDVu
z&^EL!Fyu*mC^ypd34s=NfByVy`;u?m7QM?F?a5YXH;-|wvQurm&QdR`FG}|<5wAty
zHJ)i<IDkLw$kT1Mh-i`}Xr^IW_BI=_)h#0YxGlI!-bvz2e$exW-9JVlIkF-Rse{ke
zBbSf_j8wPV7M2}mRb{e#3$b>?yN)-YK#gO40H=2#P;GA{7<aTy6vxv)kY6w*jxnlS
zpEd`2ILNPaoYgFU!LCf$p<CGM*R@n8pg&d4^2D-b?!?Ubr4-4Ntsn}8qmnW-leXBX
zUD@6X&Wz<$d$TK2BXYkor7VZXC&c}?vKZc;4yR6s?+yNyIny#Khy=HvZ{{Fmeds+9
z`X)I*)b5<oS5l2KQhKX7`EWB>>Bn;z3$r}P!lvK6s19}0l#|~1u#~ODc^FZ+CDiMy
zVnlBxZy5D0@~S3f(R8|&r=x;hX2JGKFxfjiCEpN`1=gfoS=R7aPO9XG#Bc^zpFfjk
zC9(-oLlw-8!n0D{E9=Ip1ax1+UQX?;gmRN>IlX#IwVfitR!D2Vi9wiY200edIB!Nb
zP+~4`>xi!|b~_L+92-JXp$#kSg?7)#mThNF>9{X9vD~H!^QNCC^ACgYU5kglECgWB
zpW@9{&AU{s3++$p+@}u0Ro|!v%sz?~3%pG(LT7s0u3?3Z{0W%=-P@fWM+TeJ+3z1-
zJ4)8m-9(y9S;RU;@TRr|I_7USm4-P%CZaXo*M2K9xgp3<1vxjk37IscWv-7$n*92)
zpNSCO=Nzef_q&`Vsyqry5||y~o569=sRP|wg-JjAW=|7Y-FvmID_25A58MsHcmtfN
zTcnJKZ81s#==vu~tuDMQIzIMid*c%qoUa)t^m=x{rdt_`?IHx0FQAmfINU@WZX@ON
z*a1Jr=}eCS#n+}6UFXRRq<S=~`?f=~6d1~$l}Rfms_1mK2c`#HixZbTZJSKrY~cP&
z{8gE%5lME8h@TXC>06%k#o_%=IS9zo{St=$?voOS`P1ta^oIBNiz0a*{X=rIa%Ijj
z;z6vQ!>65%M5J<7vWe9pa=M<jx%eI4wLjCxyG}#5h^yZg?hbMLoahp1cK%0cMTk^B
zRGP;KPDymQB^UA!+52l;6>XlY8PR*Lfx`ok){IIWM~^G5kBBt?tcn=w$Aj7?*KM67
z_Q$4H_vqgLzkiJA-~AX-EoR%=h6SyWDmhz$9rX%zYF@k+I;Umm*)6nu+h!x~s(cbz
zQMGLAwYOwN6_o!e4{vdq?3$0>R~J^Yo%ztq|Cpj3vqk3jZCT#dnDhoJ4s>MpXUfIX
zVZfs`%`B5Y%K(Cb9SZ+HcIZKed?@vJI4?8pNGtWtzfbx*7j?oJdIW~8R&m+vMjD8$
z6?#(G)m{&}Azvw$HvH<yYQAyNkLJ(KlDk2WXZ^n<^cmNx=K}@$-?A#e?&w$Q`^c-o
z*G2glB`rF33hNV+PI0;BwZ!8t@(oQ|gW_K@zP2nsVoV;;MM7?-Yo!f^X1Zu??~Od9
zhM~`m?&lA92Z2+$>vpy`2=Wa@r@t7PJJI`G|5H^h`RZ_LC<Tx7DiEIHBG+4Bt$qsm
z+d)4m<W1F*Pg$(MO02Z)bn~3ve8VHe{#Oz%<mQY3surU1tacb5C$-`7<YFVuX2ZhT
zc~k*s+o7g?KoL%chMtpE8B+0Yp7rM@Yb7{sl>=sZ7G2(=RFgVvBc^wMh%?y*zK+_C
zM}K@h>**Lq!8X26pnB}{OL*De=#OH)pL5V94CY*F>OX~BJw5wzc7gJNlCQ37!JVzh
zcGJ4rmP+)#&G>gq+e^){2~>X(manetL1c_Co}srG#ic%2O$$8ztE2<ScP*z%E$NVh
z0k&@j9&<Yx?31HT5jl_jyYu&JOCcY45hjU{4US_(y0Y@Ju7M%;2$z9Qe;Ssy*`N*%
znsTA@Fr=wnJDu;AG^~G&OHXcyfP;6z=<-GoB*Sv0$Z(Y$EBQl)N^w7O#LntV-J^{d
zs6B2U-bio2Nqal*=r<mM-4Fa%#DMg0+obD$!;;HiQw2B+z0_)&<)fCp=Nni{>EoDA
z`fY(@&>jYT@-_1&SZvSyF&!0CN+`vLZDxJ%XbN2J->KH-ht7>lIRFs%w5eTB;(iTh
zJ0wMQd#Sh`45tt{iX7NZ(zov<DEQP7WI+mf?`z5>L4D*olAC(M?=b~`m{I;KrFfZ$
z22Dk;ob~lNB+I4#R#U+gbZIxiBeFWrtaiESIocVExg?}RTsoELso=5HPMmRekmZ`@
zwQ{0?RzixJpY5pByWJTJ+JNs$u+v8rR4?rB02$TrMq>D1j;(*T`7oCK?5Gl(?zQ`8
zeGKHoHqe|656CR=VWB6FD}V|D<8Y@0^DM>XILnlsroyv?4d>0?RUj7}Wdl2?;AW~t
z33l00FEsB>A<r}=hfT&e-fh5^@c?K&R2RfZtxI9(Yh5+Ju`w}06-Ylj1QK6adDd6_
zOeMNN-2TGy(~^~Uou3O&XS6GAj}NK(2U`kbdnD1vs~whk$rr&=ERx#`jybJ;sP^FL
zulX)^_QE4hsK$a1HD9bX)Bie5z|+lHdYh85AC9N2>zn$FHFSiMJrd0OfvjGy&`=~G
z8daw97cG=<ic+i6ajMiF+#fF53KUA~QW`O!!X|o-;N)DCP#Tb`{o_16$KQ{uF})Ss
zc_t!t7VgNy#^dhYj%w9E`RZ)K`@I%d3&B}X@qwN%9q&!C@Zy)=0{ibsxp*z75aS6n
z<KD=0b0xNy3zvicdEIW46+P>LH`XHeYdfWaGCK3iUX)MG)85$#yO6s(b-D94I)%D1
z)$@dSwaBo^QjBr$&d!Kaa=5T#{qPIdbSKQZOGv>m7tT6dmDKNP`b*tQ_F9bww2~%D
zgLbnwU%!>J6>XvR-VIBTI8v^nKK3s>n+v;6=K}UQigYTl$5at99<4CmC;;C?{SZ%3
zywuj?x_d6OpYU7&Q9To<iqR!lIZAhFjay_%P6(6x=zW-=8fQ<1RteA@c;WOO>f5#N
zt?G|TrD1_fTCuv|NyRTXB{~_lcyTqDk~Q5c{I8G*nAZzPDRbN2Kn;k4JuhN}&F8@|
zyWY_NZIKkZO`DG9D<rW>P9fDr&l&7^o4fwNyK+)Vg*R}x;D{+9;`U<cq{x%M@&`ae
zzhG76l!wy~7Va$4HX>M3`N|Z+nUeL~$L|kfzNyMA6V{82dMLz#0hS_W95hXbhr)VZ
zYsRtfp>w^1P*c4mW$7lRo=ujEKK9Kmh&gwImvJ8Gy5%XSqwMh^P17bPEJ1|bI8gml
z4Glb&61z|c+m!(<suiKZS%E8@`j@)JA38cIK+GEL`lMJN&~Vr21pDJ%6GrSwEzp==
zH-x!`>)S$xMNQ9`Ta)z|N$;Scl^GYfDqcEG;$*LBSLxjxl=)XXpzS<>R-3}D0$L9O
zWC0#rF-^(-9BX*{5W=@|P1~X>;mG9P9*}GSDqzVK^-3JOB)OAK##Sd+?)D1iv-cPZ
z<ltgah|>)lQD(2qLH3NGR5z1B&E}#PwsRi_A#toNIw8EZG~GJ~?G<3s24~{=?$lhK
z#?3L8;oAON3gHgRe;n3(Q8(8n{wRyk0(wj4l_IseP085g*-z$#y$QR(_vhadEyu_1
zEeHVJsF-1ZprK1}RYOi*if`%=BIw2&{Fk`d2x{6D&cP~+b<%2oZ2)1C$%A{&bwD-I
zTM?TVWRiib&4y&7?G(?z+f$t%sm6bs^W#%==MMhFi@=pOeBaxHqhout_?7I+Nt{=<
zq9?pPh7^?VbX1)y5H>uqur;(jPtN=LlL(5zv1-cGR(5vd70S?!;X=?igJlsS8u^6j
zAjJ&Z2R3ERE;f3JwYC$F*ADEYnaKe^n(v(QrNZnVpS@r9qd8At)ye3RYXeE@g%z!q
zkDms?Pr8B(oTA-bw9z%DLNz=dGY6H)Z^fNfzO9@^ZC_pD>u{7IYsTP2k>{1jsyVjU
zNa57V5pRvk)Cd~+x2q(EgIz_%W9vB!{$|p+x*Oy<zAHp=*pGK>T1&o5O5@8b5HYcY
zz2B-En@2dnGM1VDos@HE<jd&XacA%?1<yq63i-}>SehXTpHIdY#Xup1om`U7C%H)p
z9$Vj8<z^Q*hJtsBT=nmcba7fiI#xj9+I&QSL?iEl0%A$m&QK(wXwyisn%`_*=#M{L
zD4U09<lOMYhKFC%lx|-?eEj-$sFU&O*Cw7?aB?Z7`|y3T;^q%8D)jr{&*r9n>2|x#
zAoxjfY!cKOrgU<GtX(7%s1N_7z#3y3dGS27$L8u^xM6^rVC^{9=CNst^WOJ4!C+VI
zl{Cd=LdW>kFPM$ZcPT+%W`CPN)%qTaCx4$dJ<#?7pUO(kIN0Tat&U5-*Ptl1j#r{U
z-O!{PCc$ytXCVZaXJXVP&>|oM9*U;^B7%rZeXUwFuSZ<X^gum(PS4Kyr4LS7xaGmU
z+KlBC<;MoN+wSSz<0B4p=Y{Ew{#^tEj19Zv5a#*qNRgF5<USo+*QZT;8|aQ)EK=z?
z1+w`I3upMK<rP)uizDyT)4T}eaMgJ~B$zrHgD|_dK9&WghKE2oP_=191VxH1I7p4c
zUEKso$Rq^f3fNXAs@8<6<^DnfPxZ-om$>G0g5NHFuk90m`B}@vv2*xHbRb{=iYo%_
zu5znbxluN}a_vu;`1`eLbQ`lPwNBf7vg0iveS@rNJ?!9K%@?zpWUm8qp(CwxZ=3nz
zq%Q}~=X}8W%^g8z-vsCZ#fAj}dB?(>s>O#a!zRx1OTGtQVJ%6}#qL4t4+*B;3DYJ*
zA{BRBfb=FHftC({F?zT{4_xttqF@eGJ3r!%zCx0d%j1zVTeJjXk)xnYx8r(9Qu+;9
zWj1I?HDO|sKj{+tvcWsNSK$wcUWJpg(UV;^h`xQ-myk`DlAHztLFDm~j9<tjcWZ!8
z30>Tow#5!iq&%BU7pGf_*x12JA+Zzj1qIm&QqHJ>?3@il@mneaFFS!GSAIM^`MnFm
zLTiu4rH!bulvVOvap1==iN`)0y9=`%yXwjDZkyxTHD=>VzMD_zm|#s_??>-mc$-3A
zNBPQS@AO*<DO~e@h!k^?T#8Nn7~`5p5lfMghQc1rnZg8i@kkQ>n6db(6bgE2d`XB|
zRnPAg?jjhAvVMM!&|UjhN9ol|&ZJ@r*(Z(MLr)%Ge$QO4KhjK>^U#k!WADPRkpl+2
z4XG`^@KciEDsy@Rt<wLv-ZuV7H6`Eoi{iICkJhPOiUTd)eI)y*Q6BA(zT)~hOiYNu
z3~s}PU(&1L$xNxg*|s_-tLxXJYEfSlD)c;>(D=bWjgqE9V)e@yK{5u>O@UZF(8B~S
za=Q0rpS1W55WZ5@mi~PeTplX_wFQf~x&rGq;T(Lwn*1w^qvwOG36s-*PL7SVBWjFi
z&4_&Ph79e~qX6B!PqL|(M(+KzFP?-5S2SgIze4rur3LFu4x~Czw!FfS`D2ZQ9zF^D
zr%`SmzD8X8IZTHv?~h7*`jYS*-|M!Va_$xQY1}f)aqQmGmhruva#)vo9mk|rP0W`)
z1@AqRs4U7YxbAVp8%aHN+_D3fwe@4saUVvarv*#RJ0oc6Pwd8?HEhX$wGG_ThU~d%
z;jB%#!NIC>EX?Fmx1Z21Lnl2Nb;36~N-5pZt)ek9Z+sQlqRU?Q3_>Ng1UQbRHzD)F
z<n`F}RTbh^dR5kwDf4A-cDmBzN3T?xu}@o<eU1#Hvg&cJ_3w6$Ph~pJP0qo(zG#Cw
zJ$C9K!y{?hU{182z!)RQ;;rd^))Yw*5v!*243cGnn?X#pchdXEMa>l9CjG!88VTZe
z&1H|(Ct_VkxJ32T?ep`0%^-?@nY-v)V=siKUN^d8=L=s0`kK%m-c7dM&LSkZ0WB>#
zdf5pp+WN1GOec>7Pgk;Pc6{C<=#EBvNwoC`UzDERSrECkz9qqk*)4rn{n97)^5kwC
zrrq9Kynkucb`Cm_kgY>Vpz+W`wv2iF%XyORil^^QIf2@LrwDfz5M<B#^c?jg@68b~
z@&ZD$+lVNcwmz@aZF6)uElc00@bENyc<S-?7FKZ_O}Ac+#5*}uaF$lFfBb6#ga(){
zaeaOie44p+AY&9123pV2-grpFqf&qBy}N%OS+9zL3?PpWr8z5nq{o&&rl&ow91%Ng
z53MKd;7<Q_8W!4#4RQ;rl(+8}j|$TJ^3C^jtYnHz=wX_=yS_=EZg-_%=FXgLm*-Wa
z*<Tp%mcIk%lpDvY{<Sz%UL!S_?$olA6AJMHwr(N=WE~|I8l82+eP#UE-g7ie|K!u5
z)Hr(GYliaST_8-bNxtzWOa_HrEvgEli0r9fJN&*?0}4B9P%JnzEP*ZrR7&MAp-&gl
zm+K$7PgiYJtn95K2^S?cyK>tj^QZR^Vb}8q=b-<|UkD9y^NL*rw{jBG4IR6atqkX2
z<On+@(UR_p3AV8>D8=46MQ&ZmST$}UV=I-$kZ>mWJ$?bVQ{(B#YcF0FFH8_b9KU$t
zLfb4<j*B*75|qANux~8A;jYq2WD@(R{t`SIW$32>>hL;P`%>?cHacn{-U$hLNSCV<
z%2Teyzv*Va)Y?0SvZYzC8P-V8wXGE)ox9`OD!F&zA6Nc(@yvZLMnnx5apXj$_A*!P
z+R96z(3sAHYnr^(Yc1mM1l6nLfr|Q_%?oEKHSAtZ-4#j|+1vG!Mhg<XZ1zRgo!sfL
zp`j<ZAoiBs@0RDKS7awskDoDRe>Ns4(nh0|4|xu23wUf(cV0!Kp)3!ZV!4V3N*>P0
z)mILAoPF4rV9OR&mO7N_X_nq7GxY{P00p8Y){|`6>jsFeg+Wd16ktA<?ADH4FXqwm
zy9kA`lp`>mbx|*D!6*Lgp<mV7YdSB%vrcU^3|cmqV=0TWEz>I69)7eXnfK}62PClX
zRsyXf(Mv12E+PX_q6)!e*H!01&macqqq2q;-F8a`Do)PU6fx#8>uG{?=Q^1E+?NMU
zeC8L7B4MA0o{*P(bO*nG%MJ`f7ZhH5tp3#sR55X$lbzsPD!S_ew<6JGP0Qyp3L-p(
z)Z_4%!m+-If<(8*{6<*UjMfIU^SmTwXx+E*XmYjLy)b?+jmHb6ObK|2Y@utec0yo+
z!Gk6Jk76Vf4&%x18W!%v>LijGoR}s#_r{F*#ZeBA(fzn@wigI^^-3S9@XiywvjPcw
zti9>A&q~(q7C8k{nkV#y@tQi2_1&~W20ORQGv5>r>o7(0D*2;+qPbUj@rzQ=Hh9`k
z;*NdnCFp&NOO$-p*xN*1k0$rBmEFE!w|&-*bkz%kE+%r`L4U}eY|rTQ67YVTabQ4M
zZ-wgn<3Fx$mK7)vbN!pxRu{$+pVx$(n2aVW3A*80`1tiat%TcyxumvEE9O#rvi*^q
zC24&;&HNv0x?QRH%*gsFZef8DT4r9oA|bdubp#??MlLf-pHAc?t4N_5EyQPZ)Ri?E
zY55#pIxROL)qW0OGG##S=ap5(bmZa3zx|#Z5%<6lRM9Lf&SowJf+$j*!HJoi?+x})
z)p<D!+I{<IalJ=9OYWLB3+*@Eqo(XZqy|Wxv!J{<dW9Zed*K$nZ!g-o(%=XbHh5DC
zZ8?w3J(cElZ{VvxFR|OMuYM$n&vteo49dlMz~O0@PfVVJs=6=;@~)rQTW??g(Cjpo
z6yb8vaXKjYeG$Wds|Aah#^cNGcoUzmm@BJmk@uYc_xPk$!kB>fD=3_{oPHxokI_xQ
ztrf4Rys-mGD0SFp8%?{D`GHo=aXJdW=mjni(|z@6(TCJ_qSm~$<T{89NU~C_<Z4i#
zEI>=d_~Yw`F^C?iFDB*D`IaMN6w0`7saXk%@7JWZ976IOb*lfk_H(hJCQZu;P&cPp
ztoIXg1S~e%am2`Cea+v(1gW8KIcc`?&9L3{5VuIVZ}MfQsYX~-MnPl;Tq6?XL&OGU
z?ozgZ=UhAmdx90}TZJtvu*PV|<Vg@&viH=@cCRX{+0UMS9O_d5fj>#`JU-*YJDeN$
zQO4(0pYx=nGFmWJqmb_W@Zg9sLe$>#NvwKE_0OwoxMVdm%O<>s$;WMHiD!efZ(Go-
z<7%%^Yy|Gc{dK7g0dGUviZ|<XUcOOL<uv!O*QbV$OEfFxPw0Yq39P8Uz=%tW=_TJu
z%vZ$_g0_n=cc%>zc5fc{?mc3L3l!r9mQ=}g(uI|HG}_`EDi#k24SP3dS>F+V*|$!0
z2g2>#XD6aXGFcL!4_xA~mGRc{B~3>peorcPghtzuf)J}{0F6NY0gCoARU|Y#J^`N&
zl@`X5BQE8}kuhi`vw@mSYR-DfE7g&PECCpj9ygzXho<tJ`4uKkEB5-4>YDE5-<NNJ
za@Crp($J(pB&GLdu0NM}RUe;sOro%qB7cUNX4E@UEh}8)%wZ0>X<s*O$p)GifGKny
zy};)^G-s<Up_tkqS`@kQ>S`hCoWZDdwB7_fS+~Ka{7Y!@vups?8TS{6+b{ud5s?tv
zj+*o@lVJ2&nCeZ)Exg3Qllu$sarFhEC9dVy%&o{OfGbSPMq5y_m@oz=$&+Y$f5%YS
zKM`FPaAUH6<kMUhI36kuZ?Ohm7BTy|J$&C)V#kVK9B9=c=so?K*n0DZY(=qsbRBqN
z-Rs-ic1gog8L#*sw_21>i-+`3q!HtMmjV2NJ8+*RwhCZqy-~Eni%Zs1RBi$Wh`kK^
zdc2BV6_}1yNM_nIHYKG#WzqR#vx6A*$0<3cxSQv*R=N;E+Q{n?N-B2DbyBlQhvzI6
zgU?&Ry@j|EaRfz>1jqL-hkUkzj2U9RsE+Kp$3sRv+Ng$TeEXP=sTMRJLM&Mbw6$?}
zZT<`8otv+?w%=3x0*>t&SWg^=ctc9eBr3SLm3q^Q36I)bB0Oq6Xbu0mX8K90M^E!Z
zy7BV(kDDh<C(IPZ#k+TfLH8w!qw<3X0suTAvY{|;vMiYnmfkbkJ`1tU^jSRiz@rgo
zc5@@q?sUI<&7E{<KegQ1Jm!mHetd@<uN_zP>5eeem{@^28S2T8IO~L}5G`v0UUBso
zl$Sef$qtf*$aBT#`5i!@8`WxMz0|{GJzST6L3~IG_fbP~@5T|+6$aCy$B0F+Lie?>
zLM9KZIR+SMyc!pgyXXyutK%G=dMVy*Q;vPKU-=+yAmj&Cc^m*IGUQFU34m*|oKXjp
zNGdbKWjNT2*&PY;<>Dl3A?-1vABm$K&94~086Upu?tY$z;5@@w*|J`)6m$QNAs|^(
zM(Zd51XB@+t*x4F)RL6MSp)C9Adbd%)%l#Pg}#$v*M2nW-F#l8RcrC3P}lNet3_4h
zYvhvsL>U2_T1n)0?py#EzF(mNqHu`s8{&?Ud8>w{v)9*yusgPSbCJ6CF&WeIctT42
zvrU`E)H?PMA)eevJC4UZc4ssvDHKB&rIr?MaFi}GAG2U}$L1$;<#dm8sB>-eV8!CE
z)`Ly;+}w9{PI(UMchRF4<GVXQKvI0$7S2n{sa_$gc!XnwN`wT%{HyOSD2K`Oz+diQ
zBfaUkYQ7!!4o*u-incxiuB*Qe%j03C;DY+Ot^|nHeI17oIlsRh5srVNmw!>GADI3p
z15;!3du174Hugv7&0MY;wxSo1@<o5gyeqzgZ+_87`Lb1cO9lAax2p_1e{S-B&Et%F
zF)B)rn6Au#>$WgiHi~pjc^d*xara#O4^j-}^n1C{4?Ssy`<1c$uN?QqHAeeQo8IkC
zE-J~b8E3WPX6yReipPD@C!19(OP0uj?>!HxqUX;eEaZOjL3G|uVWfVrvZ%+j+1Jgg
zbcHNE#*AUA|CO8?U!ri0vz>a|I~U6Jeu7}Bd?f9HJnlhgrk<^KxPV2U@Wgd5&}-nP
z`$gZiReVn7`+IxFBNeS0Eixac#{79P{I{Hz%ireUw5q+ILOSlitlJ*H5J>dhP?0Nh
zulc*3VmVp&H%+%s%%jRL>5DU4%6;a)GZp;5kpCZkXNKt>3UwK?DV-4=^IGOgTa@Ry
z>b2J%DnrKtq8B$=O~?;5YuQ2hJ9b|GC2fQ{wcH|Rf3;~XlA`Yzp6an5I5Y7-@8;O5
z2P)?1U6AdB&_UXD^2ezTa(x{8mi?FTXjat;$@V7rzigjC??+WWP`q7dXXiyZF27zS
zm$N`wd*d{9zim;+F}pTrWYO`yONzj*`}m!u`?1d=2K2}QIRmM@bLTsDNvZtfG)m()
zZ1hosU-4<~wg%96VC70n%VFNO#*X7}z)|LmCAagF<Eub31+joz+>GN&<)mq6zfo#9
z&e>KAf)dTCfW`~K-d$P1pC0$K1)nPXMxB@+6&y;{v4C!r&(s1v5qhWZv}C@bJ`H~;
zKNae&^t&&_GoT1!$4(22ZSo&q18h0GY1@1VS3IFV^s+8lvHi^i#psx-w|oj(?=VlH
z3>k1=5s+|VMUB_&@J9UhHw4@EaFe7wsJzHb^P^ahd{kcTZd7GVnuIo<D&0#c&`BaW
z_*gsW(!c6d9ONayOoEkVPxJ5pM$36mG^$6!EX&SvHbhFsDE3-G>#!l`hEQJBGLt-g
z!cBi7^0od7;o+iBwK=gX56lPvLu9j<&xV)@If?&S7y7%gn13_xNB&m8`)Sgs-uj>}
z{cb;HDyV@K4{57+CTT$6l4t)QSoK{W2fC`Q$SzEaz5b1&W*)T;G`MhL*|uf8_nU3{
z|7Qu;!%t6pRs12fl(?w%j~2dQdt9|X-hTHa*x&U+-?r0oMu@k8e)`o|ec|ejZ2><_
z=f@7A6>^y=kLsnqt=_cI+i{+!eTrOkkE=J5W|BNuQuBTPY442tE{<E#S@YSPn5DdJ
zO0TcMzKpV2Bq$<A*GYKHHK20DmNskO%~MQbYOkwgg`%y;o>R(9v79W_<)+E_6R`~+
z^;4vV+SJb0m~GC;^g2dPUP;ovffGVA|6%RRZ+oCcC~o>6E~my_;i?<C*Q*wrbS()2
z{FKU6GG#_MW2uwV=BfC(R&iY&iXTGKwbkuVm5__#cEbsf+=3b%r@j5AKk?x|uKfo&
zD&!q*IsuFJl~NVGFl8~9db9As4-`GcIn=(mP`QFQaUqLvt%foEkQ~?ZPd8EKy=|0D
z(J0&gH$M>PKaa?n&zn=7w%y@a$xg7B#Z<>E%9C3IT00Q|N7OkF^|o(;r#<YWvih5`
z1=ddk2;EI@;ulW3PO*^IMTj{?Ua$Q@4gS%4hxvo=S|WV$ddKS%UPxAKyvUV3S}(V@
zvr(6$n5B`z>X2JUqHd@a$k>u>_dG(^xcE3>%1sq;AlV@to#3fzr~DSUR~Qcpn?RsT
z!4G9+p5J{RNV<pXw9<<wVMq~t`Vc^P{xFTuAc-(Nm-ZaDORYtQ_tT|oPaZg2Y=ueb
zfxA2f-AIUEO>}8w$)D|GFfz2ps*8s^f!A$W#kS(S(lbBk2#V_c*g)V|0!M2SKq|_^
zKJ{lWOERsG+ldDy+N0QMF8k^_ZEUr>OW5-y_MlZ4tz~Y*e=RQ8Wjq^f8OkzZ>xQh9
ze-_Ia2J^M50{_Qo_sS9tX2w<z8wO3~fVXOfNO>+7U$-utth>S`;*q_g53awI!F<l(
z(_ceT2+V%s7>#N6+8Ysj;Wb#?P$h?pPd~fdj6LSHTX(W9nSIgAE$(!#wYLeY7f&{k
zrBcQ6;Qo2UJ$p%|#6MWtv@<I8s(4<!oIRJu1s8shqF#mBn}4U)N!xF!(fmCGxITY&
z&oL8eK$J)CR#K!qX8N!-7Y69QMvJIpTP8+HyJ2G)I*;80@#tgEh{v3NGMRjK@2vH_
zehNkPrXJD782}P458%MYul_bRzllhHE$Y80raz33g_HiPHvVI{U>B-?4aC64152^a
z_OB8AZ!3g*i1F{$^1p4wKc^7>whaFqDV<1&n-iXVj1gG&nqpY7=oDia0u*qQp7_Fd
z6?RJ_YbEk|{p$@3A6!@6rz3asHMIxXkbDfZmfFywMiHZYTMF-pyi2It&?h9!OS~=3
zVju73@7N|*71bmgbtm63y}P)7IUZ8lImdmH*C5<*SBt%+x-aw8nmBd{qHLr~tq0Ge
zVIP1DYQoj@1YEAGxajm2|LbM&jalL<>O@mz!7RGVJym3~__Tlb{jPl8S?I1uUcevA
zVGgU#`kC8xw0j}DA(C-;>^sStsa0h*A}tA*QMSb?5#<kdyYBewf30q0%DbB!SwSqV
zKyrL)OaZ*Rh@oY@_;l^wbf8On64u=@F_+8LB(J{ZFj?`0?$Yhq>59A>+d%GE@1X~d
z5}obEr%b5jVw*S48hK~V**sg$2~-CBL;50~JkzL{E<V3j4^OV4#F$}vnBbY8WIJQW
zOoP9a%W7;)#%LdKUeCsN{az6e_f?%c(9UGOPc%$lgEb{V*pv_-`EF=Nck`JnO9eZ-
zES_-0>Yde*Ae0@<XN*jTH7TK-L-R{+a8(e{A$OM<bQB^vLVU?r(j=lA-G`Vt>^Q0;
zL+m-NF3d46?Z4yRy-6r~Uui4}k;Z+ZCqDLC?`aB1iKG`Drv?&^RjkA-xQdF<q=Jw;
znVWWBKWi=C2rExTMj4_trQbNK<jw0^6Hl)<g2?P%Ovx;sGax%<Z}vcBzDiivte@$l
zEeV3?u6b0rj)1Ig#(MKquw8UmwP4WZ-$uwhjST+-?HXC`sN#Shk<W=qao`CPk{l8$
z9_q|cUCNHf7MbTY381UgTVW|ExGftw9VtSyR$O|hOmn#Tuu_T6E}19TIaRSU-G-Ha
zRdiQ?b9Dr)%?Y&l?pobp1Bo|fo{~%N>&EVRtp$HX)(&QRqo2echTI;94QLcUn?n@`
zn)Yqu^TKWp`Ns^q<BbFs7rb#UR6L>7zEBz#zH1$O`22gd&Z1jvE;*tt$Oi-0+@y9t
zy{7JHZWpNU&(k_q&wj^{M}wZw9tx6r<uNlo;;>4)-d9pO&RkWOhj540?Z5RHrhbFZ
z+i|3qcckk9M-;>h9*^6{44+55zT|kj^gJT2Je}akIuG1Z-H?e#gLkQw<nqee=_ORj
zBPQ5DX0BEiOt8oxki2yso9GIE5yTs*So!)mQqiBX2v9L1oWxwktSZ=_5Z4=1Dzpg8
zf4BElWJD{)ie&;4u~D9@E*;_tSj?)Uo#>&KXb>p;IuXeq>q5|;%w0cYo$CB(-{rfp
zL`GJ-h+@2mlco+!3V+{JH9beKDs@j0{9As*XLi|5yE9L0U9w9{4+nM$4MgXr;X5xF
z7*G<iHUYEVgx8;Ba1*-g2>2C7>L9UAx1{J%gPICe0|$<bSAo~aFFAQo`OZGuq#~pu
ze71uft}a8i`t+htS~61?4O<D}&OP?oJZWcV*{G`FHX;M1FV0nEg*W5y=zV9YSKy=P
zRrI@Wuy_+nlPFE}V&_CwbI~r@k(rTb2T8mUBksu#6|u#+$Oom-fm)mF(qC#mdqTVS
zZ%gw%+T2BHB4GoS^H|t5@5>dPnTvexs84mC)D)@BP#-TX+LxY&?H3RvSK?nTjA5!g
zuPyU>`#RXjT)DR+PJ&qZm`7vgPl2e@a;F$?Jygi!i-&Yg@Nq1Uoaatf?d)0O`_@{u
ze?4P7z&Pz;f$5Uf^F<mwnq(=NE}0m?kN!U6yM*$D=SNRwyB;`%jq-zixzK$EfrES$
z;{qJSt!iH}+~jTs>VP1x#8WL8wO!=!I@(|BoPjvZsql2gBFB*)zH@j8@%&KhqICcY
zySuMZ;3E8v*o^n1Z?~=o56J%_Ma3z$!!n6ZhAZNmW5_e$l*eJ1%}TsjJ1*{#B0TNz
z*3xDeJQIRq*bXHkxMrT}JKbeVMg&Psf>}YO#w;$f`-mIB5_6I~DDnOP*{1~4@I7ql
zhzn}xt7t*bdiXJYKS)sQ#Bx5OSL++fu0i@EFmF3hfQbWJj2cIGDS<ywNrvmZ&{GAq
zSkSDFCe!4t*?X;TgaJZbOl3bm`W0%S^)}AOPw?@1lkUY*GP3N0ZqqAKDiuBAXJ$1?
zU%5-l9aqZ=5+6?T_ZJ(_bb>nt7ILN&Kx&u!tVwx0GL5NxDLFRB3SHbs3VPRphnrMl
ziFn*E`SMxr0ha+m&?@uCh0@q=hSHif4~B1ffbvgFdR#Ki7=#r1Gyzyj7?d%NQuMBH
zzGtg}&~wOk?C>6X8SAZoW<Z(5rt<jZc)J|8^9*5yZ~lReR&^3SFZ0(V3Ot$<{a95O
zZE?Jjv(6e*1!5`#Q}~!ShKF0XcyuiDgkAh{$*O>EHRDAR-xVmF-zXqhmpoFvr=7@P
ztz%|4Ax4nGyMdTWiGk_u!_L0NzGJ1l-ZO#O)h#>~;g4lS>CLv`NopYZlTpgICmTv)
zPl=|}bGNHGKoAAuVUa|zy;te@6=og~0h>l^)s8vBN^T(@<V_p_7$;lr3tD4?@&aC2
zBJhk*(h7M!RV`LgTvPqA72P(TY-Q(kwD(6K63#QKxrdkT$n+I`kH1AomZw}C@YV^x
zT!IAgz_fggQ3L)|cr3Fae7e!#Q|a>2x8nPHAbCc%=Xz8Nvpegq3V91a1eXw-A~w)~
z&!NN%94_K%+HT-AF;hv5A!P^+2*>hViemy!b|zdAVYw6t4pw*!QQSi{oD23dgGw7-
z2Y{?9yP$qdet6^hEE_^b&9@v@ZO9XQT*QjBFG;R}n0aJWg+UmZJ_pGxwi!o~8ec{R
z{q79oRR?o#zZ-{8d^U<Y%CneS9?=VMFh^72b-<v!DBv9Ae^BZhMMucPk3nnKhk>wy
zdQ(ui+W@G!+M;KjnGLt$A}}F5pD(zpt1u48xi0!_d@@@6dfz}cs_A2j^7}!K@zq}Y
zL$%K~v0z_5s)z21)ng#5-44r&pqI>Mh|w9@xO3uV!gt~sS=}(&mtN5-;RhFVK;8N>
zvF{J8>DO|z9GB~gwz!=Q6xI4PKe3W@Zi@qU6pKuWb5dx7U0uv2>*t<kr3UzZBR0r_
z4=^w3P6u$Jp+KHl+g*{3Tpm6k@2O#hyw+@G0DYxs>WF8$2i1Vfy_MYmaUF=20)tWc
zo_fnJ(Oh!lTK6E@9aR%VAd60z-ryH;=_JH55(A0vWJV=?n{h{rMxg>zVtg4OLX~c&
z9J*BPl#UoS5YkqX%$Q2J!kkP4Dc-E0?+Eu~$}k!i>OO=Jw}b%!UR<n3rHN<GwX|q1
z&}SYa;%u%<Bks2ONea}|S{fj5^zHKTg-U!kFbW8daksmsD9HDD(G<8?es~VBFWf%c
z-{_-q)ZOKFOQwyxGqSCZOgk&Y>s69ydBxn%?o4t|__ckf$yFe{A4QJvRl8ZCc)0E_
z4io%_NfwYH(`G$Bc|`a^l!ZZ27(`&gGwmRm+Dvvs<Z>ywzd>ZoA13(iDzxm+dkEA+
zNbZvM`5FXy9D>5ESwv|VI5)LKdpA0u1$fKzMXyS<C^2|6r5|<lJ7y2lF6erAoeaI3
z0<x>(Q??rJNbGvV?!84x;loT+mAj`<Y(ZVEhI^!BeEtm%LVT-=A&wsyp-UOmBB`ge
zL}^Es%>XgJAdMirezJw)-j&-~-^_d@7X9ols6}>S<;_Tfg0G(si>SdiC%rG847vC%
z4loJR=0k6_aA@jg<h~rDaF%M*U$=lAL{Jey^~FtJw0fJ#r@lb(grC!EfcDI{5W_Iv
zCk2)!lqd?pc!HLusiN>3CTh7StsBY>-^EljJbegWf2xln;H?95YFcUU49D^xC@~4X
zYbG)X6N0gMN>fNh6gQX0V1Gnazfn?zuX5ZgF_~^K<fMI<@Ct$q%*cPyZ;(Ok@t!5L
zTKBL@%WJJKCbRjE<ry>44T7RxZgPhG+fvb((L!{?!Nh#uOn@4_-*s^C2q=tJ;d|=j
zn_$OgEmPFjJ&B|_B*a?nPUJ^Ig-3RQ=_a44hy_2ri-?A`WkR>?#Rg)Lo4G!ArHaz2
zsK95&L%qZXzNJgm+l&XHo|n$mk|Sm&I{0jP>Ox%$JWc($Hdg21{E8h`H+#Mj$BkE)
zr4WXUMDL{m$CNKiKpc-I%yG4V2%i#o76EF<=4BkKtTSslWpA#pB8>t9$|<>zVE1m`
zJHYV`=IvwmiUo8JN3G(qaEiDa>FCTnD={a)^F2W=kFK0z0rxPVCX!+Df)`SGzvlya
zNlOuv{4r`0?9=2^yVf`P$EYr2R#lm&=Xt59<Ix#XANeO=%bO~bmMv2K+IE{2ObdCU
z%dXM;HWYK$2Cb3D9`+FHRCoIOQeOf5UyR9QOWk=jyvh4q^Svx&SmI<)FIgu{&~7u=
z@|wCi325|4O8ipNRx$^1+@N^ZE}<rqdZ;G(ZNKf&Mop)Sco(OekoYe=9KE0yB51|K
z%1ODC8(s=u)xP@beioCri#i~}(c*($?(-gKF4V6Gj5@?l+6>>`XFMLR5&ndK8UiwL
z@jt&ke<AL5Q!Ap5JLy{!cI0aIZ5=9nhGRd9%cZ<;A9=h@-RR!s@<V@H{4QOk`ICMA
qB!E9D-A|hMllc9A`@bvt$38Ke5I8!(HU$Lydnl(YTX4_7@BaaS2u_ax

diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-4.png b/docs/source/assets/design/v1/prefix_caching/example-time-4.png
index 017df1657c22e0d371c1a194e7a76cb36ed8af15..d463248a3b1e9df78a32991840e9886c10c4cf42 100644
GIT binary patch
literal 60607
zcma&N2lU(I)ju9WDU@9{6iOgTfdavjtzl*?YkFIfB}-7iwj@ikWyz9kOA5PCprH^#
zDKEQ)uv1n@TSiKO778tU(^4o<D62prY{Ksmbnt)A`JeNili2d}efD#odq4McuTE{$
zn9VlcY2!I_=4{5XBtK`)TxHIj^{kD)4oa@G51N6m^#YzD=3M*0&acjygOx(69XgR=
z4fQz-v&Ao;7Qz|R4Z?+4av=;;L%kRJ?g%`Ba>vyy!_u`c{|0AZWZz77Ul<o5Y+<&5
zK;Q$y7Qtv%{jz?iuaExK5wa))I@q(?=~<)D4a|jE0{rGip*{rPpcs7C1@O@Z|Dntg
z5OM^z7kDgq-cXnIZrKVK!Zeb_7G+UTyl0setCfXt5j+pAfeyYHy*F^vCYs-wEwWrt
z9m+4tEXsnSqUpO1s6;~<7!)Cy46+DK|M1`S{kO8vF*?5WmC@3r>tUz=<+5{L5upt!
zSCEu8PvZNEEv@?Ha;Lf<SYWTd*tvg?oi+|*Pye!1)2CJsOe8J!!1e@R7K0_h3$qkB
z1I_C6{f-0vZ2fDSqds0P@~Um>?L3BdhoQ}MzU*Ms0c-klurJ$VYS`6K4yCJLJ84^<
z)^xoeOoVZxJ#zEDUaRW)j+#OANs(q0&Zo@m)c3?(vEMA0tYV~sJtm-vVsavGB%z5#
zgfBQcKB@A3CB)4l=0X!W7j_2YY$*tZw(JW=jvg}7Fyrx*%7<1Yml-IhA&6-(T0(kI
zrEZOiW|?Z&OV!Lk5p_P75qd6DrMih$q0C-b55=Is^;{LBFbmI1VCpP4QU~4MxE30j
za^0$8-Qh^Eb-T<|1zK+9aRxHokzMeRL>~4d%F1YTE~C&Bjz$9^D;kl~@y9cR%oJ-#
z*RdhT$2(%)cOi-NZA@=e5->``Q+tUN1l0-MkaOv{GqvgEn=Ta6$f(h<vpLlsfU_W!
zDb%+jyVEp#8Lr!{nGOaONxq5X`}x8Qiz!)=?PQQqE1|2`QHHP0Mk>vUnJkKy$PS#;
zIjdEpyBJkba6Rrj-A*qbm^hm&CS|f#Z?~dOP;=4YV20AI{Fp!xq?F4KsiHXH>X|~<
z<eZYp45MO=>(Le2<u!;OOpq~{o8S)GBRrAvf=mywf{4I{jxx(Zat(+Hp=JtoMs4OP
z+*GNM%4(!gs8-2LhiDl_$+T)Eux!EQ)4JJYgw3+G3Tp9uoujhC7$22;1rq{Sm*AV3
zcnaZMUN0*I!Qr(+f)5Mj+=Q5ktXrtnnRc!^sC#Xx-p_N5NwaTBc)QpthXITcz74u)
z0gXanXb@07oWyR>NHUpZjMYboNgIxBN-b&@qI$nVcKF%E?*$dvh6uSVPT8W<Db6xz
z#Ef-cE0j^o0>43T<8cr3%N|<v6(X1O%ek5u_GWUq?lgi-e})ulVpAh%C}DN9YY}||
zwDEEyQ?dn^XtPXH89;TJ%Zz8N-7JLDTmjUeeLv)taU-@`pgN2a?HSfFXO&zr3bHd}
zmLGs(q&*=dEb>Q8PPdT(-z~5TTXIWAwqg#5X40jNmJ;V(oX+)oZi(r%FrXpF-4Rvr
zGtH5J1YjVG?1ZIB?CVrD&)2&V(1|qoS7!P?L?lY5&w`P1HBiJ>hJ>Qbj55$#^r*^t
zQ?62or!1OG^PMOpvh9HC`$e~D<hY)l6SYj}LOwrj)r2-@Bb_jx8&rjnMU-b8K66^}
zL~Me-L1#>fZ~~m{)uP5Mqc^Ik8X6Q3^!5-3704EAyXv%`8RSX|JuEYwJ}g!TUXW?2
zLw_7-{pJYI_r1zsKokqjS|+G*vl2J3Dik?^qJ9ZOnruGo7=iAPao?EovwGaE6l`J)
zw>4{k>u%Aq#-79uC|sFZcHpU|CO43oH9`<DftD7PGM#=WGG^WwVxV+RBA=<y?AZ5&
zuw_cas^myqM~9joTrM=pcqW5N7-D*gR&9+*GY5W(LHgOjXb&tIJVk>EUD6OLfF(E%
za@q7IIaXQhS|N~9I;~R75$u4Oc}y7;beqKqG|ZJp!%opG$~IN&M_RKKsk%q2CTBqu
z1#StAVrDLb>iUEmyXoxXNlmsUoF!vTPlbk=zUq%FO$F*|h98Jg3&}%NZh&N))<p8^
z9%%@(ZZKv$fruzJlF!yHyN5KXw&1%=0j!;NGg&vR!4lns>r<qHxQ^Y*#l;LGxBWg^
zx2JhUpHy6lhHB*)^|DwiYull%LeemYPk6<{<;I}plU_d4WPwNrOm_n2*|y2_l8YD^
zVhSEIqg>JGF%!rxCb7xkkkV$mB`y#+l>!3A$2p|Pl4UlB`B+CLILGgZ{Lrc7DX!y=
z%;uEgOqhmoz1Om`44Nt`x=%`4Hi4VjkeuK&(wX5ttAxP((`3F)xs6&#>#D|<n-Sd|
zDpHPD7&hH|BE#lUA5=4ndK-#CK@SOC+9-_kif_`*mZSJdY>dol9<O1B(a6>l-Ix01
zX-i~q*Tsv9AC>Xmv?O$q?34#Ru{NZIQa_%e%#;v1J!xQ-hc1$L_;8F1iabEIz%N_q
za8eSTS`OIv^jaoK)dzQxuptxt8ewHjM`uH!=-{YfdRDQ7GkB^o3e8anEMQUZIF_$Y
zd-buMt}>N8nSo0k-5=p&rbTO@08}Shbptnm?FT>qZT|lk7Wl8F2iHHv4*v!ROkrOx
z!KB&|jPGV*B|nPB)W5Mn^uMt{yC}D75?jGwtvxn?QF3}iXu!KRXZ2boH&FBG<oivU
zvL>0PomHl06R!4B)eWY`*I;eb?Y0UEz+Ln#CJ}wsaWrBQ7qenFwn@{9DwWQVG-k3a
z$v_<1>cC+t6_v}trpH2sNv%;Gha=FYRuh^W+~Yf*n_+rp<PJ*Bn!?3;2G2M#g&A7Y
zaRy|)N-|^ujqq@Q&#X>Hn{>fkl2NAQ*K>AQ(QO?K`b4CF2`K}tA1f$1G+TpQC+-()
zzFf9VF<p{2DdH(qq{kTeO_y`1n@m!PY8f>L>x5%QO$1=S(Tc%{`6AIr)k<EAA+{~z
zB<!T?o#bS^F>#xzolKM%v2y8VF{KVMbt-j*36uaA&{}PpNv}O`wlX!!hBC#Ln$)QR
zJPSpO<8&CK@Bqe9rO!Y$w(X5Gb)A4}pb6dgOl3$rjkaVeo;m7yQydRw{a!_(_-v!D
zmOZgkZE!tmrZ%AF80BX<H4wm95(F_pyG^vwqBfYBQI-g)BIU=HsF~8J<rM5D4K!#a
zL2|HF;$+AX+AX>k6o8=6bS7(58JdLpe!-iyAk5@bb>P7>My+;KHdP9w8%Je3QaL@Z
z+faL2mrY~{j{`d__K=cSfK*^k`dzc%GqU3$ZD7qZEE65sD4^ts=rA%d5QMr&TJ>?=
zsf(kj*`X?Wo$XU_w&Fnb2GMU%dowu5M`F1eguYd6cYKz{g+{^(L9;3Mm3lAWr})G)
znOSCJo6rPC#o<haSlh{w&19-(88Rnz+PEmxC5_AaSX_mx8s8`+0nb_0n4zGGRiN_~
zPE{EP(HVglg&9u-DlgjwX01Ku;&ic{{?K(Swq#OJm=QapsO&YZ^4M_tLPoSoBZlK<
z5NSHIUVdUJgzBPAt^wNl?4Z%s`>Y+5Cxw_Osbqg(q{kCvLxyb*GkTLjvxYTjcBN*E
zw%npxt#m{oS9h=+2^We9HYu}S&?St1thS5eOz3d4#H!n6!yMH+OhM9ZbA}KzLBYGd
z#EgSz*6cS%P7xUEl0dclxB!l+(C-Ufuw~$Eg;F8O%oHsOD{Qu1=mo4sX!!{bC0%J4
zgnER9K|PV0k_%~-re<S2)*pso%6Ss}0v&wuJCBpi$gbt>iIvk~oA2o*CRAck)7yod
zBd8*(MLqz%a5I35E@yTMjW+muCd}8|an+ciHuZ(&^=U83D?oeG!~jTYmmC^_U$QH0
zoi__$P*o`a0FmlQQX<0vIMh0n;FK*ENrB)@i^(Z4S5rlOq)`!<)i`yWuMKGg8NjAm
zsAtjYNCmA4k<tc&!^LtJ$HQ^C!BruMa#L?mvZtAv>h>d}q6$L<;$^Z<6ozFvo5=*M
zWTZB#Jeah&cH9nJ*Fz{G=lAQid@JD%q1NcOG*&gLEzhXeZPXC5e527N%afjm;MgRi
z=$$d@1WJDlJPB~Ab667#&`Ioc%jvv2^#L#k7*JJ&m#@=RUK|wgR*7#l$}^+GDxDdU
ztpV*$^oxnes;QN7$y}%B06>J7p-Q>qciLWC=wXc8r^=aJaS|F<N$=M?g^DSY#xONg
zL@@H;_ORv(ldzeqPUMMPaeGtRrE#gB#iCScNm9E*XUQ3ib_&$Q9Oqj7E;3;fBdqCp
ziva_}RxZ<Q5Go%T?M6lG#zPqL%V0_bgpb%{(lx57@-Rq<$c})%poL0X!c?a%c4#a&
z$c$hpZcTMV8X#pv3;0+lV?9Y2RvKuR^zo^Q7+Svtx@$p_l(+`ae}vA9oYF5=8KIq%
zdnoRzoe*4iNzS=2YG{1dat2YVv%x=LeDaM}onfMWs^3LoJggZur?a8xwd+VK!<f=0
zdIGFZdpKnva_TG4(Twp7DHBe+L^4stumhtL+Ak-eQ<T9~EURI|9e7Z_E4kEo)Pk~6
z+(OaNpGk>OPBA4C1oRAR7cj?)qM@w8K(#a_nHzJw%O<kc9H@9!!)Z3o=(GT|ZdL)h
zBa~&g@1_Tu)hQjQG?sR%P>4&}W|j{t87P@{vJhI;IY#F4a9r!<fitN|wXiQiG=L0~
zS&Bfza;u3}T1>OwuJ<i&NVHIc$g2_tc8~5FNemgGM5kK`ek&x`1|DP&*h`vD`WB1V
z6;u>a;FCkNKWP;-NC<0*fek&xPrBIDz$w)B5P<#YTtwC!HGMM2OiCmGO95Dh{~JC5
zcOCu;KB<Q#jHyZ0IM8)TBY7uJWd4m$5-sH_sA`J0nE-qM)(Fg<DCporY8s$KHhs_=
zk<EtwG&`O0lvQ;-3RrT2lBW7JJ0pmGuUYn6Og#jQ4MB6wj+%u9C2l1EsdVePU`FEk
zfm9u{+JK%q?wG1F7#D<{W*yY$rSj0HDJegWl)9&uOgKZ^kz&APEC_C`LNriEp%jh}
zyEvLOo8?@ERAau-%ah&w(4=5~ge5N9!uuTCkNQErWDMQmDCjorFmJ?ES8%|>2(dm0
z(VCz1`%Eh~q7qI}7(C>wUa1)kbHzHFi|IktB{Jhmo~?8nrt6?}e~h=&LjeComiD8Z
z!Igw6f(2nUVx?4}^_#pg)f^WYO<f5tuo9GH@kTDyrllYS&bBgZkD>^##+h`JGj*w7
z(o3pM_Xg=T7e_G5mipxq+whfUoS*fXWI!oSrA?OO21W<<xK|8q35q*IV$|;mx`9i2
z*tZha&-9(Ts*bEwz`|*DHYH5bO=cs&me5qw9aWn-+Mi8}o;+bQakbURQ-LJ$O^wdy
zq{^`5G)*Z>coZ3QdxIvJE5ItMTWB|{fFc{iHOnly-I5x6eSzr|;0Ew#rgfDffv-q~
zR!hT*d@3d$Tx7bzpi}eo1|MP;JrZ36McDCF@73d^K5nHJrPV@gkvGRtwKObb$pTQq
z^h7I{ji4{Jq@*JYs2~x=YN<c2#%{4BnE-R~<q?f$hBTLuC9DAa=a$<blbM|jBcGoI
zo?!Vb=WAt8YvZmYFx8<@7@#i48iAR*QT5bGpvSp!3`Cf&6?mL6N*Obp0H2;l#TNSo
z+;&qBUujrK*!2)mMmmu)%EN52NC=oFDda2({b7^WVXai-+$InaKFPGiky}Ewkvppc
zu0k9WUZ&P3Rbq&T>E(y0RJNxh8d373si7g0ZNf&)1#aYkZ15vL$uV8Ko`bqkH3Bs|
zn3qaR`wJJ@M@`$vBf%)!8Tj6$Obz2P))K=kpf*$UR0|zR4muM*XcT}04rR+QQ$icl
zR!)f<mIAm9e^9JR9jm7*akHSLzLO=Tiii+<?OIasN5Y6AMgi4MH;BrKs50!?86FLV
zl7LQ-)_9VE`TPVZbk85d5S{gPxE@8Jk#Idf#out)hWrU0x|3|nop{~Y@TRs<^dJ}?
zQDs@Gn5Y+wK<|Vng`<FWJv^xT&ZI<-hdpT0ZZ$n%59m%mX!mC*#R5%3^czf#DgpAD
z7}W+7(L||o)&N|e*)GtGF-$V$sNS%~<rWNksSP)Btx~B)3cyLVG-zCBbTS&(iX~>$
zkp>;yaWgsIk28%{kDEBkv_f@Qfl2zHCDP#$Rf0MbsYT#Wl~6+oc)t-d=tWj9CTjsH
zgGJIx80D+-v{wr|#er9GM_JJ~Grhi_ZRrJdz#?P6U4yv+1nL?wSDh9|R6;fRX-%>L
z2Q+fvVmqrke%_O$SVXu+-*=!&!M7Dos->60X-R-u<tb6g7Wxt&+5zL_n8a>S>E;xX
zEECIoyEIX!Mp8rvv(&26U>&((ddEwhQ3r*jNO9qDwcRE$mhDD~Gn*vZ1k1H&CV{~v
z7lQRh5hDb(Q5%O!tz1gRopKIr0cVmB+Q9+_cH0Guae#HBXL!__s^AFdOl1NF^Nbu-
zXiJjOuR4Y}OE(23G&W2fxzI^0XrO|{6Nr<G3T3!Q4SEwe27gmBKAGoTtZY~7Q!by3
z0yr7x5Z$z(F;nqdIM(fFL&eV5(w+i%;FLUuspj-lEzAZAP?ZElm*F;Gu6w>);md8Q
zU5bL)P~p66H((oeuLkyr7>sKomhq?FWP)X|`mo$eP4UD?V62kw$tAqeLCVvVUjgxg
zFhcN(*Z|GygQS*XO;#_tMNLqp$gr_&WnuwD##P;{33xNwW$01VlBV3)1iafUG^;o*
zl6cvj=(9<`K=qn(A+9iEWD=`EOHV}@p_<;rsVCMf16AxQJir669mH&qnjp+l#~m;#
zG2%y4T7avvLBm-X(;6&0=KF>>fLeJ8PkLbnOwEA=K5?vkTZxr2j-*>ZLmE?gR6#3I
zMjbV0ZB%7RRj=Bi?&J`Ens4*<wpP*EAdx*y1`UE)G3ufaxSiNUsiC9>b;k&h`XQFn
z`qIE70;uGxlG;hZ&7>?_GLJ}sTl1}{6~yH*H=UKc)v+4opqVht;zKmG@EL0A%CIVs
zL_PGg7{jto9u%iA#inb3W+5EZk9s56j&NHXO6?$O4)Wb8JTuyARvb%8SuCX;RX9iR
zf#x&IInNT(MW>Sn{}hMy!B-v`j4Clb`vUs>U(pXZf1vFD!_IV|BPK~pkBlZ*NVjYf
zg%qr%kseHqx;-Sw{X0^lqp8&83b~2Owh=DpXc^cSC!%NJ8d<5-2&+2H)$Ed&N_+`O
zieOQhqC@zjQoGnR4G^HCZ5OSR0~b{)b}u^uQKBi;2*#v1X!1Vr=WW{(fPu4Z;3UV?
zaM*Gw04-&$+w0WxikzOpY$Aw3TMFeo*{Bs}%^sj{hoanxS=nS{J}@V`DR#RmNsiPB
zidO4#vr6S_Gr2w?yG^+gm%R!c0tiT$QxzX*kt(rMe%O)u8iyJYjVp9G9*FhgxR6uy
z=77$GwOQS5H7F3X0XsuXMZaf^!knhTD30q!9?hjEOF{u3LfSAp;INRz%Z2W+Hi2li
zo$r#i|Hag&cm;>@g<8OpHfJ;{(M&KZH5V8{S}~d8Fe9o328+UI5cJ0~NeUboE?=lN
z8a)t$@``LlK;m9+N+hh(i=AdJA11UHr;wC&?XV9UE+MK8<l@r`ldW)mPwf&a@Y{R-
zEEcm_rqGB)rQu7N%3$0RgoeS9LN^*Fo;<GfwTUq8dEsmTcqGQPTGc6G^hy<U99g-h
zJ)?O$z_WS|7n*X^^(r$j;GKxV^Vx=;Cvr+%k(%_R8dO6MDDQmMoSD6(sSn2btioy~
zKzP>;vP-t9?0~74DK~HV8MG`YQB~n*9Rz?rBE-A>^k6Dt$CVu1?4wx?5V++)B4<$x
z?2;(Bq$>cGk)SZYubXC#D#js+QQ3^1RLQA{2uJ%k69bu1^=T1;j<k_a;fGD#;$
zJ5(icjA!dPyo_{;S)r!}fQ@l$@C<C=$eJkAh9y)WYn)3Iz;2Y2=D^M?<FG218)T~X
z3ACDV>|SrwkAOavG@AiIw201_@GM&-Ac|pS3HY7F&}CzAd|zDHOgqkWTsPmbXD-Gx
z)k1DC7E4^c-|&WmOb3MZIH@F%HfvYViL2y!lOc<&D~}*y7?(J<+Q@drMx`80Th?H}
z`mI@HMXgCmNs5%viFB<H6{Z1F$EOj*`O_YS<E0+pZ|K3KZpk@2-_eVDt>MXGU8D3A
zMvYXqud_9!M`Y+K0P{u{(*@xDPZ+Q62<iNVDH-)NplXw8*3ulKG|ddV<B{JGit^Y5
z;Fned7PBfeuvqyjE{q9q^ZUL2tnNYP%(j%KC)5!X^wgd@O~b?-Q}CPqq}eOEj#+EA
zCh}ks4%E8QZ5M`4Yb+s@MA)6|beO1o(TY@s%v&=$BJyTMVUdn5TgYsn<&8?#nCc-n
zL-k3FC}9H}l&g)a6PM*;sh%@RV!FagkuKztBB1^0Y-iXoChVX(Zkj<wbiD~}aV;fV
z4-=e#x<+2FHb_nlsNsae%?VL+HE?j0sr2fFU~E<p%rpj!reS`6GVTX6x!R~(J<f3D
zJWE9=ZM16<pg~DRuA9XUz`3DU=Lsq25oE5OY2^|e<FiP_Is%r+wFW(vVxc~ztrTOM
zauYT?{b^h144WB!s5K;%w_(HybD0{<`oeSubw?dMgejiyNMR5F=VGLraU4v84v5#<
zj74KEF3_z|OK~+=`n20}$fOq9bS^99rc#?g>e#3>=EHb4Mp}k8!BcZY#{*EG*JFTo
zMi?;W5Tijzm@h+ibs+W~iiRC1wqsI8K|Y16>aMH}k`4q@trU|e35r-4Aq6(Dvl7(v
z3c2<`ALYfOmZiEW4-OFwgQfQw>oA?fV$7gj=rBB>`YIfWRjPWmJ&h-05QH~_sgS8)
zie?70sVxmyQDX}Lw1T$cc$Vvg5GuEOhQrIcC$P=BhMN+D#Hy?ps8qyIw22vVCJN-q
z2#~duk1An6224edGx-=F4ohN+<L%*uEp<xaq~7+uDWi3ph5(w=ZXYSPUCqEnYh0wd
zv>=Rus&pltiI}KWh^qr34#6<)tfos6cnGi}4!(qxfNTYx#OPjcs=}?<QPH8@XieoF
zh(xwkj4sd8t&S`8vfeUDy;kWKnyfn&YeI$y5<o4CkrD7IO{HE&sljL}q5*^IaZeOA
z5ABSqEkLw_4kRt&Bw)~_npD%>E|C{Gy2A{s{Soj!BZfv11_X|VvF9g}4|vvwpht~R
zQK)WXH0_Xh5nx~;AN8{32{awlzW~R!gQN~N1s5vn)EctXAS@N?1FSXbcjJ!N4s+<R
zTr!ve0ViRD3Z}GysttBB^b~T8N{vFs4Kqf}ixpu=7_3n3_!*Lw+BF>FI$R88`fV@e
z@&H#eb{P`j(^{V9bSqz0GhTnJjHD{A15A)gEvnIp0q`MhHj&hrHu3*En*&bpt9WMI
zXOVoZ%GDIUpHnGH5iQ|gct!%zCV*$CMh<E?MwSzEeX69fAZMuSF;2JZmw`js(+hs3
z24W)Wj2mffC?5qGTIi42o-&EsGrBJ$(-f@Qa)+^6hS06F5tJA<VjvN;(hPE3v8Fnu
z65J>d1-d1qf$?S#z@cRcK+xrplNkaR3mji+_k(sSM1(5Wbl5bK-mE~_C9G)RAPZ#@
z=i)w+)NnGv2PO_j84Oe%r?-|V*bGzu0@Z|Bcs7;;sYRk;3QsIOp{qU|kLz|+j{}=k
zlb+SZGnGaxR9k~K2ag<uPbeOp)<|+->czn%i{YgrXOAIt>LHVa52k3oms&f0RFy>|
z4LgBYy<Y9tP%qPo=+s|k@EkhR10&zS0rV!CR3w^E(3tvzT&f=ag=?s&SnMiZ&?mbC
zI0N>t=bC^~0cT9O02k+}YCYe#X;a74I-lv<L0uKOH2H{Ngcj21k#Pk?Q5tap0F&Ba
zq{}r&rX>UjFyWUog{+uWOSV}{FC_;Uo!ShZwwn`LE9z_pE@USID-V@vvNUNF$DLB0
z7n+qGpX1x*MiEviJNCU2$I%1Q_2S+XbTX4lt#PTV4*3@BCX`7{HM=pj1dc6Ay`f&J
zDp{)xTXjdSlz}6JMW9VFJ;*|!L2O~D=Dwh|x={@(io-tDAi7Wum^X%3N2WidJgl9K
zLf;Ph-Huww#6ATA!KTCD13L<;aWi!eB#$VIh*3bQgP4(~#Tu0c_e%8^Z22@uz_GBd
zTnhVTaB+yF!p)>^!GuY_T`5fMp&Wp11))9=St>_OfN&asci<(!JBVX4Y+HfcsP}|&
zsCc=dK{(MYWI(Wz0;E3&+LUx=3X3y!(g02c;86-GB{zb>_(auoESJj9KzP+8wPGo*
z=h#6GO8kQ1CCG3F;(~Cg<Y93m_P9YW(=Xa8OR3?o-K`dKWe!lHg#gJ}?N-uG%bV3E
zZb2o7rpsfIs<soOHm%`^3U%tb)f@`I*Y6R-K1cNwy4*FRnH`kCWRgiVYxsJz9HO*P
z^&<`}q-}#!jo@j6uR5b<x!d6>oEmm}qhddr_M0WgbuBzA17?UWsQI94XNs0tpHzXP
zOP6}pQN-D?KB~cvIx4Fe8HQbGq_%0#?Svz~+9l;)a~1^tAY;dpClIr!+U&3%tJ)-~
zuv#93%Cpsms{&VA&5VP7;E6NbYog_*t7E35A?U==p_#7)Eu1Nngac1H?IAK!i(%hu
z;)zEJcD`IFawD`(Sfn~F)`9!jE9offvZ|X@2ephDm6Sm1LUMtQi?LJ%{y>YLN@6TH
z;doqWfXfPE^k6-cMm`JT3^U*Em*w05oFd>Ff(F6nZMswO8dKY30qHLA)tIkAp`@X@
zJ54}p9aASpc&U~Xiy(BT>sBt!J{wGOr~&hp8rla64n)kzPPQ5a;~C&Zptd?IS7Ci<
z*b!O8OE5HU;iY~Y3)5n&=C=qF!O?;gj3>Q(t7T?OuriLwyjLm!=B6T&5razoK1fNL
z^~(~O@fui{8E5&D$OWbw>Z;wTP1-Op|K;|??#KySKom4A=sj#gYJpk<?q{JtrVC)2
zfzM3pRXJ){x&!EHvu3o@4M97aE1D6GgZ#jtGX%^6JtAqKh|0|hq)$qK<blBjrp=hQ
z91lF;Lk_Krcm{%z#t@du?JA2H4J*;r3F+yh`Y3}`C=d{`9B}hVHVQcn<eK$zcx_TC
z=&eSG4zaSJ$8tN%qatPxW4=3TQn)j&5=vabXbJBYv<Bc}M<dQc3l3mLWDK7H%qQW6
zVyuoG3WYne#c7m>ifs{CE<#GW8e2gJZkO}Wgwy$Pp}{E<jhcF$Zi}NvWvmAcTjDBE
zPw>i0lEqNZGiMB=jAq%ghj~2)9rV3Mo_6bDYfzh@AR~yAD8(9Kd3-8DC9eh`GmSB3
zelWA93>>AXZ6;`yp<GAgzL8<wAVLG4bn^|q6PLuSPlN$7!gLwW<IV_#YI|PJ(o${M
zfuMXv2MHu&fonG<O?StpRE0>vNO!m-^IUZTbtOiU3OPxu_ePG_>xrhwpjs0xfvZyn
zuw1K}UTcPiEMRg(9Q4}NTP&%Yqp?^SS?vkt16e0n4L69Y3;3lF%~neIpapBiY!@2L
zK!lF!g&Z=ekP$u+)%v(emS9VZK`V8Jhnc210)9d#!4qXLK!mCd;YgmVqnIZp;}D|(
z*Fx~3J{&N#Tx-F0yGbQ2%~jE{mQXdily8o4Yb22bpUKAhc%<=zLf2IVNy<SrJ`Z$H
z$MYM!Kkd6@rJS!)B_)^bWwlAZS7X(RQ3`ra&CF+km&S{53+-e9KLj$+1i>HrDv>3C
zfQ3yK1Nlu&tseHOZnMwPsbQVYWS@ld10LY4A;=3L5;9M<Er}c=J-0G~t&o_7M$T;6
zCT7fP%@Lpui{qLdi-7HJ8l6nug84DqRX|@J*lDR;6X6=5LW$adZ)NKW(kn<Jg=Di6
z3TK3Fy~F049gD89Bc|<>Uj(*hog`@M;M|K7&zwSWuabf|r7v<+evq_zfGY`k3UU~N
zanz>8wk~;PcuX`o$Fj??BeSU{&&Me^^ksl=X<B!nK^4#C7#|>>i7{)Y$Lj;ULzmlG
zFK74u?|2Y2`YIkwTCiAm1$keLo{otM-_i^KFaFJM{jW^qH2bJ3v0adTBo1&)42`Oz
z_9kZ8stQERNygLz2~_<7)ubyu(*=`^OM&90R<a8NLa*tB<$AW13*D@TR;QVQI^jE*
zYUO-k+_Pe-R8651MFlY;(J5$g{aK#Kc7d${=Rx$5ZcL&jzg=gEfU*>h138XJ1{L|f
zmt!j>Vdivm4Fn%rK<g2G2gLOtGgnCRARLc$vIfgfYSK_8{S@E_HN2qOV-6Fto~_A}
z#k*~9Fl*YGTsLv3kq2xUh#}G)wv~~)dO^U2l12l_E9E5}MOhSQyZH`^Xc|aJ%j@HD
zFb;w?l)}$`AII|*M2gYLxSfHr{1hbijIc(NE6xI4Dt00eMog~|te`1fIWCwkBRG{=
zAw@DysBoeb1%n1*_SspEaI$XG%y%1g+Z>Dm!-#<hS6k3rMQc=y8D6M`T^}b{(&@?`
z(Q|AE7GX>TsV6#-4TVfhW=Akpj}oPqB(v^dERP5J#4_0_#CH@x8}v+VCRCi#P{?&=
zg;Aj~tbp`D9qJ4!84~0|7(70mQm)Wp;8rgNYBns5+ThGa1F~r5o093|%6yz15DMJT
zD-11`$sj?HLTG>pdq3!zM6VX`P!m;JD$mlqN3|pcv0#j6a^*&d$tKoxym+8BGxl(t
zz!QoqsV$+yJ7nswYpsFWCX^!4rTaapjXLxfo=~~%_-&@!>P~AOGmtBO!KbImkPmnf
zjbK(L;D#|O`zoyXR85>Dok^=71|r?1s8*TO1~t){dhJR}CRDl@DmllnnF#4tB{?D}
z%appgFH*CrN|pDL)ahz}ffmRqPvk18*GYE^V{Vdb)y-U)&G5d|ZPv%~gi$KR6!FVF
z)tadExJOf?#BB(?R2YovWCSFfh`XTqfXHQ9=D-H|195`~p;u7GPvfd%RjEcLELDdb
zWdXd}m#afvkV)C@di@f_rJ9Ka9;?U`)e==pV`RC~sH6`txF-}L)XW$l36`2zo;Xl!
z5QWT=j+{%;a2Y{D#hMEFT1Mius!<4diLREq4bY<kGGQD<Yqez%0x0&!BQK{{&7wu8
z+KMh~Rk;X!e7a{heG*}6(=?BRaAruCsI{BL+yEP@K`pY35r~<00za74+#KFDJGqH5
z;t`_V@U0Qs3`#SihtZQ#LI772w}HsFZ7-!GolJ(t>}oXxk?A}MUh?3&v^0!{!Z->j
zSq;bp#mLlw>(*UI1R3O!j5ONHbS$@G3M3DUCJeeEbSDd9zaV=cQP84O1gUHpjPXSu
zYY<7M4KykNuv-WN@J8elpsS!ZA{(It82B`vL*&^OiPQ|yQgh>~1Cm^vq830b$09=o
zgxv~Eo~;dwf|#NzfV{DakXqmLN7HbV!icAeQx+*S8s%QKqwzV}N%$(I6*BGi7l8>-
znTd=ZFjRyq$xgZ+Om)k6sw$;$mTm+TSUJTdVijyK-3u_TFp*PsrfQ3h0dlyQcGXXh
zq^NY$u`wyH(tc$u4MD<tIU2?-Gb$u>MjW<#$uvmPs#9Z{jmxru+okD{i;*hbGgTUE
zBt(>rGcJtVQVHbwRHt4au_Y?5MFpqg@)LzfEW#S%z#I`dW<2h;D>zjP37(v`Dq5e)
z+g%n&v<2STD7Qe6Nz4QPe_Hk6Ea2`4;E?!v5j@P&IWTa&%YsBZoB-?qtrrQ9HagR(
zVp~qU2`B-%Le*(P5+z-jvSyPFxk%^Ac@fYTgb@~5IaE1x)@Z}9IFv~V*NSRsI?W<J
zNR62|LpmBseSqePE}E&x6@-Pqz_TVkg{uuaZd5Qo?~v0#hP#a!?}ZLrlhcU^<6=+G
zrb-ykH1(-$j0@>a;oL~%bMQ=L+^X3kv57wDmTM5G)`*J<ng<Lhcol+Ta-@Nb38vd8
zyir!Is3UEfiqSv>xDE2UNS=-bSTw|5id4G}4;BPw=g5}IC7Oe3HNUK=Y-)~7Ovf};
z8O->8so@Mu6v(|8HL4&Q?+nKirx=yvge?R2OfdWHWSDv%AVChic{7C#@cK-n?uNOz
zhsM3Yl!LG(`v#OxWS51eXe10A*mIz~99MkE7HFdhW>eKVWEKX==R@!|4q^vvh?FXb
zfR0D?2!;A!t26*VS|)F0TkV$U&&n)#*~HHb>WoZw4Y*c-(sK*PI=I+OIT(%GCITRS
zB&F!3Gt-K+*pc&XjMKCEOo*WrnNB+N$5}C)Xp;&&0{5>6_<|`eVIVUm>#P0NwB!O{
zLBgrX3`!{)0NI@dq|x^cQ%o^HX#}PBTv390J8(#=l&|<~qdDv)6-7!9D9PpcM#?JX
zaW$0NT?3`<d<DGX^Zy$K1a<%Q+FX`;=Cg;9IdkUC;mCp*9sb6J-?-rHM?b&kklQa=
z`Sg?IPuQmUvjzJuc;==n8>L&{XP0o#T(!@>C-S9FM%-oOB?~UuirYw0wFf^wcFh?}
zUjF=#Z~o=nCH;RK{f`R|c;&Xk4?pkRSK<fq7q2?{f!SvlA2!|y|Ju%TH(&4M<BGr8
z?fETF$<O_K=||f?yWuYXE%QK?m)M`2|F!?rN=sfi?zs8Nc+TTTKLVPi1z+|uUm0!w
z@8>UjTz~HA+V9pAKj8i~#FuqX*KW>#wOw)E<{RP5x81Pu>>mH$p_l*d<u9l9uQ86t
z=RMC#J1@Tf?mf;pD_pU{g+KcGgP+cS^~h}|7tVV%!#&5mbNZ6U{yV4R_t|~RA8N?5
z+#Y{l@*J0Nyv!Y<-EeJm+w@l}-d}$3IX7j<4Gvgv;r8!+^2-I!&bNNP;-Aa^vh2O%
zR;;HCpMQAyF`FOxL}ux8&#yje`N7L>;ve~5;kmtbf3?S*wP5wx?_IItmkZCnf5orw
zUS9i2PT~XZtcBZd{`}cLy8IUTD{J`urpGrHyuKZ`YN4~~mfzg`^Ya&6xGlc&0R8>u
zuAd)s+KP)8teJO}@%06pEHC7bc;rXRfBj?e_r)8(UViQ3-%bC7?z21$w!8g~!K}Q~
z!mCbOv034{o7qQDPFS#dqXU^;qk}KK@!+=)y8EL&x!c+E_V~u`Z~pL$J^ym6#^k*D
z%aO(-pB;b8b!Xl5=lKWiwb!c!PUez57B`;w+C@L#*ZQ9ox6fUE*8a=4+i*o@*=`Fm
z&o21eneVB`9eC676UD24xxzU9t;K4>eLTOh#a7pRYwlZzEP6HOf`yN7vE{y}h!@XW
zxpKiD-aqT}OMiGkYpboU{KMSk`;>2dVACzRk3Tu|xZ(RhesA#nt<HkUvXDIPspcO1
z!KYlbQ2C&A%i)i}+!h>h-ih11>T~;*cUoi(8V6WUuE*WFuyEy`TcW4zx?pttf@jvd
z*WSxoy&-ndW{)pgVLv}!xnkk+*GkFmZ?D^6`ETyt_A9dV!ikIN1+PO~Z?3)1MzvSw
ztoPUFKi}Ppinq<%W*bpmvhdAKKHc=pJ-7StgAb3}e^g(*i`3%(GUuDcuJql-SI+BS
z{PoA&g$oLoH=6vex^di1N1pYy-4cB9PV@Hv^!yLj{BHR^8|`-4l`GWs9^v<6mv3;@
zQHMNaJahVao5Am&w&H;GAKCS>E4RG*d&j>r@3hu|3zoir|3*i>vJ-a&`!TuIckkQx
z_+MNyrQpBsd*$2H(oPFrJ(%0$>({@u<J;GNx6W>N;$zn=y6LrNpP#<7Pqg^Pee&0Q
zd)^gWUVr4--UXNc&nGMYbBO*Pb9X#*<&h-*#%>3*?aR3nPTcM6!qF#Hx90a`H@ljC
z<NOc5cL2Bb0*bhC-l8iP?6*04I=bOTb04|<gSp!Jb5E!~_0Do>gB6FIdF3YG|AaWd
zvE=Yq&(wC)lY<t|1>Zma7=7sLr`>(=CRgqhZ|JZ6@ro0cpZE0@%9aPQJAQsU@reBG
z1NQv=-{-G)_V?D!QOHO7oA8%^wBFpc;o|*I{_*MSh}-6s*|o%(E4RPvrls!g?+~Av
z_niCApB7l;^Gj|Hp1Ae!1OB5`HvAgAyRyTP`<)%~i<hr|{9@$foXMW@i`^!~-%j5D
zn!^?+-(RrFH)?w>Kk15ZZMEY9Wy|f??Q-=6TQ4&^W&IDU)|-3seb83RTl3z2;nn4h
z!6VzR=x@2#rq4CEA8oO0hw?D`-Bvq1cIlOGEZg&))1SQN!DqCmfAR3QFDly1L0kTE
zll6~%$z4|8TUqtidz;;~-c{S6!t*D+^U34u-HJbUqx*y16iMCw&x?Nj=IVQ2cxBc4
z^jkk$VI5q-<dgop`CH6A^BHaO9(!$L(D~E%586tb)7{|V-(2+fC+4hI+~w}V=FkaO
zzP&R_E!*_<ZBPDd-d=poqEDfD$ai;Hrv2>Y6HfVS{`afcAN{aw@3JgHzVTE69MWy5
z^7`CIcG>iAA1q<F`^yh*+V<4P-#mV=BQF~~Rz3F2%`g1s^{?OghqrE;v*=3JIpF#`
zUQib3PaRyiDSG9Z<Fuv6T)6kE*#~pQT@I*ig};03^+ai-i>{n^=C_G|65Nf;jyuWT
z{*nDQS|Kk#3ETZMW~)=b_Q3zTBP$kc(wlC2>u%*MpV<gGPTdst4*kudP4?RUk$G>N
zw-Gice|_hh(H7qyKjtpmoZX1r^wcvJDo??M#)s>e^}e<D@Y3sdB_<nn7T>=?{?3QK
zwS3MUzv*q>`i@<GtoN7hWe0A#$HqLgqPWc)7d5IXw;lc3vhT86u08TyYL};k-Ojyh
z(b6q9Ubf3A8!y;suWdiO{s(XE@T)_YtS--e>!2N@Exxtq3XQ$&H^2GTbsHbL7kkyh
zzBG5EyPrQk+_GiwxqRDezqjrBkL>r_rMo`5es6<2{#knch1}{t_y40@m)-i!-|t!b
z$&<G}boDj!4r0$*_TGCNJ!Kw0@1;5Guf2W!UzC>3-I(8d!CpVnlyB|3?gyLwc<pxN
zohRJ1;p$zqQ;AdNp7!WRce7md=F@YJ|2}@?liIycU$gwX8*bLW5?7Y(|HeIUuUj>~
zapi2cU$nQxPT6jQ8#lV+#P>SqoOI4b3$2^3qmO$&UbD%WM;^4(yy}T}Ena=<ic{xW
zm+W`P&$b^%8;CdI8#I<}M|@|mNawewKV<)T+YjFQ>oXfX{pLTK)!oOxwbpxK=}lW+
z*O0$zc(<PW^p8F~eZj_O5O;17ZDQYa?E1ew@x$6rw0AcB!E;ZsCoMg&W^V8orn-@}
zYJ(TPzhL7#w;tZU0e8p2OMm*)r}w>b&v)Lx>mM8MP8~8_Z@&eHMcD2K&3`Rfu)&hO
z7BrU~{MHAzy>-cVNavJici8HXjkntGwVMt(`I@~iogTl(4G-)vf5EaXg$urM=j!>(
z*Zr#hl?na+YsbH_eB~eK{GyOsvtaL4GynDH&V1?UU6#G9BN>~1>SqSE)yX&g<Lp0g
zeB<lC{Lhv7l$iII1)Drb4sU2wfA&v*Vee~=BX-|w{!zu1n~ZNg<+GEj=kEO#5b@g`
zkB`n;iDS<l{g?mR>-A1v?C%aNynp)A!P)=mc;~tKo!4V-^go<)%gg^&ExRqd0Dy+G
zFL?0<^>6>hAd5F=esku^oew_l+K;|6*n&L(4q5ohuYM)v|D&3ATXxqsy#;%pyv0uk
zzy8Wl_nr!%&As_W*S5aW{w42gm+Yo2+N`|!u5avs9<n@s@#XTS>&-6qzl1)=UvTo`
z1t(wj=Hs`2e$1NPURnFlYW(tzK7Fg=m(M@yhv#mK9RU~j{Q9x|1OI%>Y3nUJ_ozGn
zy7RH8-GBEBFR$8f@c|bcef-mZe1+Y-%-s0tj%V!LTna`<Dx+2J^$*<nfzA8JZu#~V
z+hG@fcGuHq?sM$*wR6>-?=4;Xv+(@O4tepf;MJ~AzJJ^ytAF*;8@oPmy7tlcC-*(}
z0@%FGF8k9ypY6EC$*b;qde^Ugde!gl?|<%17yMz@mBQuQF*hzB?D?$m>Vpw}!7dN1
zoA-Qu-_gOJoV;qjvhK(W;nmtX+UlEUryO%g)DU*vWyJ>j{hVL(lMgQM1aCmhRY$$L
ze5WU_xOCC&FI`fBJ~8ikCGkI=d-RRLIoIwhmo8Luk91D{+x_pY5tDg;Hg+suaL4P<
z9s7Cf^?!W-@yoe`&R%m{^xeV^%_BbjP5k19f6A>pH~+E!!R+ebuGMy9)z!89t0$iF
z%sJoP;-ll=z439^y6=rwCU2i{h`7d&k(HA*8}Ga%yn68t%|kx@&C}?KN8b`Zd3^HM
zr<w4DcQ)AFKH};FU)uCJL0@q68_blGu37T!hweCJ-D9iH*zA!{x0kOz=9+JR{QhV9
z<r})rpI2US`F@Wr|J!xeAFf%|dgIP_?|gsw`SH89E-CfCci2mZt#;HSio0I8Vq<;3
zN9S)5e(=p7{Z`&~wCc2NA9(NWleQha_vsIxT>If;aWa19(pzpQee~PepI_PTx8Hi@
z+2{W8@VV#6^0VLC`}HTzcOQC5S-QIa+!pHRFNhZ}Iq#|0Uii_wx1N8<BilYcl3)4B
zkM2JP@*X3RtB+pu^*=wm{|&o*aP#}oi^E&rz31X%udb{;;JtMx|7rCnYtET`eAONw
z{rS<~O0Td|_nimNTz${?wGB?5x2ODiZQI-5KJXyq>f3kg{^5v2ulUAYORl)<^`~#%
z@9GDyzu}p+4=jE8gQe%JxqSco-+E%d1D^Q%mGjT|{h3c0mp%OO@F&V0=c2o9d%)X|
zK6u`m=;rxK;rrYC@Q=m@{S~)7yWv4AcX1cNh_~n2E0b$}aozWRy2-}d-*?|*zd!S>
z-!3`$jB`J{Kw5J{Vd)j8p&xF#WO$>o=&;MH*ZuMLXGZ$!hu^vT>0ey);PvB=Ut0CO
z^Hymm%~!s;=}(p}*=9ZVs&xR+y}quw_1Zi49{h3NWA485oYmf|*POTZ!|TI`{Pmw*
z_v-fN9Xz`Jqt9M;4hXHY>3=jf{^>UN{&m-HJaG6!n;h0|uKM<_|5%`WyU{)A-6x+r
z=DMRlIrg!I)PvX&$0m30KDpKX@%tm?s=Ky0WS_@>dG<}~^jF?KLp%8McON|D;}4E_
z^^-N9Y<8yejCI^I=+CbxeDI4uAUizzaP(qjm+P)Qn%VcQ9aretyI(!wpV$G*x4mQ2
zV8PyVm+_x{^APrlr|w;abnZLsQg~DOAAdh^@jbgNIqloudgQJHPx$!l-!8r8?`yVw
z@G9+wO@yO&==yt7CxC17`EJWDEbg__1)s0`WY_Bd>@Q#V@#&BK&nIt5Pd@R>znypU
z8+$yk^%eKafBJTDvtMMtvuW$a{U6`?f#tj33|`f{_~|p29{%X!cQ4xMs@-3?=;x1?
zB`}Q>c6^zYzBcc&kLMq=Y~`&Ro{ql1=MMLry!UMFPM5FwWDE2Ct?#;jK09~g!;iW2
zefQAIUfg8o^IyAb%dMAh`weja^@}y?huIB<pC53G^~nR@OA_(e$8TGA-eK7@9A;I~
znE$({pIH04d;8$`^Upf%pewT354Sj*Wu*mpuY3L47wc;mpK<F=Z=JmB<3F8jasRnE
z@#If#-CfQH`1vaj`{hac?RUNN9p(1Vj%l3qhbQ=pADCTTzUI)o(Es_-_%EC6z4n1W
zJo?A%;ECJMdEk_TKRM|B9gq0^Yd_m&uXILdfEm4e#vz|w^7>BCUKTvK=*m;)fAZR|
z=HBztY2Vo4Md|J(2N$>8_@ZO3{)hGCZ*My456s1{Ex&KG)xopfZ5M^cX-m$!?c$}_
z1It!p&wccpId4CC#IdiwSik+D_?2xQ`|XCGTy39p<Gq_yPCN7`TP;|6*V^;WKlHNi
zzq~SAYyI5bcI}a$FWUd(E9dt%J@mbc?`lOKe|j0cV3+&$T=KiuUq_B!SN{0LGY<LD
zpU`j9_o^=xZv4fXgC7jHf3@}C9cS1d%$95#6T7eOJSz3v$8T%?JY3ek>-lr~ry+-*
zvhOX=x+C@pdT-^OrITAPd3v8!>G?l=*2<O7|M6pH|67xFXTAFVDR-XqKx5reA9W9M
zmcPH`or@mC|G4SSYfrrVj9rHB|7hL(J<QLR9QxkN%kSluatH3ToqNF=hh9G6Z}@KQ
zdpq2*_&b-~^3d{?4?cIy;mk`9_D|dCkf+{1^;-9_!;8sk`Qium-Tm+CQM=x7)*IK%
zj(FtUD=u7q?Tx>D?oU_Wh&}tW(`V>ot;-f6-+J>UX#Ljh^w}Sx-(7mdgNyDZPW$7V
z7j3!u&tA};J#5YLkEhmEOCLDoo}0YKUVJon%zYQ+uG`yPb@CZU(`!yX9l8CMw{Kgu
z+IaZUn@ihNZ_oa4w)Nf@?ECTqZy$ZvZr6NAdiUnw_6^rQwL&@hs9mr7<yPBofA0(S
z{qc3bTXOKF%h&w%iPO8!U3S>_E?@EW4~Ey3{#O3mf!E*o@C(t4Z?I2%e#8U*Q5n8`
z<1wGTx7jW+x$dSLe%}7&>K`87c<PirZvSX>-7T9&zmE?1@jenc|Kmw*lWG2gkL2*V
zVtt1bez47{iznB-eDKf4XY~i4AAj*<*A2?=R!{rvn2*+CYu|WipS#}r(GD1Oma)%P
zA0P7qw)V|Oh}OvJzh<mD3|ey3{<}Z)&wmu|J0H8w+T(8*eR9v=*X+CN>t{c`^^G^2
z_`(M^Fcd2foj!lh#cMzN^=CKVcmK0XJHNUgegc2x{kQTLe0J`7$2@=9$$#Dch@&q(
z>Z8pMdFheY>-%ke#L`{yJ$`-8scxZk)h4$;_4Wz7DaWq+=$t3k+^S3t4WInuJtv=M
z{#t&*G{pKr`{y3r`?~A4-)hTES2Q2IP`c#2-FR}RJ-++qY{~nl;@FxW9B4cy|JJ_b
z?RV?9e|q%|#pJ}#{=!^*)V0q5t$oS8m%jJ(Ik!Ik@?Y*yp}`5qeE7~D!IJYn|9^bF
zWmH_v+O3_0U?I4>HEzKjf;RzzgaC~O4-UaSxLXt4f_s3*JvdE};O=h0U2+!Be)s$B
zJ<b{5FJz2{)m5vj?t9K_);bR;anM8D6uv*M8Ldz$%vTMnZ8|hN6Mi8kd29W3xLY~?
zoDIweO|ZVcUZkjp+y*ps&8{9`zIdDf>I`3j!blJc5!d-4X_6B6rAU4$zY@1KNVI>K
z&hGcMVe9!e@Zs{~RL&22_0~Z7^VdkQMnrFImro3-T320bBwoqdN6pXsm&(oqVGNAE
zk6yA2<nQ!Q-~kPbYWqUth&fi%ExDx@Qe6L(7wQZDS9#$$5X))}i#2(;m|oVt>7BP6
zt7;GPh4L#?NO}3AVRv(uY>|^%?UuWWVU6}m#j&=ifssEH>lRloTX}ox;d({2Xl<-E
zSF75@ha6o6_y*EJwv$S8{~V_7{V+pU#0`s#$Q$w)y5YvBspyF->#sUpmj{Nx?~B_!
zzpv$p=u?F>q9vUE&?RUJ)~Hm{C?-uUa;Xhx)@%w?o%F7@NTC+PTXRi!oFd-iH}dZ<
z=W!qO-nx<|kQ@3eKUME}ng4LOC48jX!Tp{$OsC5mQ<zo6?{r&SmqlNvBHCNd(te~_
zy<K*)5>#s0)vVss-P8L%Qu6bQwT&6Re=drSSf1cH8I$|jR#*!y(`jB-T0T&l?ZSX5
zB0D;SACjKPUbi0=epAwT1NX<vFt}#D_~TUgzHvhccR1Uzv{$>$BSW>%MxR&5uDxC+
z<4(K?Q)76{qA(q_<bE^Bj}9mKJ4hDgVU$``+)0Z#Hq~~+*3|AfU%9Mvax2tY^QGKO
zk;o_~PP1frs_1zdUxPdYUb*!itB>k!ULVoS6$PJ$j^@Z@l7Vlj&fNKg_f?I2_2{WN
zv$Vc~i#!2TI30P8r#MMtw->XH*sFcHMUo-Q&1)kHRU{03UbDGfp*kz#5@92BzHy)B
zgRg+AyY+?2p@%}sukd1>w0R}TW$6U_G}HGzH>GxM;}FdSr+)2@GYrq3NZ7M($Jc7;
z+vHi^7HSz@w^K$Lo-U`|A@6w8PmV?RT1qiz`G0#H8|1u$ty^!KR|oT640a)mfAZLC
z45HEYeJ`J$sRr7w&#;OL`OfxI%F}zXnZ|7BfbQ31gI-omJNn^bM=XbWgea0Y4K!Ho
zM?+1Kz%ez%ElXFgIjC*nAtSU|CibTSN88)=;LC}!o?ntaNApKl#ZDLWTjY6WZn*53
zT(S%l25*TP+D|SEqz+(jCPK8>Jltz75b~^hue0S@62cJSd!ubBa6+>PI>@qjrHBGG
z5ljkZ-NH5pGZ{#(RIMY{QIq;)C+E|sEdKZ1hb<9144=H{4u_WeIT7wTR!5rQvJA<5
z3_3`R2(J`wAshG|Gy&J4E`$iR7!@alm2Ps2doC(>q~a5fnt}TbT`z{)=-A}mZ}UJg
zFPk4HUzMfhUD6NlU#3l-F_4g&55$iXCjSL`gzhqU-uq6DgZnire35m$40~NaC*}1A
z<k{7gmJ+iwo?OS*v#62s)}?<RI9knk7p_o523(8Q&p#-?)5OH7T4_2&SXepRf_wH0
z!;$hSL1>-wUS%;ZUOz0S_48M~g;N=aR``#z!%`bRr^LCP+-Xm1@+>7EwfxOS6;UXP
zRHF;uo9(0cEzdw!c*F5y;&gfnm9#|#v0`+jeZeLj7lY!yYvv?~Ine}hTNaeO%3YoK
zd0Lo*-lPat4WXwUE(XJ(*xTwL2t2f!4$^&0OHUZ&xOttEz*FWb=iE@8J<l(BFyi7<
z*HRmeo#<6*nRqgA1!bMs>!L_2W5~=X6o0%|sGlzVLES2pT-{`}s6ih%w!=`ce-(fn
zrPAYZT;b~)(S+$Slg`1f-t^A+PkUcF{s|)!H9VZ|%|4&)dpVfV&7t~@1}j+O1Z`w`
z()91VJ!6r4m{c$*!igs6-Q@4Dw&hNn_(?@T<;(0ddP5KSBZZrZKYD`7)$BBx2CO43
zK7XytgT;g(0kYy2FGloz;4*tI%Z$}OQHD5;SAX?R*-ou@VGzFnr65)`s47S86^i`*
zWjxW3ET&!lqL0G6zj_rsHNb0!*4$)&)Z@9pO#>0cHdlawBM(~h%6!hqy&pIE<(c3E
zVFKII|J*bntDCYDW`m}{uq-f^S^T`+1sTxSEV$WWQ@3Xw+y!f9pc~TkXkRs_7SKYl
z*w@}kYK(F%E@fwpGQKT}rNH@4Qdl<|k+i={91&O-(GdO_Bvkv^WX&j=O#EYBjfYJ~
zyaO?^9L)a{9sIaV=GSETkd4E05qvQcG^n%IQcJJ?FoA#D+AJ!Z=FNVo6@Sr^(#P|M
zrH5dV*5(^p+#zyo;SnYG4IktA;9Gorj_04ke0#a^@;)@TJ@Sp?31iiHur-tFlmVOS
z^vruj(z~y=6|T8)9$b`+=7b`P3prZp9Z=Re7n$05CEe-|%3`Be)We^v2g3v$(PZ@c
z%a;{gW}PsF4=VFOPXlH2(kMP5ej*6?76c-@Ug{L<ARaF@c3i$f(=mh{ktUir(3dS9
zw^b$4`#<9w*U$qOd?RH};Fd39nz33Gx6BdK=3f5P_GNgz#LL;@;ry(EMh9_kHotU=
zgbme!ch)M_48OI&XF=@|P$4RrG96i>5#a;<!9fAe$6d;3q+|rhm-cS{lHX&$@Yqhz
z$FdxN-v}{~WA90-`%JyX!5l(Q6XWSH;AZZYrsYemm6sUuG)_2uYA%C|B$gOwj%z>b
zGJxQcG9BeSsvXAFCG3hC>1ljLacX=JcsRF2sv`+7^>;`(L>OL!SOec!6J0ET>Yxk9
z0|j)~wn_)t$dmc?b%-}_0LmJQYewyM|3M#8tv}6cU18u-#+htq&`iaKi}JAPe?M^^
zBx{AEYwy!F#Ww|Ppjx!4Xl;acP?^*`54NMp;QWz2*cu_R_njkJA_-LH6GSBY@{jl@
zGQ|F?;C^yawhC865a`#EEfxbITIfN&?Q-#ZZ@E-^N&%KndG92`WiqlsknT@X%$4@1
zMS+n%8vMUZdT!E=t_twr?=;dIzvbaPeMQCY_B*PDzw4L9R<{5qr~|YgKjKmhzG(_J
zIlNo_%IQ_H@V%U5qA1mDnh)sF9NFy~ksK?w%jh6^qf#W7do5u|*w$72zBN*A;hx-8
z-IkGGUczZ4o2x=lxoBeWQQ+6)uW7~SYjxzv;*Me$w42)(bGhFO3mOhZ^TQ=!fsbC}
z;lyT~i*BCg@x}*3bn#LB4boLKd&{*6@($8?4^h7gNYhH~K@nf8W2-+MwOgF&tKThk
z)w=$AH>*QOx(4qlwUk~Sw&*Zq`#nV;fvCvmm*smi8CrP44zg#p{3ZRZ%r+4o?pQ+H
z#=tUsiRJ0D*Tt-UTvJo*IR|Iht@&`T={b*-H|K?6dYu{SP17)J(kGZ~GU}r!_gnjI
zC>7+haKVVLJU7GZafo@QWuDfBZHaX!0?=?Dtamqd8&?gQ_FSx77n58Kl6lvj(2=`n
zEhS?rMN>gOY`1(`gUl8IDh@RoWw(9Y@oW9&oJg?%1_Y0ldV|!GvFC9PMcdThq&@qL
zU&x~csbFDV9;FhTt_<5=mHvrJQ2IV8{ml)^YMo0ZG`+AeFk`GL#AEmQjk}QvCBER5
zb-eXFQi9B;ge`mp3j+*8LF2rF$01#veAptI$F(c=(P6+Btp81Y79O~`vs><+<jtU-
z&U&gcCZ--4#tfNdjVIo=cYu8{LJ2YHo!zH7%4jwz0&D1G+wXM(zvzX(LzGX<5(uGz
zU`<6ZPjE{jfv5PF4|xN_-iYA!6D9>4M~g9o;B)5#uN`fTS!1TD5$_s^*F%cl1#vmd
zcE)F#tipWXr=<3NLMMDQVR~87?KreWI#{F^N~_GQ-y<^iCmM-_IP`#GkG%7_eX<C@
zZ76oc<Jpn8WQ%+_Rm=WygVAY1#iFF}lI~~v_+`3g5DWBDlF!xaG5=MTb3vot3bsE_
z1RmJU-|KR7TIg?jwPJtpf_`UQ8Q9)IK+b>6cSInawvSm@Stz|aK!4+?;MH)T163?f
zNt>j3rVF?z-#*knOs>SaqcLc#XF8Isqd!y811iIFt~u3W<5UL5A*uOe(e1aR&y@RT
z-^^cbZIyQ;u28A69=Hc_BAriQ(-WRMT8Fo{q#qlbB6~3oEh3Au++5nk9?u<VnIIRv
zNW5j<tpf?n*==2YDR$8mZuPM$RL>>oE|#`p&bNdqi7a|@F&9da85DVgXT7XWyAYfg
zc}=6xv6pD3b=_~M`*EMQ0jeAQ!Q184^{#L_>TeACx-!aQ6GyS1j}R!lbTP$m(ou20
z%DtpU2<5U}_mSpXsB#~`gyN?qZbes^m{$*yqx2LcVQkN=B+qJ+4X04!rL_yE8wVJh
z8OQ@~b;9Ko5jxO2XGw)4?OkVmq2pmarfB@(Y^oPiy~@@KmvDXQ(lyx))WgEd1I_An
zF4^@9$r{5%_8Hqp%NqNsiW_C?`g5@yUA2?@b=)c--%X!R+U-vAFLRdUH+!ISU-B7;
zod1>~#M1!8n2AQ<K)DKCeSY5EGPC*#9t#(gaQQ_q%uamW&_zI(o*|#P!{QYA_IZrH
zu(uXZ_75A8bB<)S;ftno-~K?x{y|UnYSh?-c424&{>0i<^gQc&Ncq(-4D*m`ag^&1
z5osfjtLYP{Ibi{#ZL@5>pJ$=0ufJC?I~S`suaHZ<i~md0Qup3*7W8$qq?R<YipO-G
zA!~On99=OTJ+h>gGofE@x($L5&k<Whs8H?wZx&z`Xxk<!v)WC+kl?Ub>^jDz*sXK~
z7tWV7Dl@7%Q^#$<A)jrksLahX98&4UW~QY%mTynqpV^#}tsMrW?Zr^GCz?5$6h3#5
z1fLkU(#0M}WePlC3Db%fMA1TgzLU9=17asx>4C_u{V;|{T>HL#^nd)6b`E7Fxjv1o
zqOT37gLu}pKnY$@j`@fGkevOco@+rUrNS6BccIZSuFP2?9?mNUVtr;QZKp{0dN=s$
z&?FKretnFaP&79q9ZY9=ttv-hs1C(rvADIG_Q3Vx!bvNGR}+-V)JyEHy)mqLIK2BM
zlV=ac-@Faiw3Kci-fEUc!bl9oPN>^nk#jyi<B$47%s*L>aa(x7e@Z>_7!Qa|lwDdr
z9X5$o(%kp;|0SIZ7Slmql9KY*B&G5-Fh!Xh>eg!op=DTrAbomH<+7LiMmfHo@&tSz
zj%;rF-I7#U-C;~xEq@+kSy6_ECRxZOVQ;kiJfr-Z#rQrJ9nd70K=8M0oJ-T}?k}#E
zdr6Ht@<J4cMNi4pPPc}P*PO|8@^VyNkx2VAtrj}EttG={Q+OP!Q2Wj3oF5gF{#rVK
zSoS}!lOuF6G*DP>Y2npF3gqIB89=UmOdt)Sd3WVCH_f$(uW2L1>k-p}6b&yw_hK`a
z(&0Pwa%X%U3#39#q7^80GrefdOEQsQA4p)28>UB_F91Z&@j)4mQj5apb1jy2VTBuX
zkcK0m<?EIHBRdfE{5~;8j5p<Xz~Tc?NOrGK;jlV<BdOap1*ttL{c8L?oK*sVSZZGB
zYdywFYty`Q?>N@`;^Vrz<|Lo&bwkvgWEq3VrkObBeXIj5-7`Mp5ive+ZK<9Wcy|56
zHpv&c;~J}94@qHU9NkBk2mFTU_w2LOknVigd$f+jU@gfv0cg<((gw(V(uDoInXC<5
z+apOMJlkD|!XFRjbjEn21hl2Y>f=IH(d+jG2RDAh5RXRXjggWCJ_o++aPK6@%h*&%
z!uxZ+VKi72_)p`cF-KNXQHD>Z3--h-S;2Z?rYjcvcK(>5OtUvz_0V#w77~$E?0nS`
z0{#k#7qOyS&fWv#r|*(lMEq&52mFd389zz~kNA}KQdgprRX@d8iWb!$-Am%{)o{V5
z8$p?RO$D-x(;t=gmhr|-Q<w$jz5UKrI#xfSP)fa3ysXKM5gAKv<+0M@W-~_o`lUEa
zz@`oL<6CN?ZJWcpy~C@zg4HM+4hVbYDtfI?rGYJ=WSkk;%L7+H$cZ+8No47-G2iJy
z|3I=W9_084NSk0}gppV>q0H4-Xr>9ey9qSRPvVO~DLZI~78I^jYcQ>`H~NTSNEE&I
zy96!n3nz6|?ph;ljDVy<L8R*JuZR#3g!ayr6FEn7ZPNKCWA0crf>f(QK{RIw4WB_b
z%;?M<_eFn745rfjAhEm}Ofj{pe?#?@z#kVnOs6hsh(oYWhdK&F%=H2jf`$olB6I2K
z;Q7mR!XPwK-aNr?TY5;T>0bjpYPiJ-1-5o#shOZbCY+dpfsBK67~TU@G2*=M3G(dU
ze(8K3!@mP`bjjx!3w#yj(?GzUYKPT&%^o{u|H$`|bF3d|3DqAoM90+5T527t<^NWi
z+*r|TQs%MgqSivTq|<GF%VHcud)O7H7lAszV@40>Q!Fnbnbi;bY;HvrNp?Nyr3%$r
ze~kO{YQZ9^ohf?g!}p=qD%z*I|7(y5`oWS0n_YQ4#<o4^7Y-{*m*aW}QIacoMhc;c
zgiVT7p=8j!nd9qF5~+bAnnVp?)(CAv_g|anLwiPE_LsR$`Vw03;PZLEeqO}URFJv1
zpdt@QlU(LlNp9)Y$7KxZ%9$eWgST&Ezb1*M#sEjfq<K{YzGe21Sas_9#w_4CwT_1b
z-+=1x-&!dyhlpQB3I<RY$0{kn<`z{AHh$jkr5z1zCgKdtvT8UA?aC9BBq)^fj14<0
zybiU-5V!W6wEkYC%Y~Z+T}hIPNq#pf>uCCd(qKhvofq7IjN(^+aJMRFTp3-$F@;#Q
z-6!1k2K7t8er**X)rkSy@o|-{T+xwfh>3f<X7lV~)5=2iSRV4ieeBN_ub;s`(F2qg
z6<`7re|DU*29lTzU(o*mdSR6FR~C)gV;{H3abP$F2ysHB*!o2aZH<#aL{uwAOmuuA
zAl&UBHWSG_bQoynEjo{vs0}wN&BV4wvXqfA9}U06xyuuTDc*?nKnot`H7v#x4`oXR
z+}!Chz#0yDZ?${M38468(aircuv&N$oEuL1!BGhx?CU_kY{NUehgiSxg*=KLrY0*S
zv048F|4>M1n9<EKQi6Ti^NJ2Oi8Fj&jeH;-mpXHR;sY&9e-Z`+EO^U(<4>du?QcAc
z=Ddv8_vs*5OViV>rI;E+TrTy%gpzG_ws{|RI&o&O8cOB*NplK#V-hy6M`(;yE|HI`
zjcBoQe<0lY=4}jk1J2&Lm%FEUV{!^@v8Pp<I?$k2-lUhW?pW53$Ij)HCce6T%VU5z
zkra<|%25^z+ZJ1ZH5gG<M1EF*BBb7=yuzso@Vzhg1LQMsHDD}MDm8y!7-~9SLz^gc
zM3WLi8MuYd1$k^g*vOJpxxhijBS_Seb0=MXSxW2@dG6?z8u-J{Uov~!bodq%qKr%I
zVCKDD*b?Gt@3D4Tpot4kxA%I!$Za2qJ-)nn=`y5s4U-=UnBZxkGGuk@iSn>i7vH=w
zpjfZ`w4J@Eh)Uq8>t_tK-;}T2OSH4gPH5;}qa4B+{c7k;J4{e7PS7gK<UbGthA$oW
zaH2G)nar@TaCs@X5~a)N?^3%Ciym1&FI<W)0)G?`aqPDI`CG9G@pXu}*&(rII!AlP
zWO^+@2$3^xqa|ov<d5)3Zky<DCJy`Nj}!(gL^gV&{EI<c#9$4sb?Ym{JZ}N!Dh|~d
zVdf+b8}<-=XwbG>PgA+H<nA9=UL`r$@3Bt-_<JwFgVKYlp<O8OPKR(T*)l@*Z-I<7
z?WL|mzoS1t{Ut^1^uDYN!m;tpLCW!M4&J$E3b3DRq4zhncg><DK00!=fj4`LNo(|#
z{n(%g_-SW2Z?FZFBItY833!9Z>npW{--$+Blapk>V#7N@#^2QIH_4$mn9y3MN$vRD
z-=u9X3PTuVFL}qh0a2I80tECs<2%9~5LBXwqND&K3yo2$J*z8E4y(?IM*81ni!)Sv
z;B2Zv6D%s34`;!=K|DC9#hRUXuRho+Txh%S*J=H|KdY!hPvyIOzNmlH%i~OaiG6cM
z{sZU=hXnLu2_x_@YvSjvD(7Z&lm(zNb#B%(Goa3d&wRgtrU%M6Q5@OG1`dz2oo&w*
z+x}gjuUWW7A19F6SY*w?sM~KM^K-a|pf~v12~g1rB{~<r{%otFfxsgjc|Z}*eivz1
zB50ov?l{%4z~924#DH=X`>C44AvV{YdgZ*&Zg4tCa~hxyVgqqYkni3~qC1S%>`7XG
z<s?LPUhQj(aEpn-Xq%2@wKFWY*AIRd|Bd+$N&hO)3RY*e^oY9``pvp|$(=J#iD2Z*
z>fUTNWFdNtOaitx<$1NrN{xjH4f4Rm&rDKOFB%^M7qJVqS60>MR$5s58vDhWIqwOp
z#m27q=XR!VYZ75{8x^Vk9;%pcn0;v%l<W|ZN7efFh0Ocz$X9$$Gh-6Z7~LxERo1l(
z`9D-NVR4GNXosHoieHDsWgaXxsR05lRXItXEm2)vLv6IaUNd!G)A1j_+@AW%&2U8c
zJG1oaTy{{%EOFA}r->Kf)>skA5~l8W+V=pGSXBc*jSkang$@b;1t5f$@v@V^I{ss^
z7n2gD@JwI?<fa#i7F&W`Ssn?RFbEU?&(9J)%KuEV@m6q?5JRym($5+ii`?lTE`Q&S
z^=wk~5h=i^^q2q4x`y%#j?LX~e;Y>-{lhip1}3oju1F=$UO=zWO4{NpE%h`Z>BB--
zo+@`@y6&NJ)Pk=XsqAO#pk*DR^Q2*{1@z!sluwzXhyn^BdE0byFj&MVb}w-X(0*+_
z9|`B}Bc=f_+Rxx^t=k2y`1;ry=`@W&l@!vn64i{quWe5%H|j!6A|z2FhbRx<@Y|8m
zw-^es!f8yg`D}J}r`sOdRM$_AcGr<Y$mP4?;K^7Ye6*tCI@@In@DhJoqQ%kUUzCI-
zlM_zcEI?z(NuAdGZ92VZhC)#_V@-W>^N`CxG8UT3HA9bvs<R5KohUrZkPX5@dm1D2
z5R=)PFZ)B2m?sS~8ImXQPwR_5RA%7KRtowD1P79(`rJR(>cSbW)O^K&i(^dos*KHu
zE4B={<rQFWWPVm(%;d&sB)rO>th0Ei1fg+s(roUNK!c`5a7YxTV|zV~rMaT*T3Y3G
zWjT}kh(R;-EV=7UB>&8hHKQYa^qZtFuL@{%zq9avF3oNgE?Lh&$E<>{(YP2@4an*d
zDdU0wmWWz@z&2@J0%nb4Ge&3lEd04gu1jl;-9Gb~9ckzDc_drsg=aQME__7ljyqe8
zbUuKb5--by2dDYUD!X&UmSvcy7R&m2MRnh1rsiW>{o$Kg#1_-J7Q!eAvE8(T@>o(3
z6IAy6(7My|Eo@0k1io-e9R!H~6(=Ku^3~X#q~AQuA}N_)hb{ZqMW70HqT{_VoeU}>
zU<6lS1sL;x&4|<VPRt;+09H?Vg3Ovv8*FYLsBPdoSQ5OytB%0Nb|KpgW3nx1s?#2m
zK|X+3PHfHr_ZOjNgIsmuIj+kYS#w*y46#oJaE#=>Zi+I4+;D0`GJWR~gET(%Kw5$J
zW612BQI6n!{L>3>H|v+xk55U_+4HPvhqMLc6X{S7lyr%xA<4XW>^kFt&cwnsA;U~=
z!Hp_>pBKBjBR5{er5J-t{*a!d<%DgJjA#z~W3+~(*|UEZ6Vqw$F0zm!Zli7Nu@ae{
zH!-#9X-YDVek%edh^h$zQS&$GiHLf94O_=0>-3wkux9Melpqti4cje<VJ!kgn`8uD
z0Rj?0v_waKiHwLGTzg0j=MT;)!mhe8#yuynrU*~h=?BAFg0NoC_)(Li;WMC#{w1%U
zcvBI3m~pKbXi^y@l01V@`8+T@K4{x`nVpThmM~&V1y~8V!{cRI=qF0Z6eET83p;`k
zk{q_D?9ejx1VpHKb@?212iHy9=dli=63WhiyRk%*y)e!8$Owbv)#_&z%r7Y@^pN`u
zEn60S1M@LY;@!hc@7-X(LHw2U3@Gy%AXCBAa6+FsrYpo}>}O8|ed>EeV)_~h(V=aL
zn{B4DtHatL*-p)k5#l;bTb+_~0k*Uw(i(J}cbUedP`~ADB+xw$J!yB%YZ<^lf_-9>
za@n$DOpg~@^JEp?Z^I{jX(R)atVQa&<Y<2!2B*;Q9&Gbrgf*@#NbWAAaqqF!Q{<_p
z%o_C|RsH=tu=Ic&`SQ;~^+;rcZ%Qq7wG>nR>fm#k9gK;eGk<zY^0<bx4sYkWaX@M=
zISS0{WN6m2`)xNqF|*Ayi?mXcY2gBnf^v)D8(E?RhtI|5KXq6A6rZyba5k(7ZQRz=
z&H$w$Jn432Xo5L{0HziUUp=tDM1JlIw70(~(?66Xy1j#`y<*G~TF0RC{punBxUUjF
zv$PkyE-}pu+0T*pzZC;Y^9IQ~z^LV>_%G$;0GueURj9}YAbZm)O+#Z@C-(pKa)6ZU
z5Hfx0qZ@<AY!V+ML;lO^mXm?qTO^j!AyZ+&TW<J~&%1a>{}(ZIkfMTiaoHv`WBvz8
zxPk@#_!2P9CgYqR{s-xM%0y6QKLLg+*Ls`(fdb|+11MqsUVoF%f1-pcPcLV@|MBHN
z*mN~~09sUYHqmSU53&->_(US&`Dix&2Xj~)4lswsLa-kO|M8DNc)-hXUbUAl{O`qM
z_^HcKkw7<T+zR(tL)#4E21)r1a{4A)SN{PZ0RR)~y--n?XDvGV_5P^czzz(d((R*>
z`nky-`I@0U^NV2ry~NzVCo0T{@~TjRNf#gvnM4{+cC_&##^X-iUcdd#zqopY&=38$
zh?2;7>EKK9t+N&wjm`klm>)7#0_5=UzGni^O@xvVja6X2Dxci-l;!~O+RZ=OInQUw
zQyV>hBrgF1nQr_~@^}9+M>suVf0P5%8UR*6&CPy&w3d;ekaqKg=oth2^!Y~y_&b#T
z_iX#52dW3_ZM94vS-yZ+KFaaEe+V~n2nIoH{*dP^V^T~^$RS&FD3q#KY_?jkiC!k>
zs<%1GqwkNaCK-HDEfBr_|DzMr1L+b40;b#lK9)``B9|(~doX*tpQ!u4$!QDhya7nF
z>CtV?e>kIPe!^V<t=o$pMi}$1=<bo>vHZlZ=c+WTxPN0kJ#9JdsMWO1oWPN<Po8Fe
zYg;0+f;{|**Bt&z^mO9&lV$Y8Ha69@DIJJ6=1W(_Y!iUjrj+GiYe+VhJ_6xA#88vk
zrtRnC_v^}{cQaH&z7F|&^?TWN%LZmYJ*?KvM0vfl6Ip$gbo4XIjR-Rd3A!JUebmF4
zqnsV$9D)7O8{OLp%KWWWF-GLKD6b#pIN(mOXb5~bL}QuSxxug4KPf&tU#pRG2ArkF
zSH?nVqpQAC#D$iC)oQtyPB>Ba%e$#$BA;1r&B(jnwcU(E-?ma)#I(jz&wx{EL8GNj
zf12O2f`y%}1TZ9RGRwa>6JH!nF;A}?A`h1?FBHAIm`{G#Os{@Ft-vyE39h)@8*h~P
z(K$P7a>KpwM5H<nOaS#YJ{d`#Zn^Ns`_8lTiK55gP&Qwi4Eonc85mM|NZ!Fwydi0V
z-HHDNErN+#xc9+4AQl#Z58`T72;vXcC;<R%Cr+aQ#aDz;ko^95RbypDQ9~G3`s>?7
zs*JMLV(*Z4-7VXqO7Q|UPDi0RE6#@7NTR0G7;2v>-#OFG70^RyIM~h`7ty~@tKi^k
zq3X4rTH6%5+{sm6Z+gWPW>NHB2ACrf#kEOQV2*;U!+rRHIjRD&TpzdlPDr;N50I4m
zI+U2PdO^^V+DafXg_^mq$#_-g#UCWc`_O^2K$hyHk~A#x4y_1x{5ylx@E)X^6Ud+N
zyd{$V4`_U@LPHJt$9&?buwD9;8IvZWLw*-MF+(jnu{s>Qi{$h(AZ8!HaZn{M0sGX_
zd45RtomIYeqoJC4f{9GP+0EwO&iF{X+Bg#bQ=eVpHn~Z%P~95MVy|fXb>^Hi+*@S&
z6HZ065EP%o<imNhD(f(?Qda>GRp+-xSyQ4PpYFaaRQ2stQHu`J7GWy4G=wkdoTR@a
zXnrFz8p8$;e`ziUdz7Lo4f!kn_dd79{J}itV*1C$-02BMYn4y@4|W~387p^%@%IQk
zHcJt~CN|5B3d@JHg{oK1<BK+SBa>%>I9#OY!@2~OJ(A%vHvs8NV()6TxZG+3+#SVe
z55%@li#YHch?4-5MN#{j%Y(a){|j_9@V#sOPt=ji&3r2E01Tt)n2jqth-1Y#mdv+R
zOL%2^K5&3ajKL6mw0!i?F=O&@eKxW+_H|K3HYsXwOYZ%hrv@B&Y7Jq<go+LT1`DvH
z`|Qd0x<Bs>x;^&&8?gl*wAgv{^~{~=bF^iM&)XawWZZ^kfdUfy&7`XYszLDE!>a&h
z-MYkEr?wAz+KVK0r9@35o`(5L?(V7XW<$4EW5lBjIJDyX!_4q-JYei#a<E$rO1pHf
zKh1?vNK51NUB!OPCzV|oALa`#&bPCCnU)1L^WE8R1q*Q?n!wS5JOakXp_-=^S#8+>
zuxkf?!xwTV?)4D>hU87Iyi0|{y%*~F@-1ZqGff;C^c@{Nj!13kYD+#&1UxkF?Bqr(
zUe5rF2&Ph2yL!~VI-1qDD9i7FU6XpZ(Vxe+5N+fjtN0J+7>PYrni)Vj`RC}8=rmB;
zva6uHcd5iZS9`bj<N>q3)MQfU0+9H(>XvinAA~mZaL24`aKv`Rc!6yv0Y4pB^)pSf
zky`S0ZssoG=5yB_d}n?D2>fQRqcBs45flo56mn0>n{8svk?2AZoN~t@$;JDjoO4(<
zD2SP*{@XvF!8#12A(E+ySwv-i7yHAs<s1ho&n<%F45h>v+6*qx{#kw2G|(vTF8k(V
z)_EK0X2<uixh`Xh@8ubNJfCdl*c}{yFVEjHL%QM@c*n2r1W0DQ#prLU6<`fAG&0nI
zQb3A>U$Y{wO;39-=WK*iAj-olwO2XCSo*HR^m@kp$tyeCM;j5Ice9hI`SJnI_+F#T
zzRpdk>k2)jOs~}PG}!)FERfK!{Rf+p8+JIa*Y?}(PgjWmr&KK5yjTGUOjbTobN28F
z&X5D*Vv+V-Y6zr<m}&Kp{4PZcg2%r3%REYR6zx?ZSIzD%L62Cb)tfL)xevi6kkJan
z5~IZ_vr$E_%8Iss(U0IKCizKd3H8Ys_tzWte#BqA`i&d>9s3)`)!s+PtT$<^j|k!^
z=urQ-W1#jc$RtlSy>7)`FZetbDdR7)N<2}ofcCL5KchG;Z905*eH{C_by(9PzxJK)
zHK%55nU?Yfg7{<E(beI|M~Wc**Ij_czn(tdzRfUvpy_*mQ}`;wUHeBBHtg(|a{4I6
z{q-3Kv)5EgBUfk@x7U@qrUEfEXq4vG*_LU4X|b${t;uR@U23v&>Y@oI%bn}7H=3QI
z@9y0i`}GsmLm#E@UJcIV>o}S*fjuDtiXT(|O*>lZR&;qE<tmG59&Wi>jc%yk9{gIg
zbUgZ0p3EyDet#Wr+fbXkCI0X|tOt5@+=WTJh$(h2O;^JA-d#RXd|c>iVr@J7o2QCs
zO(9Tr$}X?|EBg@Wj5i6&U-e{6W4#T`816M4j}%qj-BbD>nz%q7^NR<Zl*dPl<Kbr7
zK$g<3)zw-%^52{zCjVNnjVCcjPPnFCcKN*cBojmC_xqc(ppS&w_rOhWMBgMgj%6PL
z6_4-uclqS7C+TZnkoYam#4tVxv|rKwI5oh(q!Azez-!#8vN;srhn2M=`O+b>#J#Uq
z+haciQ@Z9v1qfn&9P3bL)?_}o(q7ird_vKgs<ZgrD}knXeq;6SVTBUdIQa`5KPvq>
zzA|qh|91%Fp9M4j|A{X`v|p$g<^3C9tnyA4*%Av7i0e`1)`ngHO^StY^UbiQ`f=ok
zkK4gkT|St?W%oCyOumm@wQIE(^suv)OxI$cs1hKn>+t^X!<G2feq+zW99n)>NPCxH
z>#8SNq0^6tnyfRyY5xP7wmYG7CR+F_dPr=}8Cq8UQpTSNrX*eSPC+NzNv&U+6(5F#
zn=H>tW<Bm&+4dIxN_ba(NqjH72|5}6RncYvp1L;Q`aFR<9;t*s+D&cahOnwe{>SsF
zr-DWeC%*x3m<sjwa@%6(4Ebey$<cv=;Wfgb_a5mw6G3@3Ki$6r^_m4PIlt@igU{Kk
z46n5lRgPz5gl%&VdZ`b(HI!6%-NSkYDYw+{>{oRjGe_?K1`(b2hxny+Ob0VDT^Tpk
z)7x*igfpCbltRi9`sjzaP)l&}lex);2T`|rmkoYsW{Ujn(E>C@hUyqBt}YL_EcXhZ
zEJ0?NLY60Ms^++;Re&w*uL<7F^c~LKw33ogDIhb9gYGYn@(gJ*CV~5JoGj#;l%B}_
zRU&6t)v_*^;IBj8`bTbsv<LnM5a}=_K*<<QKW>xX?&?H1|Cu#U_0<`ax}eti*?n!w
z01xN<PVB{Q3ZAkPo+iPR&;i$kv+$5E{=eA998TOUZr?7`RD>+V(;m&`oxM3W#Q{WJ
zuMyGa@9eGDFK`u1r`AlV+pi9<?$nz+aL+LIBwJ7km5Fn4V>rIQa@O99s^<@Sz5aHQ
z?`p{}MuWlm2aJ)x984;P@t5K-E#+%Oofh<B5&2g&o<FrP=vVK24%?IEG_+rV+Neu3
z4@4KZFI~Fd(p=4c6C6nkpp^e?PsY)nXfF$5VOX=KbwxG+Q@C3O)4$h=r3Gu$_L4^y
z9*0y64XJ58%iF&hVJZZOabo4|GQ6Xftp_k}VSA-fRAXRpIH@Uu4^CHf>sX?6T$%J*
zB;QL(nZ|P4i<)aM!jDCTK(K%Yq&het(7T_(czNQ+OFiwy-xps7mJ2^d6K=oOMjR24
zVIb3_TC{Te>)^TZ5BXSw2)qYB&M+JNbX)5G!5#y?p)dJMvo`zz&u6iY0NqWsB8;H5
zWv;vL7U&~NTn;}zsa8)(g*ZkGug^JV?Am4^{rk453w$M%XxF8`zh7q3smm7Q$Xpy4
zWA5bPMssua3mjK=pnW_hdAR?z*ekfY(jgv6Zct<-uVHW74FmypQXmS_>3P!M4gl$^
zTW}Ig04;ME861rtC-}UKi5w~QIxPEF61$-_FQvCfLBWWpeEShK)Bmynlp>5EP2Fc_
zo5BmRtf2fPg4VYq6<WrUBO;tIyVUML70v^B(Ya;pFLRt6$l~`kVU&H@%CYOWyd@?=
zGPD3vnQJv~9C|}>DAeqPzDx(vhi>u%G94n4o#@}lA1!M*i1Dm1vZ8&An&j8J+@g2p
zAqf@i<*xj`CjMjku0sj3xsuj>NW3MMDlsmyscjbfeNC>vX@Q;vnfc?I5$^Y?UhQ8_
zShQ|bKrQND(J03yNi`i?1}^Amib@*?u$U#d%?MCjbwSJ42s7<)Hs($@r<YR&Ou#Up
zij91gP?G(8Q9DKHDOdDFSCTmXUG(_eS*?9?>w#<HvRlGGY1i%c@HndtbU-kqC?7R%
zq53zLk+xS+T$}^{4OlY&H(+Udyb22Y7g_m4Yj?I09^J%YKT(uA(8<~hn_YIM(q%6>
zK%-Y0dk1@iR%yChQcM4w6Iw>;d)pJ}B|)O!rYJw8-=bTX>6O#w?VD1@@7Ogrokrw^
zMP@bi44!jr-PVQ4J?3yALHM4+)Q3_lgqn_C7h{C!!+497ZreUd9lseG--tQz%m5td
z1%RyuQ&IWJqR-|%Mzs~A#Ulbr{oaxmBsuJRt(*7i2WAhY2OfC>i*}NlM-D$x`Swv0
z6)_*<HekhTW#+%;puPYiF+7Bh17{y7`8;X!5(J&Rdsv3(;$Yr~{>eQEkw%m;R|JWt
z%dVhb+}$scGHO3JHq|z=Z6b@6l1BnvH5~-V?O7dcT2t?^WM;&cwYZ<{P1cgC9~Cg)
z`W*R4Wj0)354B2VVQ)1u#Vpua&(GAT=-8JsJQ9zn#J-P9G`Vf)Aa=p+)Tl6ZHIBF#
z5&yeaboOJarV4ZC(u1F*nBmM%au3}oEt~yyqRD6-OUa@wh7yK!yTU)~L!?u%?iaFm
zh|!`#=<jo~<EJ?wHZaL3VbFdEQ1e7QS1SP;b)8Au!%rB>*1(feq8Yd@mC-Hr3o{J7
zOs%C#^*{82VT?sLvC+s35%i*%T8M>PtK|E1=+r5kK1J^v8lc7T);M5KhN1hv2dJCx
z65g0$)k#ajSed8@X$6TN)CuX)h7(N;bzb=Z*)5*2PO+}uf;d)o1Km!ld>{qYsf%;u
zy@GwQH?uH^3?=}D7d^5}TZ5~Ey6Vu6uHUnf6a%(QDvxnhtm0bnaB~$SAY4=xv1|GQ
zRhF*k{iv&0cuxwQGc_`cC5E-V08#9-71URLYi3IgdUjGjW_CD*w!cd1g$^90gdnG4
z#KpRH6<{hIPe%@rXCM<<yjae$Piv`q=iVxCs46u<Ww+(oDqJM9E<`O)z|60PC;`J!
zh5Z{1H3VzCuZG`R5e6UqwJCJt)=6Xar^4MJl5(^2GbsmLUQ6D6%PqkmcL5jh2oH3V
z7i6A(<5_gB)^RqxA)9`kF4(WisgzkeGiU=mx7M8+Kb9@#^`5OiU6%VT(?4x!YGR5~
zqBlP_hy`MQ2`!UDc2(x~XZXo`hv(8drA28W%=}itM|Iw%r8<rp!spP?T852O1eo@L
z4?6Z1u5u5RW|*UYM1c_+^_df;3<P8^+jr7IZ}B3y6+jUyunX;_Qgnr^Nz)#VG2D^i
z@qc*JAjfC=BA!>2SQ`aE1gX~fg3LH~^MXAx(8QrKE+PgX*TU1S-pn+47dFP>{6R2B
z^q*9VHBvQalqcYvnb9f-ayM^`kL`gj3$}JYbmi#?f}eU2de{VkPhYRDzOh*@^OHA=
ziNXU#yd2950#d2<!3%F<sMQ8{X<sENn_p8f!6gxW0Q112dq}vapRelk>1X4@&%aEQ
zbc=w^GljB|QLB(b<ZH;);Wyi0?Jn0A{Kj;ZCm9^<fxD`g3{Gza?T5vAZ+t}eM+21}
zD7o%D!6AZg71OSAQe8+su?GiU_?e(L)o0wgTAAiufV-(7`3m}Fb)vzk+xBliQQqDp
zEj#rVGZfBln7ZzV8c~keer$Yv2#jZq6!*P*xFWN|%4FpZGnwck^ET<95Uu)b{I3xm
zA~haWVlg9<S|znRvVQI7fX>h@8)hUk(Qd^OzZRM|-w<!?h#zf&ZBOup;NYPzc)IP%
z{%{ed7DDNh?1xnGBR)ctx3}1p*D4l+-6}rFDbo6d_}|k~mDv?Nq5~Uzv~=1B*?$H$
z-7@%S<899pRv%jAJREprAo@(x-6uysx6f}ixeHM;zTVyqZ8K1==@BWyVlP$w$=LxI
zd1>a;Y6@5b?NxWgH@$Bv6fGJ#5qU2!2tZhOY}NsloK1V-3boc-0<?OoFS@+sxI$V%
zna)KsxJt15UI2{E=3+Bw4vN!&KuvLHjE^yjNrWvD<G!?t7?;gv0iV8MHq!ku=BUV<
zj?lJ@K{5$0p`+qBSsdaBK!T)**!yz9kMkbZFJzEwi5Q<QNc`#&6EKyZBAQMUadOfV
zdEhlxck!3l{#4c;roo)aH9<OuCo(J+%ONb)ib5w&T^~#*yccOKJ!>$5k|CEKG9xSD
z)Np7VBeVOvB^%BQx2fuHG*2^`5A6FMn*?2ec2JWEpu8FAk$u6@1CX&~IcC&}s&S5J
z<Tz+Wn~MRFnz^+{u<Iw+!sP2^wvnPb{MD0v+0_Fdkf$bVae$$y42}#^=Bc~2&@8Tu
z*-xm-6?s>zS|E*Lv5|=Y)_S?uwr)Z;D8j*<{QmmHYPW#GMIXy|Drj0CEo?uXj^+G(
z=Q#~;a{YH7AkXsNVvJ)V(lLuHo6_fg7K~4gx^Ix48f~&V<hu0HS+RYJJ<|k5mS)}L
zCaO9rK1hkc6W8=G_)2MpQUUfko^Fi5ZjBYAp}AJ0y-_t`ubxvsKwxJ@$!*rTKIPia
z%kK6*iRa6LI(B$~zxw{F#G)~Qy&zBvsOsnpOr)tmg9cty?I5$57a*Q0@W2wEd0+fd
zpm&loqZ<=Y$Of7m>{@^^tpCBYS5k&&TTZC=VPrwt;UUU7%*eq^aGI{ZlRV3}xYInN
zxC`T#l{z!H8tfNXKV@#<Q+_rg_>1Ud>k=p0M8-d&OAzPVqbI$vKsCK8s*w&6*{fhX
z+|-TwWlY(cW$C50olH>R(6+&$hM#ft`|Y1q0Em>zca$Hy)HphcH9KA$%DsMf!S5Ma
z!Z8-nEQ6*a6e_4cUB&ve#+UJF`mkh!(B{*a@dt<!qmw;*{gZ||!fYg~&_*WLX>XH5
z;Y7A;he7X9FloD2fMkmGf%7BKTQBNYGzFjUed4POch(_3dLzQsu<KxzEaF(D(N-n#
z@dtakyz!rLATE@HRPI@_q?GWKNaj8U`=CgNp(JT3fRpWg>AzdBuCdA}z@j<qBTuAn
zwwhaezt5M{Tw^e6+{!1vFsl3>Ox3^~T>6bVMv=<_!gcoJ*Fa<pFnTE_rulk}7a0d+
z?;7!|%g%-Q*CM~Yau&S<1A>J)VqMWPJW4<gUG23#u+YRv*fieZqS8S^a~Q-$RHpSl
zOktHi(I2n>{%Ob$#!gr(50K!@^GgPYR&n;<0}1e^?MF_MspedVtkza3abrh>(Vvb;
zRl=DZMPXEmUbvGOe4^ZxlqIFVbQ9^3d@Fr5*>oNfDbjexXSU0<n>*Z)4L!SriFkTx
zhShjLR)BuX1Axv^ZoY?5IJ481#jY8X(dS$F?-H-;!&(E#<D)!c3|T0Y0I`hMY|V83
z>yw!P5j$Y}qjLIu!}=ZI1Q}XBB_PGuiT=lTNDPc)-Cs0CC<B&_MZGXb>9>5ZrB8;b
zxlaFFC6d(A=#yfxpU>jGj)j+AjHccMdr|oT{EW4BrT76stmUMwOZ|Yp{sI;k3WW>y
z4EA@v#_#t-^r8cFG93LpDb*R6qur2G9J*5I>q7SKXZRWS&A&3aPx7$%?H=3;ixbmB
zXIy@dBsfy=j|Q+M@<oMA9vR~)pag0q{&VG3FU1Dy7qT}H`T-FD0muhxru)s;zjAbt
z8U0E&E;T;K^-%M`OYV5CDhuU37=`x|rAtn%wU{UHO7A4_+7*8AUG-3&!=s`4P$q{S
z`1;f^j1)?RlREb-n8=p^I5(iS<<LwD7E6?URIoDZIFyh?iccoH8#iMjgKLh8U|}7u
za)*uajH3?V=7#(J2?o$b<oEnoy?R+8)wF%c(7s(+yF6cggdE>ZqIn#Fu!?HL+Xo0F
z_|U4GXj({B2WNGKKsvV+BcHFUKp#@yU>mU3X3iEaEOSdx8G?Q_{lwA5gue#uFG*k)
zHiB@awgBomQj@4ywS`Z1Mk#JPjO-KMK}l#@U^wQfVReqVEX?JHP8krY=7E;2vo~qb
z-Z^74u><<~XfK3Wf0qCkGhhB8EwZT^h>6g*zMycd)9P|Hk~XwLL~^@OIrhlg7)q1#
z{rjNY<bxE?+OlrnTKAGKv6KqX^*^Xl{1OB0=@hx6roLO^|H9Fm(;&qp8m>DG{xjUe
z2B5eXG6%H(LUB)erhP;K5jG1c-<}PX77~l};tyLEuULi>!DyYl#M|yefHmDOjp9h@
zp9knBVg9|E6x`t5GHzvF2g5f;eGj|<!GKQKv0}#VG5a!%cA!R{O0VXNcK5DU4}`{^
zLX8_*9GxPt2%dSoH_CW?+_sa#vC?dwy3fGEa3lOeMQcObhOsAQSH%ZOR0Q6L^&pB2
zR!m+GpzKS52^@;cawU#aNoFgm|DeYmT;?ZO4e3_m3`_s$QOOo6W2vv#n|X;txAab1
z^{NhlYiEXM{hK_;lArhHS_3AU;xi2$6`Lduy|KJeDnesfx{2&>xDLI|(KWK!uQFi*
z`GI%BH}oCWV}m{PcKQ7NLhHYaTzJnQFKpJ-D9J9L=_s*#<RUMT6YQx7#BW*Lb3015
z#jC!;8+`o~UI(yya5E#lrG7Q0p{0b8Q|FYM0+7E=R$&BR$lDK9cju#J#`Br<jAXTB
z+QclY_Xw)4lm1u&8hYSSCec>PonQiR-2850>NGKqAE<PwHNbfTTP}HqWd<iM7vo53
zjl#R(fjgeS-@lK%cb6|1#X%Qb20!HGhC0J#wo7};onQM=A5*5GkUF*hC?DgR(*%g5
z?jMGZ9qXI~4v8~boWBe0@PNm&f}mqB)Q}rl4Vt8z1R&iZIY;2LSf3eSADL~?!6NoY
z?4^C>ju&x;1(h+d5YBf`@}pPdfowB1M05Le&_Tc&4JzasG!fw>a8%^`ugH{yCj%-5
zxioEc`lPv!^eOBif>2rT@$%n$<B3%P<_Wt=S~lJh9a8@N;$5_$JCPDwu#Rk+X=NZC
zpqXr+`^#fKois7-d^HiT{3f%j%m#&!*~yL?9W{xa{oSDZ+8^@kU3247REc^^*XF+N
za1-p`3Eh*CwChS$RmEVk$aTe{PP)(sVDG&u0#0kd_4tfs9DV<NB#JUe$F7Z<6;8tM
z{$86(?NZ6EDsFmR2B!9WI}qbb=m6k@#Fy_1{tyM0{RLJ^pD)3JpJ$%QiE}F1vAfIu
zLk-|{^o#0BnASN?YtnyVm)*=>8wvEBmQA2HWKQFJ8Qk?-Csr)oUU$mqV^C>QK;q{C
zEHFnl)hnkodS_!ZaYXq8AI%TIfdp!U3?@LKD&<W59)&R7QzP~KSr`^h@{^#_5E0Y(
zmm(fC0{~T+u1-zLo4>xj2~&eVOnLwA3a(jD*=0V9jw28gHh5VJIzUF&2E-S1L-X`F
zl2vI5(G{s50;ks!PB5I3E-0%V$VMUc>AVajow~H|0rUD<=ACgfu+3a$B@`SG1(p$5
z;|=5qOf{AQe}aDoCtl5_`vY`oQC}kuBH^8@e>WG0KN>LN7So`^obF{#gs;+A92mt$
zW%7wO2+y75cgV=d@L~)m9tW{+rRkrOeQW_m48G`o1rkBGN_?9aSeU}K?SAGyr{qC;
z*~tDt$>rGUvH3-oCh~(~%;k{cZ?(>*Hx-2=p71~?3AJ#%81G<(-KQY0v&`6MJD><O
zBCkE<?DjHS3wW3Sd>OOD?g*3o7sChgpA`Y8cJyA{4hUZg`<%cgdn3{XZ#SeK1}PFo
z1^O(dVt_6DV?yJlFff;~(WUz}Pli#l6GAMjRRG=yb`=PX7XjIt1}y}cbPF)<V!|1N
zt+;U-o-86DRuc}t^4|w4F2FoeAvaL@V}=`@tGrDR?DQSy(d1CF_3V+4HcG461lst(
ztq&IV5jd`f(}4>-V2F`q;*<Z7tnpi)HeKF3ndZGMa(*INH=cJ<KeOmZ?IZjxiqKaC
z=}JZNJb3)qJkdDgzB#^Y6$hLt*L~oKkJO#{8iw4LV~O7YF=|49qXkCa#T%D`9`PT{
z>YcB!47+ks)xmO=)wz)4(<u@R8TKtHXiMj^Sd==a_Z%^(mR6$40gk6i0PhaY$oWWU
z=5+1pI4kiAl7x5ghWhWI>DKGKFhT#9kx~TXp-WPIBw^c}a^yfG#>Z(7P^wA`Vy{+p
z8Y-zj<}8Cf?Ylb)&Vi&n_b}H%DHi{}jiQkejLIa$q*cf+2mCe)=^32W{q;H<5Lrua
zMcr;RaYRzJzI+8*0o1r`><1y1T!-jM2tLq(lrlRWy5hWpy(Oo$Kkdx0U#ODh!cI44
zr=aZ$ltm0}iY7H;#?RTw0?f*$@AlaZPAtHt#R=DLvMfjS_$+1f+g=1*Wgy)4vuba~
zH8AWr8Wf?FSp<IyV#%kR^#l{XPO0gAY+m++;1&!|mhkftb;?lpksxh@qlN(~s++)S
zsj>hi-;XW!iCL|QlyugD25AfXwsk4dOkdS`<#8q9^D|q|g)xU6)2zM|fSP_;Qc6h!
zPHNbD_y@<emdD%h=d4GR!(`XLJlb%iRjFWW72@R&>z4mv)f(>Vb(xO}H$pQ$1MFD~
z5Q_jhj#cp6wki-LSuz$m_9i@P7f4L=b_B)!q#5!z8P%lm7;noTr{vD;OXjqHh8rE^
z?jye$EQ_clX|d#IGD_IyU>apcFEcWI#QyQf(3scs@{GRHF(7r?Vw=zah)=%fV#+$^
zlhyWj`R=iK&h$Me!DN;XHzYQ1`kOQWj6Z#(XWAy)i^GSRo#Q#lJ8t%XLzl2GGr?@C
zt~yYB%k&X}!;R#FeW1AKEau?%1+9Wu_ye+V?<jiKB9l$1-^ooHO4)oxV^#*K6J{99
z1Wtdq|7rVgpmhwigw9S`mYy#-Qt^hutXnjfid?JSq54EDC^_5c329ATxj25!OHUlM
zhmy~}p~9kHoDG)<CJCWx?kI-FHYug^EzcmgHMvNEovCpU+Jd^Nbi;z5@X1+K!W<0Y
zbr|VbWWBsGLUBQ4=5<!m58&;1_YTniik4?|rruM)P$j*J>Fy}<jiX=vj$jB2`psFo
zt<0m}6_Vv`+MoS+f4DuFXY4BlqtZ?0Ut#sTy{o=ZIG_EoSTb2=C6w!{zM8T*352|?
zmzT8K<?AO)ZP_jKyIif}O-YkjK`ezU0E$Qw3>=Qdo%f1~Zp^9_Vko|5WG1gs<Ur5+
zf%P5Uof~V(ugPTqLD9tYJ<kA|4PfEW_q58jTpHzNlWavVt~zh5qPL?S-RCb)&gSCh
z`<N><dH(zBk3I1({;DiAOy6TYZO-I`BM;GtUsO6&)hQ;LWJh;QVUV`f+O^wFxZMlq
zo=c?!?aeg*6_?zsh3ANpsr(P1`X<bP+KotRp38X{t;zRXT8c#V1Jb++wIR#DjC*dW
z^__q2+H`WL>GrR3e$D?w-do01`E6apbVzsi29%O+Nr8<>3P?#K-Q6JFuxUZMySsZ!
zBi$k0jUf15c+R=+|NDE+hv)nA1&qDcwbxuR#~gF4c`3@xTG~rbiJzw#<FJ#wipRhf
z2E+^l>{PNJfEq#=(3%EQVLsDM6vtw*FT?3{k!TQ>r^mT9-%=-NL=+JZ=rCq{jZ@|I
zYp4X%uojIBZHa_xF8G^Lv#D|A0dU%WV8ho@n0T44R}mdoXV4lhNXbY(9Z<yZDpm;d
z@$I`BhZkdUu`EaRgo-5JqyfV7xt36-W|$8^T)d4D!z=}~zGz>ZRL+@b`*9jx5l`}3
z%*j18RB--jpdk+p3T}O)n=sppj|x-Lj6A3~(`34n5=(_8{96?a2+2JoC)Tt@ddbDe
z*@?RxJx45xFjd84%lHTtIi+vOorqbt7HzEqQNGB%;TphF0|=q38&B=$OMLOwFsGcQ
zY<>J0U#?!XwydljbF7_}jDIPV1Ee^q_6;C3EJOAS!X6zl{xddo(>oEH3oGX%p{e;L
zoD*0FKUbOqt;&{(*MMfVdtO~U?_jQ|q&ogHNm&0d*TB~b%~J_s`nW~I!pn0qv`8EQ
z1k?bx>G@B{k(@3;z|JWK<U{}&UDTJUT{W#E=0>?+SwFYfW*zc<PC8tjN4~}K{2IcN
z!Pr50qpKfWPJD=Px&vHiEp#VxXwL3fI#A~$`u9c$E&?58MHn~&kqJ#o^TvmBYR&b}
zShDEi&F!Lq^z`}a*H~-p?)-#6tbi3%FYJ~OQEfBwTykWis~d*}`ek1K3T>lXq<`@y
zRw#agqzI=Lw*-D)quSC|mhXz(;paREnGrEfa+3=O(wzj*Ok|ZD(;`5TrHx<njdH{9
zkNbgYfq)KQ+P)@~F-R+sUvmRYhybx4<QgYb`Mh!FH*L2tV$Ts1Z*qv5&t@FWxdXT=
z_5(%(Z#xuZHcTCAc2&maH||<tzoh)VMt$#TBxkKIJT?yvhHL@z;8uUMTC;Tf>i@<O
zTg|QBvhp35*I+F+?g1+hc?5`$)5N5rMX*L>kd!aZFad=gVmmJ@N{ptn-GcE%(KZuN
zPJ8|yg)UFQDnwec>e5^9=hHM20KBcCnq1ns0amY|eA*MD0?s_aO(Eodz}K1DjDy)|
zU#*?4*n&q8cL}EeafZ}Ic=g<d{>TU+b54xZIre@+Y?px_x%nT?2d~c{=IMvxNUjjJ
zRnH=SqkVyNg7j|6>#Mo7qcD$=b8KVXT*{t1(J2XUZ1p@}P1dqkJ4|B?Mf8s>&8ViL
z_81k+08<n<`aZ0M0{Sk@;HnZ2h;xc$lYHC3-@ev(yv)ahd%EiTPa^)m>KM!ZSI6j&
zaKz?n@VnaOQs9aBgCzb#lPJc!jpss&8{JVdb3j=MAn$DiLk`FfXCzDQLOTB?W+bWD
zR<v$zO)?X63Hf}I!CY|rtVabsiva$&^dg|?d-#!sUjay2@hO+xoLIyA_rdPMNL{iu
zUScz{HMnp!ABOLdJb{PWTECkK&!N>Ue(~k-<21@!PTQxb`_dD{e=A1M(EPzS_MMB6
znrr^p4=!e(2LGuV1wE@qalJqOZ?#QwxK|MXef~<QuEOY_s!`l$)o55p>C!(`qX|+#
zP0R!etvQZ=$Ww8j3tfi2U7I=nNA=9_&($}-W0oJL{6n4^_7-UVHs|!f<$o%DOH!LW
z7tb^anROod$AjKJH>aI+b^eFAHn!$jMO!0zD7f>F2hlz^kK1c)`JZ~OfCyLfSw*{$
z?M&hLj|as)HwRyRTKUH)^H`omxJMH$(f<_Tf}fj%?kjBmF-jG}XA$oItLEgS)BFwO
z4j({|aWyad8jWPYg0fS5kf*V%l3vhuyc2u;Zd>L$#ek{edi&i?pZA+o>`*6N`VVQB
z;2c0};7hOETeDcf1$w029|Rr9m2ht($UbZ>lMH<O12JbCXdXVf^r6F!Hs&}Nl=Tps
zs`i&|rzvjEQ>>XE>V;1`9EOybo|CoBf~Q8ZZad?-<yA7pDE@#w<W!oirX4f${2>*w
zS2nyImw|cjAmJezBWp;D+F%tu*#P<#(0)Gp0W-_~XIV>-HKa~{3JoG?nj0msJ@W<9
zfo^f62vG+gyxg~xeM66t?Bu<EtGU1A3`s)n5MT?yvKvw<Y%ELQmYa%)JdM8F_IKqx
zl*zkfDwXG#!acNb(H}Lr7FzYO^LJxq7is7ErOz^}ha*cR_Z3uZqU5b<PUNZKDVr&$
z_0rry-}mJ<=0TZ4jO1#74r`pMgo=NPH_ufFDe|ma5Vr3QOS6W`50<)(fiCjbsX?x}
z?m@rYI`ae}F0x;VQsSezr2`DpGcjap7Qu#iyD(c-O)66}{+Z+7H%CX`nd>I13XnY-
z8jPUC7GXf56IA7<iXmUY9ttWf;MrIhb0hebl>VEf#5#qk705u~9TF(w3-^mn_kg_m
zin)RR)CDL6SwpirEFjb+EA&y2X}PH?;@EB$m&v)g6i+LLn==^*@&LBM&a2DWP1vO2
znt8zp`;W3Q#%EDn^xgy-L?1vDMATh6?^6n6%gd4??{P-kzFRz&G43#uxdxQ^fXswN
z5~szdm}Dcketl+{U6b;cLow<_oYZc+lU}Saf-@S;FDL*i=QnvtoN>`^E%?2jYu8eq
zvXGULy=NV39}hLQ-)CjmzhSBCehopyG5*#@Iqykba1CM|l1N)&wwIvBhi<M*ycrUQ
zt=1wdO6$#Bq8{<_(66;(-S6u<PLLlE88#UF9B$!<VQwV`*DZfzElY*K`wGNiO@Ew#
zz`X0S#G<s8(}hG)R!|2ThM-dxeyYQQ{>Z`xMRZC!X>d=EvnEAl^ss&|0`VGyz{?)Z
z)}DO7PT}+7clXIc-tcK*I&|DN<?TvF;uy!?TnLA7@X=vfKaH;~BJ$#bXDR$`f4t7K
z%U-;e601>Su#?<v=+vFUlpP1Ra4B4ZG({|BlivI^OMU|uIO<|aN!Ov@&+gKZE;-j&
z7km3X5CJF0!&;ww7s>7lQikeZEfYTA+SWG;VFB|Sr}v~>VCH=lNLGGq8($Q281Vlx
z9oh`sg*N8+>TWV8Q}hc<7yQ_y^oA^!G7_uIc8CCk1|@e)j+jh?4SRJ+Yz}W>kyny{
z0~@K?IU=Low$*(*&loFMs~bny!T8E|Jty!j<a*dAcJf~G@kt+M$rW^s1$HTpzd7Ry
z4DCe{=AzSFb*WAwsZ}~m`=u-@R!7|E16xc;1fI2HBDgWx`P~fBz*--f=j2-u>Lz)`
z#?K{VfslcSmtjqy@c|3@=3`!X4AKA(xk4H84#ESsC&$~6KTSmr{?eR%hyl(+ExR=f
z@>8o_$TU;G->0P*nHh3I4yyRLKAOFLbQi7i;X^*C2Sq%7b?zpWaTKFC;=^a79+}yK
zPcEVKukUHeY5(nGv2OL#b!3nb78lyGwNm!3Z6Ts4z`li>8eMECmcTIOog-iIJ#IJ<
zCK$%&ig_J@wl$|i!D0MAQ}2pwsMRsxrx1Bb8CKG}qywkBamjd{cuikGP=@eA7>vVF
zZTO6A5=D`Dy?EPNzhKvbB4ia-z%_1}wuzYUxhsKiI&L(sM+jTydl1AY8NohV%xYfe
z)$(SXq1^|u89qU3R(}+G>_t~4?J3H|E+p~R_MoR+t*T8kFnuG&y&?r6n%!jyn&!{!
zMQ6<{Ry8;>I1pMrVj2P&8kr&!kz7rRD>eoj0yYSwss45S&9vy%`W}Vn(966p@-t(K
zJKRCqFE0me*7aZ5duZV%zc>vBQm1BEj)-bP#;~*ogB-c{jqd~n*cn+g$G{K*VAGmC
zIl5InrFiOarxY%S)4adxeW#UL>GDaV2d){JYZm7Lk*F3PGKSeRw1gF!r$)r-dhFxZ
zm)=>;AdQ4cD_8Snj_N1o`(^|O>RiGCyU-s<p6fL+=|p$hpFzV*Y-|=AvwVU6Xk>0@
zwxFS#^`BX2>J+;_>qAK`{6GU?zfc8sMA^bb5voj<X>jHhCVgjdjo`#(!qIT*Jzr_~
zkr1WnCwBIS$qD?#bU>)y`fbo*C-6m;>t?>eLD^6-WFkxD?R|fXH-{an{z~q5sg_Ye
z^p8iHa2(m~URXQ=1r)I~_wY15Z;`rTECVB)iDZco?gvgeX67lof*ZJyQ;EQW3o=y;
zrOaWEWT<QhZ^jnF{Lu(4p@exG(^NV~se~o3)3vch#|#$b);`JNB1pusFvB%M{Otn1
zAE%)v@Y%JR{*de&hBNBPHGKQMSG*UlFDblVRzT3s)8vCOELSl<KmXf>yJl8#?jyc-
zL!(yqFH8N>qyEFA07a=Ahw#dvRvJAjFBEZ@Hp_?CnVDeA?x*yCb)*z5v?h%w?KN3U
zV)~gUFZF$&bnkIAStqOdN>9{_{ml9`1H2r}Gougsy_gJ|$;K1FT~%FYnrrdjHgSP(
z0{Z!73pC!R830jsjKdG9o`W~jG%Mt@<b+l}BeFx>myjqNXk=nIhihzJ%>rk%zSUqx
zn&82*XaG|KC_>LOj7wB<C)IJ|K&A#Edt+$?5tA5<;h13`IBy?{T>2*W+eZ>0zQ`nj
z=IDXB=9#+!&=4K5WqCpvaH&A<>n{Z0eQlwU#?>}7U>&!VR`Q}Uj=#&K(P<VRnbk5y
zGN{_*h<}gYOEa0gVqNBOGL{9*HWjXPNQNux*$z>?$NDAaJ#_jZziN)AJ+MoOns^ss
zK!^*{|1?H9)*o0M*yboccpl=6WdGu0QT#Bo=O&N8X(|;q1zX3hTWH%##DX>r%ZM7t
z41Wjl?W^2~!Z&i%yx25sT{?jgNP&p4ENn*L8EVJKH$kQ}0m9>gjl|W_&BjNV*zZ9t
z=@>`w>)n*2x}SI^UU$SK1@2nIGLHqV_mB`PXj#nhpNchM`!zv8D1o$^>`uJQ4ESlq
ziN@!rKj86F$%N8yko_9PYN%zFsRg)?<P%NH<df#8DT-mzNYH5RU~1P)43^U9J8RM4
zX#?+4moTT;A_oN6sbzqQ1Ks`I#!J1RpLW#;*u1jf6g8?vk7EW~UtmHqH4(67wKYP?
z!&yMuM@ZcUUH&#0Hw#w+kt~$iU_vsNCKtpdV`VBs)9IArjxHb)Euzgf*b3$s4?Xe<
zV%K(Xu<RlXi6YY$+bREMx_m!>cI1EbZIICDL1+j&M6ya_4UoVmu&>cvkT-ydnw4{U
zP}D?Ah%kL^A6;~_;ksaH@uA@^x+dIzl#SeElO7v@S4Mot5W;%VXm>;<Kz>#t`A$yH
zz9*|ZGU!%)Oei&u<<tbzE6#!X7^U&4jN;uHVfb{~?E+d^YQU#MV6um_nyyE^&=@pF
zI6ySFNYfg!HT9;gBtYSu03LC_D)x@wM?hjkpXP-+q+e<f0(&DehLy*HKZL4<%tpBJ
zjvZ>~qY3ZU5z;5vl*MEuDpPX68mT5daG6dI&Myl{SfRQjq3sh6!LIT3%$FN+9TFa|
z_H`7vC1h6Ha{WyK0k7eJ#e_8<>Y^U~f9<^CtKqZ*_cskWR#$z^F7p~}$3uSJ=9s=#
ztWC<jBL-06K~Qj4m47y_@kSN)q?#iNM>mZo3unfbi@zDsz<5sY4Wjecxw92W*(H2@
ztYDl7`~DqYJmr>4<tKGOST0oDtbEy#Y^+(2gv)7D1Qw>23a$B-`sWAL{ouy;fsq2}
z=bqrP+S%Sf=tvYCF0`PhM>42y!<PP)OUhCC5T%N`gqd2_tS2*ocOEtt``5kz3|;^Y
z#TL?yGWiY=F>)rzC-ukiI^gJcXgc%+kJ5}2(<n>P!WYYE!9gVs%w^f%QqhiNkAFrP
z0Jps?>Te3ts;}**16mO=Gs~i?i8eZ5I#yx?CGK!GD#-)Y(k91BBX>j(i0;D@;1aVx
zX#)aJ-7>`&e2zO*ysBlgm}EEbc82Z9r7zqWxgZ2>R0#29>(_(*(Q=4~0qr4jge4An
zu#vB(SVOU<urZxx^|(Q3+AVg6Y49un9je(aje`0D*{lTr#$_iv5gj1unLo7%Q$y=O
zXqzyy1yv1A)716(K!pjKYUAI3D%+W1TZ<2RY|ZgWRlg^UNaE9fcq<z?l#o4=8WM<|
zI*6VqV3fK^XB7B8HDrkV-Bm~+tz+=-@z#~x!}}AvvG#{$$Bx#UOQ5#jn6781Wo7$X
z`>den53W9ml^=)W=Yyjde~354Jp%Ue*FO*HZMZR}@(na@=w~rTACk!u4wCMEoRv`|
zpOf(5-e1_cA5ul6dmVR)u=~=(WIu5IbC`9mYkHcfAbJzd7FCDk&ufk{bNq(xuI1L1
zIzEq|?PwJ6=ray@Z`1OY_4_!pBNgYade`QH26jh}P>5g$AW?2YVWHKz>lJa7v|Gah
zm<nvF?<seGN;lFozJah~GGVg$f`-96CR6NTaFRIa^bW}qQ^xp4H2NYK?Vq{1G2I}X
zq%?2fS3$!mKLzGx^P2E-XeKmLKXf4`JMbGgV*faly5-=Yh8ot;Wo}EnW!dA{y3~nm
zCZ0Te`x-ZlxR+R(!+{|344KG=6k!Q^yGOVKM?eM5v0T<`O}Ud*#3TamV8^~Bji}uy
za2<u=UQjx%L`wz9kHawv;Cl5+9<&$5Pk&JCOEF4l#s*DJ)4$1HMo?NK$J0qE#_|CC
zFIWT|2)>YIXuA6l?YbvO4CM@U_PQV`C|Hd=?`q)>-aHI(t!ZDdCouaRFGiwksc<!5
zxR(|$6+^`*a`oN)G=YTkr$o1eP|T@Du$^pD<zt*1CsuIElys@*Ma|wq9_!h#&eMQ<
z;I9wlz)MmI+dZF-JPQT`WR{7eH*?0=31pqTR_)?@my}~3=rsFMK>@sd&VB+~eG8Zn
z6StHoEVaEiKZ*0K#vVWmS1|ErO0VnG7M-af{mvl@u!4B}!LpRE2Jc#8jWPt(6QpoQ
z$;BWHHhWEMQ_O?%B7>C22`LkvR-g{YF@seWR3biauOw^QU>&s7&Jov02~!x<R9hP8
z#i-)v_OeVeS6r5{kC7)6LX7=1D0g(4I>vW1h<}KK@`4dUWP_I5S*}==((U+Bpqu+5
z^y0Lz?6@P%{FB2yxBN=K<5*4tW%c>ptIytWI`yXM*Y&OmXbvQ6a<c&CHcv^3uoVWR
z&oL+OTC$=(Ro%|XAK^Vf5mUX5Tg{xLH!4!m@=lXkxCKe`t1k;(zu<|WwcF^xXzcBC
z^-vE8Xg)3Wzb-YKzcN!2eXsO+FF2+1B-TE^l;mj)x|y;$7N0?A6rlf;lSIHxIt~0J
zVDIqJ+Kne_4cvFv7H4ZlzM=pAa|aC#-FUQ7!cV>qT(HQt-3ISi4qbHtrogJ#!^Wq^
z)W7qa=P+pPW<l~byWGKly@^)SAPs!PfAh=rNnsPZS&p^Xd49TTeSLx)M#`rH*z4=v
z4!oWs;q#<VUUp$1S^EXOjcLg#>_+)^xfb=*ce`EQ=@R!60`pL7Ys6(XVgy>ZLhk*H
zy5F^P2PSlWUn%N5xTcA&XjBA>$m`<c>Gf=zkn0!R$Kbk%Z<F+i3?)v3jg}dkOUDwB
zZVqEuFv*s$DPJvS=(sU=tJy={zGD!eL){!0c}Qn5;_6;AF9jt!fhK&wZIN5z{K)Zp
zaqme$5uE0gb&|NMizvI^v;91YH)G5C%t5QJ9=%am!e*s)ilwHSTIK<5cnJRbj2-dh
z#OKBeKC6`U!591*DajjYr9B?aT&55fXxOc1cY=g1x-9Y`hLeOxm4}yaVQaE=&h2+T
zq_MRoeTTT4F*-C*z<~7aYa$rmU&hiobe9^Dbd+Wj==K!o8$VXWj&mBLtn-yQBBhdf
z+R;^Pr2MvGvn5X0qbkQE3;BO>ppO{4p0GJ5h+UY=Bq)BG8j$#vzVyB61^1Ic(#yFu
z=(f)L$g|Eyc?Xuu2*Kb3;u6gK<G8-$2(;_{Qa-kn^nl1py7D1(<27=wzQ87B(*b)5
z^`A}G6gS`c%SI=16;B<#g0@~KI3bT^>d=81_PRM<IFAXW9ulG=57N=G>wNG+^X1}2
zoTlGJ#^nV3u`KzVP=#s7Au?Y_U4)W|py3j~4arOv;~U(dzy$9s)8!E@GSmeX8Ek|v
zaV)Ux!U+6lBv09}!H)NBo$5-y=`Z)%jizc=B6QsLhL+TECf-y?m?z`B*seu4VmHRI
z45EDPbJimj?_Q?zg-#5z@jAU9gT2=@j^&DzWN_PUZnznumnw&YP)AD*l4EWwaZu=T
z^5@DUKKTghDBre8)5X}qo4918cfp<zsE`nkLv2An=Bn%YU9ZvhJ)jt^PRJ%N4+ZXv
zAGIACoJ5VW(%(N*qx}7q1q(}~Nh+4ega|7xhK-GY@b?#l-spe-0{j~W3JP=zrbPAC
zzZ(6kZ7d53JkaUCJEBH06B7d`At~j*`$tCzfyPkc|MdE|b9mq@aQiW-y?FjY|7i=7
z8xWI_6jNw#RLqS_7kz7cGEJt<+TvJwa%AZq`7Zn?h$I*LT_Z(sSG~84(fOIZXMOB1
z4&COW6XZx$oOY=fP?&48i8NYGL&y8XLRr^)GRiGnS^?Ps{)8#-5Nc|?4vg>#_Kk?u
z<zhZSw!Ifc%RQei&l+RRAyH$f{Rg%Bwnk$qOAQ0BV{i1E2Oh*EU^UQq0igzoqxx+P
zuKQVQ39L-<lmN<*p>+To@INp$NI0Zd3?R9HjLlM&_!BD&@1t;gCMYT9x;V^?mufG<
z@CD3{hLR>9Ofk%K<C#sJUNNjbwjVysDh&INQgKt5GQ<hkF04tLBKV(Z&>C@6QU<Td
zG@)w-&@fVa7~?QQEKlfjf3WK`Ft!7dHDJ!q+Fi6-pi&+`Wf*X9SyCfRtQ!No5^UE;
z%AWrF3(rs^O&9@~qXs6Be;Z>&Jw`Y_!~}kb^eteOSQZQqT*V~$tVQa9Q4lkh9~KNg
zQWdJ0C`_p^WmN@C2^>Fc)*C-}7;sGnq8+A~GE5q58^*fgD8C*|m(sQvOgk&2{0^K{
zc`^$-zpbM938Y31l!rCDFP7^yWc6??e702<-}%1`rof;n1@r!F-%(GoDKew19Y!67
z_?SjDA4V9_FC%voMiA~LY8IoPr*L3dZO9&b*$s!^xNxVX-;H!4D{R$Dqvu$Bwoa`v
zGd^SnQOq1>{O~9e?I5Xt5s4lZS_5m0LCd`-4=l4c&qh+R#@;f9^b6<Bh!DquNj4-r
zN(kLT<pf~xV3!Zii8%t*hqo49A?Ty|xxnZmoqZ`(z|G`_c`4(X-59x{m?TFyr2Z0A
zO5!(=5u1JT#NYhnCaYe^8)dhj8F6Pew0q~$ZN*)#y)R95A_u{DzaSJl#PXo?95aQx
zv36O0Yo(<avOk*d@oiuzj;k@=KFn;1n`Kt(dSLAI!Jctv^lNx~Yh$h(bnaZMnF$O@
zz`2`$MS8W9Y%KMC=cWGR!FCF-Ir>wx8*Gp+OLPej8LcwymUhAXL?**ZKa!jBQGUFv
zdR2NE1I%8#JAT(|m?=;2Y%fd!9Ch<%=MP8jX*@PBI5BIiRdgPaU8wy<AzU!UD@mDV
zbbU6zKkA-;R3vCL-$$CRSss{`MI1!BR-fGI!t1nLVclar!bR+kOVoNKR@?02%#@d>
z@#)|Htk!6_vt-d|rXb-my<J+VXHMQMY>)h|`?`~IK1Oyccq2_sD3;~}FHtwX?gyUt
zcUspjCr2c0XYaM9t*2j*)EW;Kbw8eYbrVfyFZrGM*)1&nXRy%}Z*rXG%C1Ae$s%~t
zK8r+^NyUCir;Z-c-*RMKtKsz%JPy<$_3&Nel(uT*<L2;Hqmnh8aQraT0x%v;Nbu_=
zvg66Ob3xg_{i@=iW&<m%2uwx7H*Uyp42BfxX3Uaj@h%LHJ>NMeP6frjlEb_r<$i}#
z&BL6WrQK=ZW6<Pr&1CVB%|BetGS`cs;I?03l6}K+X6d^LnsTQH$<h6>@JZVx+k0{$
ze7r%7!A+j(>u;e*?Wwey>Q>P$Uqu-)KkQbhBJr9I7+)PP-V89jQZG}8KQTn?D7yYp
z4vJt4#h{$(Q(DG`IfZG$>cWRvg1v%?osOR$1G!T^Vq-x1)5}CMe!5bM;}oZtm1QVP
zo)W}AQQd-WGQ$_m9D?<SBd%j%SE>!6a9;IxZ#pt~2;@3a0~Dpb_ZvTE#@ypO7AW?7
zjNo0rSxR*v5|Zi-tW>iNW-RTqw58tV8aJGdeJaw)W&7G_@b&W%K86QcgmI9eBb~^8
z)F|1U;YdPdV$NrU2e@PH+1?#*Ut05;2P0|suTDf<cT%Of3iO)76@D3+chPp#nuaM+
zdgz>!Y1?=(Gq8XoL+awJti0RX-(1k>kl~+*m%1Q$*ThEt_;&M9Vkhcc1C1fJ&sBj?
z6k%`s7cq5u14E!I>+4zc2l-#Wxd=GX)!EWTL%uYt7!9THw<%pE>c=oPt^I4|oNpr5
z75yooogC?=+b3nWg43-WGN*G1M`{lON&r<i>-nq(Kwb;y3y+v*fMZT5wAOdMTdOZZ
z0^E+yr#W(k<y#_1e&@_67edOE&h<C%^)ie|?htAIOvFoaLNL30#%;4_3%8oV>GAow
zpHS-+h9jIa*{tm@D)E>EL^Q(ExRH#c(ZieV;{79nQT0k!8GJ?HRu=gN!$&)*`;Cg(
zFMfy+b9qsBtT~d0ohT)L8hksM)mQ~qFXH91Qc}!+P3S>q+ArUgL6>a!J)lh1m4o9X
zcR{u>zRb*!vF2NtUw;4BjA+Ag6S3T+W97qdgyNp+H~GsiFQwJEt;S;1jVjGj&gu`M
zWakHCDEf<JoxZ>$-<<ZB207^O2YY`P!ey2USOqFtC|RGJ*&^HDjF1>ZGw|jrKK17O
z;^Ocy^Cg&#$$#EE2nMS6Odw{astBbhl)yHP*IJH`%-!+6Phl}04TI;R(T1~|QDB!p
z@bk^qTsOxHVJ~biK(T4VdcKr0PvkL1CsV9LU+IzuD)<{M0Pm;X9!181;@#RFqq$}y
zu~Q~6O=J;4G}6;%QtgbF`0qVovt1)qq}R?pd%_=C`mrKQ_FMSN4^Q&b=F{Jsk#e$4
zzB(0V=d0xNJ!1`F5I4f_jK)4L7q>u3uU+GIl@fB&&LvpI&qFz9>p8-|F5|xc*w9X5
z`x-J(zuFZO!Zx5<1SLcM`&~XY*-?I4wf(a2gR$7=?0xMlh3BA-$Ty_+U%;YeWX9!{
zF`Q}hAt=*s*V&#rkL22s7*I(`rna2=B$4)qVvdSNGx|dq_sjPPKl^UkVbQmpfnHfX
zj6x4$*08q17~QAjjtDS3F6C-yqfUzBeK#-i<=jAsqIuEn&!$Dr8+}6vG2vRLRS<$}
zV3}R?+^~cF>!vXT=d$>gz)2Wq{91twD*`R!sM+>FNH^{V!KH-DUN48=Rr(n`$F<wW
z3V7m{{|-Dp6Rug#Dfm(?nFUQgEJ4nZr~<#ZzP0D2;fq`qI8f6pvsMZLmgd!tLObC2
z#BC*Ntl%^QQj`e2C9U2P`C@q6J0&s<DLl8Bo4iXkOSg{Tshlwgs|W7D$=~Jh;RP%o
z<dP0mmxJW2*z%%UW;SR}&Hshawo0cE7H}X0848=9%JjyPNie?F_ovxj-sB&e%tce3
z4KTY`8{8@j-(lM_a(+a*JuQF~x#Kz)ZTlkM=DIzxxaGc+zakSp8L({2@;K<5tPpsR
zc}4<g&e%cc>3^rk(9z%}Q^rT0eYTpYghd;102#^$NB}>?;C+Y=UB5Pn&wRzOq0u~a
zeT)Y528J#E8?h`e(u6$)E_Pm*{k%Z<Wgb6(0=<}oBeIxG+t1#=MU+RYpV#@RYTS5Y
zW#(=bfekT<e+mLHhZ8Z#657*z;8h0ya9eyCzvy%ng&$Mcv*w%G7yGor;d{fqeQxoi
zm`|5U8408*T*qy{`A}tW>^c<3jD>{$0EZ=<LJs)Dp09~=6Xk1FX@WLxHB28XBjm}{
zjh@2Gn=Z#%ZD=*o!@PK#UKc13M-3;xE`GSya^CY_vT4Pb875GBYyW#l@oZd_XtGF3
zNVP&)5jy|W%K$hF_4-b`YTUc52eBDK3aMU0Yxiuz_Mi#_;E>*fzL1WUa#f!Dw1*MP
z$Vx*E6!Nv&(Zhw9P;y1s-hC?LGJ!@5j}x^iA5YSSbfa19L*IQp^1+9G!yMIe?fKz;
zVW58+J(NHnLd#?ng4MUECO26ksaU4ffK7FK9Qo~!P@w__m0%SzY5kGkkwB*SDVBDw
z#i;8<o|=r;8%w4nFD~TF$a_v*NWImTVv%kIu|@k2^6Rg49bv|kr89+=+I9Q$XsNZ^
zZx07-b0uc9%;s|_GJH5mrc8z+$J#~u`5X(4?i1|i&kr@z1s?fc?7R<ZJmist))36p
z)0vK^4<raMv-A+(@nA&MKci|ia82J1t*H;G6`Mm;{MX+=Ff&}xO#>Qc$bxsPag%V;
zs`NZi_~VVM&1b6w6gNF2oAnC8&0mA593>H~VsttE4g2heiO_6U+9)4`y)fBGwRtO~
zZgiRn;%I(ta<RaB`1q~G{0{c-WxPKw+kobbu%^vs%O1ZtnA)5fCR#%wQfEn}3W@-P
zn@i3guG#f0hx{lt>RcLWHSs#MzWfdfFZogFXzGQH_VE;<l9GgP)5xoF+wa1S#Pza#
z!3QNq_+F#0dyakMfnGDGai~}^U=?^P_~f@OXQF1Lck<_d2ju^EK&-)N@gVu&4XWD#
zqFiL=i;MZK;y6xuEvc;NCm)nk-m;1zg4cKzKdR_gOO{lF(6Ij897ZAB#AV)eM_Z?H
zSS?4!f(b9XNZYOjPdauf@!7OCk)X8po7bXkB7r6=B_>5`%?J$?{N>=;#>1Os{_6Ay
zhEt`;G+}l3ZLi@u=7!?dd)iL`8cpg6%HebTI+NAt2Vz!gQgYfwSxL1F6t*S>!oC8U
z+M6$cP`XLeZFtJC-MyC-K)a}1M7D1m$Q6wa`jk`nF6E?Z0JjmuZT@59CsefBX+!5~
z?~biQvmXA^ba3U$Ihz1-edwQg{?5OQDdb=p+NrwvutcD%PSZ*1{I%#raf2c}V#jc=
zi6q>ZV>(6fIR1byz?k*-J#(2>2TJ^B)(?Mtwl7S6*O?MI?D!~bdr7sJX_QF;@n4z=
zgr3)QjEQGHz1g0D&Hq1I0HmQGN!WH6P(hd9uk(hK;7sHQx|huZe!IBLwrv+F<sL_^
z#WTo8e)*Llomnv&`>=nQ{6{+XZMZS3W(Q9ER!4WfT(WZU?_sxGt7^X$4jMkvs=Ziq
z^Xd#PZ`=Y!k{I{lhV|*<PNhyQ^JVwBPoZg@(m_`>#^XT+<${~b&0Iri*UBSLH=Zn-
z{DZHdh7!L!-UumgjB4>+PYUUnc})(rv3c}6h+L-AYF-tLK3cpG`DOKjQZALWo+(ku
z;60`2MvU<{{wL%Eb=Le24dXXP!5CdX&*`c>4*%5Y<dquq5VbdduuYOp0>-_s7%-Y@
z=_hQB3JcU0*vKu+^-iYYcf1jfe!O6%bieC%6B2ej;Z*fH^!{D#TYc~SSn!Sbvfpat
z;WNF9b)@7?**+^Q0dVhdD0<DYhSz!_YByr`Z7q^o?4MuhX1{HyBB>?`2!`8#Ho7zX
z3|^8dY_s*K-VxAjb8FCQ&S9d8*e(~8KU;|P*_-r5$N*=e@-Pz)ddose)^t7FYXr%L
zZ&8pvYTRySV>#>(nERnuV%GCS3C7QUyOMAuT1G{<06cqcBtu}Z*y<VLk(_5fo!1QZ
z(u26k&bFC|Qp??>Gimz#9*UU}f>Qb5Q^ll5{(g<Xu%Vt<@E<#X0Xw)4|Il~bA)~QP
zk?sOisS7=}**@hLdyA&{9!?g4NX|?MAkB`CFa&PRm=4BrF&vVj+!!h#U3PJo-;gp7
z`PNc|W90y$5{CMxbt9{lYAm_kC~U|z6<m@cW~a+B>7Mz!Zw%iqp3WlPc+{RW@9JLX
zQ(jh170AVS;;j*d(cn~JQf}Gp`64I$9{f<C3_Z%mhxFkM#%zQ!a;=}Vo}z9O$c+Pd
z?(eGAgAjc0!c9X%%s(7At7@JI8jerGL9yQH(Di<+NXz$Qlj@dI?4~U-5!te9Jxrt4
zhfMu7rDeFW^C8!ZOQQ0D0Rp^{(CGQvIBLz;2}L1`Jl~mWpqvgzANQsnYRPn4??^2r
zZ2Im=x5HUp&nnwA0#U&8lX)Xy{sRAniERWriTD#-bP<K)P-d|Dt}b1*^MgoYM7_@s
z%n4>^{au|rxis7*m!sYVM<V<V0Y~FL$Tpd3)5KgnkCQ67bYyC_MDXNGV9Drl4yV^;
z!>`J0^M#kP#d0SV|2|;_uCZpKHc$FVg}m1+bIv;WQ20-ff}h>JV`Tw&XM162i^sKI
z-iQt^5u1*xyGAmb-avNm;6$gXT%GwjFR}D;jdjcvZM(KF`2&|O<XSgGod10H)qCQZ
zH%z2_e`gN8`FpVlUxl9RwZVNE{hrXCqm?cX)Hd_|P}B=H(!lSA<*=%2Oh7!5!{>Sc
zZ~_?5#0}mMDWL5&F4w3h>3_%9m$M*gFO&23#g|t<hzwT-D84g0pKsFt_B__RKD}M`
zCi?8#zDWpi-slkp->PbQO%z~i@m3VT4y>zvr55s$ogFqI+iG53n5gtbqp!O4v%fR{
zV@oWW6A@eX2RpW3@XikX0hLD6K$1A|#a~`R{_Av}P?{E1t0B2!%>mz`{|Dd9?#M8z
zl?mK8AFuSS_4w3+P}(EToi{Q@^@@E9Zc9GxcHcb$9PYXM`qQCa=)xyj@tWYMaLJGg
z$!gCZ=;VSI$V_uqA#BHF%R`glm5K~M7cs)v+$Cz9Xb#mX|GV+Ip@O~}v?p@sa0QBc
zg=2x7FFL`pY3<wxGb}L`6r{^}-FIV9CcT1}f(_*%EU6ixUmN6ODLOOChLbB_8uFMv
zxt36kjNUuw*n-|UwXXT3#**<^(X#Nb_O~1yN==6yOK*go9Ef!7oLc_Tvb(vGb?bDu
zzIt>zlHdJYZ*#$`Qs4tuF1klzfO7S*(4faPef28WtP|lc*3L0^yrC=A%VAjAw{?q@
zv*U)o1GjL9yL|EKcfT9ra8BqodIdoa8VX#_a&G<!+Zo}!`ddn=RU9ogk7~XXi|0Gh
zhuu$V6}lfS=ROb0_dT#46gf5|+1`_23A=r=4L8nz7Xicx=82WL4glQWx+;KgG@C0H
zw}q8eud`xX<Ufq9lyw#S%H-_v47mz6^uHV-&})AFY9&#TYVqxN*;;>##`)YtmDB^7
zwXJSv`28P6kq2D0cSI7`i&5+@chu;AeltsjdN^AIVOVxb&XPAzG<?uA9{&~DVp-+%
zi%+>i1>^c8og}{6VL`(A6}O^|FZ-|FBIVX|`MX$Sj6Zc)l<%I{YmA4(Y#uAtIQHag
zZvI@+wKP};e_Lv#_8wr_oA<`P5Z1$9@}1Aa<Q+|+7usZqlDY_?5>M`sJRhn3rAon7
z=<R==>HXz$cdVHAT-77voWN$|F=}9=^n51&uf2P^*@=BNrYXi+Nv{ZFjirWndb)>-
zPc5dteP~V7U!(_6*YgUOsOk9;28>uKZnJ2u4&$laL+#th>e;cw-bB49lX`iSRq?5E
zUXJe8tx~Oz<FtD%2(t1AX4QOA{nNzAkG0M<PYImj%$t**J99KOX7_RZ4hgAAfatj^
z^7NGYa=>HmjRqy<sp}N<Z$FZ2o>6-Pl0|}AtTu3k?+$+%S;&ysN}{N;=r?q(AA?+b
z&J>^4m$rp})Mc(5T3Ux)v3sy?H2R)S5WYN9sf$E)bB{p6X(IWMbgi^q?oq<3jt1=x
zReaE&oa5p!-%-hP6IN?eGfz3ADCA=%4i3*uY@s`kMrua{X-?KOgY~!T1iAOe%vavq
z9peD8!@K9$fg4-J21{FVT&)!IP3+H(yl`U@fe%*<YA?^*UL5DYi&qFDLIZGZg?Gqi
z9DYo$#X`?~-EC7#c*vg)vr#tT)b2m+DObjE#;gEs=G{`)4S;YC&vOO>3F9VqTYuW;
zDB0)k*H4U7<&#LP5@v6=P8!+S*~h}^dO_x7R*7LGwVB!|i=>ZpyX-%qoLj@^@5VNH
zTIB<*V*|!TAMig`>UKN+((UR?m3y4c{X4NEerANCT+T8Vn!tGv)QG;MKCWv98}1~u
z0E2tAB3E_%92i;r4UAwQ*G?H;z|xDeK!JwgV77W}htq1Vl`UoqORSBg`6r#Kw&!=F
zU!FsDa>ff}TG<}$0AA)jsStW?#|nOx<0>fP<8{Kz6pZJ9BT4=cc$r<|4(fYiF4Z{p
zTz2y&<J9n>^y^pK=UR=Iq4!~@u|JQ6JP_$$m<)D5@~uS;iNyH@X|+MSd2~MXov(W0
zF@H&9cGZ=MEi>C^#YZpSBzm_Zm@!$W^gutPF#8SD%XzjK*qUi9&hWq(ey{^S@Frlv
zNwz+rle7S6)M>}t=la}#K(JPV_I2)r-zy;Wc<q)>r9h!nSPtIPwI@!pv!oRxqV0+a
zK-Js;F^t>^aA{Nx)@5wYV`-J_;cYr9+a4U`P%P3tf#LXln#e~7ST<e$${s&vUw$_<
zs3GOTC8F-zAG6=4Z3`8@^__3|t{3aLOTe`n9U`Sr@S(O5o=@e=@vXWGtC6afs59zQ
zNFo(Udbb9FUe<-QaNGX)RlxYm6*3A8P@}fD`?bbOix0pR@Ln^lka#%NoH%Wi*6Pw|
z3Qo>^Y>Z{`o?lh1jb%ymi-v&;{+dJmIs8Pt=zJ3L^^|+?>uJ(UT<$-u$3TQ9<Z?P6
zeo-l0C|C!e#Irr4S2KTnTlE`gO%Nk~yDp>rSy_tj?~H$YK0I*BS0>%Nb3AeJSTU4#
ztV|!HCs?7`jKE5!+I#C6UE1gA3%qmC=TnvbiBn{%up;!aN}g`nd%T_LmN~&N7Qv=|
zubcDxR`FDhO8sI5oo3vuHaq&i$KlEZt<iREIh)a}jAi+@cTlPgP7<*K7Fu9=OiFC0
z3fC%^u;yyB8rn|Z#!B#vt?FoPSzG6O<03ZsMW^pe#`w4#9<3>@+n>B-34NCuO^Lqa
zduzq9P;Rih|F&~FoREm456<%N71nZ(pRLe2NH+fV^nm|$rZn=pG@036Ak!UFa$uks
z+qkITx5*OKVcTcbmIt6?2LHL1viRz4_3suN12>^lCXRwt5iZ{VqVw2>-t8B_pQu#3
z0VK-ec%ZG+ebD9Z+0q|O+WxIq_J1C+w2FJCg+TH$zM_%bio(SgITEj~!a1DW_oqYw
z(wfW0=xAJmBdw|2peJTN?9}Bj$}X*c{7X3P1;AJWY?L}FImU{nxhePRWFC;nrY|u5
zni@yvQ7863d*o-MF;g4<i&dQdg}~*&97rhZEGi;4*CWa`3LFNW&zA}9qPx^dSx_br
zq=iDtz&AYEGx@Tvs2B``*Nh2LYi3RW-7`p&-7NlZy5}PoT!UIBHKFCw#0A~Ud4sQ{
z48IZb8};v`<T{^|Qo{@;z&S5p9`lK#0IYu0gI}Ey^5wQZRQ91VN9m^pHd!JQ-n1bF
z0fHoU$urCj^7CwpdCn7+*(Ks`J^%*GadIDFf&&<;P!EkV-ud#=)U%gT_@;OcD=*4z
zTTQAP#N+>&uTl;~N~<m2u4a`62%dWYyagHu6~?;}9l|N*t1=c`c8#~5cF4%yZSTH$
z`K#x{{KXGFFoZkRyf_5wvE+o1MXdp{@4E8z<r&O;?#-IW;_C!4f92)#ya%<dwcSK%
zlcSr>sHqoc!gLb4Xiw$|HFmGJ+`NF+$eKK}VvFN*LFs3IgzeE!8xDgR$^}aX&Qx|(
zf*iFNG=~X&qjae{Lh{LMM4VR|GNhGbgHhLBLvh=s6ej=W8s%Jtd4uNE&%yY*G1pxI
zP=e6a<Qf@FHBXDFJRwg~s$LZ!y_M$$qzz#y{EXXGbI^e>v5-wyA&sq<6%?Uc-0^8?
zv+K$GrA`?FP)JtEy_0m#10L~3XW+g3;`;m~o!Mj-Q^|}~X!m35Mx&l@cc}@JKEQ}C
z3GN|?17_?5$z2f&L|IN~i$U_WUE8W2yMJPp4*zIJlbt<Nd|1!^iA5*ZkiMR%(E*A`
zE2}vE>hGFZFrI=)(TXLJv5UrZdewvz5X&Lw(kD+^%I1AK<yzD-{mo>j?lt>Ut)ku2
zvOq$qD!jb5^V=MulhAa?@98{Q*Tp4@LpWey+|NG{oM0gRXvyyRqrWb9*)67))4+tM
zG(SzoZRae5Atua*R})YCo;%zHBbxpKTnxF^wOZ@-LII)&!MQS6Mn?(1Qn+o1MW}d)
zjXKg}$ORmiX2ls%YS=d#k-E~##_!BK^&s>f5$OXwd#u#Zsq)sCmd1~uB3(Sp%0J=u
zFzmLXFCSEF`bPC~!H|vr1`oVAJgOG`j#t_xKYf4tZn;h1P9~g1X484>!b4Q1(|`?}
z5Zxi|BFeYv7rentwlk8ovsBUwvJy3QM&^Yuz|3W{KGRGAv1NrB?u1efg}qhwT(Ke_
zlE^lCNc5ryn{cSMT3L~&{WJc<!jIlay4}U$|8?MtwPe7#fUuQdfl3tYbdr{RhX#WS
zON^4`Ds0S5{k)V0d{|=3^gYnt)#PLgooW-_JDCU6mnIqBY|1kQ4w2zp5WdW@`kGu!
zM%2GMc1I0<;V$~>FAhjx^aRLsvxTe@j|Sz@ivx#Wx&cjkF#NWWq>EG$b4SsUU*M(<
z#evxZ!F{0vvabBnIPMZ)^|$)y`4GPd=W~^9)tCBLRj_wfo8+oYYGT{Q7zdZx7w>;1
zRerRR1PJ^plPxW0O^#DSZUzm2@d2yL!Fa<ik6CKKcpXVLytU1aJYmnQu2Mq&p{)cV
zz(~7f&}<uOf7@+kHrs;NV5L<ERxjaJ)bB~jK9D5No#>EDDh?gd%v7PQbQX#!%DhvU
zsR`;X4RHH53;6t9eMnxM-##0mQ(>#DiZ2SF8h8Pl2NtFUG2^}rR+X6mGS%C1FF#$8
zINz?9Y)244Nr5DbI;&owY;DpjnvXz?sOafJV|ub{xBJcYnYz<x^k{Ao2@0z#p2`b|
zW%&S#sCq(Or->nSr>o~%5Tsapbp5UGzlz8<ozoaQ;M*0#!Zz)HHE>qBWi@u&p;UnC
zHQ&Gius30Czh2}H4b#u$10}TI_`eOqEtXrlH1+_mNO_$1xOd(LslWP!GEU@%Bd_hM
z!k;PPj5`?f$G^~F8&yw0+@h*T==c~H)*t5?n4kdg5pRO#{hbx`o`+ef*6{pLXeRTi
zXLgiKsnO!q`;2y-Pj$kd=V1V{2d=d72`ge}Ts2vW>;3$BdvFt<y<uc3C)FD3BND}G
zd1VGF&$;uE-^n`FD?aR|=iF_0H#li;R5^EGf^7UX>bn}J{gWlcTD7K1{kUXXxWQ!O
zxVJMe_0ZRBHOkpcY=8ND{5g8_5`YbbZ9gp4*~EvDX3icnw&tsDggXDbuh&0GKA{E4
z@%4Bdxc;9!S`No?X}?>cyJEUX%|(If_m7rQSNkdkN}0D;)FgScU<iw$a;x&kk(4VO
z+%?a~5k}wsz~!R#9C0%+p2#*Qh~0K4nJ?OI??*lBK#Vaya6Em%Gi^v8&)TcqSMHRx
z<o2%o5oP@ANwQPSpC~+A=X^Kz>IvBj3Z3vv+-71`xMM^qmnkw=Oq$e~Of#w1d^5or
zyDD*Eygx<JuKemlq10T5Bs_3Yxyg2%E#cLD9j#`$&`r<J^WMpFNi?aXz3h|6b29*&
zAzOZAyOyhQ3-f%=bNgI7(8rL~&=_s0qNuo5n>n_m(O)BLKgd(#PfBvi52?jqQoW%#
zZbk1-t=+XhO13>8>ctw+?w=-!n9P#u0cq~O*6AB*9%eWA?&a0$1+PG%<^K>*j{|Y$
zf;0F={xp|4Xuo;KA$)EeOcs)a1b26k$h@8$CVf%1V1GQj%liIq<amEe)(1;~ky+IX
z*mgAe;<orlT4{9Vg7*&wdG4FC%bvEIv0`~2kA|}K?Ei&3YVVZ>qJi!2!YyYE!iQX2
zZ8R!%s=GTKbG>*Y$RiOTNMIOkO!4cHU(1;|FrHBn4SFG3)%*3#WB>&N77fp*s&X25
zlsMJw2f)?wLSK|ha$MCQuJmIIjvbefyrAS!oQ?6sqV`<<{yl(4|Lkp<W+z4|Q4C<V
zl9@*s?^d49{absy-K?w9*Pu|ljp$FJbRYdoHlg8b6~@&K1|8Mx8`^@5!@(7O2`GQr
zmmerKSpfaPU<gOE{sJ=7JO{Q~=}_D@EMKdGN7StdfEU`O{j``c-(W;Fma)Y^GL%Y3
zN64ZD4=Gcr#BVEJE1k1e%cu&Bji(1OXDI{17=y7N|5dgV&8YdY@+l%ARc^B<D1r&=
zaB6#)-|3F{{?;mtmvt4XH&nafSXnF2^{`RW7c>v(;)-5CaUlf9=5oD(uK|%wzQwaJ
zhU_m~de+(qL$#_cxWW@{LZxa$HlyDPvy-N9deh8WFZ+KY+XCXEe~?C-pV@GL+LB5?
z4<#_v>C_J9)t<kp&R>0sJT3RW!Uy182Jo;zq2BvZkM-#_Q+A_G8NNE{ae_A*OLOvS
zG!mzf*iJTlMRjm6bA&UPJ?IudC3;tc8Sf-Kv;$WA;t6msHben0y}|e10wvThSC%k_
zrRjA)T8c<E6;tA$(<%WUi8NkM^XXDe%1=VvO~nD#b_%m&D-Us8f70$A&RoGsR~=n1
z6mH0U`EpGg(;``BdCio7k^hfcCwxNRk+)vBp+7~_EBD6PKtd@ltm@O>{3A^l3tor3
z)-?OZvogsQfRwzZ|D%U`7H^U20h!0>*s?RHo&dJ|`2fGvUvwG~Z*9A=!<SFvrU4mN
zoaNVPd<1{+p#scYq#sF;l+*d#^6No0!1DG?_U{59aKu<ym@vA^v{HcP?249}w~$D$
zcm)keO7LPJKSQl!5so&CU}R@ZgbCoyiOGa$a21kbR%6agj<x|n%#kPb_d{$P%gR*9
zc6@%mFv7H}EUvGVv<?Lk`gU{*IbHcQp3&$tB{*?{+e>ln33CExhuTGb{<&Gse@bhP
zhMIR*FtNF{8^DlLna^REOAPTTMg61&@Cg~;DV5vjPl$YL?&yDx9kTx~KM_KFQuGJG
zj-bmO2Fv{^gSd+V1|nyZB!vH}+zA8D(6K^Ut;eDW=<Mp%GbUM~3D(2AL71TID$H;U
zHj7||y+WLTUe;#|dC-;PBc#Xp&8J^FCr|_L9vEh*2QV+HnQQolLG)*<$#3lm>S%Cd
z;(%EMTEG*m99-4I#T^@1K}u({{*EsldQZPi>l47Ho|V%wo}$9dVq>S(2B)=|;KFi6
zRHonA;Od+X_Qw+-xuO11OFhHuh|UiglK?u?ya!j6KT{-sS%f;EB52^LLc46~N(As-
zJe_w-l7Mv~`{^(A{$^<0<65|yH*+Edv}|v2R2zPt9k#!-qrkzr(fYx5%HkH7;9cF7
z=92AaOD6xdWFc|t*-9~=dA%nNb{Zh`)0uGq8Wqbu=ZITBM83ZRQb2W!?I@w6%^zJW
zESiBv)6cY3*2*hs1}}i{)jHM%W_r0)=wrPO(C7fjoB}hAPlwk5$mq&lUIVs1nfX?#
z#RfET2KMH&;o?x&zInu}w-Uit3)~)zsj<2e{2z%VxHt9(a9$JEFMS^D{f!cEUO{-g
zXK_bTYM<#Csrq;H=UQPBO}DG9k2gO8bq4hb0x;rGMUfKOK1DMD&r}W)bqQ<@H^%rc
z{-m@RRc}mYNg;DH<}rdIR~FBB@DzFv(Eie?-%mH&Kl8-yrk3q3JAz8oJqX$b`G8Y~
zSYF#A^!;6ueta?l>@Z*r&0qyx?f}11@N8|+n#7~l_GJC|%d5R<6pRDa&{vR|l1FNt
z7B7-sq=pZVl)OxsD-ThqU<UbEA+9ayj3^d3&J7tt#bSAElwiMk9rnPVx1rSQq>$^a
zRrsNd#+nO#3}Rs%%URprgMK5UaO30EOEd_*{FmusO0`Fyhi_wdyHoUi-jg4NHE}pX
ziqvTFe5SLAx;esu&LWvsPi$I)jh#<;aJ3sxKQwC`@*Ru$Cm*hNjWvUldFdOD=R<0D
z*<Kzw><oo2=v9JaHzbVg(nTCGPXYf`jklve+PI~}t_Q3h7i|UOZD=5srf@#5I5ja~
z*KTac8r>H^X7KDx=^t8?9Y~YiM$Y$M!=a;z4WqraTKdI0H!^fOn}r4$Sc%iwC-@Lt
z7U;dfl5RrT1bOiXk#MMyC#~g<6D;zVKr}i-#unx{R6V{Pl}>6qE}(l(S}XA`0+3~Y
z$>>Y<78=4)%5WiEdX=vCsU6sdyeC~1QT9!4=eSSpH<kCdo+lk>@de)T)UvY`RKDW=
z$(CEaF4yO}wT-baCy&LE!(z*_umv`Nt@kYp(Oi-RI7o=(7NZUIK8gSTk^+9=8O$D4
z4V7sn5`n8hdPKaIE2l~6<>>n#wmSN5r4enO7sy-?hSpV2hBq>Y2e^>Tn|ktlib&L&
z2*Ak`=)5hG5M%HnLHXkMyipb+Tu0tH3jdS*4|%t(*O6UhIafhreKS?`u~xXn#mo*3
zS~62L%=!)^sWZ<JKjk^op`Ij6P<6WdXJvXMPZoFUDlQSoFfb|lu5fM~4r+i<;8uYL
zK;5bk0k_7YtyCor27!|jKYxw2%=d6-<U|<?q#_~DClsev1vaIG)VmC{=fm8fo9KS{
zIhCZSzBl#cITfhGJToj5C!WO&^Atx5h>35gsd-JY#yDO}0b5wI>sgj!60h(!@;}Pp
zEIgnW!O80qfNYW%iy;1ww781~h%IVlah?w$t-k*eG-Li1nuPyl4uL~IU@QZn>+t_8
z?o7j>e%n4ivhVvk#yY6%D)cuY%a}=)N|=&$B1zVaEyixLjgc)vk+MW4M2N_~4T@wp
z)?}CL?w|gT?&p49Jj-$1$MMXIc{7gVnz??b^E%J(_4!^GLu^ICe9{d+TC8-^=wsmD
zsD=(kBSMc2QG)+G0dXoj%4THXxNM3V0nKOLqyEfwDoJ;%w_jjD(QA1Q+*mlsi*gI3
zscgyVc(H#UWJuvnSBK2}<%n6N@B}!&^&x(cGljqXMbnDnAGD-y+9()EnbMo84(Cek
zJ*vT+rGRy2&o|01p{0<-jrmHlkecnp<L^*6hubg~F0)~KK)xr&?WdHmg`mwFY~}2l
z7?4%-`u$U&Ov;J?(H}pR1U13kGtI%O`o-1s*7=z#=Lg)n)@5-N>6CbS%|wNn)B4Ow
z$pAf#FukK1w*MF}qt8Y_zou2_m|fHWrN1*Fcgn9~%lbiU6j`z`CTnGE1LzWHj!r}k
zE5L*1hf1Hv*7}bwQ(%deSbyqqBTypf@(B_{ofOYh*g6A}$7tuLq&H3L<fI2Iy{%@H
zo4_q9;#`!Q)-062+cGU#kD;_Fa})#{z;x#BgScVnS!xg%28{w|Ec|+zDm^TNKiReJ
z^E$`jEu6UZ!ojYw(pn6A3uLJxC0Y2==h9|KhJdPyt6eiQeY-}+`Fb-@tJLcD=ouID
zrv>$QE2;(gj%tD!Hc?a(R<iX;+`_=p_eUTQ*DO-!qKTUQnAT(?{W};|9#*BfWGd|k
zL+sahg)>q6Gi1GLj_PHpV-<Sd$Dzqm+_pWu4r18C7t(^5He1B!=N1MX*T1BL;Gl7X
zk$cM#Lbmm~k<h5a+kRhJ2qy_ThR>nbN5d8ac|r8wyg_}{FfUHfN;ikmh1H)g^BkT_
z`q(+rjH|~bf0ygNudeP7u(o9voE;Ehi;`E+b8-6%iyx<@;>@h0^$xBH4PW!~?BTCF
zGucW+Gxqr7pi!H3F%d3#WO|>MYta$mpuLzxO*=faAlD&5o4+)1jG~&sI+eF|k}sP!
z#oc@i9rsje%8;NXZ{i3_gOLMDn#G;)h3{Zq-7%av?pgef`qR@a<2SpjjXQ!3dS;UR
z?gcx7O8GZ~7AJ=%ScS9s(<$d}tIS9QD#Ml9ya;>!40hUloh6Cmw~s;}sBUB^3<!<L
zyGa8>>E31_bB1vVqev$Z)>kN4_k^y>UpQW=mSvcl4F|;6$=eyE{U9AbZ!3iwA9G|h
z0*SDX>)g9M+O&iZ4D90UB}4yed<$Bs*tfc%<{A#+TT`1^#b1%EvD=7@ALgQ}YrF#i
zaj^P!R5M~;Fq!49;0xkvBKa_6sMnbj8xi|iFuR`db*7Ja|JB1m8Y}-u2_z3@$*EeP
zVKY9bC!sMw9z;4(7bkS0#h3*tij%6beB3$IHcxFWoTt*?uInZPBA~(p!q6zCuvZ^+
z2-m;N%BTp<w65?pD<k*CSo12S>dfiS(jbY5QDp3$+6}ZDlRMe{OBA}N>I_M{)XV$F
zW3T2IW)+DJw;aq98_n=>B}(fGJaoY@Zgz{Cm3~q(Xn)dgG_3p9T4_**>|5nVW<h6-
z3s_1}#l`IAIImStkUwC70%n?@i5Kvew*$5O^)ooD3ZQ&>ECwK6Bq%B!jVz3=huGM$
zwoXE{hmn2GUAV79YC6TNQlbRe_kHKLmGTnp4qi!}{Rk1$M#BSmTbb!Y;+<?m8BZ;{
zu;xv6rG(`y6gtcroB*PqO>&wv(oG$<rsrE9Q+XjLqzW|?PP*9`)>!qXxWjC6>ar4G
zliQKd3|!kmu|1)!i}-{?M>4V;I&l$5dqM<Vg3ja}{JZx;#H2=T&%B5+Si&9;Yy9-s
zFP~}8lEXb<V_o7}<ad22YS`a66F%Yn>}8m8z2b%{Hjtv@=NVJJ@9PfP{+Z>PTJ<rs
z8h8<Y(H>FJok3tmi;#1Od~;RFRQaZI61;NtIWO)3evf&z6!$3tT=EEH@2iYrBJZ{@
zLvd67S}yneq0)GR66XU1&n#tN`BWUb#AtPp@7k$QwC<O)!Hg=p#9%WiBdUucT-A=l
z@p8{vVoEy@pLtJOAdGl0y&JD`qMN04w%vq?^@lq=98z)&9{@wZ&6CL!42I*w5GBF#
zzB7YLfsX*%3P;%xvP>Z|u_6aIOj>n;YC`>J1|wbE9avffK^@&$X02)Ho<4Ls!*a34
z2GmEME(D~ILy`um4=>U(@^Y{)CDkx;{!H-Edc$;!Afkjy9i{bUL@JN{s+Wk5XrXU?
z6ngvXO+Da9Lzhu8&OS0&`tprmKZgaB+Z5-(a!{n0jh$*7ITE)Fy}NNKevf2muG-J1
zCB^?dM!?d<C3KE@W(L>T;xm<w*qybHoX}gY@8s3k6sfeS=q)sGOPDqwZmBnf-UHKT
zY1~x$HV&3UKhgGA284f0wKvetj!AG&LW^u!f1^C$ksz!AmZTXIg~JF-sBa?TD&m8A
zueVp%27}wIey@&goz~<hwlVN3UdGq8qCs|Yy8&>ge}sr*28J#?z|Xm^&Eyy@3urS9
zk0h+Py}Re#%Xe3to-Ed2A<uEL-0mc3DICQ|9IsU@#Po3HT8|8eo#C>Tp<|+VeYaGs
z!IVG(>n#gK#eeG7Z4*}C;GhDmgg^MaZhPay_}IH&3oVOGllne}g)Q1;2r@pf8Lu~%
zxqt)GQL2`Sm6A4*k0H*b5HCY55CSOQkQv_MQeh5A2wL%1yrm`Cj_-a_n~%ESGv*5m
zt0}!zojShi?J;Z(UmtfHq_XGFZcQD6!BF3ZwiDy<Waajl6U}9)G_hyTWmgI`z9pBG
zh_y~e$gNIj`>8xqX`P%2_2SHo0iIkwDpb<qKdq$Umx$))r7-V5r?Tlk*A}bymky^<
zomVt_RLRVwp<vPcRKCWv<BF1coFx1BRNL^WCH3vt_}MySY}HK;{+aWuk+d`Zit?d{
zUA}xz&>!e<To=&rNi~nleCp)jI@cif02|`zg=w*iSDS3v&J$FZX850oJBp`5TPF>_
zS_N8el*Zwzpjzn_sdSguW>|%VY}ljGis{;epKy%)V>ZtfTLra+nHpRH240DC6Q}6$
zC6Y1jBi}5lFcLU>Pd}IKOL7NZ=9`7T-HRJ|MFgR-q+aKKtl#~VIUC!#n9w#66x8DK
zjoGHqjBCCncgTBU$!QedXq;2PC(FZha8^>ij2)$*0}UK&jRQPb&EwqsIs1cWZq;<_
z?5C1-Vw7E)auHJ%gITocU<u!6cf^6@M!7e^Dcouso2CK2DLqmD>YFa2J`5d3b#03k
zXu?8m6bOw|9y5$3d{&!o4sG>lAH$sV^oQM96X>q$0su>~O^FqymK^uZOwiS8LVi<i
zk8nrMr8rF9VjUw7XfwrlXC^d$a_vw4m;ktdXMczicsY1EH2d3)sc~YAm_nFK)beFk
zQf6s}g1F-NiBE#O1WiJr<`$J0fQ&~8{Y5xgjthG@70(GRv!JU?d9$Cs!5?gMLPHz3
z@fkQWm(8p^sFr_JQ$Iw@k_e&T;JOc++<i~_C39ZVXBR9T)MO1n?260r8PM2M<aC*a
zg!cE*CFV`qIw?7Z1UPtl>!n_#^Mrkf&&^U<aR@dCmmwqhweRMShj89?p;Vr_PSy)0
zf>rj;j}|4*-+sm$BWg&HnQ`7}Xf`f%sUT&>^rXqmq6hD@%uq43Dn%quzXX=D@UD)i
zVG4(hP-@jyU*2%8$^wb)65n!GOd=F{IbFs5EffESsUmB;8dAf2Xq*18^)t~q9vr6(
zn>NA~(I5R!%iVyjeSEjIz}f)4tTi+6@Wb}QpcVRlQ}Tj-IgXw*`rz$|Mf6ugb-g%j
zkG(Gyj*(XmvK{{Mn{4e+oMu)DM;G5&-+DFy=TzM42Af};bia{N`Eo6-R*-UoKb%80
z_FDLldFJea)SRw-1GnL~Z&;Q&=!63onb3BdPUg_?Xe_=E|BPYzq9|%&Ldu2l*!OoI
zs92K>1bz!NV(??@p(`C=8@P8h^hhJ%QT$^O)Mf)}`UJ6~y{ghk_Wy7xQV9lQfxqMW
zjoj+~=^4h^s4R4e8FJ>#v@AkRtCae#zee^H7W&<fD6jNjtN8=7wVBpf%-z~&2Ra2h
z!RT96Ud-Z69d-2*3oCQttdnINVa5{=$OSuFe)d}TOS9N3kH3dzuns^VI_-94Y7!ci
z<d^IopKA5fa*;h!haaA@tOGz){Bq?>*X=d#j?_cQlk%kq;o8ig!qn{-Mf;6ZgR6eT
zgO#4Xp4ZEF01A#r$nXI4U3otCwpE-?i5QFvg-F`J!Hqi}4I4;hp-udT;SUBb3WZ?C
z9`GJideU3PlD8ucko`b|e&wVFYd6io{x2&S{{6`vPv`JlX78^>`w|?`g`X;84f^ux
z5tuX+Gswv+MW-@a6RCxNjt32E5q}KiBe;$y(ZY~eWKsFZ1a--#gJf8-zd3XzcdhDg
z5%KY?;;03aQU5!S@NDY?3)R2_G5gM&u{5&2HotP`Tav;Yz7ARez*>>zKJUA+0C2#;
zw91i){6GLhxr?giEB*vL-da=L=g0lrxI3W?y$v^c)dQcl-QQh4m*{cM@iOb@S63Nm
z11<!M*RH=mE#myW84uguKW#C0OX%U0XzAS>ki{{wvsBM1fQp2-J1Nga<F4_FI1)Z(
zHOmcLtBx>1n0lU-zWT?)s^Rp8-g<9_A@x;5Z#GRHA~V)ZgQ8Xt77eJ(pMr~NLTo+-
z2(MdOW?_<Z?+oTV0qB0InTJW3T}vq|%xLQn&vP>|B=Kt$S<3J^yvDG?f0`kw`|vWt
zbi+iNa6C$oH@mC9VLz~65HE9^or!-%o#?0aunNFC*Oi4Q=DMR+Ng=IVJ=T>AU21Og
zYqxcY)gHw3tX%}Y@Pz!p#oDFLD>=qOUtV-Kb?S5<*sTo3F@~bgUu^BA`P1Yw9%;zP
zKAtRO!H2E&f!iw8SCZv!H8spz!fTggNfiMv(zxzUp(TJqx7ZOQsNu#Cvit5M1h=I5
z<uAxeZ<J3@jh(nC%C`ScL^ZIv!dg#n!#&V#c(@O-nvuSFG&A73ko2-`dO{)_vOme$
z&p2O;QBdt5`MWy+e%fgkW3H=ywo;t9=<${Gc;-*5y#%jYg|?5Z91H;Wl}TAIfor@k
zZwBs_M3DF2_wvefx^pso?q&V+Q`mp^>&!<8gNA~F=zt{jpG`g+`1gn1N20P!nx~lU
z-zH=rK%W91^x}%lM*f>V|KFWEfE<%Uccfp+{_}f8hybjhq%I&i(BEMfKQ=+DQXTcC
zr+(X7{zaNd5ks*!zD-MGXBZtg>#4w1De&o1;_)-k{2^v-zZx@#oGPKI^d{OGkbABE
zo~9sky`z*Ue)RT#(>nfN=b$;ha4arEv#mpd8%7Stz@NUeWTnq{V5|+39oigU8Fj(0
zt?J#c_E_4ZhWZpLoUXkh=w`K1lUGy*F3$##<W-xhow?DHD+u>ObuN0sgOxx%(yQB<
zX-|wxSKi(^v~;fRR1yWQBA>c&98H>Rk+c-Rur}F~q{2?U{5<)Z6kdSprbD_NAo>SA
zX5%+j#ZH>-=yn{qTUSF1`gVVkvagZxQ|1p_{!YRG^1OPfQnF>;(-f-jHTEF-(bRQl
zkCozo9mkcj_4JB6q!8G@n(up2FHg%lHkrEive6i6dT1myK6D#wJUP90GG*Cuy44+E
z9o1}Dj;LQw+GMe9?OzdRvyMui1Q_%<WOM{6^n8eje5I+=5oV93wO4JoNk!|{OyI99
z2BUd`rDVY4n$K;eTlQX9WZ+uro4Y-HKZa|@u0snvisR>cT$mW90|lCm(sho6-#7Pl
zq58ff1>9DEnagvo<)3bR9x6dyqWTUxSj@D6woGp#xbkRYs3lWHs$Uio7n6puS=g*$
z2vu9ICMQ|EwuUbB=n2)AkOPAbM~HVhuD<Z+0ve;6FqBJeAAdrwtTA-q<$i760n?8W
zhUIewqEP)PAH<8}fW|&J8qFPMMP2o4FBrh_MIN;2UF>FV4|?jM7|;vzTvpqG8zpx@
zV$UtSd*+)+Xh~6L2D%b*<B`&s4DncXU8A+oBc^Nx?0q_wnDAhy0|k4$DS_^OB2=Z^
z1XPXpDc5q3SCA|~fyB#%FU?gxDrTDdDR-&*v9RAJ#6)AvO;0TS0*2xo*t2Pi{g!bR
z9K~&qu2}v2jJrYuMF4-{0~#)~7y2Ktd1rXCQ6Z=Kl*S(p#LEP5xi&*ESvDgK75>`{
zOP4aNri&KG)b$WaJnQ&N$Jw4@fld{Wwt=13k)3F*D$_FK0$=_w;vB`N%f1x|<~uvp
zR_7)A1@3U8GG9k_@b~2xv+2d|f2Wy9Ip=cz?ztbq<+4uCufP4S@<DFTA8nS?X)55*
zmC@YA?6$SLfijy$pYth}KZ5T+gzhb_2=ko_kbM7;#P)QkRvfrghPfr}&ujzFR-4FV
z^ceuiMfG!~OdN`V)jOh>=UPt7?%4KFnbQCeQEV$t7qdH3|Dr3dHP!JcNEz>1&jw&`
zzdm*rZS}Np9_QWc&(hRA^ZPqRVyInKL9M4vBX905vgWrlfxDu1w{kr``s?AKADl|S
zc_zoT8QmkS9m159wBv)d8U$_`9&1@a-%Vt5d8`c4xR}f2c>PW%{qG{jFsUalJ8%!D
zpDZtKkk3`ISYCJftmpu}0up?j+JmS<unS02|0gN6ch0GURXDFT{nR1sa;RycfHlIH
za64C9a_UK*X$JRkokA5PLI*_<rgdJY9f8Z~uQVMKj)TIQ1IWaZ6mNF`ou#0q)$h+_
zsT~m;3BM0>_}|ox|AyTEKgrVnlB#nNN!s6j%-Dwp|2oGEVbU$4DY^RM_(!o-(d`Kf
X1qX|M$r%y>0WYMkp-z#Ob?`p`oG@`z

literal 41530
zcmdqJbx>T}*DVSJ5?q42OMu|+(70Qmkw6-E2pWQh;7)LNNpN=v-4F=w!3pk8aJidv
z&hLEpR^9skcvbJdx{so$qJiCe&o$?oV~#NvVd|=K7^ozuaBy%K3i8sLaBv7YaB%Sc
z$j^aSoIi{@z`^;$DM(9de=yq5L`q=kt*@US9P}O<1U}pcqXjvC3}SP0n+`^&nzUJj
zlZOaalW!7w-*rCxsdwcm)LPWE<so?+bzx*%naZ~9@aB!~fsl*>4RQe0|7D+0xYZ@p
z-mT+dRL9EW6jaNvAr$C?X>|`n6ZG%Xy5!<*=H9+pk^M6Y5}^JX>^1G>3a`sBWx2R3
ziYSQ<e0~he)=p{oP03<7P7?`H)?d-Ue<*~$$;|E@RLD^EY#XGRbTSyaKN@Z)CCb*$
z{!@MV^jC<I+w{u*54>OX52bu7H(wZ6A)6B2GJrZ8$>DE?P)AThdvI{lJ=jsy|9X?>
z6bur6zeSl-@}O#8zCHTs3Mk}z1S2sUmP1q9LST^!Z$j%y<|r^NDxHQ<w~i?=&$OF-
zg9hFRM4df@;pcb=i&vR*tv~jBZ%>3XxANOTP5$bnxV6RU8Kgu%b<XG`I)|U5ML)IW
z5UYLm=CAw9WwBzA=(1IqdVKuyjf%HSYvt0`ck@v@$mInDjs|t?Pp1O!!ei-i66#3h
z+earRt5fNrgRM`b?<l^BzQC~t*FpIa#IGDQ@l&YzN{=lSOq(8+za2qnj$UqIJ?fne
zfLP#LAYAL6roWBoBrKJ_>1_F_1+U(hLT#BdgpglsBYQVP{pdgtg<q8nQ3!tCL+p4t
zZKXYe^_xE?Kijh0#eB!|)n9S1=lDC{XoG3s!$;?9p^pmR9*$o4vM_*5kf{H-J4NN7
zBTanf_2@Gk$BO?3zIPe2Z$iMc-Ey>X5J~sAGB~(dX3pp&aOg0TpY&n;8s2tNZ`)@=
zJsJk*QwqID$O5le@^(7D5zu>$*@v407FF<TrQ9*wdA6+lZCds7;<GQ$S45V6v1F7G
zICu{YA#Qt0HYwRU{LV<5_j_|?7Idk7wf7x^rSz|&fQIG{($IB}?-Jw&Y}p=mmN`|V
zk-~m3oZI!Kt{ArV4k);V657RUzjklJ()F)t3bF0pI_2dLUKKBIURHOhg}6zx>&U{I
z@t43W@So<LLVibF-J%jUBJ-}JjMmyX9ABl8>I_F9T$<V*7QE_2;%9Bb{k3pFQBb-u
z1^fu%JZJ7P)&u$EPulIXPo&lgzQ>p%T7RuJq<a*|M(VGqvF`~4zGCW`;=>+yzmtpT
zy7;xY`<NY&RM>v9`EC++>x5K-@knb2dN+%9)OFn2BXcflxI*{^Ocbz!svZm#N4aYK
zj%B+~QZGQI=Ktk+({*!Ns8Vf#ueysVDP!xU2`@i~y=J{~$eC;}<CGIC2e%R*+Q6Nm
z(docp7=6+~y1V?hOT548gXnz<+`Y9Xej0M6u8jkDj;$=D%;Zy=F*l8~|Ik?=cv6aE
zFv5!%{-fa#c^VwXmiT6zhurtBUYENSIR&?qFOVh#9!E^3OLJt(N5>b3M*gS1446lC
z67b4oTy$92_ZNoZ{FA2MFCQ(&>G8VX1))ABbVI7yOz9o|ko{G2feo?zFYpGE^)`Mh
zd|Gb%BuvmM?9%70==2dg=>s)IY4aA!s4=30e-D>CC2kJi@>x8cZwF@vf3v6NA~@Un
znLB%*j~&vrJgPdjK7oBQ9J|D)p=|SxvJkiDDe)+FyYT46c<T^vF1(~3J!G#3KX&3Y
zMw{Q4RWF!F&<Lfv_5z&#PHl2pHGc%e6unI%=f=bh@V@dLqzAp)N0a<{LvxleJ&4A_
z5-mZyQNkn_<1iP_r)2ZTJ!nD0C4@mj53h$;XAnD`ZpuEasPb(svh^M+R#}Mrm!K~{
zf<*SF7C`X*qG`FnmcT?-%Joo?U#O4;CxgBEW7MZf6c1sdtR&P0dw7Gc9Ygum?=1AG
zS|mJR3xiP`;e2oXRk0#g&%qk2-wYLHHZ0u?c3RGX8q-fzEtfT;{dMD!$l-`g$RVRR
z3Jt$EHq%I<c+}jWpxYOauou``4b;L(as-srX7&|K=tx#pi^4)*RZ(WdzdEFv9WqFq
z@HcXQ`G${s@^f0IDfJy1;VD^?_G>>o#-KM?`~(Ms#%Qg!*h%<nty2qU*r@7(o0-uo
z3;H@qq^F46Tec2P-kSKa^9)nu&-Tex=xT?)glV$&w5=8W9L>;&&Ml}P2Bi<Z-e~O9
zase5B5l1#B^?lR`ikO6+{wnKHhWx!h6LInY^$wkB>FbUwci1X(?GA4aG3%h6M<vEC
zX|S0Ob>iLAXOc*bu+sSz%l`Z^9d2XM?!z3e5{-_kzrbduH<4)*`PfUr2N|^YQt)$;
z*$h$XEn6t+fh%u$J=72j%l-NHGtm!9cRh8mfvw}U;X?PF+5na`-*=(R<il7b<$-A^
z%0Gc*Q~gHJiq<BF!9owx5heIOgK87pgd;ro`i|}^W>EH9m&<qDE*HggWYZGTv3imq
zc+~02YaD`=W>d|@)t^gUo0<s0G4@L|D>wsoB@Q%G{-Gj*Umf(-1U99_-mI`ea!4>0
zzIl1fwa(qC0||1W>e;P0*xu%EKF@fkx3KmpD8(~}bye#3axywbgjdzn_&xrEz}F6z
zGZnpNih$-;kq93#5JxeAN5i)oQN^EItzT?NoW+9|6_jvK|4w3`7#euye}KW;PlzRd
zg9MImv2gf*w^3RAGVn6{*Wr`DtN-sc`2XrF5$mA9z3ZdFb$s|L7*ejB3{~b<1{z_8
zS12gs!)h2UKj&sayedg^7-w;c_i3L>q~MgcmaK!xprYrE9L@Rb@yGi=a(Mz4@~Ia=
zZ76wHCbWsN8zrvHa)nxg%$faht~MoOK^;;Fxp^}q1<JQlONbiY)p3gX&e}qW`Qu;7
z5>+?kg?J~orHrZj)iw4lHH&<x9Rrv;X5<Sso-UxVqaCeS>`a$QV&j@yiEUoObiw^<
zq9c}H63^JKV7%n%0{^_-7gY@2mQmeME^WNq)C6nezI3W^c0Nt(zb^34+bI+Wlf2z#
zktP-*xQ=l2e%-$I&4otn<L$<dW5*v&JYT6TKi><<b%N7uaX<e1UHfG7&Xf5{F|z9z
z^MCHlSMTAHy)#S9@9~cwN#{v7d7K}G-{Ye0#6+|hle6E^tXIwL-J0c*w6CZ4NiJ!A
z%^mbJzx0XU>7mw6k0lvyk7Gz?fkvOP@14|5{R@ZCg|n-6ehQ^Zk;}lo%w?aEdgJJY
zh^zHQ-9krEH_LU*=8?u;@h|s}7ZV<Z9!5T@ctzyV(n4{57mH#Z%WgY<_p5svL?deA
z-fW^r1?<v?mEyi$?%7<I{xQp*GB?BA9&(3228PAh?z;?P<PHyNCR|bX8e)vv#7{@o
z96RrwKW@Gckv_aLIy|dXTZ+0nmB0S8yhz(<lvdA9uz4izBP_ZSdS7$X)eGErQT&PJ
zk$AkA_W*sI7e%GuW5EqS<Zdr!dC+(3SEsu%o>)xh!={o3iR9_NE@kHo_QgVcw3U48
zO6<RBU>e6Wd^bI%tIzk=Zzypu!}G^Zt-T~g(@q}KaA5x)n+W~=IOJCP@Q1jspLfgP
z%Luat&pnm#SH&qJwj$n5)rzOHgr^LltI5>l{^k67{FxpVg5x6xb>>17=o7^+4x2%E
z!!Ac}8&n-TPxt$LX6Xr47_;qa%JMFJJ=bTo)OgK>96Fl!GR(e&)m}=t20>nJv5R_Z
zxyICxIgkWNgUefgDDGI0mJwfjjyqM>@1Camqb}l5cvlu^>2-CUo+=fF9M&qWV-{yx
z?4F0aM2|c?UaBoCCF|Sep5Ho(5d^Ues7{1gYGo0;b$W0uF6UR>%J50rzkj@g{mQSR
zeuWmZ_}=p~sddx$HWzd2psMrtr%el*5rxJgPBDFpSN*z<3cpGd3LmehLgI{t_vtdR
zSt26S*=en%DJWcyjWh&C78{pE8rN#p^Uy4;&lC5CWA09yc5ZA7CX_|rz&zwqGwj9{
zxKAurqG$XX&%d3mVroBKCUR1%w4%!OB5P^nY*CRh<#hAxGHGDAZ`>-#oFHa({iMFv
zVPGk9E7<;^jC&+yaqG}_xHLp2!J+Z``IzV&^$049Yi=jD`8K!!Mg8OFTw%)XhV=66
zVX?iz!q-Uc3>d?Nm@}fz{1k4ZS+l*4I^)Hi&MG>R59jU0`BkQI>$U9bD%d^jAm$B0
zi*!NUTSHn3>>F^oH<&i5DTpwIpBYH3%fvEV*L`ZQ;?nXaWjywngii;JR%K0(=bwL#
zzinLP@rpU>c(7W$p9*&Zo5a1>fN3VdRA|>cjBagO#+{WcZo`sra=<kFMjn6q?mI}?
z<OwtUvXqsnZrc4Q22j=IvmEMAgfyCJ_IWb<CZeYG${!XVR=n0Gt~ax2f0;O}&l|m;
zrKj;=b#s-yI!JeH(pk^$)j`0cp*w8v$N;mLFz&d1r<gA;MRX%2P(J)M_)_uO*n0(u
zCW3C6HnS-o>%p*Pc$Y=?<TMm+gA{G?Dhq4b@t3mU{h@HBnFIJ1r$|Q}9Vq_zlSor6
z)#5bga7}~+*P*?#QR41Ar5BB<K?_z02^x{3nIbSMspbNh<>3H(`-hwT_FuA>uavUz
zC%2@ssryH(`)J@%cZk;rF|kHT3%HgX8|^1bJH1UTd{EZfuGh)+3PDJYuwrDCyDACK
z2<CRNVB3Qmc5!do_Vq#2+FLDn-?PLJ66hJuL~ZZmomO<2t{r+IYr`l)>UVU~k=W3k
zJ$9kOE{qkI9lc&zNH5J#CEXwC)Zxa#q!sNpNsqatqSyBx%WSB&3y=YfT5854-<gh9
zGYrzaxXbI~pw%CfDU1@-ps>)o&zDPIyT~InjMg-RfrYv{!^#>G5`S?}Z+f9>j>t6q
z`_{3I!+F`PeX6!@^;W8Co_{Rz0C}|Ye!YCqzDY|*NP$uQhJ&8Tb>6;pXq>!pjtSLP
z8r#UL>t~!WO=eRlYv3cW$uXm@>B;(Q#%!Q0xz8vFX$0dWxZhJ|w`bprv096j#W`-|
zO~gbmh(9)y*I>SIV?R1C*VW(h%7P0*es|b~hvSkj)|XDAk(610OLtVAH}T1uUP57_
z*&>^FmyDK%G0iHk4ELwwvd1s`yrwJ}1Uxa<fu2-jEA%c+WuIMG68&Xuzu~s{Q%BLG
zlh_H<kQ@^2wfhr;#TkZ=!a^$*Eo&E*m|Qx7yGona=(UjRWeJ)`cRM**ayt2%DvlB6
z$aD3f+UZ2R-Go&W+<J9|opat@PIz@n%WwIsn9S|ng?H9a6<MM^3%p|f;1N2Db<>8d
zZ88ZB1={*$E7uab#vK=3=T1tX1EVkGk=586+{dvCbnKU5N<w_xs^V{bwlldtfzM7v
zBM_OSrQe;PEJokmQS7kI+r12W4O=O~Dabd+5GMbogK$pDE*gu1D!v~X+mi|LWPJtB
zW*0dqVd`L9W}DU|5m5`)yyx%sa6XNDA$|D@raJyAW{^I^a(LEdi%n|N_f%QDM)F?j
z-kcn>AAx|Ca=A7V&K#3Fz^sXeJX%`_a#DO)aCouN(|4=saCbXCo8sGIy$d=DwOP&`
zG>wDnVIDj}&30MqDK-b)iP`D7e^GtB$mi$Ie+JF?3SW-MWdH03=8<Z1b*_gm?X))Q
znV2C^2a2~ctpf9`+~$wzw=qkeTr66!Otk*m;BPLu%TJq?S*>7>H};?-4VlXnHez!2
zec?w#W|p#W{<qE_-J^nD+2NJAUitR}z?wc0Wc9|pq+9x#l*Pf6PRT9jr!0sYgNMta
z9r8nnqj=oCsyJPJFQ$PRj8g0lOxL-bcw+I?hcNEm>SoXX4z7oxiRY_)Mh?lM!l2UO
zXSi`u33&$5p3EUhM;S#^73Io=v{;iaU^gW|C>zj2(q*%N8bFJ=;%mFr>!nO`gYKsd
zFH4~fPB`nsBP3k1=Mm<BBc=__ueZg_1vGaMiP<bVI1un~Htwlu`Oz&udLX{=w2!(=
zR(mkf>8;nd(vtO;>Q>I0s%Mp>$z&Q?2>JMH3nIVy<Fx$a0L96*tlbzzFdX7npAeY>
zisE=;wu;=2qM`5X=mVidNBj#i6+r&ea$1zL(n=6hpX|uX#Y8u}eoj{|U#mlFJ+$dR
z%r$}DjE18iUNrOJlB>*TKq{LC*o)+GbIwyn?#6RdiRu)eDSZwyegABQ=mL|4n0c$+
zeTn&gKYxqcPn<9q3Edoti9m(n>{c=ImWIny_v3;0cb+!BI!8MOiRxe{N=Iu`0aXMZ
zLLi}~c|OATAAgL4lL4E2Jvkh_&SxEk4?yo4w@P)Js$?p(`@N*>KQ5S;hqVX}>~{R4
zrO@xxWW5e-12n7(F{2o93dN2i<c%f8xCo(_H5$)^j37dsLC-JUU*~|!!N}{&rRZ|Z
znb_g3L^x5Dop<ln9i_o+^;gh1;c1cnkV1UeW?TAe==J*E2yX#?0Q_YQLc;ZzN!_LP
zr_gesQdM0WVW+Gh<isKmh@A8U2y}=l(aI)C(IU7Az<6q8ynS(AX3`V|XZS!@W`%=B
ze$Q_--rTgZhNP>O^;>F3kG-fEKvplR<ESBQ5q<wFQ7`Sc!;W+sCxRupI5cpln_I<y
zTy|`_<{beDi>6NgH+y?GRIhioHJN&(=0AH)DnDY3d~L3~3rtf+faG;BozytEdIr%?
zyx?<BO)8wZMi7d6H*;cpr+A%bi5(h@tnhj(mD`IV3N=anE)7MU8IBnv_ypuZ@{rf;
zWExeV3qo;>X7VN0?7FoND6vYdI)X`+J@3GgKpY#BI1Vhe*&#};B6{2|zwqMXl8*G^
zhVNGw&deB%%C%i>BoP}}bDHRTD){JPV}I#2wN%Ur<Z+ZL{JO@s?{z>zoEvbL?wkHn
z?o2t==a2y>x^Wa+MmLn{MIDyY>gk)rpURNmq9y#$I%tb!J~R4Ih_yLd3s9A(#mY@I
z0Nf1{F+0kvm2p@ir?J6S&Lz#n@5e=JyYuJ`AcBHJhO9QqtdPZGlyRuoma8sVfL$&A
zAnTnyYQArR+ry<T233;kIBW!`*f)0}(Aw-E&MokjA4I5HMRy5XWWoviu+IA7I`+;K
zZ?rWTaxnXMV)u!_qY~TnE;evhqh@qM1PI7a-j5BKGfg>qK|`z&5}uD#78N@4!z;!R
z+uCf8c(X~<f@DJHM^7cO(sKFssQGYnVt=qKAOtQkgFJm9<6!#hU*PU^8laEvXZ+r>
z85h7GqJ^Q#m{YVsz7D2ejVOLxD%FF%p_53J>$!tCs^q+fEIF4_?}=dYM~MLWGTt45
zg7}4S5_<si%%*zVQ{1!_At4&-v~}Q?raHJ>wASTVKWxFhsdtIJ<f-!^nBTKg1m1*|
zMPdhi>va(TOUJ{4ynXS-js{zAt74;3K%KdyS~=L+zHdaPMV88~mccX8aONw7muD{2
z1I|r5((QO8y!t5WeW40XqGS`#t+u@NLDTG2NJ$?2o50H#pPFuwT4HgcS5)_YXVteD
zZiD`5f?or#Byg<2Sk^h#hz&DV2RBZi?Bij4uMKOV2`leEHW_mBCwQg96*^mBCh^i}
z+ksn@{g*yor>1U95ukD=1m=9_>OaHb++X=A?R@tYy@bl4qwdCA?7;%I1D3T49GsI8
zVa{0rcizOU2eGUTJ;6#HYg(ou#hyuw?$H+`%cMa;cB_N6gHZch9{p70FiLWgU`Ts*
z+^LkQ73?REJ31{_LJyfDi?A$TezEfPZNkdm*-QU6)zV_s-!U=>i^oWeHNyT__&#OT
zJ{;hT!4uemVs^ytX{sWG5a|~@8O-loIz{!~N6A;^8V4VKWgHefIc8$@1-nNG$z$X6
zwn>V=q4pNLmTI<4`PIw*6#bTD%z&wO@H7)U`X^uo;^@k}_+pZVDcpRP@DT1&7-~n@
zdRW@$adMQga=8UEtl6vLT~YNckSX>)G+jlL>-TLreN2lZM`H4ji3kQ5Cwo#YO6b8J
z)yQvO-^mJAPaXAaG@oE5P%jP8>I+`Ngt|Ao3~M(K+eJuGWxyOfR@1bk3~SHKF|p@=
za2O9DGSNJP?~8qgftu(4qo@1%C4`Ona^ryq{T!~`Lop}@{QH3ibs^-cA=Ll+(;dOW
z8LK{zJbFBYw<k@Sc2VK;m?NFNA+cjHuxWfbRIaD8--#TRJkgP(%=$63si)_89?=sC
zMtBu3gGPj%(~Sj8-pw6@S{EzTGZ!D-Jc!}Lt8zGNT)S-4<_O%fs-jI~Fm<Pb?R5Re
zQ3^h9!;&D3JJMOy3Eo-IJRcFgQCV_`!K28Bvz}Q|>?kqTZlO85Tnx7;KR&bWA5aea
zFy|d%fde35d0H`D{>xoXg65y`7i1V=)G3_LXF{jA>tE^kq9)?(F)eG1K`Y%6nJ(un
zCjQV|)&R4S6ficFc_Lp*Z3r{XX%3g=m#H25R>}KIE%enoF=g#wy3&Mqtmu>_YUQfo
z0HA`dU7dL|FFagf%kGcKFXZQCI?8RswxiUEoG{+=jO*_z{!&>mDSCN&`DeTQ^s-jY
zeynA_L<Hl=Ya@L4OEF7~)BNI*cwZpitLgRc)c?br9AFR7S`Xr(9&LH<YGhX$ROoGF
z{UYY7Sxq?IjhW^dM-J5aW^tLfKc@H77Q4S{f`{vrLT0)xHgr3_16;4p$@=#EUNR87
zv$N!6wUzB9uJ=)c`m@vaQS64fPdTALcQ+X~pg3cV%>wh-Gb8^|s@*|$Rc<yOv-z-#
z_h|R%-LUb>3t0k3Sb>q&jeYrTt+S)Z!?Dj^KFVa!(CUnbu30_Z>we{!ocnil(4)OM
zOrRA~Ez6hQp!f+*Bv3A80{QjxcRDL`azdrWGTrku4lJh5p`wI(nw?4Yd^DdRCQ#4C
z4pPaC@PCMu#?Ta#&^oK_Zw6)p<+Dz{EYTJulClUy46k3pFVZB9TfnZ%1?F}Tt@~2f
zwuKuPsX&F`lY70D1LwL4l-zC)UAmMO$<6ns8pEWy%mZVpjQxJKT8%A|{jc*whJ;kh
z|BF|u_EGd#75ImKx^wu#wMxkD77soHBSq4W-X?BX72}x6ec~b=iow$(9@0*wiEtsl
z03HukY|fW<l(6q&o3x`k6X9wyr&)VlNvj~uIN@x^Qnm7QBb@SYHFyq+N!Nu(gbTls
z|L;hsWtDLOxAA$$WaMvLayoo#Pj7d{CPLCUZ`M>j@s+t<p0V#d(_#WoHr&-DyojS7
zW>kQ?{I(Rb9Y=G``7)mj_=Td<t9#{Eg?{^5Ycuozdb0n6jM<!{nVacYmKiQr2zPe4
z==5RRO-94kP?Z}p>3D5_a^1&=hrN9KG_@t4Hw|i8G4qc9iT4-{OUkUsQfn<e7Fy!n
zv|*XlX4}S?5&6;pPe57t<J_~U{6{ISVs=Lj^go|6y~NV?-Gtujlaf|?d0@&l?eDMA
zc&NIs#iAcA*<T1{&QOUt(u=zp8#P0O#e{jxcxHihG2q&z+a<o(I=4q$@1_$D=cb!C
ziPKqXpZ}LPwCap&#}}0BX;tep+pt**qaU&={!oSEDl8SZHo337-0m?eG(VFz<gcp-
z*$GDCWYvL~-skz?-Uj`{B#M)m#bNA>RDWHHiRw96IE*C={(c+1>CRQ9YmKN!)D6M3
zgFC#RDacww0`Ze5Lsj_?CN-`cYLj=mnSJ@DM+|m+fZ<P0&ay@z3ODFZ?=-*5s0df;
zi`I@n!0he&rB+8dFmf-Sq&a^!r9!m35H-c@c!bKfpnXhv>9n!ShLt6<d&DtgM|B9B
zJ!<;V?X7!yffhIy?H(ws+c2K#42FbXTIsRPo6G~nmSkQ|>r0Vpy3&k^)Wz}%qL9z1
z9$yp~hNHisy|w|z%oIo0!B#B}?UI&$7JbJ>_=iyS<LUGDvb)P*l$cHH_E61g+1h0l
zITDsfNKo0iUi3uZQR6W5JVepEl6;vHS>Pmu-6^c*zlrj^<2@AX3!J$Um(WUv>pBt>
zxs+-Ln%Y91%8I<jxHUH7HVF%RGO>%DYgFQ0=qN`y2iPgWVrGep21HDoNhcBHj$>{m
zj$>;i(AGW5)ms6f7I_zQP`maBT}iskYG)*cS~L;wp{%^$G6eyWjbIlTk9C;e=pz+r
z$L19uvjZX(G#YEFjDsxGic#*Jg2<Uthy^9f^oyluECoJw9>y+47E^ka<E?Ecs}o%O
zOZ<*0taR<;Ps0g0fM-)n(A6B6SM^KTvK{okO1BHIn~{~lx5YHSuff_PysX7f;5tC_
z#Faj2RS8xU6nj~n%_t!loH$XvpXPHt1DA2yb&DAeS7<mcx!R}2E`fti#!H%{)uAS2
z1-^Z*;IDb9jzyEy1~-I@jxNd0ab&2s$O8lvH$%W7RfB?Ef>=z}KgfX$^3ylV>#(UO
zg)k=p%a;*$m@6zN6mm;2fjRy>2qK`9cHUJ2=n8caz+wmS3}*O{QE<u^3LP1`n8B2O
zz5Sr*p7lB)oR;&X^CBP~s78(qrJ*0ck+p|Smu=SEY6QHM$=+VwtTkG`$1X#D9<8(X
z420}Vzy^)3LPi-TZNQ4JU02W@_UMg?x7t>4Cg6VXM8Ja#2wyK-Nl(s=3MV2#gS>8?
zsEIqI-z~ZK=*4BnDH5`1ZOhIry8hhJnDM2yKKeUI3PP{e$ukQ|hlQzM=75V{DXowh
zmo2)6Z;PCCe5R5}rHn{f?~~~uWtZ52Gx<|%TXqFLx28I3T?zPvlQVc-&^G$1NbD40
z67BgePSFV!PgM;T5M5=~j$?#bkzEn^{>z}T?-gHRUuy*qKV-jcAU;Fji2;AEM%7YF
zCABveP)ll_tuqKrgu@U1{1hwYBw54YC|-nwvkchT$J9$J3?hVtWiovk;{G#?g^pGS
zt;!6XobFuC^)zaBu|f5wx9^8f^m>LL86<Wj(>sSmS0fE8=E(z!q1~t!ux(blG?xoK
z)RFlh)}`N~M!xHteIxQ~>s}9FR3Xu$=HzvL{)FSr&{dE41QsEgJ0MpaoA)Dw>xw@I
zR7t(V2#4G7GV+K@tcmw%8ph3G6@#8y{!%umvVQfMM&j{ijekPs$567^6U=eTg8LHy
zxqZ#oCmVF-r;f;^^G=A{Tig)PVmFeF+8hdg8yEm|WlclCo+MsQ-FpMa8V7f-FZRkY
zq`@S%1~{2@o@B!P^@t@LWYWZQ>}K*`Pz78;{)bf?!59wOql33Ae$4pH`J>snB913(
z`OudBHflsB+&hq;ahS%>3DZqK+#FvbQivUF@c<CK&2jqC@g;R=P9tA9v@o6ZnsoQe
z&`!@k5wZk7uu22XW`e>tb4jCag-R=lf({5z(@7&fXaQBz6XMV;P#Zq49R(~N87m7d
z*tq0)N8U-42_XULqc`RBY(H)eGNZwX%8Lkyg;KI4PR<jH53=k*Q)egIUa0!s?XE$D
z2(M^q@wr~ddb{kZB^@^l>}?v1zAG-<Yc~hYK~YC!%>NWQe!rc`xOZ6hiZ!FoR)}BO
zy2=(s9U>6QRcG91$j&@DiUDs!z(5O|4pm?_Lu3N&<URsKnI@t0kH)MPZk8D`{9w4k
zPiXgFy9JDADmtAPujAnU7|@Ry)KOmofbk5znNsUTinL=vT%j{P`t&;}QEIXx=yMR!
zURh^xoS=Kie!I(O(xZBSik5}k2G#R0MKJj`_Nl+Z7w~+hYw$jm73(_tIs%yu@pER6
za>!@zhGQ_@2V>SF^p)FNkBIzKS-vSVSiNrfqeD^PyZbG_ZakKF9CQZtS1{L6RuL_O
z6dKs0=ln}I;8#`4z?+_6o&yhVyq0d7wcS6<q-qOs6>WE#vyYJpqhS`RO+j+rnwBTx
zE}i$>Ej?q(j09gfPrKz~?j76_Ar8g%D>9b8mk0bK50-)9*MA1T)*sgqUXVv`awY;-
zmo0LBhhd=+C(f1f%9u54-7k`)jS{*r9(H37C@S{Yh53?8rX@mzHy9eh7iiV`T<GL2
z)DqThQv9qc+Eh<ze^R?5Myn&;keIVdQ2~d+;#Ova{B~ZvoE*OYLYSv!>As?LWGIj~
zN%hFGTMD|s*qQjeyMMsTL<3_nHKsq{Ic0}&pQE#R8*$#%B2C1vvYKEc|KI?46BKk9
zRsftinEh^O6ivNj$ms?R)DO9m#Lk)I6w7RblVP0^dB4j^>5GlX<d*)pXymV%#H4jM
zLS{$l3lvd6{BrZpkENEkpFaBSGVC%vSn48WtG>*EI$@hzb;eS|e{5uJ;N(YS5}k2!
z6E=y%m}v&;6rEJvlRI*VSBD63nL^ku?k!+i8Q=S{+pmtvf>ix(E=dmEtG4CWD)(dG
zrgM<A0N3()fL6@lA5-vS$>)p6yU4PHVw3Nh{a!1wrh(@Ex0@KRITt(ur9<xUUTS>l
zFF`AY;2ztXiRe^-33Qz0i%emQufsw){O@<IuL}m(sL;E%ILQKf!=ri0OIxrk$r7kP
z6uljrHQmKZdPjReozE3Hd<El_vOzW?!puVls*scFW`=fb0Xs*V!vf<Sudt|L`eg#J
zI<)9#(!?JvO2P#>{O)|+)S3WOi!{SYiW2vuhzp-xIgEw|I@5od_vUzkFivG;;a0LN
zA6Y@i<D)pOfueogu+8ilas+)+CnKe&L|rIKS7cDmEVK|M^KSSD%@Ur*-59`LmLq;f
zv~p9W4EW^fC6=LqQ1|haI3Rb{+V*A6)LTUT!Y(Rgb-piKo}OlSQJy=4YL_QcivFlD
z$%Nc{X;!x~=SEwgNRh!WX4$zHMsT_ChTByAD6ir@vipPW?ws&SRqnls4j~Z=R3L7G
zABao@Z^Z1%(v7wy@9e+ugJL{sYWo)ZTE~<y_~Y4Wjs}lRdHu%TN24FqRF&$fc}Ow-
z)3M|tI#@N0(ovqDrnFHUwC$LaNOr*EE<WC@FWOPQp=6+2Hc1tAmP>0_Wh~Qfh!e_}
zYOs~V(R=q3InLvV&nr{a+l8-NS*~ank0BI^W8sZ?oj}@-v=_0yXgg8@-<T|NL@2%>
zSrpmpe<y>wY<Z-BjWK3q+#In5D=xg8B}Xa7Qt|Wy(m5=;ga2Os24L(8-u(Jo1lOx>
zq5is0zQWU!@pYH~z4U%ByBWkwT0}4qw;~*T-tN^I_Yb7$k!6nhkHQWJ<h1`zYBJT?
z3jZereQ}Nry!c1%uNGNM(Zs5JvjvaI3E#D~R8K1D7&~VAtcz2bopDW7#TV=1(urJ(
zh>Kztx@Ct&*3j<?DI)H=L;Vy3JESMmv3q<OH|7Y>5v1SuqyTP#zr}uDh5R4i6tRC$
z<OQ&Y->df5vu`7Xu7-2|>lm>8A{+H4kt$MD8p#;%HZ#H2coYr02jBEx+rczl6iF$w
zB#~DrY6sJW@T*@PIfPW<EI<DBE|e>f@ant?;JCoX_tx0=(5&#0WS79xjr{r_1B8A&
zk4@P(M94lh^Ob4hllJ?u_stg1aaTIKYUj!LGmROP%*Q_wm2wU;cYxa<Dr_QZj{2WR
znO|Wv_G4?`+jW@$dTxm;$+Fk|^%zdM-|bHWJ<q$FUxyLUtrNoKD}C*xpIg}-m#%mO
zolm}k^(9PU6~;9{7>#?l4ylwo^laO_iy!y99ZT2KyX@p$_ZnHk?csg6$}Fzo$Vk|{
z7xP|uIA1Yf`)X_qpr0La-wRc|iYL(~%8fbme|-d{*BwrO!A{FeQbIuD-UmGa%<;#o
z{d+*b@5}Z(wrhcMMI^t>Z8Y|DzqmaO{0V00oJ~BVk$yjL`qiuAFX7X5AZl7Q*}$f{
z%<ida=Q~l0e|T7OE!;?oL>#l%%}7)gn>E`BA7I1j`MHLMwG?|Vn<WV7_T0nj^9WYU
zZyT+d213io8x}ZP<o;5(FBhF8g~p_D3zt0)O5?J(K#himewNbIBiZ&zsk^`<3o=qr
z^zbkF40-A;`1zbVta|_?i`tVweDepXm?D+J@2W$i$z9;(4iB=nP(6b+kG2X5bVgeH
zxR)6xr<bfvcRf0miu|zoVjylgP#%|Z2$2bc4P&1!UYQksj!a{LOB!#8bAE7V%2<rE
z^&#8XC&i(&=Xom89|)+TW4tD{aJ;p$_9lEM8FQX-pwc59d>*Ouyp8_P11Y=mleOl2
z26W-zBu6C<7)!={`T+tK@$r7*a=bQLoHf&RT%qQ+?qyEGqUUCY==~-OmiaFRi6@An
zF{5KA=dYjT`#98_S7koyAVi&YId4Dz{7Ko)clx0;EW%ey6XFoLKbrb*xECY4k{)+X
zZ)DVayCLT=ywuYQi|=Uiv*_9KTY9v`TEGsn?s1>6##SJ~c%v1-!H0u?SKSZD&6b&-
zKgvw%ZG#`!#BY`K=p#9fqZnYJJkPI;0XcT3(a2MPl&kxwaoJZ;6qvvG(0Lzzm-p7n
z3bggI7`M}lnmjI!Kf9O_vt<HFUKdlgB^kM2yx;lcbr?%=eJwi9pWR05@^}v08h@I@
zOEZQAE`FbTIj-|3&D(cKw#-POnZaTJQ2$yQuMs`FX|Lb4MuBlCcz7sEmK0uL>H7E>
zurDY*IW5k`#cy-E$h$t99|OLf42PBnk{}GfC&he3e4|e}@L?mwadNQ&aND@9x#`A0
zlQx_tP!1awEq0)2gXwO^O)-y2BGu1nCl5Z>Au`>qe)1}KGJN1Qu+FO#>sj>kkvdXM
z2zd+03;wLla;%zW-l19BREiO@8evl9)})9S4}L{InY={KrSH08Eoc7txM{#{z5+eU
zK?nX`(yUwr8JZ&dlvELj+b)-HlTqMoe?MH^pX|Jhh*Z*j3)!(VnxXbO%24J0qrIbq
zwVZJMcwg*jc?r!9_DNUyH4u!DZ0OoG_&C2t`I8wdxc`87#Cs#nbv2B!!r0jLW^89t
zOC^=UV|yuev;cnAdM}DwiNw(Jy09h~32pw#qESl$*<oUlz$pXsRfS{gAk<+4xog%$
zpn>1QIr8PU{r>!|#)l81M#4q6`!y=-zJS_mMxgjmjLuc2Mtr^^1?U>e&%Vbv+dGJR
z--eY%a(hgMumg$<E}dvqz-h%oaUxvdLbeZdwCYA4S-qQdSwddO&&XfQ>!^O`d_rZy
z)isxRUV%{wt7Gf9Jv>XL$$(8bzbi_x!j&(pVv_wyvex0h{vZQn^*D8-2O1KSbd?s?
zjTeup@DAcn@lAcLXn@IYzJvJ4-*ELVhYZkm?hTiYge}O07f{mLFSQdZ<lErWBbyRx
zbWxTLrax=B8;`KW;7*dQWzDz>y02}{h4e(ub1AWYxmPn6{<U8_?RAeVTc!lalTt`~
z#EEU!eivhBdY}L+4r-XU<XJDt?e;;k<zq5&0KcMt!&3%UCz(c}E?V6+8#CQWnZy;<
z;<UttxQuWbBy}3nHlz&fz&wF4J>Ffnrc$2yfjkgV&p3RpxX2uYf&h6H=bM%)rze0f
zbf*^ehu~!buWyM|{0YX;)MeS7{2rStr7xLb?&D#(u24UR;@7n}xZU)f%ORgT9}i3Q
zTCSphTFX%;I8=|#>5+FkHvc*0&To==l6mUXDd&*m2y@uQTupTv?nTq8GhOor^)3B&
z=n01JJLLJ7Sd=?q7mM`inqAuoSVuqkL5$0-F?2f;>pqn-Wr-S+L=&N?4`6!y-6z84
z&4p7ve)3NeFOoBk1?I$lOcv#)@g&HH@$lex$@Y8-9?QOG!2<k7-ul+?{UV}wj!jpS
z!Ip#kmt?8D=+u!jnLg906MzNeRXR73J|CTgLHsfq!IO0>i5k{u1T5}~FL=!RL#Y2T
z0dcxtq_7s(Z`r-FBsO5uPc5NBhuuR2Ay(~xl!*%G8(KM<$rZFQ{a2=G^>HC2tXM*;
zq!t&H{b?Flqon5g-B=4CLMzi1NO&fnFm-zyp91(wFffEwV(sW7k_(q!Lj9x(8gG+^
znz|)eF6mao;~Iz>t%Tp7lL$RMk+UYyuCI}Q&FwQc10y|F6XBii?a|84Yt0LB^3QE;
zkU7~aRpfr8%}kc|L#Wm`>~fwL^$75LJlXnYYp+$Sxz1E-w2-dj8uyS8h?L3>I<VcN
zjmi_=?7C1Eht}6nBb$;u<>NW%7+GL*q53yGT0!%~yk%muF&SKS?7R2x1bEx{@in>O
z6_G3u@TwI*uN`3!ThO2<MZ2g8(clhtr#sAj-2=tG%qjor=)k%e^b|(|MRCNDbL7M1
z#@E<qV%2rrQ8cd0?@Ob0X}WO-UP8dclr6MHfY@7P{QDAc0`fO;mw&b<74s&><o`?b
zK+k;{?AW)^kJ-iTN>PN1P`)Ub6RQeX1P*F|)Y0n4BRj<%3#+0PBiE+9Xp8TNb!h+D
zg5dy0NF|X@JMp1)3U!<|SWqd5@jIX1&H_9ds{~_v9_q&kcqZ0%mYkZiuB+)%+O^wZ
zH*qZ!LKf7&jO>GgdrTAX-9Nvh5Iag@BmfbhjA}&-an!b$+?(f81*1sQX<75>v1e%w
zZR69D6cBGjSkBkM_tWMz_4Oa<vGfLT6g~AMMRp15b^`*cAT%DLG)ss&Q8c9F7iFht
zsdUPjv<%)2t-P{-+BVu23n!TkkSsC}y=f#%pP$214|?ZTaaq3Ee;VW7*cmGMH!yTI
zSEmrtgc}5hf=F0T`xmOb$E%t2W4i+4oxmBx=qv4oFHPL$02d;ShbcFF%j4m^xp_}=
zOb|gzaff=v+kax|+J+2kv~Vy-vZ9#B&vC`O;yVlW8~_w46XdKb&rC#FSt3uAMfXK_
zEtV||Ga;OZJ6`^xyDP6WfEtuWR&OOFfDrVh8fU?oGj{{_OOnOnQ1b~C+ts=W<xKmS
zo<r<?F3DaIbWrA?aB##ekxZ4~?PO*o-?di=;Gdv=)M88zBF0f}U4hX`SZ2MyxoPa>
zfvf!il3kSA^W~mCRmJ|nlNTValL~mVrZLC6m{Ei6!BpqdS{@d^EHbMEH5^6W%jxLN
zs<Xhf2S0Ja_hU1a=9X6%6Aphf0tVn>?re-WTbO+VHfkwyUfd(h)2Ym<o?4)cxUCXH
zi8mk7N(sPsvEHU)oG_^2MCimTh*&n3ZMWj**eYA0*E|)T0g2Bk#?<Sq`Vg7S$AS}x
zNkndbkC-1CP13Alw_3chPll-!#{!NRvtoZIwtN(I0>m4L>x^Cey*pBg54$I{10CoZ
zWi|=K^RJ(+D0p{{A{%|2u1+nfu@x#DB<|fM4mK@@->qmHdZ=cmO|z>IcEh|pUU|od
zXk6+zLkzdqjM63oA3jASxp$=1U9%I2?*WVxl?I|X1#L=1!LME^>(<!zBf7nwmJQ?#
zh;4qjvhP^Y*Wu@}Gni8L*B4c7Z#}Ts`*km`v@~!ir<cd7FAP`h6k?uw#Sm#w?QZ<f
z-f12O7lhtpnhRx?K~fpPhyT87#>~%L<HRMr`l&yCg?P2Rx&IUc9}Az*T8UmBsh@1z
z311|USBLp<$VoggJB+w5gGGA3Mu{yA;|~qEyt>O-aijO%v1R>yyD=~U&OVIEL9gcE
zWfZ9=4=|`(Y($){!H8!m!XP58Mf>aJ;UTVUOnxgi$FU3&qC3d56j9W~!RDbtBXe(p
z0pnqH8C=h9F#fL!pxso@w)MIrF)hcx<|D%dLfhd1)nZT;FzK`t$fTYh{pQepV?j%G
z{OwG&uRnt_nxPzQm(<9d8I3t|b^6nIFt~KKF=>qAtoYfmh};7fAvLGBis%m>xQceT
z!g%SoQ+AJ<BQ-nHaxp^V<AKhRq3E#K=g99`iw@BnCu?&ZxtWM?8g0v4yf4I;UoY4A
ziTx3Hj`A=9OvK)R(d}chsAr{42NFb?_GxU=PLN5YOJt>aT_?*)rW({c8nfJrV{+Y{
z?Nnxh$wJZRE^wvw3TN5i4feCa30m~a(@RDQ-@%<T45p)q9+r`DoG+*u%ZZl{rm6po
zc=jO)gd_~7)o(~@zX#_^J&ap@VfQqF$u~3CIGa|VBz@1jm!mcaOVo?h=vJLT`6AY4
zxkQs53wFbKrnC9FJbY#lLGc0m2O6*ZN6uajS6Kgy&5HmFJH||Ttl*@b@8h!)88e%=
zSp`Vs$QQVqBqvq@qqv*~u2cxHb!1&cS}6ZQLnyTmfyIe<KD%rG5SHz$38|D{s#{cK
z;}$lE5yf}4PrbY4D$=yZ8}h*T{F4ZaSU%bv0dBhQhH&1b#|2Y}>*4j#old&D3<daw
z6&(_jxw?x4QZ9~07va2R7975p+&|;4dpzqqf%_C{rAsU9+#sYy5C6FptKx}Z+se5q
zAL6r6f5=s5W;>5)YpQBwo@?#?JL>Z@mI3Stv!(uX3p@Ty4up5PuZ0Rx<3iO7O$ml6
z`A#v`4@L&8UQQ6qyd2oHAOG6uSyvX&XwA_S0!@nk(n-0>Wf&NW1C2yO26r~(0BG#r
zA-q8iWfUdipEWpOus)tK&z)7jmyZ_(j1_+l51gLl)IPHAn(4eaD&dz`_BC_n{1ahX
zAtk2z<=Yf&)iwlC*!VcM)M&P+tb;=W<2!*tDM1D0Wqam*C7|LBs+d76$A-8LM(auN
z8(???X&=i}FDo=(AhpA3EI4r2g-1TGzPrLdIjAAr39SD*g%=-3gU~;MLx$O!scN_N
zqmJxoiKXVzqRCuG&z&}N#`hcfC;qhVZ-<TVY-Pqvs=vJqtPUyp9bwWL?|gvv5k9ZU
zMEUP?4jMa?X$2alZvfnh1+2mU13sb`Py=D-Web;DY%>rl!;<dskI|ABCvp-0b=f1{
z>*LuE%~K70xFjLmBhfxWdW(V!GOzZ3>AD*WZjzst#VPAH$Xa`>K&@i`xxsveXeV<6
zhNIzc3vINbvuH^Njcs0yvjeVqZO&lnC2w`rBoozg4@*+&`5G%8;Prt0n(rcGqfyr(
zWIKDX8Z7H3xecot-5qI9$TIjs!Q8gk>Bt-t0a>@f*l9Oz7J~agk^eu%<}$5}N&NK8
zOZpI(Yxa98?Q)wJi?am><7Aau<|PY=ub6GFY$3h}ZQWz(r!b5jU&xZp>(VHGdFQWi
z=F13w+6!x~h`gIYUmQN?t6{S2f5qS^pkiG&GvHc-8*umhqU^(F><hnw*TrG)zku1h
z->^^Y%%Q0cXZf8KMlrt9;6G?KHC>cWrM2U4bhG(kR5-zxUZE?llOXzG_Vd);4BBKQ
zGFkFZy`z^o0i$oFkvfy_7Cnr9^5wG4`FlxiREO*@R=sy2t|><)Rj*YU7I7ICV;wN7
zL}J3{OBcLdDf_Q`mE@D#3M35tLYQM09i2^{iacu(tkU#C#b2{Rpgm;E3qG7Th1LPY
zFZ=Q;X?qd_P)O;5REB3)2hh}`#|vsre@&JLz6B(l=c@j}>pVi^{*OD|RXF00E@^nA
z^QMx?aQK0TG|X4+bS5@i<@{fbYkOW;o}4D*vQWuUrHM1DInf|LJejH6PYYi{TvqSh
z=lM$gVa=l2e3rgDqssf5@VJ|R3%>78;AHJAd!Aaji)NuI-4t&YF%?1#HMRdud_}!a
zv?>+)-dK9Jmt8#Nenhb6fuAJRqG}Z3%q*RbmQ|pzJH)z272sjFEXn=A?q&H{XtST=
zmcNhQ%M?>)-aGJpdsw?94Otig+)s8`GL)Uk>mD}&M*d6=jbT;mcxg}cgK*XqntEWo
zxK`F_)5J8XrQ8cZFzu1*;qUQ)p0&emG|7lNS1oRmnvn}d*OB4Dhtq!mh@ioS>s*Rc
zKq-xcVv5RHJ11;8%-i!-AtYFnhPv^EY^B{Sj~}4D*v~q6*6&;T?_w5czH=08wTLDo
z&?qAb6(4rQEGHcdSxe)t;s@I%?xTB<Q~>gsi10v)Y!)ptifL~zHQn`MMvsa)h%68e
zL*(o;_FKCEl^%OiR+niGxf#yEXsRFCMqDP`YW~hG&qt2s?t|9v?J!?P!Gpxalvyx!
z{b^`+?p?yeP^{IM-$Bo^kZGa|*F4>QYvM$Hv);A|er%^XT2$|k!09>Qo5jmACB(_|
zL~sQrDC;V4`}whoUdsf8gbu#{D<^jKcOV{bA*ns2LP^NTW!<hBfP#Gbb`C|@$p4A0
z<9cuN>zA>sBA+Ixov#ET!PMpJJnUY3$B>(lG(y-O+hxO=-O6iFfO8HYE^Pqf@^dJv
zZ)IPcW!s1x`;xH-$bXeUUv#J41(O{#9ZGozQ@n&uL@REk`Ak3Gqs@!U6)B5ika*AS
z$53DzRTs}=HG^!5CXJo5BElSeQv{rNxd|ga!0ggO;kvUSWi<8NXju%xh2i96aCQJE
zW6ZZmZn-x>z!6o(UG8l3MyD}6>)`LbBg^@^>*Ra%ZGw4Ci|c?pFFo;XZG{)S;ud{I
zfRr4hqj{^O_#g*aUG^CPK*yDF0}!>kKR%kzGUh1s+S^@w75G_nrlHw_x1P}Z10<$=
z31sbu;w4yK@8!rJDpd7xG*g)-Pcf5>eB<lv)6Y2YUmPaX_*M9B){eMI{^vl*AH`py
z^k4v5a4O76{PFiJD_Lo$ihnQ*q!Dl_ZV$c<WL(MWe7tFa!x;Iq=#^Z)=i*P5^b9&P
z3qw}dnGU{`b~2INcyveFeqv~N$wH+<6VW6G)L9<N3=0Vzyw5KN^1Z&VAgh;2YblF)
zf`sO6ylHoA5O~P)<Pd*vj^JCW5=UkFSyc83>%%*K0%}A)i><n^iPdLl2yGPL!1B9H
zg5-0`X8r)mNYNC>izLC1fZnS<9$UBKCzMC_;-}v4O>sasjUytMPO-|Xh4QwV{#Ik0
z2u>wF00Xk}E`%6f-*G5$BArc8w8gPeaBjy;+PeaPoT)vy<ErnD1vY=T=T(se5C$Fs
z$f|uc0&6kF$2E83CH(FPxR+U4pka^$zCVEC%+hK>mkHg&1mb;_q_m5G7iazOL<fBi
z0?L2}6Y9lfrbhqc@c>ypxUHG;^0GVpY$Urg9b+-%5Sh@AQh*G;<@O@o5~RMe`{(Is
z3hWG95O8^ufCU1X9a!^Fj#PRmQ2mW+FNFK4!slOeos0d^)|w_2mrw1!@tvLRHHTEm
zr}p%pMhK1(P(Xg5dKM*K-Fn0W%48^lQ07JY_3#HVVc9A_T1W~i3={n>rYGhVl{bvQ
z^4qK8zHT9xjQji6pMd5W>Nbp6NSLE6^KSP`)NYlzK9Tz<B2$v;-Z80OU+^ChM;7+7
zy>>%G1(G0qKnRcl+JWvJz^=-yK8Ftnqv?j$ieg~!gDwC!#3z{%@iu<Ye4we7PZPrV
z(q3s)cARGs!^0d-&XdQ4&ul28WAxt3Q6%ZULb#HmCN`*mOZt1D3!i0%Z;RbLGbLXP
zgB5lXIJv6hTJ#yH0r1&lVr;#>NNY1~59F7_Yc!C?3tR=_ThRg?cN?HIpAVr#gzv{o
zKjsID#nQO}U>`gDptT2I)QSO-JZlB;E+@%zQ~nSL1s&f@kC)^Yy$s|GE%|*J&KeD|
zN`W}L0EZvSq_)xoBhHVl>z;(nvgb0t^+3@d3Tw>cupa>xlgsf-u6IT1?N9E(Z2abx
z(N`zL3}!P^3z4izCiN8_Az;V*X)ak3LuuM+#fsFI--YMQKC)7XOf3DJs0@(nen(xs
zysoF?UsDPv#q79tZWQNHmi**@h-QFMe-NVEx1F^Uu8<@OxCDrgBb|QY33>0P*w0*u
zM?W3#8mu-0X)gxKvcw|ec0l_azZc~`x2qQWQdv)`YoS1i0PfvOp)3?qNe16fAXBDl
zIYWeo9ThVqIS5#*y{IgznK)xYsO{$cKQtjE{Gj~gj}(ril3VyviN+*A06m+qJYcJX
z51-1FpN1-okgXf_X8_e*R9oY9b@@vN8ITRY_Bef_Zq18Y2hN769zgmg-O*F9!AjX&
zPqz<(4|l7aa@Tp8Lusy~(w&G!=&X)p)v5j$&I73cI1gK+>xllS^8~7FHOGtK+G1Ti
zyt&5V9i8|ZPY8UE4O%H$l+}4j+j1Ai1x^XY)laJu3j(c*p(7+vYm<NuQyVw1SzPpp
zdW5M@Lyuy%crIyJ2+!(|8{d?BN_U@PqR)mL7QiQ&bXUeaOEu_khn<7!ZdP1rSB}wi
z`{!?ijz+viSig(;vi}F)j?t07O4mFA^ik$PG+Oh)BbX4$zPZqBqm>J>Uf05I4yt=)
z&a;ih)*+)JE%E;qupZIlXn<Ml=js2FBqtC{JhEfupAEUwERigz&>*Hf3Z{*)g8o|t
zt@D@JVUfK%8~hx_*=B}&yxl70VX;u|e{ksk9w=vH_XRHw@hK28qSx?F1@c@@$p4Mj
z4LOVcJ{(-)YNPW1GV%tFD*vkl-o5%t;Hp8URA89=soziV!8kgdNb9d7$kVI_A%CAz
z|G(_NzWjYG{r{g|du8#B^j2W_OS&t~_-qr{d0sPx&{}4ZVc>t)w9-#mZk)uT>XM@B
z_qP31Pbth?mN#uS+tzat7MHYo(aZRrnEy~~MG7Rj2WBBg<<2}KwYJOD&UQ(=WD3KL
zzw+qH|FYZ8NQpY<=FRq1>{k9r18)4{8nv?W?MMoG`{OP_NkF`<!#t7$tc$1XQI~Pb
z(nHt}NfJBF<O0>Wlw)Y+k^YT_P&LxwOA5trBe(UJ+XxgYtN(+zw+@SP{oY4K6qH7g
z4gnFQ8$n=D1f)Sg8bm+@B&0h82PCDtk#3}8l<uKJVx+rfsG;V(7<+%dd+*Qweb2eh
zb<Y1Y@B2LKS#hs>t!KTkHKhG}RZTyvF`P4k(wSEMu7Qf@Z#~});CZIDnmk+tM$II-
zuCfpmErY^<6-~#fr6Oo&Gw)tRwL<PMhOz2q(bUyD-vhcg$dyZOr6_W0mOA1R4uBz}
z(^mM?-@d|cH|fs__)Pi89Hcb(-nsEuiOb<VOo4)R^I@d9$qCG--a1Q)qVZN*iTVSv
z?*{0Qih{4aPrb3ufqsO5xY8tu>YUfu+4pP`x-})VZYK-K)<V`^wTBx1jNWGkanjsf
zFP1ZD9^?0Hx6)vFyOCI&c+=hQTsOQT*F`1JWE+*@zf+UoO#@<(!d+pMh^A?rler3s
zj!2(90<k91?NXZU%X**C)oHPw8TIY7e9^nAwkjQW#RKN1R%*qg)=w6}zn^)<qoo$A
zHK%0mX)`IcQ=CSP<|_FN*UkF-^Sj~?zrN}k0A7W)U0mCP#d4!L*C@j{*XP9LeWZY^
zX&QH3W;1$(6k477duygqu<tQSHb3M_NJ0mm41eK4YhId?3`Lt2FwO!U`Ro}k)FY4C
zlpHI;@C2(QZx>&Nf7aS!ocFyC7BD5|o%KC+5V!~jm~}&O+moOCRxw02f9GA?f##jO
zixe-pd_!c9b<Q65;!R~Rp#d<sf~I-Oy||6losGN>=EbN!*WlG>K=$TxxW;T@QZj9g
zAW>R9)9B9p4lrxT?RJXtoG~zuVyt+#&A5}I%jvDl`(X@2_%&ub>ia9Z&0j?Efy<`u
zLgXjZIvCsXMRS7iWZrJy+q*?rPPol}6Q#{LEN6Dl8pm^P>nEtKya5A6y*#^%2(=cl
zgG~4p4}rYw>j*tuJI2v)mIAvtmiM2cU?#(#YL2SSFLtM74#ZSP&ybBZMxy-%n80Dx
zff*yxyVo#Q#&Ia`j@BBHjQcU39bt87eZdOFI}_QVz<N%ac0;kSSS@#sH!jjF?W##!
z-&wPbTJxOosfgcI`@yV}CY$pt;n6Q=npuX-x6<71j$-tlJqBhMxgUHSy-s^CPSh(_
zjq>*Q>#J{WUwn298PH89!Sk`W>T7?RwQT(szu-L)2I+G#0-XhO-TY>Si1n8eX9Qky
z&wWjSr)gZ@PO%StOju$0>`bpA9>+nf@rubFt_iu|HPr$<6W}@;J*8VVJKGcYTVs&O
zy$tOK=m_4tAomByEm<d)6u@0F61Q)n%_r$ky-$SFvJm*@R`w4-GGIUEh8mL4>&X|%
z>A1*>(xK_Gf?B}M2V=z@ncN<b=`SIF9Y7Y*aboyf%r9=n0+K+<T9bF@ty_^}$n9uD
zChQ(A5EmL&Sq(i;9y6?{md-Z5`kG=%EuBbNr?*1{T>;KTpYkihq4<LVbwE!jq%Ek#
z6!K$kQA9CBq_>mG6{E3YS@oWNqtDY$bCLl!>+4{XVkuLAkIYfSTt@0Vn+A4FO<Dmt
zg}d*w%&(h}q8pYPUi}<kPo~9ALk~bj+M31$HPWhJxqQ}|zVkwDfYJfxJ>6gwu8Zm!
zpqKGLdyEzs4HJHCUqdEkeVr%`n9Wk~-5{@4f~`cLt|idcZe@nhBe-Tav3F*<1wLiz
z=MVc@?f<&vmRYj?lZN!dE28Xq;LHlLjKV8=5J%i1-oP}-!#qbmtD@SyV$B?OwoFCJ
zk!FhCTW(U{@7;Je;J`Iv5>0ely`F318j19i_!;a1rY~1btqR7buRJCO72Xn{GQAoV
z=mAX05ht4xZ6BR}6L41ft<oou)TuDDPZ_wu`ASDi93OXDprn?(82hZ$JwmD~H%Lmb
zwMj<(nx+vg*1!%XiZ`0>2^n6xB27g9%xszXDlO0(u{(^VQT!g$<?4RQH~x2QNq7=W
z-BJv1E*xLaWd>6aWJ}*0Rg4~`z9)_p@O*ZfYK_nR9HZ#@SBBD->(2!u#HpL>DUTKW
z+QQcO?i9L}_JUs|3Qy;i6cXt__nwVcYcZU|ReA1RtX>a>DdZ=c3`Ol9ycWPD-o-nf
zS-u>QR`<anGQGPCY}x$|iyZr^U2rp<jk7#^uZa((oj~GOpj$*XfOBM;u4U;9c(pVp
z82?zNFIlRbJaE~YJ~*#aVSkYM8wm%{WKSF~P}1#fGJIC0l;4u#Xt}29r>qIye~!i0
z%I7FX6>}T@Dl|Q<*}M7L>ZuQ9vxV+6YEiDYhzjG^GywBksc2Bc1+_8ZA*P}0D6P~h
zx#id6dsA6Sim3a`HD=fu6F(2Gsrxg%)rtw_fkL9!Eo;4lhj&7Djb2m7bv}s$zr0PQ
zhVT_|*L&_Q7OvL9y~FoJ3r!Z2*O~(DfId@hF8}cP)u=UsV{T!F8}GQxU)*Z!gHi3A
z;owY_dIuOXRcPNyD}0t&-biQm8AE$X2K|Lqx;iiZJ@LN4mk%12@rt;)<>sv;rO+tK
z<8K33tSDZ`om+Pir3qfQnmsAC`|f{5?_N!|k2MCTc$w|)QL0P>v!94zx6k{VU8RPz
z%{+!SADP4@xVBolgB1LM|2=OK%hmJW1yXF}3m<)6zpGJs_ftnMa*QbPZj+`@QYqtS
zqb4ef9X-Ef*PZ1$S~}cn?q5K34K$9G)J6lh`ZRAI)!A{fileDg<oc5%ckR^H;&eQw
zWWC{d$D-Rm&NYNI()_yL?MRm>y4+H)aJBRJ9+`!Ar4?A=Sl<W~Uz^6^?v76bkMo&~
zT;CN6p>4(k!erliN4-CA*Yz<Me$lnp$MW~!z9ML@Z@yPO>g)SrZh&Z^_qjgNDrQ&t
zm@DV-YnHGRvENnVw_Q7r-EGgKwaL)UYF}=7k#KWY<Mjl#34#o{24%eyqVahcF#s=e
z`ij741e>ncwzENr?Y+Eo*)b;OT<$~t{M{Jir#SIfK3y2DrsOe`#41{{0&77_e%|ax
z*LP$FBZJ&dQmqFIj1PSgad5NN2~d0PJ!Hr<Etur?5Xf-Nh7(r(hPq7Bv)gjeaUP>}
zN3r7jiFsDL;y)trvqHyiL){&f4G0W%Q367Q(bD6BggzZ(x{n7nw7IoDb{19xPbhEa
zf8e0Vdn;49SI2w9A_JK5*G<B#i<ng_^#(^or!`O>S%a0mUv^D!-oa*k!%cLgT};BH
z<jwIBhrRseM5q*25QCYi4z#r9tN20?rRT$Y>e%=Q;l2#HOpM29<`QwH<Hasa6)b~y
z=A_&!^nGm7ue;pA;gI*{X;nlDY^SRAgejzcHGh>{CW2OH@f|#1SciqE*5bF?9zpXK
zo`>>RD@Cq5`Alg&nm{)<1gM|kSNg@Zdy2tBd=m9n`96>7$EkUtLl}vz*$A!$Yipj|
zA&Hlgpx{2UI!O4SuKe8F^L`$fA>?rsP~nwj*JWmD38!oonYd%KP-_pC=;5ii{w{5*
zO>oU{<dM+}dcLsU)0a)@U%{4lZu8pRBb2(=P7naLSt?+XPyst$TP^GwP^NB`wHU*0
zO*Y{AuJVfT7S76trJKqGiqgBS^nArM<t(qrU+vG`2^p->t^xYz&-jNh1i@HkeBm^`
zN3FPoCkI(?2ZtE6<PARpVS$`bwTgurP>^m))+P^KhO1mu9uu4&{J|k3O!piS*H+<O
zA9s!JrjrBrS#NyXVcdI2P9AdOtz(~PSip(RCr>T3z@?#W!gjzO$UmGEE@Nn2!(eaB
zqm?)Ar}g9IP)20Z@K;xwTLHO5W@vQey2u@!@8}?f{1#<R&c_4HWusy0&T6ast>oG#
zck{L(r1KxM-j?AsZevHZV{FuAQN(uhw$Ac9k!d5GF~OM%&TtO-Tu#;Sq18!I(Xtb<
zcr}ge;Z+K<?kX-%j?owyhbdAEzOuBJeXZs5J~uo#$Zu`jST>Dp_aG|^{8kZzJKgg6
zd%U;vV&W%s<EeN=r0p=FUm0#|3g?29FdNR-5*>WK>#&0N@eDT7Bc;p(V9TOHL!_}{
z2ByPRXmzQVLOvp!oJU~b(UMTrZihEPYcev<1s3ju$2Hfy^-<eLg={_5;GV=zyOZ^E
zEqriAi=tq%*gC;A-5X6VvnCC|B(kX1!!6^xty~5?QQN1O+pxS^J*pZN)J!}d4RtV?
z>0Wv@g(_+3$L{7AUDrp9Y6fKnb>BWBxOZb<)sF;9f5)cVk9XiH5^2PM#&(rR_Bw??
zaS?$wNa|{@WMcP2tjDBZ-22O6V$IRk_K&Y6Z#1}km)xDVs;Obi3xp&#c5r=pSlNv&
zJbW=zqL{z)<@3+!V%H3Vec6}dPQGO1>f5KRiN8QwrtYQxHIOX$Ef+N!_M+mf5pk*I
zGY^4>w|?ca=vT9QOXpKZk27(X-}6|DCM~f3zn$fB-8oi$CaS7Q>@U}?Odjm0#d@m$
zBPk8$8QR~W4V*8PA&0-uzbw>6tRP8$3`8?*79X(*digL52l-eXs3{0`A<nRWER4(W
zj(Lq_8TB);dpe6aadqON%f<B{5{Tm>;GfQzz`HNeq<kA77Zu<lsH;C-Bu^GVVWslM
zKZ?#E7mJ{6>z+%B7ut^ji1%=w;8rtNNpyeYR0CRs9V11c5bqy@+lVkSz&gZJbS9mo
z`9ODZy-doAf%xUx{BEMFw6F(medJlhRtE}!Gk2i~&O5|L86SYbAI}tc-(G#mZXkuz
zy>c_Md*$o*RwAXd*eAIRe_ym;_EXSB8>sbW@L~`&7mrX>VpFt>G*yL~-W5FI7Qrc&
zi9xQ?BKi~<_869J?Y=eE*%(0Ev=?oERv|c!+|xPAfuJUXRB9g4|5~HYo)gG}TW$q*
zc5D%Dz<Rm;$Er`FKmB}|@RNO<vZK2O?oti`tK=_tWSWkHDU6@s9^H9cCb?|7%5eU@
z(v2P>QryrS@!XrD=0ceD6X<-5Nags^3wGQ~wl+96*-Kt{DJn82h>g!cxQ#e6hl#r0
zfH<4{Ql9DeGrFJsbq9aG6qhFh?52FYj9vYSR~}?pw@^iyjk(#Vx0ot@ZFLTF>lx%?
zs>DFaz+*YvevOe-mQ?q}_C77dgCUT2?6-c<57ET|4Y5N<`i-ph?L#5}b)2mPayKEa
z3b;JC0b&)C`-dt2Doy4m_S>Ilq}CRW3}QYcLeAz#nS*mk)7q7qPSu1*)l{=Ibe(k<
z1^;{{O#><v7z<W3_`M2Bqho?xY9R@~)Y1g{e73gPNcTPRQMwG%fJP?dJa4|l6DWb3
z61>fH<PLD)m_C{;Kc<C!eaQI~wV||=UD3~D<hmqT)CP>bc|Y~0%&HFAZXva8l>AFG
z+2_{_*dkM$dz)~raJo48tWucY5dLq4OW=HZV4BU_&!u-l>*dT;FScGPI@Ct0cjp`e
z@eswL1crkw(a5ovDKCosxjl}C?+t5`D^!!;U%4#*-`UzmVeMnri4~_}1Fwa?)ADzF
z-NS;f#!)BdI+LY!drGNI5w$?pD^1m4QMWjceYwZ1#2+?B68>H~HpxlNcPd#hCHlu@
z*$MjJx@tm`>bjt_=2*WS?YZA&P_dDzB*a-Su)SY<K0M4>chBQU8WJ|Yyw|sLE-bn^
zp=yVm4MP#Ur{%je7eLv9MVbz`42+S;?hX*N7}{`L2X>z|Jn|Td?62j+URe-WP&fib
zZ=cno4^baYb2gszOVgtsG5$I`W_l#@_&BT_#aWMR0=t9V7dSnRBboJiXAgG~YaDLW
zaif@mvq#2@pxjG4`k~T=kwvkZ$bRdiX@?GlaxQ$Bhl=m1eeSV;O3SwQmrX7B&
zY04e~SddPu`g2_t(S<Buis|B}#bf&>mz99#KlKUWJifqbYIgb+dXc18QZ;t6CR<RX
zP8xDb<7zLwbG9(_o<xt{V@oGha!*uY)lc_Dy5!L0O436Q6q8qQGd_ik#SiZxMU(2a
zgE?7|>WP?|JgBPQjOM5sDL-%*`@U;5EE%gQ4BQi*y|5w%g=x&}*$5x-4QF+vI;gmQ
zd#{nh+skL?dT1gjHHQ(NRr$j$Wd`^)9;C07qwvMnTkW@9R!UY7kEByRa?}syuh|bn
zKhC126IG@~^gVmd*N9aWquW47fr&%NcJ`>HYJ9<fyG0(qMYZKl5@%hEfz2dx^tolI
z>s}(?lwG+VHuKvFp4?uMp)#poewZZux^(?cgO)EP$+F|}#Gl#$K$#VYv%5rGwbyRF
zy!XpB_@2&QnFdHcC*HAcKZQv@<m``iGlT)v){W%DH#ZP-P!zkKONhfzrQkGMU}}d#
z^(^=Kw2$;iNtiG)$-Yf}Jh&N`{<ruDZ&G3nsE_FiVpSj=>y4>Ht=$igI-z!ZpJL_k
zMGvD+4jPppF^Ugo_9+IBB{kr+C4~IbM)4>_?9p)TauUDjmLEC;1mp>~t%5DO<j~R}
z$$@oCThAsI*M75IJ45uxUz`hm1Lxh$sF8HvY?7;(J2bfXUb~sl5bseHUt#D>`=>*T
z@8G#g?1{|RDVOR1uM<WW73%R>X{sKHtp<g|(jq<|{_sDGWx>Eyv)B(zDK%DG`p#vt
z)GpIzDi3O)U!}7x3hPx)26Ma$U_^!sE&sVqAlu<N0Ndb0j`bZI$1oCK4Kuqxc+mHG
zI9qV~^vU_j1_&i5u<i_vT*xf-l^PmY8WsoKS_jXhbOfxItKrDEe(STDK<_C81x(Jw
z3=+sNFd8+%-VqwplJf;>DJ0=-FCJP$RD?S=d2>!ZtP<IjswRw!3eo5O!_go#fqZlK
zOh>DRRP0#A(c>$BwP{EkJ}JC=il+C;1|{rVeNiy?F0P)KL*Od83()S|;e{AxOZtEf
z-%T%RBzo^_?SZO}Z?m6n4$2PVsFiPzh3r)@Pv42)vqz5H&oCtpF9rHjjWy4Q5}P8<
zp}P9(S7|YLPQEs%)L(R~mY6^Xl3>M)b_PFo=5@HmG+eFU0L(MISoepTNP~ci)p;5y
zkjV4!+%&_+_sK5}t(p%~(odarKge6-4bxShxX(;~2?O{f)wGt&^w3VG>tR7>fi*+x
z7CD$yZ*>>NI{(-LFzIA=4>-Nl6_uQ%#6J9{)suQ2I^RlrNK0twr5Z}aDkamqN)g`F
zdG4)mKa(6@zv7}XoAmyZ{)4&Apkc;XExJSr*FHUikY>CeHt!ppJ4e#y`1F6ksLKF#
zDcte|M|f9GuD^xkHT)L)DhiFlKb5BN3jfB~_$<nBku)NYEFi*0a$m#X0=Qlud;7B!
z@(t9AQC#b~QcbyCMM+UWANApc;LHZXptE~GO;ss5`ZlfAz*CAkcZ%2l0HQ8~aePPl
zRr|(}>Zju{`vL&HyqReoNOq#Nj~M&qtYdDc)o<G=*R>AyNN=a?ob4WIVJ^&(&KA{k
zkG?Z4-R<=UJUbpos<7mKF#WBn=qsY|tcqbR(^Os^=jC$|@)R%kGr{$7d%jiosF?&y
zwn)Gn_&fgr>|KHro=bmFK<xyj;OgX>8hf}EZdonSVzdWz;<ssEZ9Gl8{P$%rU*rb(
zwc6SC1NWwjl#U_MLl9r`oAR!W`Rf9}rGgV-%IrkT3e~Dc5)zf?tzkOOWhWU~kM5am
z{fPg}^oGF5X%@#G)7HDx@^ECc#ZGRA8Ne46Fa`C^qSjK&T_9|l-w@s{nKr&phra!U
z3V#K(e%S#mJtF0#cTTUBmq%}~J58|H3-m+2H^)yga5lmSti#)4WjSm2QdT{s3N4tD
z-Lw|PR)1qV7vs+8Gxv*`9n<ksDqA%PG2U3ag`z5@9rw*oVCR4lB53X1I$Gmg>YfJ$
zS>zFV4n+s+SDgkf0pU8qAaCGP&u;@KZ*x*%AE7qMAp%Mz^{0d22lh`Vu*YE7FDLb@
zZ}}^(?(_D>Sw1yzWa-zugg)jU0U!8R@cq9)pML}2{|gZJH}HLFb%B3DU~2vWDRe)<
z;3KJj$R!jA)nk5w;LAO?{sqbXKiZC_{EvH#YD7<n{oA^jQiFL$Zbi!T?B&K(YEP1M
zGyER_V>#7_za~7Fi*#u~el@iX>sjpV+j5*?owAuet&}(<z&XOV^snD?vpHERunL)m
z)-9gs9(msRg=9Jxe4&u!OW4HVbm(w5b&_f1w6cYv$==oVRKp-(x@R|yL8)_Q{ljpz
zqVUR4XCv*?U#Ac66wp~*nF1Z_ElOXIZmns@Fr_-VR0$S3akeb<*W3}^pHQWDI%K*;
zUkpEA_+N!(LZc*#>bCcJPx4`Uk~Ye%0l;QYJhyDBgqF=86qixk&hjx8`{k61Q)mh(
z9L=5jWaa%p91LL^FyzyinA!8uFi_WQo~mK_W?(^<sh+FEqCR-@eh*<vH3(jb;Uu5}
zvq4y9?4FAcjhJ4Zc(}AMg&#GlPl*B2l@nLKfS3}TH1{_QYP$^lZq+vE?wxsNavb$9
ziJNFUcitlT2L7wF*uokxg(3Alc<q!5vzz94f$p^E8k5iqUV8DBB-1AQ$>`^n5A*;E
ztLfkzW_(nQdmw6n>gU9UomN_V#Izl3&)hLv|KSokfq8AjLCj|G5a<O>o6DRpt>j80
zFiTzS?)+Qx&@hcB*H(2Ay{Y|zR_WjcYQ#0>i)mCNEsCxVzcnMsYt~HO+H+@I9intW
zfehi3QT>JSfV&1%%F6pQ4QGQ&6%BYZ^4@T2qLO67Ti2M8A}#X3q_tELG+E@_g-eX?
z>!mw8jYY*~IXHN+m#FT)TEu01<!qYUwvU-y?o{lJf4i(zPNYsli&9zc;JVnf>rdur
zn24B^xu5j;<yADa?zbToy~1w72)2q(cD?*cC%ZvACZ4S;<QGwl48SKSy5uW*6KL$#
zV>f`Y{gU!FJlHv(cNIlo;1#og8W2;%ZvU{Nphw+zdf=W`0n)neAXJMg^yA%&nJ~z$
z#*U{EM`pd*p@gCgVZaqFmk&?xu(Lro?eF$`q?$HeFaO+}9O`qE`r0Z+qGfV4I@}#h
zZkkVvX@>mrHuVBYfM&jA6NZjgQ16E3x;w~Zg6w^0b8<_fZn*<-D7wGOmP+`*u3yx;
z(VaeqnIVb^o@y)adVEAC19$rqYTT68-W6>&%TDHKoFBvF0STL5B8-@R`zxf{_8<1w
zs+%H_bmBtPL2w^hrWY!L6THVU{Xo*OI!S;Q=&WZL%})UbS|Gtl7upmh&mFI0?t{>Q
zmvZch(S)ptiwD$UO`2hH@n!DWY)KxIA%kwgcP}yMe-GzRTR=-{syPLPwN~h{+E$<H
zuC!r2k}$gZ4kMX&LFhDb<F<w#ujtmhf^rmLimr}k>kU)sg;34Kr=V+hJvFoX8|Sk(
zb3rKmkKn~+%#{Anv#dS7M}$}dnTkS!L{BtrYnJ6Ln(1TYk2mX}k0u`)@ju$+G6*l7
zHu^L=man7LUk_iFPkPaY*x`A$=%M=)kv*~D?%hI=bAJ&fNaKUwsOIIU69{TKc>ZuZ
zg?g~-V0mgQBAKBdwaEBxn8NwmYD7<=GE{DvvbgFHV-l4mirg$RlY(FK^4q=}shDt?
zXvMoRi$ftNRczKN?&pb<2k1~p^ao;A(;Ldo5l&jjLXoNE1{#lUm-su}DpK)}C!ATg
zpGNQ<{0!ZLfcCE=63&3}j4No5toK&4*eja5CsC)=j`ogG#mMkb1ChPLO)uca(l9*2
z`u^sZA;Zri@F(6hZ_mi%K@R=b$EQ<j{efmvvJw6xv2$(5^WG^~-?j2+8GuD)ElC-r
z*0pXNrkV8Env5c(>`Tk+X)J7+!o7qvmvqfP#-?RLG}%ufS8jax^v*5rdK8`2#Y<7+
zr^y(W{xe5i(*iKZmEIE>hB^OQ!nD}g{pb_^=LMtVlU%N0A_-TPyns(sQ28k_i^uAF
zF0J+=E5~4Wbv9oBg%AB~?#dcXWdb41^<d4LWmiv}$nV3&exfB2dU2-2H*n@unTOkk
zW=|5r+cl*K(J8HK_3GN|C`~+*Xu}-#{VF&WQ=PXKz^9Rdml@!Bf+9LKI*P`H4=q7-
z;XA{y3#lfTTbQypoaUksewi@O7>0RNpa|3Ec6pj_`1GOzDB02s#f%y&OdWmyaNhUC
zyEIbABNc_UL_^3@!14;UD_wh9+S;Ds5Ka8cgsYH&be0xPmS&RQw4VWf=C%l&8cOwf
zdHS)e$1N;n;YywDQXL7Ab^5~Mv#=|b{Tw1I@Wk>^eBd_%-(F6E-ItpVd#yhHJJ>Vv
z|86#5wn%tq!@At$)gx}8Df^vaIg(WOL6C2%7As-ej~u6|(7fGsJvFvN*_%FqAXrlb
zX#e70E>GOH4Dd&#U{dJ=vv^f{@hw)jPpKocj_zF!h2De5WR$Up_uza@b(|YrE3AOV
z<G$sIzABe-Q@Jq6;K5w4{UBpZ|NI>)pd@JE*J{JD^cBPzmVTw}nV8j_WW^cob>^6L
z<?oNr2&vALCUHo)sa=L9*G>|xqR+#whu}p(SAvS`d0k_}>DLzEd8$vKoko*%&Q0Xm
z!*{NB84Yz`W2Po}!tBy5*sNUBh6wt&?Y4Zd*>s`hf^(hO&b~`k_$J_O-G>HmF3dp#
za8L}ul-)$yVapgzSL7yY>A608qVy)BHu2&8kAK%%6r>SxB{qwB+{W^54&w?VQ&&Vx
zsXrs9&<fjAjIOjVvQmUy5sz$jd4jSS{TW$r-DSJOoOZY5X#If2VWlV~VHkm$v3u3O
zxtYX!4uK*>SLB_>)f7;pdIlQmPSwvAnXq4sJV|D&Pzyu*#N;y@xvxA1xQJ)uvN>~M
z5+!k>iYJs-O_s-OJMVUi&?Na1_#DZmUyGaB580s&1{M?(2Ai{^ccbFiT-S%p$S*<~
z8;b~*2;Z8ML`%o4bEh4NAYHkr)||K8Cd#5|EPC4|I4|hVpa#oOo|$*h$Zl_Xd3K3C
zvZ&YMYiKLs`fK($kao>6p0v1F21Suf7Ismmv%~#znfr%%IBJo(BIUp;m!#M)lsbZ3
z63M(SRvv_P;B<ga<FG}=lvvOMbFFdO>>8xdWp~Vz;DnXqKGJG0KcIz395Gvp*OE;x
zVPdL(GRc>!2qP<CJgme~3s=Fd?ivZml_-C<lr~iLKcvyPTK7;wg0K%WL=dr)(X|M^
zz8o@G@U@gi)O7Dvgf_<KS)sHw6PzQ};@9Hj=2;q!n?{`Pp}puWuLo-mjKyos-j{=p
zo2R08=A}M0z~!-2Qr5ZnL%JE{4#)-`(s*#6r8da~&q@T}NsWgDttIYW#HPmv&+WV>
zupl3BUGdJ2C8X@W1xYnkdPbY;=AIp!AWUxkG2cYorJY$~yDU^$Ae&+Obi`U+>s~ao
zn?qcJ)Kd5-R~+UJYQ!T><|FfGsI}41dv=L>hj^sadw5Q+37L;JyS&GEG5H#UCzNN8
z&)sPc_+mm31cG$!l*#7g$|;nI#SA03O{`<#&jZSX9^9rFwIP*}rmjn*<+o)-9nf|Z
zIA{smqo~i#m}t^sGT8<7rVN~rUFI%1LBe-g)$v#?MNd}yPS2z^LX>;lZXvs_lJToB
z^{~CYYGBr?Z7zf#mh&j1tXY$Xk7S20hi<Oeep$Pmefd2YO~RWCx0l_By8Z6Cx9#$q
zHX(hoD@{uMOu@}m-R-{M==O*dAu&|}tL!fX!xIBRMAq_#4`O$ej=Trr+7mgsddQwF
zm+nn*&BL{#BDGs3G4f^9XtbI;P5ZhQuljlr_mPV}<?d%KEUyD?kJvp%51U(D#_MHX
z4tZeoY(B~0Wl9ref*I+)AHpkGqM`O_AkxE@;(E*Z^qcYnMYrcMhR?YeTFE8i1?Yp=
zD}y21DIKh-_XQCYNRXcNv1Q!4fv#gFOGg&*+i)NATY%oz1iUclSm0z24SCS$w;<Ra
z?=mti-@eP=@i2^Wfj&Y7KSkh6pUhr{-@^Mj<Z&`vm}pJ_LVR`vLrQ+p=0TaEEPi0m
zQ)r(}YrL7@S{`YV+Yo*;Q>ZRk8+3$lXm=`?6$jR1D=$k(N}fxfNwJ-=8iX&a@lX}#
z0J|3y3)(T2&Idk-QrnK@IZiEAu=M(H61<c!tVrVU^m#s&+P9SbwTBC5M=`pcN}{7t
z=6Xu>AgwEz_tuTg%pn0mKg>*ii}dGS{PWJ~Ie3&MrTC{M2Zc<jwqp$(%U)2#YQ90M
z6q9=;Uir&%#)d{A!d+0|9(m97>&x8e!Y;~8a;m42y2fb<PnHb-@?aL8{5g)mRjr3}
zakR(yUS8I5FrJGGxlw^<|M^4~7%By_K2LN$LFs9QVGmDJjLAL>s_7Kx|Ipns;(59I
zdnzt*-#F*#(bbLBTJ)|dIG=s9Ia;nSnIRYB#ka378dIH;F%*y0b5#K?*LXw&_a@AF
z(<)WWEY|9+4RE~rycGpTx`zJptkJtH_&=8(7;Ih>ly-hN`>mwxn{707T7=KI^95?s
z4L{FvxY>JXc(QUiP*%bpj~iW^GP;T;9xeGzZsez*GE>Nab5$Dr(#4+(zfC@B<5YLC
zf-pS`=9OQG)p?}?>F7_y7(DH2dT`*^D$YZdDiCX$?EX~KN@7=gQ&20<R@rPLoP3}Q
zEUcCAW2b+v?oYu2Ud$<H<Ahc5W@9wd2fBTJX||n}R3}B$P6?AA-N|dk!D!NCvY`aD
z(G0vU*=S!`ov_{?Tj!AzS%~b&%xXi2EKpkA%W=lWw^o~Hmt0$FAWc{a?%q1h{G}1)
ztzuC??orfgU}a&h|J%am2Ve2@iwJ)hpX*-e9%;Dz?T@ttAyahT41J0LoexB(+fLi{
z2$}0ekt?$w#uusy<$gZs#91|#0Afxn<x6J<0{qy}Z_WkO=U*QAYF`vSxu1eBTkG<`
z(^bftb-vQa_Vj|;GxewU_d}bj4P1MG`0_U8pcDOg=`efw$xjsvXQXNFuX3ZcH)iZ_
zFi2i;v3<)_Ey&OG%gH-2ZTP}wh_;;}3n!_04gnq9A^Khpl_K3ZxxdZA)fmIRJY6wH
ztJxd=#i=$ag$U{GN^>>UQ-KlRc4cmHFXVn77J4^&yGwxm$(E!R#(o=q6PC+Lqd+8P
zd_@XRw}S{-vcp;QhBiB(kiap8u^Eczq#8<9CSiW=(Rc+XK<xYWx<nG52Kh(z>}S9E
zrNGAbDF#euPYTVEH%;{k^lQHO!WMY+yY3rE%O<%HG@j|JR!xlmG{NM;#A4x@5|Ft%
z3gYFVSR^s7%X4r4qbH(k!JN8Wkj_lS-}qDaqfbTBN@C8nuouarBk+_bp5<4h6h^vu
z<XkYKXozRi`%n#euOf?_uo^S=96Uf`=enR4`IfV**qkK63hr$*C2CP6hl`DSIOuSY
zR77=Y=NSP;l+z;w2}!Rt6;BjKlt^b}`HSaBtJ#qj;n}8^6|Gl^3;`Mz?Sc&HwCBeo
zA7gA0CsMOX^gZEBvlMq@`B{F;V~JeZH4%vOUiw9~Utdl6r95-(<yvi+{V3Se`T&8Y
zCXZu1GH3VgJHj!)a%__840_tQu%YQzAg%9Wz`ntG>4^ZV>U@@hwxnKYKSsdmD(Rwd
zzk{=buVj9aU0wHV?>WPKw}oSEzyiC8F9v>b;Psu5l@RGOXo1$k#<L*ie+~J5XiK+4
z_T8WC(Y(?(B84G?(LTin>&uMsf}&=lD0jSg3RkH2;>ak3TqU?U_bLP8$n<?Wsh)@-
zy4$8`JRe1)OQLr_Y;nm!l;xb!vnf$3>z{Vj=Ws%ne0Jk4@9XeYk}2ajY_@&nVsC7g
zo%0H2hV?w`NA|i4iS>+o$m%(CNTjDLKoE7+zxR0ygP1oZlsDUVob#o~BpA|~JT6pj
z9U%B>zu5prI5vzPNVqib8{}KNk_;kK8{3Nq0>tKq#STqLw@Mh-!-iY-f%g+(u&rCa
z<5C71Bbd9t)h9?M)TnB%hDO*v(1wc0biTWdtUcm@Oo`)8&5b1loVDor!HI(YAO#4K
zFw<kzWDuRn{L-LD{L+0T%7+B}iuJ6Le`pq&a+QKbyCJVXW%#`Yl*t4bxYUdumYemJ
z>cu*jkIKHgW!ZVLp{UfW4$&ig6heRHJ*m#KrNmFK#gVD{imrk?9w$!f82kPpVU7lT
z;p9+#`B~FG6xo1p7`duehg+tRcJ}SUy5-z_+luqzQ<{wK$JR_cY3)f+;VCN56eQ`5
zQz@(Q-0oB7XMlR#!iWOAv%7U&Uw^b2PAhIE^{}$wgwxu&WbmYaVMk~1H=E&I&^;Oi
z#fH4K-(J?Nf@t4o&##gssdrE7^;mWmWIg~OZO}Wr?K)j!R9M5A{sg$jkjsc-9b+nM
zp<xBh-@gue66xvrcImiz;H7?iC<B5cqx*D%Ig`RFGLP_=5n6ugR3iD3`$1Ui)qU$5
zV(d7pb4*G4ND<ebt#g34q3+eu9?W_vwMuBReWzv3AGOGLkgO$3sr>oXeRi1JvgEj$
z{Gwz1mMV9h3YnTK{5W;X2zp-_l`zFRjKPeJ>anG@-<MH`k-EuF`s0A$)gYF`7nu!o
z$Dg^<$i5tt6E(L5OdFJS5c599zmJfto^&b6{~QvSV(HpVelG^n5>;X!tTIdBuAQZ2
zy1yI^rQ#%0CSM8Pup%Dg)rboE`mh5sk1w*`n*Ui+LkLgmzaISog>qF;D)$Gy4Qt)X
zzKITfxD#kkdx_VsOA&<XexXd`aexkNECM4c-9w(Jk$=#eXWV{iv_5}`+I9NAq2@=F
z1-bHa#t?RbqT7Z9RAXkB__mukcNH&gA|sB<SiO?Phn<f<VbwZ++P+l>^z4GMsakhb
zV6$MK80#hBGJ5PGv<Tv&=wZr`foXp_b9A7pz+ai_QC!Q^bzv3DT>mR`C_&S6+9!#I
z$_K@}j7=6;DbG*T&owczqzYLbg3db9HKL4UJxx{Gnqqt|0g6C81>rBm^gQs)7%ey<
zH_{}=)kqw;qUGLcSJJ@fo^9$!sjet`1n(B`r#c-jm8TeWT?T-AGQ3AlVlxwskrSS+
z5d|fK2`wiuiAY32D{+_w`{lKG1kaWbeG4f!8?8k7b7$<tbCfqIZbN>-*7XoH>Z3Cq
zL$5ZEp!;(e75l{g!~OO2`;MGt`*3xbiJkZq(aJ2Dvg?Bnbb{fAk5k<jh#<XztQuBS
zPc&u^Urf9jn_MZJ-X)Ehq8gB)UhxbuuTjXBdO{r$&4L+S75y9T#ea(QbPT5*tW42P
zA%R{uso&=TH!d(lk~Kw2_{80|dNv2n-nVsS$xIKQcX_{Ob@G`jyUd@+&^<m<W`{-8
z8V9@m;c{&d<uFBc+Z$UV$+KQ}0ScAw5V`^<*DLE#r>{i*&MUSl&Wc&nsMuM1>zciQ
z^R|muSWmur+eDIYn4C|%7%pmY*Nc~CD6juspzLD2mi<=AAG>&Ew#$o;LXhB7pNV`&
zh^(#+S4ZUWCy4KY#pG>`Jxr>-$b5VG`U%$@QFR(gLcqEGx8wSEB=%3P?BB8AKS}Yw
zgXZ@pU%b&^wZ{H&@{L1ZeA4j_Iv0yAmd=7tRQ??suXGxlKN+F>0seIE!y~VAnP>bT
zu&*T#KmGd0AK<Ep_*dTb@A&(l)!P|*6!UPKCgpmZem{ea^D*Df`xr9{!m!ObhXh*}
zm0x&1oH3<KM#PJEE;)aR({opS89uvO++m>J6Zl7{-{ZM4--S)GSVw($Y`6@UGI_gA
z@-HW45`*{fVx-i1sp9Olu_g_SXjW(Mn>=#!3w@9W%lrr%0H%^%=0@puM#uFb=0eat
zW8_wtyfUyf=-IS+j_3$HBk=bR_lB-j>AB~bw6saOQkSS-rt#_tQfg2;-=Mqy2LO<4
z8CrYx{j0=TiB(N~xd_xd)NNY<HBmq`dbDT;RNi>e6K<WV{mL%A;RBxf6kN-j2I?^@
z<jVSYcRPD_@^-q2I0?DXzp?*jEC5)cxom>nAk5py(c{fCwD%p`y5RCxR+qo}mAm}Y
z`Sk$!QG<qiXvbq9e6CMw&_))$dqZ$V=SBK12EBe+?U*nn*n!hyx6;5BxpO+GV?|q)
zIc}0?qJd-GJLizppk-3GW_n}iIL3ukWmEM>vSfS{UG~h``t<X#f&n-6koPRWAGDmp
zOjGT)cLP;tJrTQdJ75P3>rxr#Q8V|AkxmCeIKw#QmXg>?#ICGrFFPp}&kj8x>EHO)
zMSaUf1GemsL%we9vOC#b_ZiJ&e-!^rmSGIe+b0r~>U8|hbFM80r=-?o2G4Dqbk^#i
z+06?#)x~1yC`O07p*$vtY&M{+K-gqf@-lz^Z>DS#lYy3eNs$n~6l3lg#)tynkg#2S
zo8#vYC*Rv4P`^nn216`v%&CX0=sOg^>`=7tHN@I$Gsn=if1HFb4pFK)jaDEAF`@>(
zt{1J(W+u*^`f$6}YXZ-*0-^8Or$XuHuwZaH0OgsYx;P&8TniIjBK?}yJ|3x!|GIJ_
zhhFi3et}ZO1;Fdd7cl$8<a3~Iph_MH06h+>u_@=CZsE|u<NW8LGeA>N^Nz!J!nw+7
z-O}l!UP{B{1MlaTnd^T?P=nrZP-7AfZ1CwPU{;PQEuV8KWU<}tY^hp=z<ywm>3zz;
zv1o%)GIZZ>M(ARx+m<(N^kr(dIDaQduC6JIzx(IzqENKX+ehiu5lDA4^ln_XGp|b^
zG4DRFMS9}ni=jS??YVd0^q$6YK6vs2%o^jf_GSlJDF$7WVD0v?6u_CtQVuY|j_=Bs
z7<zJ~W}<l<Zb2pF!-SBVvPoVpZGGvz``#7oqB**15zB=^UV2E+AM5z=gDd(chsq;o
zh8}n*w75D@;n3otq=vIfPO5twJkyJ{tAFJR-GR)LM`~oA3PE%YaiQ&asx1S$R2h=5
z#0%QQqJ7K>hEPRWkeGe3R3lDfc0e#Ed*mzcQ;M9^ebCw(aLBmC+sNBoAp22JeyBTu
z1Hl^L*Fdst9=$L^`Sb|F=%G3!%}Bi+p|1jvGD8@|dNzGqa~W_T%4py)liqQs9cDF7
zGE#cbu{*?CNX#KHbkKwIy0UwZr#ly1|N3Nc3-j#wl=HyrbU4ynFJ{(t%@MUeESSF}
zdHXk&((2iC5vY1*7pA`je0}#?WmC)_pihQE8K<2-h;vCp*C8Yjt#Nbt^c?8Y&@*`u
zC*ioI7p5*|U4yJ@vN#WEp&DdR60Rt3&_}O>g@uN33yhYa<~i@pWmt6NdnCm0rI7`z
zk5j3C;BEKKBR+%TK7CuC<B7_Hz^gD>NpHJtdTDZT=)ML@L|>z~TzS$+VEStYOArHu
z#`Z9q40@md%qo_vfo5H%7Bh-4WCNH%TaMcyAez2Mmg5vvoBlRe%*qtdgr#FL6M#kG
zv#ejL^BhR?xhRapSJ^24e5#mu3@{2MF{J9hf)Zl?2$}-7C^_%|ZSkp|hFcVVH;s3Z
zLNzjyVv%CoPKku4vU#fzIh6NC@TIM`W2Rs=-gC#qh;n&p2Yu)jN7<<$4Q)7xr{%WQ
zgM7`_@!wJcpyELNyZfIAGpqNj{0(kw0A+=TkT$$}e)|=7CLji&kKW1h`jizW=NbuZ
zTtkx-8T3xKZ(=xfH)Q+3DzU8_d`Z=PwJ8ssu%&D6V=<z2yWbkWMqbzwSIlkh+l?y+
zpjgF5AN!q{eKQ&BX!|&e3{2v;gkPzwni%@g+SGX7?X~V)kfn_FOk0M&y<n<pMRrel
z5p74L8fF__jM#BvuW@CI%$rRktn(Uza%~%eFFCA;3!XhfP#dwt`J+>-6;dj(YG5QC
z=?@;OK=LQv5>PS<of?OVj=#C6IHJKyT}#$CoNPN+oU+QT>LdDtBIPCyl8NfM1l(LB
zfs>282w@FP7e87cL%ZWDMds*rb`aJag16@Kvi^K`V1V)j?hiYu;H(wndo^Ye5pYS0
zb%JE;`uQJVMp3qsy^QXOua`@TM|GLHf?S@z4zTO${D@KUHRS#e4%EC?EBdo^k~7%A
z^V4Xva4(kAQ#(XpZ)N~J#|)jt4RTljkO0XilQ#{}n+tI!-Raku7UpnokFrgFloY?w
z)n}^fH!v(}-lE`37h+cHoQ@ftIKY+D%bM&7CU4neN8C>c2ntY52Uw|a`)w`h<2gj%
z66E;1AqZg4KvtVbbISXG+xW#rCFZo?O?0IqhkXlT&i9XLPufVSi~khOFBC)ZWR|h$
zC!yAwr*G`e@IN@ZER$IM5n89#lsQzNRLPPc%sK0uR5xJDzvxMUjDs=#HtV`UZF85D
z-xm7se45JsP~`aghYw($$k1JvO<d5CFD5>m7UBZ=CcysZ>UByugpQ5}Bt+--EJo}8
zP<+0>Fn@nimOlwwZ;z@2InK(`)Z=#1M_$ejg{i<NIL~f&oZTrztr(=>I(0}(B8&i|
z{AF<)gGdmFB@Vj50n#ceVEkWULcMF}3qhvm^1O<)tP;`7Nk$~763)~k+KY?X{3W36
z_{CF;ORGOu#b>c(KVM0_dGYbwaToY5^Ct+Ry<+s_59a9KL2*zTWr;BRe!j<rr){jm
z2&>Wz0ZXUlc*>`tC-G=SB&TveNM{j4v1*vB2k7)o3k8LvQqZr2>faGM>7$L}#&Vc(
z3;(F?=XqPXV-*ICXFZv{KrsEqL8SDP*(t~2eU(5h)w%xWBmQB%VHYeZjBK$78oxPn
zMN)|kbWFVcYF#_9wm*m1%0zx*WENN2reL0yX}pZ3&b4SSFVWo3E-wwr8%Nv`z3Ekb
zszM<uOz4=nSk_A12kg-cJYA&^judbyMB+5e6nSJVr!_bjXjR?#op(Q4C#yW-pHAig
z&(!7xBnB@Ex9T`+q5$4~MEFUG?#Xgit|?8EEbx|JWI^ApmK|G|)Sj$jt;Y`qWxggt
z8s09rgA4#0?$c&-uGL47M6d>5g#AM<M?ke6fe{1&erWCJ)Kz;%Q`@|X$^}pTHH*MP
z+iAN{lJWY;iqo>{G`$#MhlDI)WupFpwpeu@V_U<HZM{IiO<TB5IH@|Qpuy%;%wy!|
zRR2H%b=Vbc2&8#&cnRQSqTcNVuOPzOgfmnB>A|mS)>dhY>&qUyS0#qyWO_B@K3!WN
zpX_@|?e`B`<I>#3&9p0BsFnDKLLD(iP@|zeo}-8p^FO5P%`eIXgZsaRuHZ@XPv28z
z_~mY<WPb!FGPXnTpwPtL^amVro(CTO@m(XOU_9!_)a&7od@&Tq>Yf)*NpYv+k%0%v
zfnTNPrZD`K+F`~aacfzn+H~GlR%`hifPeq>r8QZyFEx}p@c);6rT>!x__ob*31dD?
zc}V;ZhiF(jK`FKJqQlNV<L^Ay|JbbFT~V73r021_9TMT?8KoWj3E-tg-=vO}%W<wd
zO}z2|U3$YqNrGQ6N!xX|ANwqUAqM}{xkq9Autdxw^IP^0sPvcTS~l%Hg^b+7KDq7>
z-GZDRN>=Kx!?8W*F>%Pgm(&`|OOhsed^YeHH36WM!W_)Sq)>Kv?T<i`Lt8h5Pb1QK
z8BAJGm|OM9sfH(J=S<e1v|j<`f1FBC26Waf;OPCS^mgrp5_g;O&)AgrT6_tPx;R^V
z@i7!+%UM}G{6{D!>mS@G(MR^SeSK4bhyBXHDv{~Z6=VIUTuCKCpaW$Ps=*G!u4436
z=40#JX(<+{cgFSQ)SexbujL3_k@S~fTn~Vv>M8JTg>Nw1K5}!tpQitxdZkCqUVHrR
z!q3Q)-~zG|W7qs~r^9--q)Ltz>dhk<wFJ}~g!O&}3Adq~onQn=$kiIM`wnrETs_r^
zf@yH(TEc(S0RHoQx9L}FrAzR@dyV73r=Gb5Dxd;|FP|d(&`eT4SwBuNStf5O*K;D<
zLLvIDMQ0DUS(7@j;73<6HC}8gTnwB$?H3e({%r6j9C^Nz0+F<g{0A{SI^8TYAsd=8
zT2WwamhTtpV7o9d5TmV{w9l+`A5UO?z7JHVjA#9xl&k;+yF55^;O|-@Yqks*lYM1$
ztjfl(l|PW-;KIp-@#u4K38P=u&}s}{pR8r?O<7i^(ArHm-KKU}wfg#b+Hp(E@Zt6s
z#gZH+Uh5l8oi9^=l)Dmhs`cw;y!Mwp)O!F|uL%O~Ddk%tE~NrCBm?{=u3F2?+R4Eu
z&;d})+w+P)4$!V1!je1r9dFEkyJ;fskAm;FDmPvk&K6G#C|x#B0RJbGf{6p(zW5Rd
z3sN@5Xop^cf)`u>cEGE|i~pnSmYCze(=>rk!N;{Pi-;>^z?)^9zUw%y08yS={s2To
zoE9#?GgAV|N94gJ!gkv(kM}zB=@}8;-^x+)L#9pN<{YORD%0oSEdMR6As|D+ZA|N7
zuXyb&KSs1rZWQz5FfN~!X`S|WO9o!S72M?Wcc*mua;_7Mzq`^umi4!Y^8dw4RtcI6
zo@s=>!qNE*E_637DO8BdZg9`loTXf5F;lPQ^39X|){sRT_1RLU;(5t7@2(EXM{GG9
z`^i(=XFNJG1wHSjQeUX#gcal>n@Nfc33CvMkn*ZT{+TSH?D<+ltTJ7mv@CwvZ;e^h
za@wV?4yoQX_M-!`#R7IU+Y8|c4GkI9=VK%{{_ZfTo2~WWQ{^M0jgxm*)N>FL<I#M%
z4ShpPCU&tP!4^+O4>nRPJm$5Hhzs4J<A@8*A&{K#ltXM@30GN`>f+;AK9S9e<fTBF
zPCP~rMp6oocXCN$*%_fqmV$Y;ooi<U2*$pMZ`%;rZ^8$))odFPJE<DQPr1vpoa9b+
zB2w<JB}X)|t{q0alC>PS3bk{Ed^$MYR^Xj%%z{LrR<8Q-c?`a;j`2aqyro-<kk78;
zf32J=A<FaOi+7jyc(+46(O2&jR#HU_t~AmtF{53n*!_}-Ll(}&Bn*3K#D(6_SB^Ew
z;5QmWFg>?J7500=ylG$zbm2!FNjNXW4M=)ooVvK3w9F?LdD4;|UgTsI$R>zYCe60u
z>%DwB+~H#2Gf3k(UTiCgJ0zOc#-GNh1`Yp9r_BJ}zOgv@`nMTICZXaHC)uJ6mbI*i
z(EDZ|M&LJa2^u*-Z(WadC<Z#^7?V{mKt0|Z`a>!lT~0)?IAxy<osaLt7Ipw%{@{$o
zh@vz1B!&4S&sAk0@<mD#&{Z#8C!=aB8CItfyf-RC5qVG?T=;pEk(Y*OvLSa4<=kz#
zZ{lnlw#ZYbi~^YVQ0~@z#Li1tZ<02qYW?;BQxspmtHT6Hr_T8NkRuQMmH6qs12X08
zm|oQ(ACA04oT;k^@Yk^;Xy777c_X1-6IwSd4B~#ywb~I=Gx5BjD4(6_zBM|BbzX<i
zgVQ=I-ut-B9YPMy#z;gm$C^hUPrN)TUBa*czG9*acN+?}alQ6hG&XAChK6Y4i#WMZ
z;2yt^p+k2h*iND792=<H>I2=K5ug=NH<i`|<BBjBx;f9m^Z%rXR|$Z`Kj}4ci>yUZ
z&h>Lcl0GiDfRhac<<j`JPo*`4s>{NBw}Z;gB2wL`z*#-g9hxEwiQ`M-Q${hb-HXKu
z^3op?Ep~IJB04gW!H=@fmU9(uugyxAr&N|_Y0);)vu!rjr$7$mns67%f#<oczZ~^=
zv^S-{#h_4cLOF5V?`UE4&>nicbP%=V3r7)Fxv?D}ZDThL4hc~eUhyp?@vG~+X}mSY
z1`zcjsBSq(%x(r8Q{H4S6vUXk6ah|5c|jrc0yeW0!AG{=A$QUlkG54XVk8cFE7K7F
z@^NF9=gwTj!ojQ9R%8j|c}9b%jY@Nw%y~ME3ToSN27?n|R(t!nKNjbhq1PqJfkRVm
zyCj)92ofqA#IjeY+V;K*1=PtVD^M!;u2^#9EyNiI-|1j&7r^&K9W0@0#bZu410$^1
zQBdoKHb#>+W!P$jN136ED8#qiCY)w+nsFun1_PTd$`&uum|VHvHwl~OZI_E_gIwsK
zbISD0B!N2+0R93p#r?l@C4&PZwYB%IJ%Gr0o~|uHzkCQww^a-#R4j$X@FmLylQ4Dm
zJ=-9s&lva7C;38LcHUaf#lJ}82BNbR=YGbg@1He>&&jy}rgN^#vY3~Rjq;W9p6v$O
z@1|#MWF!dOq>pT5#P+WNUzaYQxN~GI&gt0Ysc^LXVw1~uAQEk+lkt>Pf@ZRZ@f5q_
z%4UToX9Y1Xa|o3kY^af!FAZ`Z1*w+{M#nW+9EwO`M?v!BC=e5AhEI2#0EK5E8-NIT
zh^Rv7$&_OZ3j>%suC<KrJ-E6~)3M9KQTt`#xYb!|H9Ql~M&$-j*(N#Q2{gjD6Uu79
z&BA3C%^X}rbBJ2NKw#<;ye({B2nb?ripBo=THL?Bl`0|>TQ1;UaCo3dvBQjLg($gl
zC}VeW)7p+^?56agf)YlQ?H;JZ#?OxOG}cZvfejcd8?X}EiOunW-v%cUjXnKuQ7uj4
zdu!i^<GbmV-5V=TUDl>gS>7R=KRSH!Q~AK;0WK7pD`u~nV@WXteet%Hgmw`bzF1TE
zL>MH+0m#{eOcTe7!c~l5IouH5AcZ+BU4jlFWuWL|`UpaD&D;qC)ojz9q~NT!MtE-J
zdp*SP)+bu+@*e7;KjM6q!c(j|3l@3(WOxerX2K)~g|h2Q?x{7)_L|o3*&%4B(f5h6
zTKEWPA3zyjG4?t3<GePwuKJE!mbQmkRQRi_6f=Emq*0JjAuFRiQzpsa3+`)4`q1kJ
zo06^Go~w0XruGSPp+Zt@^n>30s}UmON=3dcde6k1+tkAnu7Jri&`h3y7d*dqlXOud
z3l}sy7bs%m9srAjQ?7Yz$=>vRN)wgIgx*bj)ss~;TkpQdW<<lVG5Nra)8I99x7x)F
zcmmw^u##L^FkZnY5AmNhLT|kES-Rr~#@I<8h2}z_1lf^NzH~;lcx^O9ICw2R5!@Uy
zK|(;inqpx8<MTxg4(@V9jq%j&d<~5+`(-Io?{{E=m7%j*V{dJt^muEd6z@!rIVqGW
zCaaS#wgP68*wAG;rTNkXYtGK6)MsC16j<N5u#1f|MT!O>?JHvKt%Y#GzJ}@)2d~9T
z*w;egsBjK|q%3*1LvS{m2i4`WeMG;3nbB{jB7Oe7`AKok?Bg*4kr#DO>z^U*_|o?8
zl_v$I@R4!48lm};aB4CF{JIXek;bR7wuOW5BRpLQ4)UAcZ0N3^fV?%B<g9BA7Wx1N
z2!>tv2pQ<n`My#<XmO{9Cv9J9Smuju=>OH)mBzD~?O~)sTS}|*lUSmdVs164UO{a|
zW5#YeZpGfBTDPipLPX1imNtV_(y2)pTdQqqsiC&CYPf<DA;wmvVyQ|jK@*kC8}qBV
zo!<ND-uK)4>73vHod5Pb&p9Uqo&2=2sjMj_J`{ZX^P;NIZtkXJ2&n*e^*B$jS{^K$
zwsw<=hkK93ZUoLi-Km^TJU`$f$Wi-i11US~MaaJPN(PAj18eT>;MsqXcW)A|Bhz6A
z(OAgKnY!mTL}$B9JM6Q{VI%lg4wnkwtZzImdG4}$2bgO2={!pWq?u0_iTdVGtQf?P
zg9!<S4oBs27MuW*IK*}rohVL<P1PKlWQ!ihT}PeQa3F;(dGoR8^FeMN=`eNULQEbS
zNJpq(AL2m7t)k;3NLjLdO~*0`C+J?51d>e<`H3pg+5tpQOeH%ZUHjF5YU8`to5AX{
z=S`i%^_7^HSW%tvj5O8BpRY$dqf64xMrRw~au)gHOF=`J=U+e<Dw1qEN?DRR#3r-&
zD=9_#t7R<XpYu49JtMRd{OK?)6qpdbm_`*aJQNy(3DR*uya9b)cbccag7m?Oq~|W*
zb{V35+0h4d<aK><Os#HJcG3m?LtUx;&mu!Kdd`fbMzxAsV>jw0niZxycf(*&g9v0_
zqpe*Y8MFrywDS5(SO#y(5Za!D46Jt|D`#FTb<i1C^k2+HTB%%G2eGrk2h&{r#}p+m
zIT$z=L?g8YSnBAm{%Bh(MrwbSGwNKid`81`T|aZL(odHQPO7-$PKw?&^cc{H7#^B$
z-24+S6=_(WjEIy#Qw`S%yWb0!#g1Nz0LPpsIxCo&)#|rnjt~a|(C>`SG5S0d?t?mR
zTZcm3;SHuZ!=GNzyMC6GzGZ!omKR>1kTok!UnMWJtYW}KwKLXH0(b(_wmvgh)vN!o
zwz`}sXzX5HcaL!7FpUy)l&nkgUP26Q#bi#x#lfdc?~0XA?FVRk{dAq7zE!ysz1k&F
zT{$b8ZV#fLGTw}L<{Mveof>O<ddMYT@>kz7{938+%IG--4hNX=e@bF~s-Qguibfu8
z1YU`gYv26YaI^#jT<p=BTt$BML>7FUGxWr(#Vhgd6G>_G$$3<Ws<YAremG12-GNnm
zJZ!;i-r>cMCWSVSzt>;A42M>i8EGlFzAczDYT?~#*ZjrPcV%2|%W@%?Pw8#E4025C
zOexwQR4?=+j)ivv!9~focbW({iiWg}`mXscl{W;7tT?$vZgfXG=wv)A$fh%^{f7yA
z)NcqLi|Xa?W}t`!Y1;(dWE7qavSvc0><FZQmW(w*hunpU5(N0*J(+>nx0cF!Bwbw5
z8|jfSxEbu;>ceu*w;Jd07U1t-U6~E}9JaXZIRI8-Ce}Fp!iE&aA1sUPjBKk)%V8TV
z+-cVW+()fr!;{@x+2Hgx$wDDVxeU*ptbb7=1zxrrvh02t@KS@3+FA-X1JCcqv>)Eo
z^?xtbv5#-T$-S>HUWz>=FL*E$$Zn2fv>YJ60dM_vm`NGQxtUE#FaEccc-G{Q?E@=#
z*H{3vl5&kI(2I{5R!DEJTzq{XNZBPMJ-IRc9R6%NIFrqgm9*T)JN|B}+O3*E1mp9L
zflXTD%aaUKi%iY9c})$zxnxaC6+0C@mAxuOvb1nrB2SSa8w8*K!m$SD)Djw9zM;Vm
zHrvEkGqP-pe=K7O*>(Dd@`2bypu8$S4w0qX;<MD&wS^CZX#HvSs7Bv47gQFZuE?X@
z;W`aD0RrVB6wGb?GL=tXB*k63uk|t;_Vqo)Ks?JhH^;<n01wx^b=}la5ykYR-N<`$
zbN{ZEX#23lbPMFuU`ypfquzweuT5`UY08cf3{!X}xv)1~UrQw3iX1%c^gevKskQ1w
z9HS69mpGkd3S&Rr!;0)gzdS;~V+cCU*kPT$DFMVT>l8$ECm+Y1s*|e9?$Fjg<;M=+
zlMz;b;u|sEPMgU!61rg%+#fw~i1!s4i&jn#Jz%2*Q)wA?{-O_EHDT%v^A*gkO<J?h
zQ7-qRZq_NHCE>0HR#fV#MUHCRhWo_;0t6#QC354$&{xcyB6eD{u_AWu0w9H6?oh8P
z_a>Ll!kW!VWSVob{Uf`YZi`|XKpPIM2;nI;+oFxyCxVKub}n@eAO<`~+4l?Dx7&29
zOklpJ0x%y)lqh<U@V)lNo*Ax2K=6h$D;L^F{xu?BEa@?E6Y<Yqqs0Aqe`~9bQqCGY
zA_k>C_U0T{#`mEbi?er<OhjT(f$4IhmbPA<H)vvc9}2o5X~pl@<Y#bN^pZ_L-dMk+
z+1pi<Nx;*GnzdB_9H1!O!3$2DI-=iLZ~eA!yj7I6*l<UqiheBYE_uKQBoL!mTmxE3
zK*XZw2P{??;{{O5!iKno$Gtq`9@A!PLFKru+K@L!8avLu>1Xo;zz_auw`=|wL3^Oh
ztAt6`62DW%K!FmFG=r=SKf2bQ+Oi?=OJRROTv3gvL=INcpnZY`6{ZeO4yv}k1>9Gn
z4*wD?uL42WvVTbw3!8R849+g~>WO_R!o=B!4Qc4#oSB<|j@$g<tZOXC3*WFfQZo==
z2&9+Ne6Kz+WGGsGBvZk<EBS~<#k}84+KQkwJ`Ee*FT<S*rG%5VS-o}Crvq?XeP3E!
z!Ug^V*NZrFs;0+Wzgv21ltSiVx0V<e+^-ovSV?k^keiG1dGv?Wdu^szJVZE^Iv&e3
zu=Q_qqjnt_rAvu75RQRxmygFPugc6{CG-kEh>uY`y9(O+byn>`{~_EWbbV{3xZ`hK
zVj2f4_(rB8yoNvq(QE9!deL)+;*a{n-gCkeg4L9`;K5^4CNFqQZ-skeE5}GwOmes*
zSH}0Bj4~SmRKXwmosK@t*Ur`j^PVUv$Wk=w-y~ZHK9dmLZG_8F03tix(6<-VAL9Kb
z_u4F2#W}PoZ&dEWR_x+ZTHHQMmeNGMZt@IQYPFY*jd|@F_6Og)vz{BFm=7~sljm=%
zQY}0m*>$f?Bg8uJFm?0Qa+wU?rL?%=l$g?vMXVK8M?cRp+M_=;3e>N)x91G>212|A
z-!M>PZGQ*u{z=~p7G9>U3FIxiZhr!gS!X%G7(VORfaXG$CfW!k>e0nNhy|`ZX>)Pn
zw#0=FMrF^5Pb)mGU)psWY_vh(&S<`kg!W2>Krb3~K#ts@(mpasZc~GI+osYc4qF!8
z%>4I3MhQo=*$w}E{LUH^THt*q_!C*@j#Ies!-8e&^*$jNb}aE%LBtN&_!(R9aR+>M
z7-<Jx{7ZBBr~CZB2GIz7{p@N0(DIwyL6=4WJcuL+oIHQ5zs_+jziT_1Z2Q(W82CT@
c7Hhx9_L&zvt{JC9O90<Dr(Er+wt?4v2M5m<jsO4v

diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-5.png b/docs/source/assets/design/v1/prefix_caching/example-time-5.png
index b80dd5b9949dc331eff4e6efe14141368d4c7dd6..231ebc6199faf69d33816d818e372f5a0bb711bd 100644
GIT binary patch
literal 55437
zcma%i2l(6Mwf8>(Lf9+p6=;C~1ti%TE(Bs(%eFjaYq4V4k|o=+rY*~?K%1063bfD?
z3Z#@-%D$vDrOXxx<FcW&j1X3VLMaruDP?`HLi^o&x!?CZ_j&$*+mhb>p7)&J`JJQX
z)iS%ymV0bDGc&VIo}=nBGn?=;GqVnxI|tl3@)zq{;A>V;XUUlxAKUBgnHjtg(sF1|
z3~Qjz9FWR=_3HoxGMykiAVnR3AoM_QhrTleKf!(5=~#xPcfNXtKnQvmlsXI{8Zdr9
zDvQG414$oFps@N?`BqmS{<9(Ua0oQ8pwwzx!_W!L15zZo&JRO<0KUO(@L3VShYbE<
z=tLMj5kDCG%)0JCmvpUYg$E!En!*oHVc_<HBG)MK2Ov4{dtmi-@WtxwzLQkx_^s%0
z%K^pV%;C`CDR3)i`i>0>F))OHTPOsfhhxcu|D4}{78Y7Y%eVeZZ^_j4u+{x)+9zse
zL#^bIoYL@Ad75UN^60DSj&(n<z*=owxqtPYR1RZT|LSf>A6soOkmRllmM8e?HkcA}
zK#B%?(6L%wzh#5R)<2hd(j|(yy4o{!IfG-`AndWNuNoM(z?{D7?5p~?8fqOZk1-{%
zoJ>y|mvy}xj0CSL51ov!mrHu4r9!Aa$}y~x_h~aV_T5H0*R2%`R&Jt#H6~$RG4l<=
zn1&`kA$`HtiBYNERYJnd;SM~K(qXIbr3yhP$dWG@X=cER1IVq@YCW_jQjvwz9TYVU
zwlk$%n6FsFoLQvhN}&Yxm4;qVLqgkOOSCra@U+<uD`6wZ=G%^n)3`-sL@;zNKUDi#
z+bf3#RIFGfTpJ99p5806C4rHe8G?mPXV}ZS=u{eXC$t51m^7p?qdbEJLaJd*l$P&B
z1_kBHsAl(I+b3F$jPJlA<@a#C%1=QrRab3K#ULn+n5vXc`i<0@o2fZ)$e_b&wU<h(
zy*}6rQW?WtYtn1g%r=zQw6baAaE_|i&`dXzjqsS36sb4uLn<FSdIe+a<!GofTmwpB
zSb=IG(@NfImKhDFd4<s9uC2A&nZP8tbZ%Ot$`!deX$55m8}uWLX=XeUMbSb!GoW*g
zQN99YH8XD)Om;BImGf<emz;VBuJ=c%2gW8ieR)Vs#f%^^13cS6k!(we(y&wpVnV7=
zw!*5l3{99S9nz@|C1gt_3Tlx}!zh?mxd5guIQ67#YC6QDRGG)DdcBgTQ-Vhfi|wol
zgQH8<YfwCfiF`&cDkPaF%GoI~$QIKhGHP&6wp?N5bg5r)WwFxD<g24v*AfXi*DQts
zf|Gs^G|>bag~YKSpnYT-J3(~{K~oQ}3{jIY?4Bt$=_thXE>E@U(a3KHywrn9sn{5E
zIlGmMAZ)^Vy5GqbG0Otipf%5H<9^Y_a=t>Q(|$2sZiMYfDpu@j0CgiYTW-`kBm+-5
z9n&nbYk)d#dT5G00U>3MttoxDBIO}3;(E1gI8J9l3C8!sy5d!1s|kuD7%4}1%Z&JR
zZW5#-Bg*u_ZB!nSB0lklY+CQ3{koRr6t3VDj1+J7$=Xz7jHVK29D+%A+fIRPHF2OJ
zJ#9#5eW*4R&;WE~QLV5rihZ3fW$G1e0(2q+{)%kZhsmkZ>T;l`bQ#>@_yMUXkx>L%
ziy4;k?l{k9<1vR#$C=h7BvW!gcm14GGt&8XFWu-sp#%H%akDJQ`5xK|GwFUw7+Pd8
z$`g^@j7N<cXd5)f7DziFsCIc$jUc^R!qm{9fuPF+0u-Q{T+dO*U8tWfD9oV9wz^28
z)OQ1@sSbQE=yYpCBGYyGexJ-`Yh@@X=c7WtZ}BuWf+yVqj@GzL*fIj$rsA$Ku1A$v
z<Fh@|L*$OtCv+!gS)MC$eVS0lRxfZ>vm;f}$Qq(37(lZ#DL}1mYhpyM2eWW8rbZp&
z8P4-vA#9rBpd{L2zNN!87b#|IR2)g55RRIz(kV4P%1ndHMv#2AaK;5w2EQi#5mV@(
zbbyFR9HdjpNwTeCL$g94rA$&JmnOMB8@X%|+|X*d5j;p2hl5tm%t<}E+?{l4g^8-W
zjB4gBn5MxgVKCfGLzu3Q@}857J|2}NYm~PnyymL#0P3o~$JZ2C>ll8}m^9H0OsD&3
zs%DKux8hQU5NUzOwSoq!^w3PIV)fc+jg|%9VY6WFj02^du#AXI4XKP#9(C+qGact3
zR+9ZLR_Tp1iaz2Uk%7y_7;{s2Gu7*bJrx!QX<}4YTtce$n?B`cpc)56I$$+my-2x7
zC5OC$gD%G42Q*CQj5a%hd%0<B<_TDlIjxWnguF@vfvU%8G{;dzE{*$mOCs~O-)htc
zHlLyMEoW%f#%$h17=+N<O)JG>iK1e<l-Nm4ky<LGMg)VlBEsjC5SV|4%E+`+Er*P*
zcDQ0~!e|3UOxG2bOV*x*xD4ilVn$Av;TYU#qoKnXSudmbCR1zLia(8wp*hYFW!x~T
zsmfIM#cpxjY_Npm5IM!46p8k@AZTc6Tn8<2J-8E!-FS?#V^V0f#lBS>IB3SMhaMp)
zQXlICe$m1Pqe8<jr%Rv;_?u2kJ~)eH4>rNo5HI3777c`)O<;!UTDbzj5{bsh)`lUl
zfH}QoTfRDOSG-;_%S7@d7AdrJe@J+2lj(pPpg7sA7=!_AKe+st`TrUg_)kj@j(>t3
z{sj)$>|yySE|yPZeFut_%y8n-|H1;3uVVo@C&^`z<8h=Tdj>E{c6$K#>rO9kwVMSe
zP&3KkyETTkMo_JnQpRQtDYX;T4Te@PBb}k9HM0u9T}%{Hs2;WK4mpZ*sfHH!DASto
zd}}}%ktB%{5C>W9+bqwkc?dCG4$h9s)shzuL7j40sO6D%z2!O(+cqaozfdbH`B;Yt
z$c|~;=+tbxPgP13OHJV+5%!75YC)Zm2F5ZSLIuB)?uESG)3KmSP82X8rH^-G1)~OL
zv!8Co-E6O3EcVPsGNsNaM<h^@@o;d>6w{b99VHUgG|D#K3O!bx3c!A2yumgyIkJnX
ze5Mn_oZKKN#7^crN=rm_<kS*7IaOlRN+*lO7Fy)k<|_&tC;=g0<?<Lxj(y#1LS?!K
zL%F6pt<YH{3L92FuOm231PFmCT^25Lvg<(=orKGv3e$B>Wx&`~Su_>b9JbvtK?G5^
z%`0?0RjsH+x6vw9^KCj(t8mT3>QPz^1kjfV!)zeSq>SY{{m7i8$dJy_erz>5rZ{Zc
z*<OtS8kC=+X~ZhzCD<0^CQ}ZwKu{P4N*N`Vq2R8cb)zPXoApE;xJbmRrIyMiN`Z3X
zNwGIk^Lj?_!Sc8wndkuVf?k+wqXjn$tH7LeHM83`Qr>_u@LCa($d+VeF=|M*Sc&Wl
zLZv}j6|Z7f8pE;KqItc-b!jBU+i;~ycI9z9LW0bsQ7i?aZ<XYh&oP8hopM4@t4Up@
z(hlllVq}_Z1Pyy8JVLO>AW~tjXQ!#!bgZUWDlN8Tq9IhoPCn)1aS17P>ecKtsOPOx
z%+fG#Wtj}0S5?+VbygsUA>=lI%1dd$tjS(JPA1#x4jkL!3MLJQP@^@R6y2It^bETz
zKn<%fWb^q5rc66(XGWGnst#7mS3y0W>sMvH%k_fdC>xUnmFo75WP5^C$a1v-q}Nz1
zWmx^1Cf1sa<>b^7-)acyijAi!B%7P!qax=98fkQ6RnB=(Xy>D;Rp}KCb69DySyAtq
z5lTjaLTK%&83&W7)~ych95B`eftI_30JbUH?Ft%LGO)KoAv=X4r85b6E+uE%0oNfr
znNb~{YT_UW^$8vZm8n=09k|2SIz7COcLyODa)ts|pn;9o8G@=!dgV-SWTkbaS8wYD
zHdNw9N0+l{TTmNVXW|3Ui_`)n=j6>+wkm_?P?#w@Udf2C9=*}>`iwixC_sBNWFJVX
zMh%R>FZ6g>ubWxWsj6fFfJk&CB~oD@Y-$Cb5_GSTqXaT<T5MWD@@2K54?FZEpX%gQ
zFH;^cDB4F%HCstxrJ)LHlMTAl7wmj2X#^2^#cI9;!?crlgJN$Cl~t!ZF?dxNpm1HH
zDr9z0lu{5BaH5gesA7NA%*(MHIF5_bWZLgm%9-Z0ZV2V7*6eVqQEIwIrP9L;Ayuzd
zHL5sjyC{K=AVqI^m>nox4|o#bP^a-49$=%`){4ovT9rO92RKkwq?@TQrFx^EC7OkL
zvs#Rd7N@i#G*t%Lo$ThO4Ngt0ltZOkZ5se0q6qWFmfw<HS!m;|)1`}0IyVXptDtu)
ztt@X!lrc!m6d4R%L>`nKVHDQVrI9p}c&9yP9EK3PDSVPBEm4$POp1yStd*rlrk8GZ
zHFU&Ijj*g|EEaT(SZSzMCDr=GkgI%0iw6ko7r~H7m>6=?k!F+<<zY}TnHmCp!3ex8
z;;Jn-S`41<Lqi0Pn`7M&`)CpE1oc=b;%!kF@KsEsd}3^(MyJ~Z%{5_BoH_>3f23Y-
z<dtr&#0qj+YGZ_>wnA{+MJeqdn9-?gmffEuIve}}<CCd2D=a(dCi=ZWdV{jj%j;a&
zaODb`$S|(RWLrS=ahsqGR7!jWW)iWkAwuC;Bc~&!5w;L)#Q4Q&Xy+tw6pLzDb^0!x
z(L{&#hD|s%iJKS}`jI#liV3DfgMf)}Ig8uYWHOLC2v99GQObMyy2DMSPOYyJsSd$#
zF{CpB(7GuF=#Eg5oUW5>XiBGbpwc+TF2NxorfR8r$V2dS+)BY%Nzb!VK7+*Nb_O_;
z9kCpCMVJAQVH70@G%Pl27~f=TUAfY=@&mGo8DvHkaj<%fW=vz)2t_7YN^q@Ec^P<+
zZD21MX4<tlqM~4p1_pd`nDIx=YzG#?^3=cwF6vJ;d~6Uj)^kyS{h0KGD%)!E%k<cU
zNC1`sunhk@d;-oo@?Y>tB`n}<SuDkYu8SS2Zf8j7U-)F&Nw^BSl;CYP03U!g0%Iq$
zI`|N42Dl@cK4^_hr9yw48jtI=RdQS!SaOmU$ND%Gkz}`BEBZ~g5(37CWb(C^nnDC6
zZcYJG(JJX6qKHgiEO}g~&x~!yqf0EF55iWh0?OCL;=pHUG2=~?imMh(1Y&v<#XzW7
z5S%hkRxw+lm3%$a2y9xb71KPW#`SDFLur|TNh9?kK6SVz(am$+NjJz8jDa&423oBb
zW{jBD1RG3@Y*hLoR`#deF58TaNr50~92wM0ZlN|Aq;nN69W(usLqZ;(;drfTIyP4E
zJwi@40sJ*M#-F5(d_gFoco3E*oR}!IZmn*NJGO%k$Bu|(IT4<wh-x~~riCB@&OIfP
zhm#3WilJnYp^Dfo=moXMwEM|1ZwwKRD|CwmuIekbI1_c*X`fbXUZ#q16=#A<+|GqP
z5sq5}a@cJPx<QC~*tMpd4|VN|st&D0z`}7U8j~jFOrs%SOBlN53`?~%<45D1D~&iP
zE;XweIuONrt;1x}B0nhDHB(HHE=>hmyI%ui1z1IOvU05iC^8QzTV}!03To_j1-6w%
zs=%KaS5%q;z9Jo3%?_TcCt~6vIaUk$t+K0E>mhD2!-j)mDCdp!b|s!xyk=rinoYFV
zsGHuTR2XDaR2C><X4EMbji4(w#c4|tFhL}9r9#&$#ZImungDavi$ey3226fR74R(Z
zpPNpVnnt};IPvRI;0l({<^4|4?Z|{93T$a0Wc!$t=ZwHi+^9<8Brsmui-8C;<!qf`
zjRItbBjD3BSfj~pgxgNy;VV@O4K){SNN8)K3^NFq%aH=!5fv($hW?;d*O5-4oOfzK
zMCwzh*%&$ntTS|?3g9Xl9_d2mE~SzKB1{fHOeeBE9x|wsq09~zvZ)$klpWwk_Ni)p
z=ugwE)~lppZBm+mk}cd#q($E7BKw%xGcsr}Ott#HJ1WwH*u$HRFa@a1#5|QlTa<#<
z$PcPn;DE!aBElB1>bRL!;;N+pZo}{A%3{lEt4dtUDv9r8iHRa28*RBf&H6)ONRz{W
zmXigd(+x}+w0lq;3x$G!jnJkyf{=P<1QfdKdkD;=d>yGwCZRFSxBUcvBcTlYBO-K0
zsire>wb*dSJt6192r;CKlE|Bw8w^3~q$`HQfN@+RDEaoNz<7f;Jd&F=7uW-))eYos
zgwY((G-S8Rmgxc@pUGjlKWdm5T}&B(>oesn!y1DrRGd_*mRD>dh@04OBi$?%nv?*X
zRI3Af6;`JvUO87_hb^(+B5Vgr*Zml(Hrx4;t&DlP#R=@R3u>aRI;x6rYa}*F0xpqi
zC<5<y!uHz}s~uD2fRex@86})#O47Jp4qLgt%R9qV!#AOJ*H1O|tlH;L&zH+cz7K=4
zMw~B=b3=Mc*XrZ4*aIBU&_;4{O11rrD~fRg%~!j=4f9#Qr{u+QawzPk2&mOM-B425
zu2>Iy0c)q(X|GBri<3iBY%JB~!blw((;U{1606F9d87x)885VkEesYXii3D1S*CD~
z(<W0p8cjPRJS|5ii6aFEM)Yb9Ck3@y_ClpxEKI#tF%6a=Z&EPU!UGm@<gCToz`8LJ
zF=>ufumudnk3eUxk*2e<C>q_8Z8V}}Q4q4jg^42<+EWYet6=gZYNsdJBGRV&?GX}#
zXA0y~nYx1)d!@=apP3E=Wa_0+-Lznj<^3jsYu!|+^fKk7r2y&M1(#(@X+2R3QC|V7
zlBAg;A_L~S?K^zED2sAo5<~+f@20eXt5)1HSR=CUl^ZzZkKNG-hw#dv*i20E$e1EH
zpJ_`4qS``><Ah&f3^AOb1mCEFYL))9oM25(FF3i5po$Ztho|_F1rS-j<fKf%n=uZ{
z3@1%-ocBz?yG5Z{BA5n66rGVCjk;O7U6Zmg&wA)6R)eOVh%idm+>u?GS`h^Ey%N$V
z0<aup)K5$hZmC`i^hyrv!!aWuCCOlr6oPlE9Ou=$hTDgm8IhQ_LkJAbhK2gnwlcC3
zD@6iLmOetOV`<1^`~*^mwMfQPj#BkfFVyWc>W?#Wy&`vboeQRt+mS$pAj(ZN3<jqY
zA1P&w>cE<Z0;wP1=}uSdyJP?td{tCi3Ah;*B}=NKV&Ig0YitE^F-(u6qE_<MNg9rX
zL5dh)iG_#gu_Ga>K#`TuPvI=b*%@#<fhj$v3}_b8#{5Zpi1a2zuQ3qiU{dR6v@sGH
zvYKjmqEc)Wl7{%aE%-q5nZ>kg3CX0BL4!ZV=6vv#LHk2KW}=PI=YNZS!2Sbe|9{z;
z7JTAp+SDgT4NOEUnq)QsYe}RBS0`E<4buOL)aY0ub?Iz+q;fKvPum>`@f)LtYY`oa
z=gXv38mG&>f}2Qu0Z58q(NNAN{b51Q)l35f=$M{^Rj9s$DSWS;8iFX%n63sM)#%si
zKJe#zJxc%vuGa%ja!d~fO@{{1QtD{!RwbiI$u2}Ap%KVpC}pT>IUCj5fW93xq*lyH
zCM(qgbEKOMO;agqsE#nKRFP^WI#Z6M%81fx5+4^`9ti;iB+ao(^g9z(<i_<uORAUi
zm@#1pg$ccWqmuKoX+^K~87M4A6{lIHLCglM3^{K2ZDSawI~@cg2;InF>11aqIH-qk
zXOQaW@sJ~mS#3}r!3-y7G^*!s9Qv5x2{@B22OQPQ8&!T12?k$wfFWcQlg$mFhMHyZ
zNjT{TT~DGYArHFCWJ}d*8^oa89LEc2+-{G_DW|k!yOz#`Q^t)GNXj|Au!|TD*-&lR
zA;vtL;`4r6)kqci?QK7b8>tkVtxg(B)fXYY@3jS?YUC+Fn+&F|<ni6kNEo->FzN#y
ziFK@IX-pdJ0*`qUD_!eFOuZKnDLqXHHEE)`eB=godqNYLR8`NAX{DlwHD*)_N}&ss
zcP3><W_w!G`<@>0T&DmC?}|ZbluW1kY^6v$8Ow*TqM%GlN<C_!0Q8X|p>>mu;TtVS
zv<b6|r8<DXEe0YLO`2ep8iGSP0&q<1zTJ<-DlO)XDJf!}AYs)$?lUolOWw4N)<Iz%
zPdCa<7V?H7X~sY%nJFrVs@U*|R3%Lm(N-=cwABExF-{qYzyc1fkut7Y0uNhWda3|+
zqd2Ygdl|(GOH#2)C2F6<N|4=aw};&c(5GUj$ATc*gn>*XO65qHW;sa&ekVC_xEO5T
z#sdq<G1PLLOsf|;I9pS*>Au$}<SX5(JLp3#5Z23!1%c{uJsul5N~UhIRE~3`AuJ5y
zLY^yCQ(B|S7boMU)$enDGn!bF=BS`dbF|Q!=$-5&I}XqaF`mGAf83@CqR<BX4bvZ0
zEGgZ~wDg=_uDViK>Ck!tqlPNi)w#0LCLyK-z`UX1x&XZY5$jfLAsN3grY7wUP_@%>
z%IergVGIp4Z|GNroaC7R{4#35;TF#Ti<K!6f=7ar-)(oJiVK@j&r)ixP(d-!l03F+
zhKbvz;Me?7tzB?zvs`PAr2Z)Es})0&vje;7i6||Sy;f>Gn5y-hHBl8RV@1q_%$U5w
zp)Fmq(5T<Z7<|eY>tQ~^^ihl|VHIqY)A33phs(!eC2bTM$qe}%lTA-^fc9rntwGfo
zas854GXuWixFg2OH<eT+oDw9g85zA)rSfV(4@P;y9Fb+G12&E}m3Ac?cqWhHrqO3R
z9o+AZylxOlrE0}$=M6{7aP$OY47of3G$^G=6*JcYI5%`Fby7^bB$ci}&GeMO>nU`?
z*#bV5%2j48#zJMxSP8~9r5a+kx?@>r4Qh}+=u}0l-a}A3OhaXa^M!E)Yr_^1B6Pjp
z62l+>&c#qS<2V=xEfBBmu@-|ngupbzPJ*k!)W=%Wrl#ev$D~tYdMwH$TET|}uO7yc
zhc=DQh)B#46Zb*+x*h|xGsJ;0huIE{hM6MVEA<;)n`RIjj(agBVIZHvQFTY^^rtNt
zp_>UNQKlGb;iMS!fSncLwwq1MeSMf|<T@!@OL%aYWLZ2p&v=V%O)b_8<ZO$r1FDbD
zQ#fDJOY%4#c_0XH24ex@aiwDh(YPn}xkiV}0?-QTHoPd^3Smr=+lF12bXVYNl@4Kw
zEE=nlo~08J!>}4|NYEsZMngc>(mtky0Tr;k9z&U!7z_%H1jqLVBd*XYgrkb=yJNPa
z)eHetXPhotl%0-2G%PPiYm6XxKvg=T&Q91#GaHxsLL7o_oT#jeBKQ&DIRbnM2?5y*
zT#?n?_E<%lv8`f*UbQ)v+8`1st2k4PlBJIMN>Oi`lwRhwY>jiqjj{ld!4yyn9y$a*
zrKwbk7~LO^8(6?%dfaX_Ixf~4mYRTQ1r3OuNnQj7O)QIL-O<QQBhR$hL8&_g-sgm6
zFq8#>qe1NYQ_%-JYgN!E)lgArtvVdHC?W?iu#lOwQ^gTH?pHQ~<DQKs4mOQsd39_J
zIJzGevXwsG9Co$1<;r0i8x#u$+b5A}Sfzt8V_<5P8x35A@-VTQg`5yF;(CJ@2Bg6W
zrIrs-oG6zGIN!?0Fw~XZgv$e5jpwiwz^CO5lh>_GQH9*Drwqjsp#w~iNG;Z3rUt-=
zj9EhyV_GBr_iPT>!T*Y9ye@}k%B6f+sdv*VO)Cvc_$QtbL9_|r8M>N=tG1EKH_~0Y
z(BVMNkmj<srujwSP`34~&zC_=M2+&pjtple0mKMhk83NVSdN&kgpL!i+LKzW)ieZ+
zmr;xyRAV3!j8Y5I`CM7G3k9T_C9_OZNCM-vAV5OP5`dsfLmL_Z7Yl4(V)p|%5h7BR
z$~s~i({_|4dj&ja5FiU>6sO}ZIxQ2_l<1oTGJ$YV)Jsk+o9(e|Wh1HyQ%K}Vf!L(5
zFo7qQK4nTi5_^^2q!I@`PMx+bjez)SER>pkIgbo&r9P$W*tkqleN)f%M=6{r<nlca
z#>Or>n%09cmT4!}P9K(}hLMDwK&)Obb<3CwwI)pBFSA4%i}b+AR0#mR$r?Rrm~c=X
z`~7sH9{!1I=t(Z8DQ?iEv_1lX^=msOU{t^!lMcYe8M;)-bbE}c<7%Z2X}zGLHu6dG
z5y=WowA!X(9z;>9aTWlRa(}2xWm{rI6bLZs7olvbkx~mivz#2tJYaOn5i*u*Bc_wn
zIS9$7Mtv&-7a6KBs^+{_p;8xWe7l~m%f)IAQR!anyM=t7=~Ipyx5uE7NGvqHf~F4Y
zO~jefCOz)-s$)yYb2+g+&<iCcWfc*tVoQ7xI70XY)X60qnGHHHmmR3-jntMlDZ{zO
zpi5Ut4K4%o#@5xL=?`cZms69_?*(10rDmbnr$Hdtv{|Cxn*^n}mN*BZOBNfbkwr`W
zm=(vlGMxnX3Y8{e`3y+Fv2aZ)gxw-IIOLSiN1~QB5z3eO?6^0O0<f$g)CVF<#Yqhy
zoGRcQ>LTDB8XgtORv=X?ZJ`({ZhByl_9O~f5UivD>7NI63OYMR8j(7x0;dA-DA|OP
z8zP{8vSiwpLuVooUNxyst`Jw!Tt5v@{jA|m(Ln^_f=Hp@;&C;0^Zho|&Gl4{R>Og;
zm9pt#9#Ep$08LwRb1EnIYo!`t!3CRPie7^*$y1{|E)%E<w<@|-8wkMHZ<B*=o^C5l
zQ8On|FDQV)Oh=Qb>g%;)h%rLRpX9+r<Q~}72{NwMOZKo<)LM0#pa)uenCni)-CDtR
z9E(UvfEi-4Y9`QnP|h+dqY`j*nL@iXoaB44J}e`)IxMO<6^0r-RAt6>TH&x>(kQ82
zi-N%KL%mpZ1u~kHYAw#?)E-4CTqgrU<*8EDQGqM1LSE1f+(tyWHLO^3blem>C^j;5
zIP#UCNw7tVw2@Iu9-u=t7k1qmF?A`Smnmj*`5{&zElM5dD!_eg7j%qsIMtb!`en$R
z6qKN&!BUoqbFo+g{y?)n78|i(hn~k*!C?h4dN7|+HIo8yhRE-Ai&DA|b`fw5L6zh(
zJ*LIG)p5_{0O>B&OL4snhhhiQo$(Z;*0B|GNEFKHMh=ATblplP*=PMx8Z(eOU&gvX
z!GVYw)k>8nffoT@1eVpPSVHuH(VI{?qJY3&lPGlKSQzJ;Wxq+9D1l|gz#FwQ&8C?u
zAc{AkGHxLYm>a%9O;}9qc0o!~)Gdk><W_Nw^-}dhBOjPfsH?qJc_bsi{1@d>uO&^n
zEUI8(R&V1YsuQSX;C^Pi9+L&b419J}DM^#6rQ3k6Hp_;bEC|->I1O_`5FkG=Xbk|f
zzzit{D57GGhxJhbkUR)Dz>FE!E!zb@h(R8!GzbVr(CPpYi*kuWjjA=()e+_D!^#js
zc^U+SEE}ABikpP_4#+iYr-|~2&+5%;hz;<fpvRJ&s$&hjH;C)nutpQMS0a^|#~G2(
zvYjg6Vu!=Lg=KBPj7T^U0n8^7*<7r8HjN=I$;x+V7i-83V7W+fs&zOX>pNQB#in_^
z?q#caMPx8juP}0BSmixEsP@D>54Q!ks7zBh=DKFYvdS<@6<yqIvsl0DRx^xK37h@$
z2m={Gd68DEA)X<|4Y=Tz0c2)y*39%HYs?~(1hqv%haX54RO%WK=L8chs8dd+T5rXL
zM#?9{03G7GL}Uni2tu`OH*IwiZP<e046lO(5>Lp>HL;^Ro+*}KN-&ZYE=cu!X#{H`
zD~j2)*r~LKw%cwu%m#~fYFGgrog#qcos#J`BP`?qlhYtTtD4^ADBT=-4Ss0JBiskF
zPI4W>AWIJ5mqIMX7l?in>Eu!x+>bznj&6r}bi`8=VgyoQy&6?OtWFGSsSy!EHFF63
zgw~XpD*ZkxlzK3MX7Uvbcg3j};tb$g$a+H`^jStKH<6xPqo>V|qhelXN|%{JrsfgW
zP^8E@l!|q4*s1rknxhJ$n1;*s4A4C-*RR(7ao3^vVx~kFlys_{>Wnh&GN<xJA!yrW
zGm`>dTD^fZu~rK3Lm&f95d49!k|`1hSXkq5kl$46RKj-2sde*AVpzwK<WoqdUk5mA
z0P+IJDV3pROQZ&9+u?_Z6_QbCq|Ii}#Eqz28v^Pu=aqYL1F+pSqXlIwr0#K=0@`xH
zN{hvE11SS4lq~n_%~VA}+gY(eqp8%0CRjnMw75*IWie%L$jUyoF|ZZ2rh%-3eb0?t
za}39AJ^^t`w~?na{i$3BxRR8{AcrCFCNk5Ib<r&%9$CxVR<DTI5|?Q5Oq_s2Ujq1+
zVRRermxy$l^#S4;8Bs0SULW8crYNV}bg%ut$Ah5Kf5n5-Ceo-lf^-<pj6IUCH#-J^
z7ysh7zLtrcWFM78P6OFTjXr@lLZf7>?U7luN&*?@MPuxO1gdVIt}%Subig3vLZG;b
zmDCVG=+*47SV<Mqp_6j4(iqCBqk0Qht+X$AZ7UWFr35<BbP%H=lYkb-k1}LR1GWb2
z2iZlnn8FIaT;a%owv>Dx<T#=b)~I*gG{+Z&$kx(Tlo(h*>ycd>#Pwh^ot<VtI38`K
z46Z&ZivyMN6M!F-iLBc5^0<(4dmTx%>W=L8qgoG2Yg3yZy1<r!7$Vc+nvkUFS%DA=
z9R@&NF(c|2#$g1fWm*{8>41c^jP7}X7X&h#z|U@%ATm5E#@NV{AvjeZgT$U8UajSG
zQJ{;t)&ztRlOu#ESVC8NS<_(!n~$;ylCeW2-w-E3ze<{2E=rSj%Bh(dty<}seGf2<
zIEZk`LdQ`$Ro;k*Y&q0?g5)S$lU%ZG_iUtr;0=&^qLZmmfMO~&MCi(7s<fxmNb7r&
z*Vjju$&KN9O98Y&+w4RFZx;qax)o)I+3J7?>47@j>hll<av_X5F&@*7&|;BhI|gbt
zEDU9^XTv^~Gcz^OwA00Uoa&PbQq_5uX%wkoilW)j01@_X&^F0-IjF-mOlhihj;XtJ
zQ&dn3!Ru_gSPgN>#A~)2_d7MHH}Iy&h|U+(rqHU}RN}99ntfF!l^m%t-L@!WHnY(a
zD$2GmvszOdmtD3m@qX54#?wJPs5h_)!bv1jHHJlBMHHVdH%8OesM!sJ2BXn*vq*LN
z<%T_WWxgqqDw7M9v~Bd*396MuX+qMLDQfABsaYkZRClL|(<N_23)HwyruoF{WVEc8
zAElcWGhO7Mx-V+AiYJX&h0i63Uuvt?NM*b>Ll37;Rj7;E{;)z#fP|B=2CDbTG}JWv
zJ&->TSL-133hvd%amltybd?Var9qyy0AB4%rGYL;l+@GQZUN3Gnu!A*Yokt<3UoP%
zk);d6g5JlGwvdA{6EZ*&EIqQ^MqlZHC}fJVrF4RZizpf@)>z1tAu-=68QGvNGNpo6
z1uZHd6UIh6O<4jVfLzxbx@o;+<}4=BR!p%|l5)VuXWG4*PoZpioaAwkc7$qVS+3>M
zeSDw><%wksLCm}r_`#^`qzTPzrANlFj*@cKw}xCTC`4o%XGVo730zI02Si@(xd|O<
zLlA@aN~I7)rZW_H$s?~Z;$Simyh%VyYCugfoJt%xt>V}XkU>6?(5kGAJ*gSfAbGf9
zBA^*kw^JbY3$h2M3YN2TAe9ZmS-;`qRdNc+K%<fXyM+h<Z)9c)bQLV4k`dZ~fluN&
zWQJ=}XxV63YT7H=Aj!qfbpn{nbLc<;VYe)s;mU)Y&`3}fK;AenB-S^%s2R>QJFL^W
zF^6WW)ndEU>eSPcJ*}7MP8O2oje!YJn4K7HV5mq(l<Z_axat&%L{*Amlq>`sSZT!}
zV-+kg(+==_b|fY2OsUtf4Uogd$|XP9lANL?edA(AWqjTf2O#0SI2puEbCR7hP-7ss
zr{iFn6dfDmR9ut{qE{FX@-bRs+NR3j)hRhi#gKy#J+T1td`e@ti}pl1E>E&H@6<;M
zIkiY@Kmc<@rdiL^WS*eQAz7!!O}^8mGd+z15^aHZHi}IUWNKu9|35CdND6RwBydRl
zOauIgk}<GIMdLuC9YF$ifYEa#NE?lGIwwn0cLeT$T%po<N|6O!7;|Qg3-c2_U(7TB
zeL)&wj*~()k404(K^g;z5{XVuEsV!0)CZ|CBYVJ1hGG|>c~Zk5UgA*>*@$ONVvLlk
zy|~Kbe#WN8frMz)sP2X~QkIf|2wtwOrxGPhKs9|V8D2IyDS2nosHc&r!8#?gN#P^C
zuNBKMC{~SgQ!E1*Q1B`Q&88^>^+;ChlI}32^6IcNPQ<9+0JsfyH8jJ-0@5%V?F6Z6
zb{$LzjLw!UH9zgxSf}h46_rcOk%{YgM^*Y!y<4c-g8~h5FNW0;h{oFkZ)E2t#dyjU
zfqN#HU3of4ybq8d2j09HBL;YVrdn~rblk?`c3?_D*pz$&&P*kTgU8rJ=-Y^E!x<^&
zeYhtuMh%Rn)M-&E1SFpiz}q-zFW^G7z@q}@4J#82?t-Q20Qk`~>sG2MHyeIb<iN`&
zKGd(U5~UePISVKI7J523*laNvda{WEh(8e%^wNqtIi}H)GBTdmQyD14F`7yS9r|9X
z5so?|9vOo3*9LsSm=JN08I$tWZgX640I;BtL}dDf1Py@f&MaE(`i9v^FhF4lC+A#I
zf{Gm2lvT*^K3A;`+EZQ;lMR}t)Aee?DrE>Ylw{4om|lhlulW3bqky37KVO?m(KqeC
z$CEQN^Jnr@wlO*3-AgyWVBY<^EITa=pK<w_XPwbG?~)%aIOChMvi@V4Lo;t)5<HeU
zD2s6Q%U@ryauGFahaI+f_{h`#_)BNJ@{gCUdi(85-FGgDR=pcW&C_<i{fpawHC#9=
zPcE7}ch-LMH~sFC?YE!()y4WB-m}wh>Hl$i{-($6wEex`JS_J=o~&QCWbT319{;nc
z9r&l_{FjFR*~_2ip1Jg)y^i>``n85mhxVU;#=$ENIrfC+*PhG@Hl?78UOPbA>T5&Y
zo&gV**7QfN+wyBq7M}yg6x{LIDIcFbvuZOJ{mF{Sqj%4|chQ3C$upJNe}DGAh5ohw
zcg7nBHn%_f#!ZFIW{Q{o{J2ft-)spsbLw{UwaP4W&ia#Pt~>66clKZMdik_3_gHl7
z%t^CepLO%*i%y(bc*n^%y*cYM3%c(0H&T;h_nKKdYp}@^&%B+vcIJ{zpiLIFXSSQ!
zYv#N|fAZNcX1JNfdoB6)j4_+*z5lnL{oKFqYu#^`bWhI|XZFIV%?|q8zOyEetepI6
z>-p&;X4Y)8c<&|k+S~)ToOAWOnZnH9w^*cYe(y8eEsBnKxp-j<UGi@y9Q~bNoib-J
zvE-7OZNI(nplz0<W{gdK-aT$@wEpATRi8ibq5Sycv({{~I5TsW_CNNV%FRFI-e<O2
z<h=I9cC!ZKn|qx((7bz2n%Q=Dt^F(SFKcJ!%$}HX`r#S5Ewc*-=I)=&TlB4&Be&V_
ziJ6%j=u2lkgRRQVdTNV%PWi^Z>JQJ_eM4pT`<W&B%)@(LzWnl^pZ)Zwy{*nawerO=
zFTJ$m?9228XSZ5UopJCF_S^E4yP|Dg`NIzPocKVn=PYyE^}FY;ge&UH3!Y!SxVPP9
zbC;B6md(2CN8kJXrPt43vv=Ec{RKisx$)SUQ|B!5?xQ}ScVM%(?KtyD_TI+6_guK`
zi<29!^Pha4-p8AD{buiPvE-o#K0kQYdEfu$e;4ifn_fPD`yVadW2?nizH$h2%To_;
zztbi6eYu#w@8HW9Y`gL;Y`ph9+r0Mjyv^QT@#U(;Ywq~mSaZqFbGCnB`{wG7j^DS#
z(%=7T+1bY)I5Uq4H|d<ppS|_0-!43A9)88vvlc!1+b{q0`8oNm4m|6~t5?qy_MCm?
zCFd_)KKp}}HRH#Z-Z+>i-a7LPdOZ?0uO$BQ<eGC{z3IH~EY?2RWsk=n+HQSwX4jb~
zwt4@`ns__*l=kMzw+@W9ShVlV!dZ6=Zhjqo{bw(2dGN{whwpv--&SvRRq*S{VR03*
ze%5?(`=87c9vfcxRPF0W%-tW%(O-7|@+0Z358L;{nQHY(mE0A6Z`S3bGiNV-@8lP^
zyYFsv7PP~XgU(pK|96L{{dKopF5T&Y&QHb5zOl@`Y~}iWesST<0KXntwdVEA-CI3E
zJ@?ya?{D#|zjznfbnnCX)%$<0U2yuWop#yny_4_w9%=2d1fTUx`g(qzJx~7Z%fZay
zN4~q~<u~tTuepDrJ@*)OeR%oD-KG02yLwMCr9S@myXMvA9liRtU+>7j^U?)pKYrx_
zXB~d(?W>g2*jZa+s}9<m{MyMseZgGnfR($Qym*r(($-sTb^pvuFPm4r{nuA7T0Q5<
z8_qer{p)S6+Y$`v`ZYW9EB?R~w>XTSx8G-u^tS!Gr|!Fx{r02ieY?H%kE3?rKUlHn
ziJLEaE_>HjH>mUIt^cv;!aF`(Hvh@}m;TCx$z7@2?|<^C-<`DA6;HptkiUKKt?h2Q
z0{#9c)$d+~o%*^n?}yL7devXw?p<*D@ArQX-v5X>r#{$!_ZwHd^O`i=^!(c{&s=!+
z<Kq{9ci(9T!LyHE^KX8h-Qk~l^j}K#p5;rfIccRz$QQiz?Fa6JUcCEv%NNo+(wl5}
zxq;p{@A$`vgRgn;XK$?^z4)Eny_a3_8~Cl+$Nyz@aoxVl(1X6$YF&Ql?)1*zSpR{l
z22X8bn)2nn-9B9gW^~R;EB|A6ao0J^*StM{<!<P?`c(&9d;7ilWA9vvzkH#1F}~uI
z>I!kgPwrhhJoEOwUcB|bzIE@bC+v6Df$KiJtL43V(gB;ie`)UcC*JUmn(_bi<@r0`
z5WSwqk5NBf@Q34+mF7i$^KkX51#`f5ulU8<4c9C^@5k`}R2$iQgnRE>wC&ctMX%%{
zY}wsczq9*6|MBdKQ~1?KKUw(QDT}r~{n6!jZTsw_pYHap(+?lc+kW1>dCk@LnfKmz
z=P~p4JoyU{u-$RPj^5qdy!0XQ$u)=WzI5B~?B)Kpa?3prI^nKY_U9k}#9rs$c2{*%
z_(_$%ZkwGpZ1Lphm19@#cG>B(m!Glh6ZhPCAFlu68|uHWId^&M&V!Fz@a+Yrx$xDK
z-nwkNt>3$BL!?9RUEF(d?ZVvZ6P{G5Ip@5&pXcnb+10zK&m-&uuV=5@`=y<a-{QdN
zjYAg8SFPOl<@Q}QX<qRhG5z-ZuiLRT`)&VhhCiqFx7EenHrx7!qgLM&LjRHe;K@tg
zN)f+1=7Dwll;6L|EM2}dx7C4nUbFme@A31_nX->;eghPK|BClp+8+jUjP<EyGmAgU
zT)N<a1+&*|$8B}P-ZMwL(~9&aRJ+W(d*>_u@OH2=yx^$!j=y~F6{dFi`&Y<UE_v<f
z1DT7TKI7~+4z<oi*Di=xf9w13rrYOMKYhx1yUagu%lUVm9T4u`f4g{i-qvT{OC7q;
z(*1V2Vk_i`IhS3w^ilY&@BGHEt+_35zr5mr^JX2nK(p>|hq?dP7XD~4vdM~F_Gcbw
zbAz>%^h<WxgX=%Zs?Y~}%vrd8)&A^OXTNsm)*BxE^zF0QSy%4_6bd!(h{ry?bOEvE
zr^`2g@E<>UYtDl|AH8@DRl9G)6U*-~W9@>Y)n9(`F_e1mh|8CrIs35>;&XoW^v>tr
z7{3T!G1zOT!)pI7eEGuh<JynTJ}Un4F7KTA<Lp`doTprA+wD%wAHLInKRQ3F+#}sP
zZ}0t1S^tYG7N7OrD);kCce(A*Wj}iKg<N=r#!8I(ph>^5$BpN`ap%(i#4pLY+kN&I
zyYMS+UVCRT{POZ+FS_#QTRcIZx68Md&HL52KR9l|s>AMIDF5*5YV-XiOYYw6{LSg}
zieG=S{p{28i#8P&|NiU?3SU=HGxGsxnK>kT*1wMXzW~*u#XD|)@4S^C?|;>f|9w{E
z{F%9DF5YrU;k^C74!yQt3GmEj#+}@?UmwEUnc4fVoPF7jSI+&qcy4;&tR48%l_Tnh
zeBIW~Ub1BOdp}w*^YEec+^uJ?TD_-r#OXDuzU9Vpht6KL;x)4T)pTZ7?hfs{@OHm?
z+0xAE2krRA{8^RfzrT9dJ)ht5o!mdJ`ZBv!_sf*H$=R#B$1b{fx3%`c=XUl7i@g2Z
zaP}(x<d2Vha8u%m{?p6;cG2ITD4cZ5wMSFXC;P7EU;q4ZdDHV3u0Xrn?RwX{hhFgd
zH#gQ<oJ?emKl>IFd|xMa_IE0$%<aB1*lyNIJ`A7pf-wJ~jo@?p^~qXHHrq`7fIr#1
z|F@mo8xK+I<~;c7A;xx{R|bc6Z-M@}>{tXW?)*)Y^*p_>`oo<m=#Tj$|8WNe(T_iP
z)}i0Mb?13koVLt8XVnjOmR9bz_J#+`tM=PCh$$Gv8Fy{@ms4i#w6A*ME>|;u`Si8F
zp0ok#^7~%@nRevE&pP{lFxxwEvTf{l9=PZ5!_Gr<KQT{y_uc#d3Xi@h9Jl79Z>)R#
z(!CFRr}@D7l?TrGkGrb3-}BO@PZu70{o}Wu{6>24+;-pZU68x+`nPji{EYppw)vac
z*Nkg#yZ-4Tul?cA?DO~7=SN304*$3BeE4ktIapt}<>2L$wqL%}oy+$5DM~$j<?`HS
zpiWEMYr|Wy|APc1&VObLFu#xYxb5$sgOxab%d6+upT4^8^UYTtdE0j4^)D~`+cw``
ze_{LL#=&<Vee&$5|8&!DpMUg7&tJRamBmY6wBL5t{bQ4hwrtKi=ttl0|9I^ZICb6;
z(k?6Bd*QKH@onE-`|FDrtM^~z+&z3`c=BEPt_MD<ob&OA4_rq5?5NYXxqrK>ZvD=O
z{fo{z<jJ2x*^@t>v;2gUcE0MY*Ejs`g-zf4#^;ZI^2+Z{t6o`1ZFbq{JHL=`_BUB@
z+4#mw_E=N>tcvXP+M0d+3)zjZ{ETy#%)NU3b+_(z*e%PqyJ77a4`m)c_PwV@KR9~i
z^Lws4e)%sRJ#s_q)yp3H!Mc;)`?rtZLfpMhJ!<s{pP%yR)t4?1-b}st>ot2mzVK6i
z1@dpHAhi?k5FZ|I-f;Wq)5^D$Ta``E7v8vi$E(-5U%d0)%afz&qhGvp^{$T}wfEY`
z4xCqwKl#l<=jC8~{m-W^JFzmj_i69d^&5^Ewm-Su{`v8@G-TzP(u&t+J$v+(P;#nH
zUzs={S3kJ9b=QGMuRdX`tJnSRn#C{u{-r<Med@Nm-1z*N%{|A9Ke+PrADsE|9!GEX
z=G~vq`f%+A;k0ACKkbj~@{1K8o+K^%(MjJwVkf>r97&#g;q5cKRQCAhE(dPDVAb3M
zE5uKaK5dJ)*WGmH5BL4k@6NsOwCoLM9Z<W*x9-^S%lqf=eefNJ-gfs3AAYp-x?94(
z?9$2qbdTR`*!Ny`>4%pO-#GV!2l`vwaj1LVI$hgqyQlCAw2fZS^#{y9;~{m`eqW}6
z1mCpT(`zm&Lf(HMZ~XYp!=LRvyU7s`oHDEZX;i)N3Hq;ZL|eY{;Ey+Z@RGwv4?g#u
zQ?n-?xN?=d%e^1%JL|OI>KEGAsy1{zyXPHy9kb7z?_auf*Vpb`a?>$CT`J$A<xd*F
z^6)!@hd#V=VYGKT`|H0xb;D}z^bOGPPg;k+c*pCEZMWO=f!h}rFSz68U-+>C^vcQK
z1fyb?yfXgH-e<gh>j~d@|Ap_r+*~|2y4Jep^fjk`xXX|EeXczG*MEEDrb(*5#fwMX
zy7v+HpS>sc?5mGZ>vno<|CJ~2%CB-yeSiIa@4a~Eo==YP!OLgw`0TOE?|;TzvP%Bj
zQLnss?U(Y_`y8|9X(#+)>6<@23HvSb=qVR$vC|!UiLWi2d-cuN4&xda#J7M-erWcY
zuY5H3w5_iG&1o;^o;vQTC$DNA|I344IR4c`;0^oi^*m8Ip4_m<+L;|+sUNg%%L6N!
zvlsuZ1jP@3@=k8;2kT$Sy?z>g-NWy{{zvBruY9MlW_)t$gj3$Qd;a#XocGyFtE{Da
z95od0TetSaQ|~<exzw)rFTL}+^Y(xH*jJXwcV4mFJ8H1<!Lv8Y{pI^_|Ir!CcY5P9
zeNSq-_E+n;GoPZjT5N9keQ#&@>ZkAgbOU_a{c{=z3(we#&gkF$(xOYTJ8vHR`Mf3D
zs*7LEy>m-(-9^Vg(%kvUUp#Zm%e#Gc@?Y=+zH1)*?a4<MsGI%i)NR*>?|u|_4nJ<^
zJ$F4vc=Dq^pYr^%rEh(_>gts@2EX|mp1Sv=Ww*{f@WV46+Vim$TYuvH<?(0kTlhus
zgC}m=z#aF+CwpLr?cqqmmS>&}=JWV*f8X}6^M1Z$vz`8N1%Jy1*nO8Dy!gS7o<Cyu
z@<aC#`;_I^kKbrrhg*Di#o~W_anGBF-2TiJ>^&c(cCB9Nr#|}YGg}|8K6~~Df4=*J
zPxqZ1aQuG9eetKQc0S|}F#PRZzH#rP%I#(PJD-03@7w+|+OK*(IR7sME7$z_v72AL
z@v+t+$KEmj_S<eeH+}FraqhCpPrkRstrx!fs&w7Hx!(r;Z*RBTHD9dycD&=eCoEie
zz4+QE>uQ%C+upgm%?W25z1PXtZ-bn1+0K`I`T2$u8}{#;^!}a8Px`~dyZ`RIgV?JV
z?Y-q^^}~;R{=)|k-fGc?g$J(s{5I_$(8(*0x$wkw2UYj&fn)THbe_M(EFdd`1GY!b
zc>5-2!}>pVSDmv&T7UjB?qcNNtGE8kFCP1&`BHng@w*GZ`|odq*UEn-?y&h4hkttV
zq0?<IzWzROX?^J}Z$EO=EvNjkv*M?}-V}FUVt$2v{A}v&hrKs%`psK^xMO_5<5!&a
z*@~stW%s-9!_&buzenzT@z3u_-Fg0&$Nrc<Gkxuw>n^UGbk04GzWn9ft?uoN-nr}J
zwLks)#}7Pu^I0q2eDDciPI`~M^2JfVA#Pn5I4ArNKXJS3pyzHs<&XD&a_$S!Gk2}+
z<afOG{a0W4F(5#W{r)4L0*PCH=;`O}z#p^Mao23R6JGi2r+?b?=xd+b_Ur?<S#;G?
z^!vkyo`3kx>vA7H|A*WA$Cwu_c9(p*Vc#X+qwDkDc<2YpukU*jJ?F^_*JM^6t{t`2
znajVt>y$T-Sn<xw_x|O>$4>bBKKnlR$=26(w_O_@^Z5#3D_f7ey8O@ApQ|;GU3~vX
zKRaaLUUJQ<>GkW*`RQFt_@&o<_l~6xJvZmj4eWu7pW76u@2$EgsY~43&b@WhQ*Qp=
zFJA}HVUfB0+}-be{kXkfe9oUdebg_%Jb?Y+Cu>$a+x+mr@^{&D_xV5Uy=7Zm-O@GM
z5E@Si!Gk3b+!_eMEjV;=m(WOXch>;HgF|p9xVyUrmmtC2-Q_HD?|q)VpZ7Yy;JjbB
zpy}1Urqrw&qej(QJ|FCY=)kC|`sxpRjWL%CobhQpl*B0Fca*Mw@1JfdEBv&kGSgzg
z1TPHuD{QuO6)^SmjLY;`??&bhF!<2C=ZgMg`4$W6`VH-0kp(7N6mJp*9<bsc0|a`J
zDGziq{I&uARcbyAH(@kNj7IfBU{AFy`kS>h7LvU(9M~zGZy+Ed-zKhjR~|CDc@Owo
z>Ui#kxlOjh2b7(fecu@=K9%NUo|o3JINYn-1$Rtn`nVzd&Znn<#{S;{4Q87@hT&5@
z<2~{g*l2yu7z!7<Y)96k@V&|8Q8ob$E_P9`n4!5t?!h&SF3<iT%5>f%VL)v4WzH`F
zcSEhhXRdRzUPFl45>qsXYuOwDh^TEYc@8u(`0vJL`285TFEd4^-)>Ep(e%gB)BE&=
zyKkg=J}AkA6LY)0xSHVDPwGY6?D6Hh%2WfUVlUib{R)Z+eW0Wt#(N4ZkiQnfGFq4G
zUMFizbrlfsDZcI2V9WFQrF|d-xCJZS+gC&!Hm{26F}jY;gO6tmO1v&1o%G=GuWC}v
zZYMldNZT7l`~q-Oh0-Et7Sq7eRPW6OFjen&wFllG>(1lJmb&RF(%-)|QNBVx9`{H)
zfu>T{4Sc|6WOSyniIekYeLYEy*S36g%o=o9LRI~yZGb7^Ko5Pk?d70LOTr8igy%Y&
zl&RqEk2z>_;lqKWEtIn&t0OlwKhtDJ*Px<u&&AP)MwL^$+2y(Am)otfW)_ETUFeSr
zo_6;s7B3P$>In|+=IXE&XS>~xIPs!Xh-pqV=7RGaP0x?DL-39r)sIspPZ;s}w8SdT
zUg}lGfw?w=7w^ADJNIWeWXs31<*RhwLN!WJ6*h{He;lom#b=lY*?9jA%3eJgn<LY{
zQpADXO5ah#g?lF7j?|QeXbhzmnFrxWUEM4ByCcUf_SfjF1JMk6>Q*PxZD$?p1--Y{
zymx1(Y)6{8`VtGm-H!6QcV1ztjRsKzo5wFZiMmYlT#bIMtm*yQi*UNmcIs8h;kzK>
z*kP&UINkVM5aLI>IyJ+D$#ifEHwXO$1cd%Lo5IaP6|+W~r`Bz2`)`w*kty9%u)Ir>
zd0zqHnq%K*QmejuV}h}q)|y4_?!D~P0tp_5=%G#F86|JIPY~Y-)?4Y7y>PUg(Q#be
z{>B(Mns}dx3c5Bs{Q-D=>9`x_HoN<Y4{ZM1S>W;Q<N?KZ@^lJmHHHlFS3}yx@fjES
zET58F<nNhOhiqAx!!*>IPUrzc`zkQ<3fZM?jmFs@q2;Utr}4VGAVOKnnLqP4b3F-F
z+hI(8CIS|gFUQeKrIh5ePOJ#II#&{mpX;`@WVbHuDr7j}v9vug>M{c);?n@MaAAm7
zw!D*1Zn3*qxum{%aK)df0x!7nrk9_czy1IYR<@LnrG99#i-4!4zf0H4oqXGczL>(R
zpgu@EhgZ_+DVJ>Bby`$w_)AImtmDx0Mh(1C>v-5ad$~bt+_uK^bB|A-%z)u@Feye+
zc!bHx7axLeNkVcEFqvV6&vv#GQ=0qY=#IFbaYy}X9kw&3kx$=?A$7=PiMl95FY#it
zstB(=Hq`Ftns;r~CSwm^#&S)X)_{A|bQeN(8xtUAXVyJ&yK_HGQLi<XEh@|pyI)Ka
zZs5zDs8uOka>bu%D0<y-7Xb}fedzt1Lh~1<aKY=4$MgJ5a1W3Xeg#dL$Fi7?zZ$8>
zfo?103M^<QkdNHkt>f7b;V?jXD%q_sBDao>pEV{jJgY}13O_TJhv^><L(En*&1R~!
ziNzarZdJBEIKe+WSTG@mBn0x2=j00oRTzl-%DBEziF)N9tX*r8Ih$DRG}hXaA4Ioq
zX+Y;~+|lrM3ac!-Acq?*mAkFd{1Ohkc;bCGy{{Zf4s5LutdQtGNyfn8c0Kvl?Qwk4
zs}cexH)QqMu8GnZ^*pnq*J=Q}P7iewqF_<S@ic%H-dIifY|oWUQy-7l3o2Ebs<TGI
zGMr52T;Cx&Sfn?;%4a{!p&!8(8_w;`eKBK@_LfAR$9>skgxlvJyu^?(DE6GT;1m=9
zjenMcQ)RoZ*y5%O%$Y}@HVN(DIr{)WJ0g=1Bm&0IO$4bQ@^Q`1jWnDAJNbi}L#5ZF
zd^=#b0@oHIE8d39?O?X|+x6u-AMID|l10UjTZvePWv=IT?+;m2wb)(lEUWNB7{lx`
z*L2!=w1cN1qb6soG1?GMw-)L;S+|&iMd^L5o6AKpCr0PbgM`y~lLIlbQ{vS~lq6*R
zO8U}9NeobWjp~fa<AJhI;_*vg=5_=Pv14GUqC=W~xl*4Ok`l=4a%?hg)}mw(vz}fC
zP$2H8@;M2mBct;tlbD`69POpVPiTrQp)_%g0!xKpte*TPr0}2aB_SjIO(nAwOw(dj
zvV|~<v;7f8=8cCZ#Im$^fFlVp*Wf}SUHj!d=lS60^b?JGxlZTy^B1K}XP>=il`Pr6
zv`P3$XRhLNeQ<n30<GtR_SOjgLZCD%00M0lj4a?mj?>TkDZ+1SZbE;&I8a;u>T}zJ
zKl)ww+f)GUCoa15=(~M(rB**&dpeppp8Cn{Prsja54N^t=1vN>$x)=#byd_{(Nop~
zHW`hyyBR8%5&SVE&~;%)iDngWsL%7{ZXYBeUBV|*6il+5(}l7uSr-s6?dFC(?N4%?
z6qg+d%@$`glrcOHGiCuK_mTaZRy^D9<hJ$rSWvlroQmLzOqk1g%vC?_uciuYadn6s
z!YD;Nrcy%n!9q9&9a~3oa6=4?tHqH*p7K+nhu&khZ1M-snDTOND#UjyDG~)yz_51;
z*PZ_vHh^1bs0$c2P|bnRI|eq}^oR_Q0O7ub8)Q;9?@dTsePRfsVY;x8r?3l!#80xo
zLdbe&AlpPHN-vs61|jP6pKMcTy<#8sh{7gQ`j_dqv<STHb*C^rO7*z-TskzY;j4Pj
zM>yqQL?q-fD@8C=#UT~XCK7oc8d=U<CRNNo{N`S-*ZJH14n2hPc~Y0=Q*gv%=p}@?
zr+fn4BU~+pG_Rz=Y)i|F%*&B@F|hLBiVE1+9(BCMorl!%b=pTgMQSztU%Sfk^i)Hi
zbGqpmmt-NM$>FQ<nK1Q>YvnK5vt@2UH{6#KDIBhtA)S54Jv;ikL^WKd&MAN088u30
zEo>#d*zxg!d&~DNg5>^q(*^*t&IiPlJ(ubrJ8icv6!AvK>DrH@3}}-GS;FVa->^1m
z+^QUMY&`p-8FA`sjJ@U@ALI=+h!@^#@5|Xx#XE5@`^DN~H#iSWXR2)q)_T|^s(&$D
zNzl|ZCY`Us`7u@eavKNO?$Rz}+0tSBhj-bU=J*tcBi+vZY!J%Gny1MLP#*~pSl6tS
zw8fEyoqR9G=lsIw@;ucCiGQzy|M4M{@`@DCUHcwD6d{p4tQ%XjeM>ax@-KnonQ>?c
zn7b5<j!GMad0;lUr182%u#i@Ygi&Jbf6j}M=4E2fqfmx1@hpZ%T-q2&gr{R|!0rmO
zYu&EUc&_!$1_E#~p|RvKNI-D?as`1W;mQPok*c-%<mauiWi)+~jJe(OubtG6rz&Ow
zF)Sj-^A*ERitl8Gj;-Ck&)CZF(0;YXB(Emsw}tC)^)T7^5MIJ|oX*A&UmN!=gTiwL
z(d6+JmR0~_Xrf=AP<`IyApj7Bi3Bu%+F{yXMB1uatH<M$c+5<^HPLSUIwNI$Tre52
zQ07z>3v7(CS-1W89nUGTebSNJ^fSvRG<B`Rr^Hu(+@*LT3IVeiI&MP1daB0)8-w?*
z=%TF-zusp|8=4$alKY<wa@1#U^hMB&aYFY3_B7^$qo}2<=Z1A`40Xz*0LL%PoIt6~
z*?6oD4<uhCzOMv`;R3zg-1!W#KRZ2e#`#F3dUgq5R<1Ey^RKm|+{;mGOShmiP<C6l
ztTAEWzyl_QK<JH+1GUREn%4n^S!#R**aR>FH-iDA=cykm6^Yi#84;<9Pv5w$Oa`6r
z>f!wNWJ*s?`UD>28P+Df4dw7+V5ZL1VdLwYE0_Flw>1JMc4)PHjlQg;poxav9OTl8
zA75zDhB(zyIr{WrF}BZONj_>}1tmF~wbxmC%HASyCQNUSr;v~2byGR@SSg(+MINg7
zqZ<0-ZtDED%hmIGzNtquAF=Wgv*;5|6h)X_aoV9iIe%LxhlTyr3D4i~MfTM<IUrd<
zt0}nJ?QRjGRcn*^1Ho2CvVNiuVr+$HA9**}WvV#AQm@4<KlRl+IfG}@M{dY~UNe&(
zlb6r*TSD*@Y%ax@I+`8HAY?Ca&T~P~&!0FZ5#f(ayivq*diELrrWzlQox?yR%1+^P
zx}jrcx_F{tULDA@{SV!nBcaUsr#v^|Dc%a3&7mJ8BX6o6|E&c`VrQ7@445))Tl-Yk
zeog`~I?MfoiI*?%hALyNpm&AV_Tg053rBX2GPPFKkqPX>KwRc}>QpE8#P`0mLC{r%
zj8b`Q<bA$^crsE%=6~4t@t@>>BBhx1?}$2s&cDt=W#C#_9c#;~^-gC^qHGj5HLb!W
zN~%Mn>2SL+QY9{F4!qUg8-q?Z1~cxCw`rLxT?x%@PE13R%f$B$XPPh5Y;u=A<1AFT
z{rMmTjEp=!X_(g<9d2;bN`Cc#2Se0sMW@Jq_N@KaIQL8L>Z(|7fXBjwen7`#X{&@Q
z567i+mBfEtC+=FEO9GH|7ue!9e-mp69NvHj;F3&uQI-$J!XR_!k|webla3^!P@<lS
zP{4%^o{3yt_ad&><Ne{pt3<~cij(pSG1%JTAKF)($_Y31^w(4yF93nCq3G|9K3{~;
z?kM#0d|?J4m%ezSJQ(Y!lVW&~ldn0+)#h}Hl*9fY(nJzq(**t692`@`W}_vn&n7ta
zuS;J)k(70FstEumD*{nNS@>0<?_>N-9}{{ZfIC!ea{+M*H~$hD@|Vf?hQ8@Wh+4R{
zy}grO=a*@FZ#4+DdPa|RzpK|0BNh0gr~yORhWYd?2`vRWyd^{Bc<0y*781ynNhUfk
zh*9zhj3pBF(Y?tNr308Ly4Tb6hh;Dp+put^&6L8qa*gE6HpQ&ghW?$61D<eztyrxg
zY+0Q|C(Zvpr+g>`q#;!Q07NP8`P(GF3SNZ0zAj*!%Xq(gg2P`0ys*A8w{`M<*hW*9
zA004aPKTK+8b+4~L5>H*9Z?l!TbMMfm?L2)XGy_Z2;<rArr+ONlYP-~VFnU=V?}JV
z?wXzCZTc5Pq}oHgZ!ts<^)iBq*|8hYWTe`%WCEvkC;C4atq9TKQc@k+Wv{_C@EoxK
zFi%aMfh8h5Bm#|rU$Z$MSEo>Skmzd##V@wm0OU<0Of?&Cz&}Uct-by0pJ7xl^?U;1
zmlCp^OP!Cuio3?+4_(~P&x`<sApPeGwPYb?r@KAzrvrsC0}BeES&`I>Po+lZq7!B_
z&#0=I+7c}r^V@e$TE=rFUK9)+AFt0xQC9VfnHaiYL=`-HNfTd76}((lbyZm&@#AWQ
zk22orAmT;&vS!ZPKkDi373$$Q=fUQ#N*W`vVswjLdli~~;r#9oug03Rna46~11tnK
zH>ef%+|M2B6+diqyp)<U3#$#gJq8Rvos7BoPPc)pP=%^K95{6eo;6eoOraZR(B~AO
z7(i^AZ!dXy;&@619KUpmA?jOS-WFA8)5gsa1WzrEr*yP+p;Iz;mXgqVYAW8l9{>1c
z^t(9KX4D=!Ki6wYpgyOCiZkTKuRiTsW9j3f#20o}Z?+f`*3)6;{4f?eK1Y@u-h-UP
z<GDRpyiSWB&M!}GeSt(;kYWrte_AQ$u)oef52Z66Mpo4x^CLs|Cx)P)0GdMP#r@G9
z>u_F$VC|*B#nu9C5XtZZhx-FJfTqOSE`Xh?;zJ|!13OQpD&)ueqf)t)>5e|-nh(!M
z+Tu$$=@dv^Ul1wX#^cuw2idy0abA8^{-91_2gC$t&Lr3rJZve&uF6zcq_aH;Si?>_
zGYLDD2Y&A?o5T2sf8=@gxGrCdr-$0tJrjY{p^=s_WX}hS1tU`J^7>1|4|l9QaQlDs
zY|$iN*X05-Hf+qf;O{;csh*EYjJfDza?OHBcQ$5GF8d`IsjZdQ1*HzB((BTaa0Uis
zq#;T6iJv+3@TqTz71|Mwl{!DDE77Fb=u6{mUuFd^$|CvMjQX0VY}Ws&zt81Tjn};e
z!p;d^myj~aCtSl2?^0zNa-%dLOcU<}+aCbv>snE?oN?|rZ&!-@2u%PIt?yy*Df5m4
z7N!(?LiCR#!FqBeIE!)$c=;0KRIKEb!yPK7hRY44C2!Osh2>Iyt?hH-6&9s>H0L0U
z9~wI31>$reQ&oorDtlisJ$RPGL>T7m4^Y6`4LNLb)lPzRLB!hxK#oGLevKI<)~|0V
zQn)iNFEr5m1bg+(^bf0h6Ioe_z*tn$Rq7YQDM<+<KShZ59yi|2o)csvh?n^s{&cOS
zGC!HeS%S+$7m+d5xPThRTp93=|L6db4F<S%pC5j6vQAaW=t?{~R)=VDH*<cRFoCl>
z9dsb#btVB1u)CNkJpcqbv-1JaYdE7e8|GrS4tqCMCpE3)df>!S0`PlI=A{7G3q<l{
zamltFYdICrsYcs8Q_smTn#&UH%s}$YEnvW4+i=#eyij9!q8!wKYZ}`sb^aV{1~ryy
zJk5C6r+d*)DtZ`xu0!<Oqd7WCb_5(Gn@soLl+O#&Om|&O&<XjRQ+75LHcJ$drou=^
z<iO;*KbozoxCUca(Qmt!ii?f}7l`=6cY*LE^)wO*aDV^_=q(U}N{RZd@ym`LK*&pW
z%0jH$*ABD5SXw~gR9K(>c^y?{i2WWrVuSLh(*aIeV6sWjDC-S#2DNYr79|21i*{(>
z1ZxJ+Tve7w%2@j)OC9&oaSoEtMd&&YYaTjx6M{3oH1+4n=0p6U@tH5G{?ysz|9E#v
zz>L#0$EG}$u%U&R>6{#7q@0_9>Muwqau>IfZe9|Rr}Z<M%hvyPP1*dncUKf5-xN`8
z`GZ0AtN0I%gu1+#QsAW#b1}4>h$jAy;N#D&CDWcNAHT`H&PCr3lCX^}I5FCdD5%Z>
z$5z00c->H^%>NL7N&RLq9FPe^)gMxSSvDk3A@j;RFgBa^OOWW-vBL*rUvY0JORkik
z%1TStY<qg>Y4l|L_Jc1H<}fVz7^F`eXe&D%8<DTm-9EX43|GPEd_Z~PJvb8rv*BA<
z3jK-_E%iAKIY(dv8ty|DlbG(-_pzU%5+Eh}ZFLn>tMg|nx0yBTptdE9CAV3fNG5JO
zY-Z)^)N35w<P5a$_tJ_t$>4e%a%^3d-zb$oUSe7{YXd7~Ny2pb&q{?ot<>0Bf+Wuj
z&qJ{FqzR#r$1V4$d60zzfbMgobuqFlT^}(UQ;u-5pg(>ym3(<hp1@{H-I*j6Epad(
zlEH@WI)MOtCELk*G-DT*<#fw1(q5@|jm>kp$BNHp5BHRMAbnA+&@PKq55x|l(4-3V
zxl1C&s27yQJ1oE9i#RPe>HW&?xznVx<!ga7$I?Q=)`gxh4&P^BPXI^OCe5AmCSzC<
zPl0kYe=4hFOw@t1p<{`a2I&F9Q)X@=g^755bn@BY(>wtvUJF-klZ&|*1yAnmP}4hi
zQYcrNVfPIHG<%gRxy~@bcrCXQ#8_!qy9T?FftWr&fplzyZ29o}Jmo2tHtgP{U&xg8
zlH-%d05|ZIdHMz*@Sc7fO%O7$#E?;jxK$i2D^AhTkNM=S9(B5?2mDTX2vILaIvjQ1
ztJH^%eQ}>VcxE!V+Pt8HwV46XTK)AyWrJ}59=xO=tW)_LBeEdqYC`<b`ykGkKwuUQ
z-1F8J<t8=yma_PO+z)ICrCvu$>Db{zwV#DhhK3h#U_V=~W^?9Pk7f}O$<;b_DEjO2
z!y^=Xd}a~cKUeWaN`UmQesVUiy{EyZPY<Zf_DbT(LVIX-)F$rTYb5<TRZ4TJoOf@>
zeL-Tk9eJr%f=MgPbIDgHWq_dY#`dC%BHZ=$Vy`?pK-9}lQA_X9d`}RCCX^^2SHV{J
zBnmDdb&0)5EQVrjidAOJ=cq{|tegOAaF~1P1^)50^<9SZtLbeUT~7L_CY?Gu!Rbn@
z{;P{(;j&H6HOOdZ3%)(yFIQdqdE+c{DkZloP+Iuxi0sC*CrtqS#piAfP)vO5<m`V_
zuOIz@e75XQiNu8tAO_cKt32K9=>N69u}w^CR{sLfSG94>->k|^7>}cg)zAwMy<(HA
zgUX@rjw1dEfGQTdYPx@M0dkzuNlkfoceg-%KftdY{QEW&JU*fGN3oNOgskzhm3f`E
z=P`x9IaR}D)Gxfn<CNcetu`PKW8iZkV!yj0GC=Gq1RyIEj_!-*a!O67Uw)&J1dZX0
zIAdTjXiM0}h{h?~j{*dn)Zgh7foxyMR{<Jyd15S;?SwsSC}6>I5MoZCdy(on38(Z3
zlD)aN4F&2?YG)+&L8V(mY7XOd@=XDoJ~aIyY^eR*S<p8{$+oiz1)z$NnHU_kY99ui
z?3JV(Hni<vn}w5e+ER(&`k3gL`TxC-fCLp(Y4()tpCdFVS!E2bedNGgag7mCJ8wP<
z%OYvE_(3B8N4x1*3WD%H6K+R5(9Sgdc5Vd47UD4_kK^Bbm}krgRsiyAN+5soM7(?>
zfN4d45SZq)?C2R+^7k<w{y{*ygaBRuxNW`m!fZ@elP%p2H(Y6aFX*=Xa%)5swOR$+
zje}8s=XZxby;=iG^_W_W{Y}%BpN1Uud7RVb1Zco61AJ8kq_3Bm`HZXRO-SS><5tHN
zV7z>o#6EX(hfD>(F6;vB?D{G?+5sCR29K9aTjmQqyPf)_jAg5CL*Nw<prrsArR{go
z@k$Ex<)W+U5_t7A(Wq_jjppg?NFdS-@G(qk_^Sr=%C7G0j6?w$hb^1qUY3C2(U8`u
z{WUz=CrPv9r`a=-e<lo9OU8*{C0RLno7^SORyV05=#+;}dWFN<V$L;79nLVLVF7^P
zlWXxGduQ5(BR;a@2I2W9PP^(Dgzfn8aI(;-@LIO_jGL_Q5aopZGR;)(ip)2g&+Iw%
z#HZAx67ADHf)h2gvwwj_er0d2#Xozm^w;9TfVW%Ub5l=?q;#xFL#$5*T}U|`3=s50
zU+a}(sGmM%kgow*oYoNMYn`;}C&Rn*oxjQh9$BH!r_eP<s{2suk?+Zw($N{n9qzEi
z*C?zOH0SX`0b+9@3@J+C{7B5OG}PYdvQ%3JTlYssKo1&Qt?=gyzxZ!I{bXu+B7}==
zodF*t+t%`X(RYr<V@&&RGB1^cFHJNla^FZK&@-nvb4lC96)e`Z_{mLV%r&fyc0e`l
z?l%~|5a{)~$t*I}_lQx&w}-M)C7@Hus7TR~!~9tcNmg4=4?nw}uW$WGT=c~FGPKa1
zNq8Jn(4xj?0fL;~Ushk)r2dVN-ys7MsZTOqLO$Och2OzHU_*UodEhK5<2P(0xW88L
znhS8u^*mv$!he#CzQBEt<cZBKd_;&c&*SYEr;I1jIk_fV-edh3PVy#+w|(tY<o|P|
z%Jh*RCW4_5bFabdD`8H>JVS{GiZ@C311kJ~{u{ah*lr02+o+56I31_$Ng{<Xj@U~U
zzvoS#WWG+%=v=)V`-DJ|;Q&_rW7?~IH!vpDVJwV~2@^WXTE2q1YzFSyN`0G@X{g_Y
z1KbZ?d6eN_zw49YMbn0TW=Y2c!vWM_zsL(%qBj5j=d0H`0uMnC!>}R=^<Tza6rKf!
zs8=MB@V`WOFH3=gl2QXAMfrr`p2DtwtJ)CA0nJ*Qs;Ruef4unj9cFI<?FQ~(WZ=`R
z|L0Y0;(!7VGW8?+ua*4IM<K&$0P3%DaK%58wEwtD0P-Zj3#p>~g8Sbe^|2Kfm<_Ww
z`h@?wDd3NnAy1mK|2vfbf3C_KuC~ra?PpfcTJK?QZ6NAXR?ufSE1Xt-x#&#^kTqyN
zc!R4)?1*G)*b-1z9o5StBqBS_7MszAiOqDvx7yg!J8z8=UudqG*DZjokXlJEuvrUj
z0x_0yQWxXThaSQ58fT|;1FQd-Nh=}z0Rk3383WuI_~o?I5edj1q<KV14w50j$_@nu
zxn83$C=;}j<=&RS#UrH!NA$FSni0Rx1-QAuZ-L(sdRe+T4e5ixa9NJWIuBjcmbxG<
zxU*+Rm|jd7y-YM-<sgS=9H>j^UiKjMYuRpC-T5aFKPS$`2QLPYI;a*tjoeER6ol`2
zx$%b&p>)P+<BFQ36@~TG4mt;-M}*+Cl3TK{9|Ruo@|deE;4UkH_Ee-o_%`=#i57i>
z;kZL@wuyc{E86n>SE-A8?p-4dUQek3s2QmgwH2!8*2@@WVVT&YcPU3*1pba9z%!7t
zb0eq}!k04KHGCJ__$)04?+dj~<?-O>dDDy7AdouY;`SGiDBMy!wZuQ?1^v4vX$Tv9
zD+&S9XX(2I@b~GfOO4OPc_zDkFF>SMM=$Pr8Qvp|3WA2=sh_`j>-8Fh3o88xIcG<}
zrxFTiqPJuO-GbZ@VOhb3vJmD`NnuPi^w#$vs?6RJYEaOS1IYi`76L2{+&(m*QhENN
zx`~vS@}#mff*HWuG;>88g0YE>^vMSEh^<@HDG|u6Uoy1^5sxz78r1c+I!G?BnFsCg
z4p!1xo*H`U-fF%`^QpnG&JVYQdyiB~-1@^P#aQG~N{^np)S~wHYCO@ki+V9$Nc{%9
z2o1;?Wc4;smzP#&|5<ZsLlwx2362M8sW^3!kG8E7TxbW%ca=kIKp4LTUGegM`R24S
zV1sd#F=!L~&$%cp*ct;{c^A#-lyoNdZlCR7Q#hrdQqwxo66*FE-f2daZR4w;_CO#V
zm8f0I@|tdv4#5)YDIBOL6a|UlT_~I;3%#tSeX;pk&1}3GX9K)RTNd~klVP1IpZeFE
z+LzF~<P1MjfsNtGFVeH|we`Ii=u|3|dq5dg!BQAjwf1wkzTIn!q*~kTgeV(6%U52*
zT2?uwdd|+gR8yK|YY0h?!;3%gqu^0<)VnLLKUap;(N1G@2YE{CttS;6-NI=iK+pvK
zytLd`his$JX{*QtaavQb8m1qUc<G30<W%VtGS_9F2gLFQNlWvnhHIu%h>5kmAI)=B
zuel-;!^{C=f!X}nuy&&_W#3f%5{cnIG+ay*362s{xBdO$F-Y_(!C0l<v)1HsO~jN4
zPi*6L!3|c(!>4rNHsN4`IYvCrW)}K{y`D<=5uE5L>6|u1y$kJXv1FoM-{DmbK~u;r
zU6GR*h{v4XWkkOBVc_^p{(~6abUNak5U8FIS7oE`<Y*C_o$h(cC<<kdO&L?gkmk8%
zb@++Dk#3VcD`mbWx-RL9>`v<hbH0R8(%uC7f$F}ps3c=83d8D%$nGlrc2h!p_aI!1
zu#(D)Q0ARUVyX+IYLu{XqF~PJ&BxlU&Mt_$S*>cTN3X}QU#1M4COe6^+@NO#)J*T}
z`?vl-VS21{r~?C+oCar$Sfb59Bb!Ep78q%+NUF0{qt1#h{M?RIH@pN=W=W}bS>xR;
z(dt}pOPS|1E5h4b;=}W)SHEtrc_nq-w71>cM5M#Q|0RZR{RkAHVQDpU0Cgv5K@Uqu
z>h(LLE*&V`{JM7G8RGf+g*Ucitr)%XdP_NM0>RkQLmZ>J83h=AFR5CwMekI%194Pb
zAKz12CN;jdchLp}B5k@0m5^f(KDq#%=QfoPYoCt57g&es$oi7cmn=cpZ=u*UNz}~N
z_Q_M=`Y$8N7GHI`((;zGloy4j1!4ofEcK+UJ3!q$>kp0MFi?f-e;nN>1Q-ZfgfTkb
z!5U~)a5L%sLc+NLbn1K6w)xIG>YJ>zI;8B6+<V!1g}ssXER}0ws;yjN4z4L)KMF8B
zm3X~-`6pKEnAm+WneN_bG9xY!<4X={izG1CKu0yuP)hoa2Gx#NG=^X08y3F#N3_K}
zdZ@vylHz_Umf!_{6YVdD`hl1HZW!ipUtA~eKL%r<+Oz2KuQ;`d#A<Y}{fdZKb(O_7
ziQ}o(;w7<)>{<Lx<&PgF$q=7(OX~MJ!A3A3%C!D7&|PDQWvh?1Ct|JJ^6wt99_`Y4
z+*nHuW?E}|%=w}s<3q!k^oP~Aj_Cq%<QZr;a6jr}+~cax@FAkc6gL>|^7ZlkOq?ne
z_FQGLyy8w-G<|PKF!qo|)ekJZ-_#S-cC|)rhZu8ccwE%q)h_4J`b99pwy~+~<}#In
z?j4%mZp|ESGfEV&-ljKXwNRW%!kn*p>NCdNf3*7_3Rw%W+cBQVU0FwjFj~=sI<78m
zzi?lGb(B{;FV`A@@3a2uI?o5!dYl+L$pX4cPS&N5EoIskreiyGl)g#axN>BKmWJBW
zXL69!p(<beyX*w^7EZ-Sq9I`m$5mxUPSr{&iPZOZcRn{2#4dBGCz6z@{@8jlB}&u-
zT6fch!7zT>ozFtWafg5UtWDw#RcBJCHBMB9ApiI%n~>*P(m<C49@VgBnFBUd15OrZ
zwIZT|p>B0`yp9xlZR?X$q?EK}lO0V;?6E{0f7Tg{MI%jCDdOw7sqb^xDe+82tL{}v
zl!Nh#Bvp}(OA8tm_Mq?GmG|b_fEwh>w+zk7Elz-1<qzVxDIb5-X00wwKp4!dgxM&)
zDHa$%+2#<;_1fr6#NwCM|4m5~GQ-H(m)<3;QSjTrXnCzO@LasR{-DlBI<5~%n!vQJ
zlb<^X_K?_~WIm+zP<bVrbTngzmP9X~R{E=ggl3H6t)m0O`?=x_lSjA5Ed;E^U}eNG
zoVB(txN1T5@0@Ts6#lp$Y_ekl?}!!bd;;%dbpwL&%o>u2D#8KX(){BVSr=cI1nVt?
zs<KdHikPY>NxVYaCIjj*xEKP<fMw&fDLrloE0!#cQ-lW|%Dip!3>fR@>)}$zp%IOz
z#$EWZzzzgH$H{secm%3(*nnC{Z-nrIReJ9a#YUeWby(nasObzJwR-Kh$tl6;-LiN+
zx-O(SS_TcM86GyT90S?5C5i~D$aB1k+Ps)4)vHWc27Y=~a69zy`AQYGLu1OlCaW%g
zPSvKN5WW1!(Oj#w7Y~Q1Z6TvsBVMY^A^39%mfCT{Qqt9PyK3EXA|j?r(E3%c!|(;u
z|3m?j5bO02EDR4K4(2gj%7@F{Zm~eR?Icwzmd$wTcwo2hbfKhu7TjP1vN?JX#A4tX
z0PW#FPW2V}<4BF0Rt8$O!-+(mYw^^gO3-0HqqI5^OJAjq=RjG1rw|vQ8;&KiZ_|l_
z0giDP9lzc|Nqnfmh2F9A8)5r#9cT8VF!Bd+ZHMzxIV2_R!f=vWt#yAo0~Lo!Zgq5V
zrzDgcxmv7O7L|8?CSyK<8BwuV((Tkf)E0KG*;kVvGxN(F{h}Z@Hx++&97GGKshC`B
z_IshMQ=XZn%fiDm_T6$2Yj5xU<cuUNL-qUG{!i?0Rj$XI0<oNX?(7>@OqhK=f$uVy
zdpd%OKZf~8@F(soXvA+#DG!0#Can+f)`Y`HmyLMW8(2Un1O4?@nhcxfPiKlB!O8ks
zxH7WieRweb-7n73*S~H#ag9`T|4ak?Au5yOed7KvHAq?L=*Q6oD;%_#X0f);z7~hm
z>?_#)$s$1S#L7wERQUx4<GSb$mxV84)R!__`^Z>=7+=*&0n&<nDJ0LOQzHonX@bz>
zq$%m1&iWu)D})w`O5E)w0T;KW78;Y^Lo^2*9^C5gzZYl*or!xa87ny{&G%Opueiaa
zi^;*k*b#fT^n^V~)$Bco{s|WyO~v$js2wNWe0>icE~it!hM^BOMSmf<8S>Md_u5**
zs}$GjcH@|Q7V{QlZtp|>b8v;ZCKl8DCrS21Po%9AgR*2bpY#t^?W-OP>dT;!FHiMw
zxKN^_a}$Ch_&YC%r$;OP?>1}Fr~+Q-&9kl;i-kk#_{!av3@VzOITSwd0S9Dv?2oC3
z9ha-Z2~qUYUL4FzYa!D+;u63~B}oc?OND&*ZR}Gpmb3ZtNT^te@(xOF^}I2=5V7AN
z5SE?(i3x7u)QGS33{s{m4oP{wJhf|r5w+1fos(ziI{wi&ib?vnnpmaXnEXd?X*xtO
zR#Aldh<v5Ogbn;~;B$iUS6#x>ti>-(r=>%GPm)lI-OaWy(|Q)i+UgEf=dj0`($T?+
znj8MD1(?JA!w-})R_xQ}U&!E5HB%KK*$s@l56D3x@NTK19N<<`$S)+Mi8x8VW8F3s
z1ghHsI%FOzT<8$6G2Uywfjut$?f_%eXWGQ1*h>}wHGW0dNl6ut37F7%*NtLN1x+qA
zN_C#vT>seT(*8me)@Fv>6S#ziZCOQpYR}O!aiQ37*p1LV4b8MPwwOMZHeo5U-Besc
z$|>HWV>3_9yrsuVG?Jo9&85|LzlP2W>Eoign$PrEej$tOL73HSVI^)P4ElGtV`JjM
ztS#YoNK?009MV=H8CB6}L7G>v8KbI=k1y~ZYxCM4k3TnIgcbX8m7)Kr)2o|VG@@~Y
zR_0rqjl!{Tc#Np$47{c_ms|MTqTuB!!n{b|_*+Lc0qLzR*P{Dz8rm~SQWaa{x>3xf
ztMuipC~qtdx%4VuEPPa2k2hE8RzPx6)WITtGb{Cg&u)5k=PmfVw2OStJyoT_ab{mQ
z9vGKDPqWMs8|u>Dje#eEsX~1%w;B+~ca_A$N-EuaC78h$L`uw(23rx;{yZQ-p*9lP
zwaZzy(HS}Q#-cGhFQeeQ^lXiG1HSbuX>HHgsLRoFpQukNK7j{R2mM{2hAT_0%B50W
zgZOwyr-~uW3#E68AfWOS{behTr?e{dRMq;JwU4y-kPDe!wcG)5+8YC{v5yjh!#4(r
zOYaiT1x?B4u>BB3o`bpl;HXXpdq#QRbG2oOlo#A>qxQRu?|TpH655=Tgn3gARu|;W
z@^^i&rnDW)0TPWqfl3lmM0m_b4CAgZt{v;wt;c#gJq~CUYfF6k>K^5UZ5ELe%EOge
zFPKMHdYfC793Q%^zut!aHW);vRR1%7eu(4$C3Jr4qzk1!rLw+U{`M!~?-Nx=pTHy<
z{NDjr#7ilZHq}B9<AIVFV;>7h=H8rc)XKC$2k1o@cKzIA?q{{e7GE(QS?hc^kN^1n
zdW+iG>~ub=vyu8+onaXKw_w&g#xeD7Fn2ZYg#t^uk_wXhQhbMd4z`SD>v9!67Qu-|
z?|pNl7!~85oF*p~D&^*VBiC=tIrb(9eyh(SGfTwZ;=`BlxUYa!=FnMN8hG3m_HM^O
zXP>Z9vi4=3SzGfa#V#%$74g;$h;k;@^v3$6&hJEu<PMC1ZUUP;=^NyjuO5C|$wTV}
zpsd<P*1HquLZvKjwr1%KjFAqrkR7rhaj~Cemc&~OJnZXZTZ~v=K~khDHn1(<C5i|n
zmGyv~2k{gk-qkXXzV7Y=N@1;LC&g1c)`~SlUUrASXLw#nS}#3{<2%S-U22Z7k~yE;
z(fOeZM@L<t^7pi>=UEEdJs<W-5`uM=h6dreCAL!I=fmM~oi5Jh<<6d&eaS;9J;Lno
z_Awd#sn=Goiif!}#SMtdn>bqnRx5s0KCL&HN~OID9CIMESCF(<rX}Z83dJ@yyS-!g
z%>AZsQ;2cldGFX$&ee=0f`=z_<;Eyo`x*n#uwp-*6u5AVxTP2nxyrI)Be;=+pLY!&
zq$-&2fTJ!WTz5+AaSDywg(x%LfOG2yu4I5~!z*Z^p8Beq<6W|Z28b62x+gYtFdVk;
zH$~1DrA*tD+5^<@r=Y6;uHdz4+;Hms&<|!tRiV6)h@Biwo~!dTLBnE*He1A2$9}J=
zZ_N`R!GwO~c%g)f!-!h`C@Z}E{hY{`W|lgn5~iE7SF#s?ow!F%ba_qhp)j*p7dMsI
zdH9}}%jn)fcX9dBf8|r;Kd@O<yfqm%re>8<a$mxU&#msV#n|JWdW)ZQpM9-a9B=E(
z;pe~VK8lm7{0bGBFAnQX!K**msz9<J$-VoUkg6rgiu<4;h}P1BV_UbaFR9^P#U+aA
z(tWFR^^RjabdMY3ZZMM;uLwp@M^$Qvlc|BM@<_oe?X~~?z)ACZQf}f*DXDj4GSZvm
z4o8bN|86Eco_{yeeK4vCY2ohz&AmF!{$>}$!92lO9p)Zso)EFT^+L>1R;J(L%Wc*!
z(e8NN0>||_epcq4bCjXteWJNPv^Uf~N2u2|N~Uz7!_kms&d|s>SLwHs?(J!3cKj+;
z3ua~K`28oUDb5~!j~eQ;8Io;+Hp?K}6-)J{3Pw&oS<O1a0h->crq#>A>)SRF`U^A}
z_=8e2vzoIHX?}A-z<Dj#XDO+uGf~tl^{?t#Dfm&8oYJql%TlL8P&_&h=y|s;pwgi9
zyW1Mi?|y#kAfsC3hUXl+iE#I)n1?3NfqkvY4rgVd(MqaPnE5vvRqOmE4UG#tHadFc
z60J3JFr$ewrMl-C=8Lf5b=5wjpZ|iR0?5Kr0P9=%&8^uhil#nc#C&+pNc;e`P031~
zaLZFBI4SGs?b5KrI*N#ebnY$vpSapZIwF<oZIVHc9uds{DR&q6_Cy6|;Lw#&{J$LZ
zzqsE+fbqQ^l1=$fM)`^U{J)nVQ?+5w<#l*|tYloT`Zsg`Z^pbdULXxM2L-h9EN|zm
zIUE!XE}cbt(FE6slID$f*?q<^2^Ru7dhI0xDzu^=F$Y@a*dZQ$g5oKj0Ch+I$?K;J
zzmWxx3xk-R<?Vfh$UzEEGG6IQLaM+JFvu^o<|iU3sMiV952tGJjf(G(R|p{%@jFQ?
zjU{Wu?f<@~Z_qdKV1*ALEzla06{r;kYCs=QWCAIH?hw<kysALd@#W8IpQXL=5(1gP
z*B)$&TH}HKzy&<RU&^+!p@>h~J)i?o`A+z3Y|?)3`J3tepKmxsMkt5Tf?#3erq5Y4
z$BcBi&|pYjKm9pq;odw0soU`(jNk}ZgYH2*dfy62{EgCgXT?ELp^HtKn=@KPaHaEQ
zppoqF*FSKiQ;>5~K$7si>uP0d|3}Gw2KmcgpTJuO2ks`2NQ6UQCfK;??Z<vlJlc|4
zN2CyY)vSLP1(R0?hzp(?{Y~mIG$_6k6d&G#Rip|$&7}U&&+0@NQUw~Buo^hMyzM>x
z?~IIq>Puw(HtatnIR+pjB?;su4N%|nwrEf*83<eQj+=pN6+N!825y4cB=*3KGlW<0
zfHtw%eXxUc8_tUjMC){9C4+$F`|;?Hsq+y}-5S4X>N{6tRn8&trWpmpL6Mzbd{fFB
z@6HV<KHa8JoEwmu(%F`0EUYycd<hl4r!UzZ*d@Hbvls=tbG%sc(GgLHf5YaL=?{}c
zS$gHA0vb7ak0`*^VLr?(nL_2$mp{GsnKm|{j}(_ueX5yN^IlX#Y3rruP^PU4ss9yS
z_uX!`$GyY=)q=d`Ly5{y_D9wWu@}JaX*-XGl1<mCW&Ad!r3?CeE8(a%E;%LVIE6mj
zs9x@(@mBQ>q2Q61t)~E4%z>E3<W(ywi-h<M#&Ht=5<1`UegydrwoZ~$DXNjC6bQ$(
z0P#`3vk+=P^P|`IDo@9CCr;DEmfQKX^9vEaEy3@M{U^JxS4h-{_h7W!#0iy`6Dnz_
zl>Q?*BHc==)~KZQ<;*9<zC@5vc-iNzxS(;6ne8`GF*NMq&o(dUq|vbaqNy0In-#Nh
z<W6dbgOu6*aU6kMtS{`09bV^3e#)v|1(BN2WUd_}h_PrQaU$_Kf)6X&wxzr!kmNIc
zhI_$&T|6Ue-aPg6tumv`Mjg+|6*<V4k3B0(WmVsStwyhYjc;DBa?uUxoQT!6-zTsL
z%#6Uh5vO$m9Th0*dOz+k7}VbWD40krLD1Xq_*oObd|+CPJC!&n=2nD`ad27H?4t#7
z_7Z2rgl9zQ1$`0fII{P81>_nj0%1V}y&k<vH;JTavq$1T*$Lm=zC}aZ{j@>ukw>~I
zcB8Hhq}lS`^^v{#dtucTmp~w_JUvnv(uJ<izeaoi3q#i2DGM0X_xtaDe&>{@nEVsr
z$drU>lDIm80>HrYXa3KDBf=3-nH!-3L;qRwCXWed10gFBDI)OdJ6ylNN!gJvnNOep
z&6panDUs*e3$qsJzZ$P*W-XX~HP=!8J1+V{Jjd{ps%D^B#6MSBaSGU*P9t0%-iN-`
zGt@b~Q8M~`_$Y{->iL$je0?yqS_|=wRyVy|^F%&{t@YBbR=Aczk%dCma6H=^@2gDN
z<`=Sz{cF=MdLEn7sjagy!B!U&%x<c8dTxq2uRebp;9yyf@+n(rUT7i?-u%?j)N)96
zcuSfgS=9ZFnA_n>@ioS~%H{cr2FJ&b8~VrJulCn|*B=roGb9JRU^sU^dR&%{VkGCf
zeC^KzMs*#0oYwXb9q^AiUV3l`N*z&4DBb?7y87O?!wPExFi2o$BMZOlb)GLcoibq4
zsl8Y)m=N5fnw)XfiJ&j6>1j4qzN`RBY<2o;-~O(&pjc^10%Z><iImB&9}TkdbhGkA
zmYjbPh4e{uHrLR|INkbIP+i|VIJBK3#9<v4r?R!*-(EuO=FQkjEAzB5wUd|1v|A&D
zZcFVR?9ArQIHwMI{G)$5la%jPI&LZ_@w>9D^c*s2bv6>L9(NV5ibw1uFOjVa89$kk
z{{FN}!|A5^+CU`2`46|CXyY-R{`j`um`!)P1KvR-)HX-c(#g510u{xkDx*1KiKyK6
ztWn<y{WSZ4&uvJH0k7udEM3ShZwM`QHrW@wgN2CICq-iS4#|Wzh-#1L*b22P6q~?)
zGU*xFJT95iMZ0=kD>FN@!~xAB8SGI`cG}dM*dga1jL!r@@P89?F8m6ZtFc_(`?-qG
zuSXE%))}SUPOheEX#tK>x;I<i>)V<s<NPe~{Uyq)=A4~^9<R|nxe{k5#aJ+!qygnv
zILW;=#H>~QK0~H+piRuBeNKLe@}%k}=Gnb7iA>|LmOK8o7H!O6u%hu;<|CljQQV)Y
z0|WJ3j<?N^cc&OTiuC!)i~Y*^1Y4T)45@yk*jkJ&Q*%S+gd=I{G)Z^IOg!g`M1#qE
zFKi8=tvS58^y{y-Fwm-9^mvOjnh=7g_7sg&m=ePUn`Xu69T2!I3;GSkn(6>Kw95;@
zrkvEDDx<-lW^C5w>}}4DJoXmKP_Fr@{h?~Md>8cr7;b*F*+6y@o5}|0!(f3)BGcMb
zR8t=lygJyY_uG99DT&t|{!8$hta9)&*2WEh%bmUa?tdMgi&zr4{37DGrAu)=aL$ss
zBZ|xsCstnkpuIZ~h2qvJEP@bq`RDgnE6lbvUISJ~7v5tM?m`R)Gqvz_!shNxIy{{L
zyaHyE{*N`z!#`MF7sL&9oP^%Kq*JD6{epgD`m|3#N2X22XbPj@e1hHdGt6K1WEBfP
zV=&B}Bj40RPT|{>EwRL&x2E5_GMl$;?ywpEOzT=1soVWVcqqbJLNYNuD`yGx;HaNo
z|23%h3XYNES1Ko56L-zWVQ)RzCW{L3kVogMmpGDZqB&-DtzFp^r#XSJlO`M&HaK<#
z%ldC;jqz;G(dD!+h}b?cp!-Xem$qCtHA@q5humHs{4@tIYS(*_)s5%x!_l#mBFF8V
z%zOcqAfG+~Bmb1fJKVh+$gZ`EJNTn;{v<=n@v+{QCt;!ex3Sds5?I6*BSzgK!(Ey%
z{XB=p-IXt$isa@l&(E5c$L3(==d^T!gS@8LvK`&SZ9VG+ewu%T&p!ggUHr`9yK>D!
zUBGlXb+$c^iZ_%j&!}f-j1s3W*5phU{{XZ_$ohmWro-=kO~RPO>nC)3`Np-lu|H@n
z9(Oo7=wLrlcX*U^G6f%~Z>m%-p<FW9`4m^K>_@irAefWAdr4+ntr^=W0O9Fue=yK+
zzjxWE{V~l@QjKr7&kbKiSdEG}5gR=^eFOLb2{z%!mw(kH*Qh`rlGc=ctJ7{vT8OY+
zAnBtqme|*rnYoC-l4sFq%^pQH*A<i*j=8ZWez#TAiwO5;xu5XEd<{KfehrSYUF3OQ
znl>@-9+Thvtk=!(vs#bj<gi`$P*t<v-vU-Jr-1+QE4*6NN|PHYY^eN~Ty;nx&FX6M
zS%O@)GCBPZRxr}#zA{!_eib3PbW*OnslA|$BV;O3)b+aOP;n`IRLJG$8%`{PBIH*)
z&3NPOE~k=^gl$8&Ifb6^G>^$}tNR*Lqme@5Sx3@uR4EYhXKseLQ8GtW&4b^|CzR+Z
z0g8WtEMC{gCED4A%{vpk;!8v`l4Va7u52KQ*B>OGswlV`OZ@DG7cuz6-m6>^;;e32
zsb~$h=x=1B-Ic-{&QM)AqkWCffVUj!Q?$Tge0zFf{-p`_J}-1v8MafK-f#XPDU$en
zBM_zw0Wp3UiDBb%RL&d>R~nzD^6VZcX?%@4i5GINT%oDc=}Oa#de$l+K@iDPQh2hc
zSxbOx-yK(3y3h!a#Wx8K9eQ<Q(4^cAise|hB0%l)p$b^*58f9%gyG=vZr(?ch6d+*
zF8{OLuiBn<K{1IJ<O{(}%blY_9F7|Z{Yu66szdC>k3pAXE#7Pz806+R(3Cj(uf=Ef
zUMC_DX9Y`E%COe71>$IJ$DPV;rOK*LyDGZ-)s`RglrubIU<EbFb00DUt0I59-m%nD
z<*5BRp;oG9+4*q&{wD5uI^A9#JHV>|&j_DIzq+_1NaA_W_S9)vUL?@66Bu^b8Z+de
zRW85_ykg?l?k2y(BXW=(zA#}LjI?)EwEAW>b4n0$4)h%0G@B_ERfA9UxFN7Au_!`h
z7%f`teYqni*AnD+zEK7{sZM47NJ75OJR&sYcq-DU+Oizg(cQmh>JEe<KTU&S<tUY#
zGqt?K0h_gu!rB7c_@HQN-yCx;a<!oMv{cfio5Me{=wA*Q?{-%iso!EHJ!~JA+HBga
zE$xEVf5KL-)s!a)seF04-zzXuLg%sPs&~Cg=RARmW6_9-azP5WD%XZtN73mw2ZZRk
ztUA9x?>`N@udHO_F^&31)ppR{;f9iD81fCoh(3SXc*h~yfIo-uG%kQ?IaK%&9EuzO
zcqh?aS6`Ft<JHT>eA*RZx1B~dCvLTVt2~zTFd&)%x+%?hD8IMla&tUBc1FF#_aAV!
zZ=Q-nrcA7_27U;*ri3OH?sL+Wlsqk+BE(rsfSrBJFPv_1`Rq6xCEe7%>E7eCp6lvt
za(bckW>oOF5w>qB2sa(>hrZ!*d)*%nD@8_~w<<&4$${xK^|^J1-w)(J8NLt&Gupql
z6hRt~;re+Du7|80LDoWJ@$+HJXqVF@r8inM=eXik%g$<93p|E6;wtK@mM2;j>ALPk
zC25$@>8A!d^4KEVufU}S$H+BCD#d@oY$oT#?no~EHotHF8N~lx04_Ip7!IYSj8yV5
z9=N!VZ&^c|Z+Qga^y79@W%GYKv4^e^VyWt~$PlENr5yqblkrU+2#-I!L>{8gYLRFn
zK+sRBOs7%(y_nVPlh*7b3i%XYkDcri*0s<7*j{DHzB-9;thJ(5W<c0Kz2&9NXd&O?
z>Ar-J^Xs!;mG)j|-@Vj#M)N`8dX~4E^P=QDR#(_IIE8+=p|s0QngpYjzXS%8ext1q
zVB5@1K-Wx#Y@g@-1qxjBcWGN=l1&m#GXifoD!xBZo>ThtHGhEVct#!mcs{vd>JAQt
z^{2ETzMfzV{`@RA&9a8lNU=sU+Gl%|6p8#krNK%o%ae*TCv?|BsEQ}W4`q2p88T_P
zAZ9+4$dXggpJpBx!C`YpR9&1*?bGL0H2Xo|_{=58YDRTi73V1?%~#s_<)*>z9x#z;
ztPf``1ew+l0EMJ~6mRN!=|oylRk~nyTYP=hFX-L?;3I6G?Xwvv@wkeNa{A|ltaz@?
zd_jK_a}r-cn~<d2>O&jKJbRMxe_GKYC^ToB%jLQlQxGLke$oz0c}6LwI<DCiALyp{
zRSMl;$Ht5H&cfuq$tX#T>4y;N*FtL^^6#iFlIw%6n5ZCHYs^Bf1t9o)G3fEt0%>!@
zH;hFoUm4oT_9bRkvI9M*1rgg^#9!Aezz|YUDSxtuC&r55NjBXFNZb%=hDFt}zLO8o
z<|08Ke8%i(zKGy|IxQ#6Yhu<df<ek7P~SQ1gL#p|q-nXKtjXi~n&QMz@zQM0lf;rj
zBIn^Y6F$Led7dVz;bvswbg%h$@dK{W`9`nWw9-rvTLurdN5|&CXujaOxRPSI*_Y@N
z`JM}-9NZ}l*)vwlonTBu?#e=6<$c!-k-2zg)h(M_b#1}7@h)enU&(uhel|&IlXARJ
z4`xhO_05uawY+s~dv!Xb)kUs%6aBl~J=Lbqwi*(eSdM^|%b;N4%p3tl#;d`lJXn6s
zUiggG<0Rb$jM^%XGskmL9Cm0(wiw^~D#{t?_V7HhMj>}-Hwzq9gG6&f7O(1mT~JNF
z>3XeXjc?<4+-OV|pQQc*95s<>|EF&nBRDux+_R$CWuC9E*u|FZxLBi(<HD^DOz4m*
za1w3&sb$|6=`w=v*w%@C_%^;lx8Ub3O%TVj0j*Xst-nls0U*Yfe`*glBv8Pv)n_O}
zjD+Dly}gyqf3)}i6!(@vaW!4r=wQJD1P?)jOMoFja7cm%1_<sjKyY_=g1Zi$K#<@X
z+}%TPcMA@|?QD|V&wD>_eP7l2ajH($hMFHUySrC+Uu&&vtzKO*Tx*V?Q>Ya%fl-im
z(rDYH0#Yft$s6+1oAF$NUx1{JDb60P>j9o{w=V<jbrRQUm1(c0y|L0{mT;lRS9IC^
z)2b$TZSd{MaI-{Q9rymit;U{4-Te}rXPrBS@O70~y)x$;mBx1QVS3;w#I5aKomA^D
zwN31cy{iKldv+w9CUbL_zqJh$O#TC7)<Ug`A6mY<l!|5RC+rGq<4G^0t{O?e=DtZo
zxbe}JUE=0Gxyc4J?^paHM=j)~Dz1f&A`LD!M`Q9rjVe{O%ozt_@3*2-K0qq_oZ$J~
z$yD{Z)R*Bn48Y*(k~-39&2eBVGukBhMaD0P!(ySk@;nmxY_F@1RQzyE!G;o*w(d%r
zj2$I(1A;_HOrBU0Aj@~I6!=|<N#T8PQ&Ozh@<L<t$QLDXz+BZ?x7kfWcsf?l$S9;@
z=K`bdUx{3MKr6@tTK!m@vsbRa(y+(1QinO8;Sf4m&Nl?7)}Ca!&Kve5uvb5e+<*{=
za`qE0Ei}6I=#<jWIBo~#u5`%xGUz9eNSw%snJ+fTkY3C?ZFQi8j>Oprk6lcZUfv_{
zVSmhm$}e`Ex<&J76n7(z{y<_MgZ1nr$zJdK9cnN5ay#6hJUTNE1$@iZi)cS?w=2Yu
zM8?PoQa9j)Laxom8r$RjQ8;nTJ>dYz^!FV7j3D-$!lzvARSO*Ta(-@lZ9R8CU(luF
zcd(Cz#M?IRy+zLvW9H4XdZCaarZn!;4cJ@njY_BEUlekcAjZz$4x~IN4;3@SQvJ#2
z?(fUlUKKwJd??Jp!>U>br?V$iTqx>w??HYqc%^cCo0Pi>9pB_5!-Y5<EC~O23>0$H
z(yFvnAMg5cFT{k~)>#B))U0E${osM@cU>uuMFuv1G^BHb%}#e_e=S3M9FEPx2Kzc2
z?3B;39mTAY9+auoXiIcr7>3LYu~}^l0?bKZFI)Dc?rA+H|BnCl#}}1M{VV_Fsd_Ue
z!M!=G`fnC3J#Pf8R|5H99Ij8gFb+a#i}<JjHsI;^M&|r6(;&kP$IgFwF^NDlk7m5j
z-FpT@Bh$0uTQnkWku-u^m3V#eZS+-InDJ~SZJ*ND@U9|WO1pCBpR1!;_r~sJh78<t
z+;_JbHMiZ&YrPk=rV~~^E7|?}fK_;Laz?IN>Kdq%-}XR;WIV7yzp6pl5E{w*YUl1s
zyK6BAV=yzsYoRF|jmCEMlxsnrGB|cx1&6vrg|0YH`15w=$w)W;(WH)3seZq!{Vv<n
zs%PI6eS?E1ZGE<W4hwxn1=F=bx}1+;p=ksVQ9;H~z&m2xX>S|X$y9CnWeP8YKH4>`
zlGh%d#N9-`(5!NMp3|6n)IZLYr&e5g<+M^AD^AkIYq0R_Lw^$Tfst&^0y98xFfu&@
zs!<N9I?AMIt$rb>^O3h&>tRntGn6g<>3a4<8Ysa?ui7-CKv-+O4x|$-vQCf+yfI{+
z&J!uo>TPokl?Nl)Jy>c5obblqGg&>G9f<okFoS#I324+4B0Q=jYL|IvNofuSKCk~A
zu4HuBWqTx~b1SW;r|rsC9G-perA+G|s?o~f*@t7l&xx6f`J!D`#q&+4I_XG-LROG~
zbg}k;NE2Hd%_DXPzN3iSwOEPcv->QDJDZ&xb+s>0y`=yTw<eL1n2e>aQyn%-XQ5Qy
zBMgF~>EMMl^_8B_C2Y>SFW%c9`K7WbDAP#Ne{sebE%nrDX}_zS4PtsU)7HU{MSg)c
z_%p6OLU>W{{7f2~#brITIq-%YPdFu_ah$4VQE#`jX{zQt4hLeFU?o?Od-L5$nn*|b
zgQMka5D~8UN`HIem?X2T8EXQUz!}Lk>I2zx{Im4XFS8%8Cr)W^2c=aMBzOMF1(2;O
zXx<L4F3_`1y<DAO-kq-%4VSI-9}N#${a$upnPn8Xp<4R^({v1=_$)lHmT_<zuAbR$
zvf@35ZN)g;-wCWiP8j8LWCBn5B1OY&oL5l4AMe8|p(_ic90wm>XRs8-$Ln3RSORC#
z0B7;S_1tklqZXsqmhe5grr))_2;l!jANXaO{s1l!blEp`&;m?XMWTuiUW9welk%aE
za1F<n@{jCqLM3L4Z^s*zzgQP>m9JJ;$@ex$QisDh$a#KHzqeQiocBH^zXpfCWGXHe
zZSE}1V;Fs5ZDPdsCE-fG)mkWPUBoAIQ2`#I_CL!+Qs^Y7Nrv=ZO!N3=<RUfWa&o@;
z=*H}_)P4x+u<^blY<q-?`ke~Ssad>e$<Y0xS=*vkTD#&m#W(gAI^-tujTYi%S{u4=
zYijTzSY-T)lqRQjtV(2a{gu{TppFRA{yn#(9<Hvbj`v;KwT_KXxX5V3NW7;Ny7S(o
zG35p0(8=L)7e`f@XiB5f6w6Oo%&B`=Z^t+RZYPy57v!11lkP~ewhN;Exu!axYRzT5
z%1B;UxARq!QQURNLc0hyhyC0x3ZZ|AqlH3&U8?zeFi)|!EmWuC6r3=SQCw&+w0%M<
z9?fgv+Ft-*fTYN|Qx*C1Isrg+Y~(Bdl)g#<vM^J7(wf|{%Jgo}Z@;Q!i*%6X)EsHQ
z&7C+^e1g&qB)%ptb#eCR9mNLg8vwiqB_b|TAK*@nS(yjkc<eOU$-RP#w~jU5=LLbD
zVC^retiJ4i%zSKonwyfAo;P|Ly2^p1(C;WPcE6RW@=XF$F@FSPMdL|K^+>Qut<iL^
zE3#HTWyj#Y892+8Lxd>V7~KLe0bsjh`gO8&0Tt~RIvg5m7Z(<*ZPd*Myk%((sT7}a
zAb3(%bh5$DN7`-IA49XC(J=sL>9*<okG-{Xxw#qwJ$8hmOkO{jO1|$Z>fP6q%k3NP
z)PG&e0EWM%(kf8or_u^5d9uJ8KjrL!3@9_+$1Bt-dykr9XCy;u6Z56U<%IW}rK(?(
z*6dQKR(F}@c#MSHaRWBwAL1sm?|{oH?-SNdi`~ZrHv|~K)%X_?PeSLy41Q9rK4q3f
zrxklTQ{|DRZeXDIA^!w;t^Nc_A5z!P56j4%bge+hqu4Ne;Y%|>y^v(B@mPxK@H-_Z
zBPzHn7K|b)_2VffkfG&R4QF)j%~o5V_R02<M!E?CHkY!Y#IgIAPgn~;K|&W>`(_|_
z6g)yXeEW`<zVoNS$w-r^1x*5dGI#p7{qN|5LOTTSzy<@Ur1P^}N|Q<@vxtDf+mXiS
zMY(KNBEDPna<ZQ6LH-2u3+(kfdl`XP(Nuc{*elb~kX9_AwvK70GLzfBAfb~Kx2rWd
z8W_nr__A+t_L>>c%s}lWVGmNCiz*aaD`M4}1*5k$B22IWFh_JrQg=XGl{RY5xYVZZ
zwr`SZ{T|r!{`S!`Mdq)snCwL3EbUBw7`#1^tzQ{Ro9#(@^<3FPFo>C0xgH<#_>s;2
z0BT%;CLfN15jw8d8jwa<P@|UQ8?4M?nM%g#0;etei@;QcK3{olD`7<je0#RTYDxo(
zKEFscSq;>^8wh@&9>3#{{BKELQo=R{VR<j69~-t)U?I3xDn(|U)@#gmnMQwA7=A91
zM4M*oA(g?#lu)>Mq8i6<)Y@^I1>tMHWRdrCtOVkW0<OPM;-%IVl&P6vw2`D?%eVFU
zHj(fq3%^kNt~m5Z2u&Y5(ZNhPjf3Tp*qK7buJ;j?rfVZj=6+?HHdIkgk1qmsCLAYI
z&O)FcI<)<|8RG8wZ;-PEdou54l|HDeydf{lL$FHM)opa4f%$%J0*>>sgnW@Wz28de
zmH#5PxzOIwx_>GF1a4Zydry)f-t)$9Yk-4Hx%&`w+Vpl2LagEh+j|^^@VdbkEH<GZ
zz()C~zAmKLGzOU)8>mPG&H;|LbztQGDl@8VysUi8Q=1B;$2lF`eY~%7qFs8WbWiL>
z6(=Uw7Z(#JkBu!apDC9~ZbA_QNn(7v!H^l2)Xz^t!{{%9AekK;M~f4VH|cyoT%YQ3
z-|a9|>z2`YAUIl|o|{`|ba1M*b24wXcUpX)Znh|2%Q3{_T#OB=ciaa9b%DxX+EBDe
z_qoXOaDIWR*SPylKOXd>q;2LNm}`I2Qc9ScB|rXi`t7_0GJfX!<CL46GlHC|tMP^M
ztMuWf#_N`Ymo{Fg$dX1IeElL$++`iBnQr!$@itrV=k!P4N%pJPoM66@_`@$?7x%k5
zT;`WMPvIw=VBBgQO|W1$A>uikUv`hfz-9N99mx(Arxvg(^1`-Tr$ys4zhf|2dWe%z
zxUJ!F@THBO=}lfMkq2{@wAS_G(_Q_wuRp8FWUJZ+61$VnKO8o3h(*zdU11SNx?-e|
zteWlcLFR<IJU)~`$gT#zLDB9Ok{0(~Jo8fXWvIv~4Av34-JH<Dfe5-t`S#L%KN}Re
zHLw`cmj8~&pgt#J;CxJCL@&#`{lRX9<eUkg$@96)kXy*@4-pp{E=9Xhj=c<(ygZ(s
zR+-GDYmZ5pRIfLvYrS?cX&RYzT^7g&D&T2=?KcekF=Dn>{k?E(VOL04)3Osq<}>AH
z;BMjtQNZo>*>b8K`t<OaACrmGW`vI17uBQE#Vb^!05`J>)JN$FKMg{1&j?C9PhzT7
zTwJ!Q^SY^y6FsczP7@nP#z&5wpIw&fbgtC^<;V7Ko;o1oJ8=fD!q@J7YQTfogfZ-g
z&^C9bNtxqIcvgI5*k_4ZTp6PIhPL*3k%O#EbZ%bM@|LD`Euwj^o(5)~b-eS|;k{s#
z<LsMv;LD{iPiWVwr>vdKlsxUEz^~5xt;&}7C-MV5N1Z#0*v;n-9t8J_!p$ZaF&4-l
z9uy{x%jFg}+W6)N)R<{~hXX0|NyShGRtQ`^mRg^c;On7BxMWw-tI_6%vKuevuK<Z7
z(~bBe#>#%$OOZo1$WR(nsoVQR4a*ce_$hWdh~EU8f6?6exRI&-IEBfbt$td#*@S<^
zkMQqLfj%NmClNWkA95CTsG7x;^krb&Ew+kT)T9b~_vlyyQd<9tvAI_4Y3yVEUnyNw
zg3jNInyF##)-gbnQE5r2({DIyD%87dst5MxPC`Ce`UM^ur_fd0j@2XC<u3QW3THd~
z7~Jii_*K6jdElJ_X~zCeVStS_&{SVBG$ea*1yr#q8nwtT{!=Qz^Rmb={>O%JrSi(i
zQuTroITnXmIQvk~rj%A}Z_rHwY~j~;tLNQRrbY{bgujJeB-FN%L*pXo^UaF35Sst~
z>$7KER%U{pHK+B1#)`RDHjBZfR$X`T6X$3bY0>yQ-(ag|NnwDK^kQQ%wNxkezWIj*
z@;VTK?Xw!O4nB8Vt9Cwrzjb`M&Sgf?ZMC`9lZ1Ov;)={~d)?H<{M~M81S^VhnvSA*
z+6Z4(#dWJX$oJje0C7qEX(AO+d@5RjmyoYvw8AHfw0;`;s<`4Y6D)txo{7w2mM81b
zB6;5I?)tKf-R8hTX#hxz-1_|V_6O4jRmg2!Rk!#-bEp(cJ-n<vanKKg;<qQ)2OSm;
zcY*akIbWVPy|Us}X7)lK4o?q>9zK23bc6Ig??}HJlnH%y+XhOFZw5(fsXuCb6wWIH
zK0*i=$S@6O6}(3z)K)~(a>Ej>HDg>o!PprPw`OHjdS;g+Epi8Ml-1o5UGwv22yNtL
zg-+L6*|7eB*L~8<D$~Vrci}#ZtAj-WBJ!UY@rl4x0m1F~#X0amNJdJ-{2WvB$6Ue^
zLH*#{;}ja0;KEJdCGE(DRa3_vmnN!MvbM{$Fvq11vtqtHY5n~rG2gfQuo2clE^<mz
zF83#nH+2~6OiH5XO~WZotI=Xkut<x@jHd7ICEK-k^QHUkN?Yf1syFpUSjr`uzB~Dg
zzD#|})+Z7vZmZ)?#~OXrfY2F6W|DRSlBJ1iER19x{kX!6MW<tITZ7j=n|r1VR{H9Q
zn_TtNpDGY*wLhHra0|idv(#{FP3{|gP4zYElNhTjRb>9D!f>Lesh(X)&R@oHFP0kD
zpYrKFnXtN=56ZOS@(Av&SbUl7^~n-LvCtMl-Dx+%$j#}MZ;nZSL^o=V4HdHfROqNp
zcenwY(a19@=Ry>P6uR_QX?&^W6)Dzk{Ya-)H>Sj;^`rK(-FE2>^k3lzjYwRJ>fI=1
z%fhbDyHR<-t5xx0d7SkJj`j!V{E~4EcxmKTA#Mj&rhPaTD=qp1=m>{r#3yVPcYO5D
z+RIUD034;F<fvXUKMQcY6pgJWJnymURtR&-=#&RDtAmO6G&o+ya>W|5bScpc4S;%@
z1*ScoJe#vg+ve-nR3alU*7jMUVd+XKc$H~$n{3#&9Uh<YxkEJj{asAOpm>Xl*A=dz
zG-;r=)=Xx%_p<TW7lW}#$A;^nlCuQ0t*(m2m3bqwYAy|5hV)x}ht*$4V%2*P0Pc+*
z5M-u_NW%>VBc+nutN+#dH8<4|F}A8ZNG`pOvxKdG6OEABGT~5(MPcg2JD&O_7ent-
z%TX2*e(RQ-+dWR56<?g}wuC5bTC|+b0>N%?vb4iHw*zQvjDvHz1#Sg(Nn6$U_uZ!1
zQr&(XBDu73!gb+{R()&byNYn=)vx;ILG!)*b*t5*4UB;*X0k%R8>At&ezc^G!Bq3z
z7iuT*9_l_!MTt{V`Z_Jk?Sb!`^pqfJ_cBxNbB^z25J&0E7KJu!n{PG{GZj2#j?a$O
zFZb?EXhVki$y(Gvl$dMu-msn^lP#D@P1(jCk#BIUP?cpfvH)*&SE>mv_`?{UV!kax
z`u_Jh$l8Fb2mVvegP9{a{?U6RO}VpYV+9_}$(6=dv|qWge;v(m)?6hs(ks=4t%~!@
zxo@htRUj(xUo*?OZ`im=Z8>h#=yq*A`XkuxKY~pv_1*5X#UIJDL661#7;tLK<Guxy
zTDEGc9uYCE=~z1we@0>yEwz1j-a=Kg{1g31G4hRs)NCLSYg``B-l0rjQ3dZ!-(s06
zH{liWIkwa72!egTT92Ht8MH}I<rQ|cV{%i}Uc~fEQ~&g1{g#s9jD<A2K^hq4vP$n|
zlt!Pko3oUt%KIZ6h5vG`LF|R7I^*g$KOicux6Z})IBVUA?qIh@%#)81{;BBtns=;`
zAm5EHmu&Nm;VDBH!FpdBaW*S!D6wT7zEb{p%iXPs7+6a=$6PGq9NL&7t;k@P*^;kW
z^oUW*F}?Qwrx{}kjf)*&b0ky0mVWh2w^pKYQ|zLAQ==#z<Y;Cn)AAEr-G~KWEcCe7
z1~51P*zY<cqh*qk0ykD&p=&yi(5N=rGLTumCqA&6OL(5&G<(B|hR<L~Dd>J=b-ynD
z{zkF>h+yvCnrYTIn6T;S=L>tU6*l$?_0KB~dCFM{)5M)LUOaXi9N0y+G{bEjNukl-
ztV+~Ot4-Unx>0HO>+3dVc1H5`xlU&HoA29-wVIcYnfj(P$E)=1UclHf=u$x>+}0GU
z=_hD}qKyI4lpAb%5>|H${Rz+A2v3bix0CdX*X|X#4Hw$i5*N!EzY%+?t*2=<=hiLa
zj2O8Sl+Riz&-OP|GF|Y@T`x9^ftB*rNfkDjIfZO$$r5n|_}pLi#W6=#R&wmhO0^@=
zNK!7UAq~Xnilx$*-(7eRF5G@7m>$NHmK0eus9x-eZVFXtrpX-Ux3ubRoP^!CsTd`V
z(iaIpj>d*EO;XC5#C;hi$VyFge0yo>?^*1aVRu(2PKqx)Skhe$pOCqK!I)^MA3#YK
zJ@AZ($(CF#2+znI8eEqSuOYh!EUu(`?K)>4Y}HgX44GDJ+PgI0Ai7^g_r{!L7AJUQ
zUgdO*nywLL-3oRSh3Wy_DO?>_R*p#mpk>6-6F*nz(T*LP?$ulUYPVd19=jjCh9z-n
z9~uX&;UOP-B{%U|0$eeO#Ru9-e7td6*QyrF^|k_ZlwFJ_55DeKi~LIJM&RBV;WeZH
z5{#1t92$~tzN%ij&MDn{hPilp^C%rOVhAPQM)Kor22WrSr4;envwM(^&TZq4G>g;m
z&tLP5Zce`o-r{SXNl|D8<^m8u0B=C)TAjP{QOwn&XQ(;Oza)cRz5{orWn~gTTHT@v
znqMLBku|br+1@D2AELUszrkl#o)hsY6p`zsSKL^2Vn?Udwn`HeZ5=B!6M|&$&zSq>
z94?o~-nDn7&|^iOVgNN@<&}1FPT3PWzzsrrD<l;I!<J~x9R^9$PES|>gnzq6?BrR|
z4p#ScJRXVw3|<K-e@dZ~>1C{Xmn*<1S-S15_F5Sml@ZEuR`&necgkfz;%awssB8CR
z2#Ee4b|3H!p0+Ng=#wioDZ-tcf9&V_u&YvjRZoT>PU4hrOigQQ6;#u~M0T5CEVz2I
zsp}@i?KQm@yRyw&eP@evGQr@&!q3|JTa*5rPOLiLyTuZ^|Al~-uF{hj`6@Zf)oVT=
zb5jR;hCTf*Oxea|$p_(D?xrm?OM8FCUby(TRaf3lal+|&9e$Q}9Gau8JxA4iE7U1+
z^e|)S-bT07>|d<)J_O^6)!J<yt;mJL#4|5g(q~f^J*Dt6T;>*V>e39{gwJ}sXB}x<
z*P^M)`^0=1nBF$s@0%d6yKU>6f8UQTvE<7G%J!6r7$@o!_R%+!$_g%cY#f1DG!PXE
z(>I;Z(`Eomfh%0xeT!u78#^j3d8!ERAHWbJjq^~}EgASJ90k1O13pB#v3zCICp5F~
zttWaUv$h9qYusrTSxJ+Eo7Cl5&XL5#;N&U3<*Zs8N={n+ok%MVr;YfYsk6t~_tGW9
z=5oZy%I(nkSrt+{>9hb+8{MO}r@Wv1%(g~8EM-F}L%Wz=fkVGW(dTQilmy*A^Tzq{
zJ?D9T$E|J;trJ27YbY3Tk2Msh^gLs42<!+&lt7ovjOJ|3y?+dKuK6n_UBpo!cs5eo
zjk0^`#OmnTjAumHLx^)w=pyyK!NL|Tr}6p+`2KVq_EH}^q4)yD>%JVvDAh9`QNbQV
znpDw#j%0rQ!J)AApREX8=Nf*0l^lJu!DKlc)@zmi`4=Y@)@{yF5LUX~PKiG@L`BC*
zC|N)eSpYdqIKz_pd&9f=I9$lA*Zjvbt$>x1W=AP-t(E3;zC1HlR{w_FR8v9B6uRM&
zt@(3}rZ+|TiBw~~ZyiI*_7cgDE(09$Jj4a~66wY{E&1r{ecySWoRvi{Hmq-}SOV7m
zuIS|e9PMyqnvcdo-K1j4Jv-^qj~c>K=(cMdi=C39(nfW%^RAcjVqqxb33t;w&rtnM
z5&GmnPjnu#;6TS$Ml*=^J7e&KTAI($6TN&>>C%m5>el$qH#ByjV_#fQSNW^&BvD#_
zo#qtLYkh7Cet|(Gh;Jr%^SB@p0@&uy1aCMVotfGV9K_}{1<YE0O~#Wte&tz<q@p?d
z8k^Q>kfRW)X9OJY&^DfKDkwXguhz*rb1(qVpg*y}$8(-<DJ}9<J(CUm!rpI{xW;ur
z_T+MqFTryIK$JqCx!O(q))=g&u6J~4VNLTh`hEV`#fYN<RgAi;xjSVp-Oq)}I_DDp
zPCM2ufgHHAJWXxUk@8cMuy#k|Iam8Zy{GW{%a1k+MSN^m*_U!<p`Fr}xA9MWx05_s
zS4M}qQW~Ymi)Vw(%s5&nShdBE^gbR=cMV>gH+uwlB=xhjtyE8X`+hwejE@9rTw@7x
zV&{LDYNu{pd#8ZT1hXu3;oBI3`5Ta1zAam!iNJ+~5n$|IB7XC%CITom6T$qRyfJ;O
z`Eaw6dh9`U$F?ET*^zH92psN*a*xU9`bo0g`u7~z^x8;d(K6_hY935f+RMTE+_#=5
z5!o3$fFC6VF0;IeavV({jRY>MVo43OX-7_yrHCt0#Gk(wjD+7z4aH=BmPGXPOzSFH
zmg?~S^7&ptadA+ven4^${_JsHZwZCbP^#xjXcF@!v*DH&(b;$4>Ouv_>IzoD>!$@O
zEUgaXR;~C3OeF2ps^FvR$eSs9^_eDpMBHw^#&k=wCgk4U*cqpZZrpif*l_wl#_db(
zGyx<|)A``)1!tt|^F87AKr+s@PzFUIo1xl|nwK<lgO<1W=f2yA%T)CCHq5)TiH2v)
zq}<L=n$2f}TRq6dbH8DEOPvZ{?h>~L%H?KwT{4@TgVnHn5M${q7z&|3*_|IRI6DxW
z?cZXp<}`YSLCbkVkdwcO74hx20#U`IgY<dW(M(@h$#^qhwmkOwM(DUZ6%fo$pgZ$L
z&e>$n8Um|DNsMRMdAE#96HCvV7Xj*mo5IbBI8MPDN?dgJPQ{?TIVMILHOWl_ta{1h
zpeI?IS%Ig+LEy3sbL(!04TRa7_h*3xDxaK#LytD{!&T2V%#25OwNK8O`jt3t^%J#)
zqxhUGA8${NqvX^dczkz{15Eo}VH_nPhGv)9Es}555`w@AxiL1X$O#iIVX+v|Z?Wbu
z{qgb(H2=CeFzmu){4?=0PL~(L;BXi>p_3wOD052l%l@;PFA2Dm4e8{$8^{GbCIcOd
zE~KL~C0_048m|KP=*qM9_<l{=U`frP=fod)g#D}*2&pCv>j?Il&LOf}Y(qBVTIsK{
zi%M_=iT{X%g`~yNacvCf(6-{O8K%V$e)9|CK5V+lFOVxTx2X_3Z;Q!(nXopP!b23>
z^|-x0!<>`cYyN7Uc20i%W#Q!wAs%-<Me@88antPv)9%G7VMF7Fl;-<Pyh>^;nJc1z
zP@bY2l4?=^Co;z=)dx3GwW3FQ(TnbfQp$jMQ#cPjZTjHHYHRtqeLC*GtH*Gyt`oL4
zVhM{$zsY9gTH3}OEHk~-fSq13#?|M%RDV?*YGBo9nJ>J|u^B#YXyu=<X0W~-wwPsV
zb(~_+^o_-SYJu|dMJBXw-|^daLHLF9-ZM=PY5X`wE!r>8XS{~V!H<}=8bButH=t8f
z9kYZrPLD#WXA2Gjj7mijiJL5~fcb3bg-a*+;HLgU#e-O6Z23%(*00ujYOy(XP(026
z69XF}d`fo8*4HnT<cjGCQK4eO<M3R_4Q%zZg6N}>NN{xaun^23VHqj+gCNatel8^h
zQ(fXhiVCyAD&K>UHk>Hr2snkXJ2nZ~Y!N08z}&Jl6k*ThQuYi8WvzXjQRqRAv0oT)
zbuJL*DNID~gNRFln$r#U7%>XrHLtgBl7H;YM(-*)Pb|Im3t+W+_V)Sfcvd`GD0p?_
zojW5Krm_ina~Yykc9%77h&9@xEep$&-{^QVx#z?MLN<1qbLN;CqQAr7nm4@z79<mv
z(G8iBTkN9wNtMIr_A`QBBVg~pm`$ppj<h&0XX;Bv)eE8eW`&lMT)hxZH~Xshi3s8(
z;42Oy_O`7Pm;M?=5(CRYZ2IB5Wg9a*=O8)O@VxRuy8Wn_EVf%E9XJqvvFkWrB{ru%
zim?I(s;tB9?QMGHt5+NzrI@acCZ3RN`DH$cj{7NTpW+qy-SioYjL=?OP-G}2MEB?e
z&!M36A;_|RnGug!ndG2IRS$90e`I*M5#LIGrl4QCD-e^cqk7Rj_@zNVl}hoP;l?k&
zai)RYz7PkE_cg{gsI*4D!K(KlYvmR@<d>+q^PgY&+lhrrF)-M#b0J$rffL0pfdx62
z<rMFy<B3k@Y_)iXYgu#E7G|pe4kzb)%73&!y&NmO2unx^xHU1U`6TjgjjsUY%0G_A
zu0v+F|JC+Z8pg3Bjr$qU{%fAyd(xhE9bEmPS;kh&?h`y?Zp^^pyB9F->5*JBK1dq(
zk5ZuC^&!-bDr!G(_6a~YncfYJ^Jp4WT#aUcjjlwfVBCg)3uM}jl7lZrD7s5mZtvZW
zo_RRQMv(d<{+;Y%<AUyyeXVGVkM&Vp1w#GTjlvg5xIXSRhBRaio3!l#zT2abLT&9B
zrutXD_=XI<{&K=#kK?DKYz>XZf-?fH8EPSuRqa{aJXYEeSQ*Me(+O=|rI}&zayc%a
zNk`Pel8_`3A}0LOvvjv|9LTYTrT{1KueRM$m-33)nWCG7$i?gEBOI+AT26RIvG6$^
znQ&*{<ZAKNx&FWj!ayt69JuN9+Dba3FI#9XqcsojoY#7|y|G$}&0}{dnss%^ZJ(Wz
zQC?oig(R)!JjI>hhfyui(Dd{W5Mc4Y6W~okD6HY7<u)LIOhm6FK%a5Azs!$kvZM`8
zsrJO-^O}tSXwM{J&5al8v1Ne_anPCS^XAf9k=7Hm+5N&@ZTi*^iHcz*b`+Y_u$!ZJ
z=GKl(d-F>W!B9q$P2q)ix-9fKRFmP+8ml#CKqEJ4VnS`-K=~`P=@Ijg$_UEM(EFME
zahpl$b(<L?BfZ|_EMhVR8qw)Dw=$J*Vi8mj0*y=Nt6Yd@Xu8&mgJ1ntNl2snUUyxm
zDmu_d(s&R&;kjJ_vp(($xYXdxphfe#y(^}0=Vab{QYu&ta!8=?q2doDyBxju2e|{M
zI`=V#ZoUi&1C6E3%`R_7?)fthHENtR60m6}Sn6k{3%{;LkX?bT5ub5-?mmqI0uhDl
zl{YqXJw|L_4n~Q(Lr&})l(8n+_kgQ$8^Y3#^QYSdOpSFK4$WoRdo@4*aDWNARF-w#
zI45i8dHTlEO-Dbhqt5sK415hZl(ML_8Lwx#R)6--c-@tuHy_LdpRm$LANkBm<Y1BW
zeu!9EYoUmj$0Nm?bg|1%Mi!b{X#bG-^~NwV4){}O(89O!e6eK#l(we;y%Tw($mf1h
ziY@*z1W$N>&c%FrrSQbZ581kl=+ay(<1t~cRq(cGpl9eWOFPa7%d(4Yh*jv`d(P$9
zdojF^k(Xrm$hePu5O?*vPIpoiG$g<k?_2^b3dihbWi$T9z-lCh?i^6lc`{y-WO_$(
zxR=a=H%pax$0-VWM>dT&1mvF9>S%$UcJ7ktqsLoUJ%}U#bI+iR6$iL0J%LqBt!sW#
znOmPSubP6el-@A_4uM{shG*;|JmN3UlVv&UsRa_Rz}`T{FL9`WcQMeA@zLk4cudDY
zc!2ugn;)8-BL<?ob>7Ve!kRhwd9wk6B6)e-A2R`9?HA0Zmof&_fM`d;+T{{;auoC!
zwbFzA=cnC@kSob*QlJxKTUzt<&9!ZW6)Ef0u+L5gs-AWqSq-_|wjO+_!%PjYhhPRZ
zFeemz>{*z7L?wB}zat!?ALmb{k5kP2*?M1lhN3!7imh`^qUgu?{El!SV*!mK#$w3;
z!3@-LcA&}BqmZAbznfH?(psec6IocRhC!Vy6g}<2!L&rqf13RC|4WlRfR8vvH!VH2
zJQ(TVnzlQ2I%C+dnMj(t+Bj6kn2|V~akG&@xXyL2>ShA?GBG(`F1=QFWoJKP2t7yL
z2BjiJhL}vfcxu@MJkDM3^W#1w*h^~1@QmI6Onn4>=}ikdyRgzG5Pgj(^5+3D@Q_7A
zkGFdh0Rh<`Cli6Z$(!6TUni~=<(SgKQ&RkXo^&J1+wxPku8ra{w65W9t#PzZr)q(Y
zK~q>u=Aub4qa*q8C_&y2PY0V%9BHHk`0|&cBtXpa@(fzP%0=*=17U)m=F=<K@~T(i
zz+pcD9v^PnsPa{8U`o>^ANCquM$*zWNr7X?OPY)fC92=wfx`|mf@cu;-Nli(-(5^X
zO$V%=TzZA$OEu1aUgb<1_2c)e$fSS2Dv@Yt;|6XC?A-_&Lc8(O2GPOylr%zm*R99J
z{tkU5>UCeFI1ov4Wu$?*!>4H)Ix}kPfYRU^K;Da{&=e1nme-(pcpzn8rAUeRZ*QLX
zI8#O)|7l75cT3eoQeYY<W+lG=`37y2?P|0?4usF?HrSWObR}YD{P4=043B_UI<4xd
z?D@0cjAm^TF02$4Uz7cUe(*t+lQqDga1^SbS5FlB#>#G(jDC(qsJCgCL_(FKSm&h|
zK>s@t6!13U6zg47s5CIK$c)e-0Ws)svs>B8bvAeX>!v8#?{P_?ha%O4hqF#nD;uZ)
z%KsR3D#23Q7qu$s+B=>7EW9H|?=(*1oO2Lr{QiL8ms?IYqQaaruLgRIEve$SABr3Q
zX=&t%MD3BpThyF_l6vw%vV6aEgs#uB)da>1Uo0h2bI#HNS6W*Ao}SJr<Sb-$4Zb;e
zgG-4fjQo~4`B9eJ4Na6SX<W0jf%HEst>uwV9C0KyaA&FOsB&EK`$dt2fX5jjoaCMX
z<5ILaanP%uDF-%JCb^js%XW}Pa_2RoT_J0bo%Y7v{r7jiy<!!zsZ#RAev%}Ud0+l^
zGU_Fwyd{4O`2U47y1~F~H!Z4q*wyz%GscLsectU(IFLn{&8E*xwZa~UQKjLnF9YXQ
zUfj+n`Szmx?7z#I8t-?n{h_?>7bs|2n&{VO*-0K!nh9t9-_oAWo!i)(UV~~B4`PR#
z*kqq&$t+sj(wHEP1`NOYtUUjUUVe)Nk}*Y?sHw@Xhpyny@R|2hAzBI`Y30|y>i;$o
z<bCja6i<9j*4@ant}G%~_K>J-^oCM5zQWkfncYhrshTwTkWEaGt*LK=F0bFY*y_*h
za>dGnJuXkrYrcu;+k52`@m4mG1H^Gp`9}$`K`?zZi$pCZV{~d(dC2d58T2~j&wQff
z7#3YG9eABV7sia((}l(2o~m8l_^BBZX<M%_xD@eW@>)hJ1fz#8#Wm1%uc<jwXtt%t
zOg2~uxmR2whmivp1(HeQNfT<Ra+s%)OogKo5;=%GEK+fM`?CKGiTq(mIOo152TdHQ
z@}-O$GDQBa4yGUY5&xHcn+kQ^8X@Zz0ITht_YTnw_*kPd!6f4gLI*WGd2?X7X~~53
z)g>{|KAFu<*#(BlH=S6n@gn~$DvH++yTOJP7PCI`>n&Kcy$v9^$|nWxaDdVG=`@u8
z2g0C!ulvG*?QYJGudTWAy2d{oEUJKs?3mT7eSYzKlX)HWCI6@Y81M{S0P2(<DLUH(
zBP;{Zhjh3$K()@Bw`QMZO7j;OQKcW62OwLZ<bm6hv1A82NK?)CA2g7V`179QZJziC
z^s-HP2h2+C{MVV_zq9fd=;=`q5X&C^`Vd2porUx{yzkwU8JGDxMx2;G9d~z?`HfR^
zh7X&-(kb8ZpG^?;AnONkwqyJa&aYyD1!h~XP=)&!X#J@HMEVW13V$#w30UPZvuXKh
z|EzNAhgJRn+Nr-myLRx4CUX4TzT)0rH9_$l^~;>Ih;MK1rs6}2MXv?zgD4zxP3is-
z#k&VlL?9}>t>b7yvk2rOe_IkuTCY8;=(&Kqh3b7z|98Z|xL+5d0B`H)ea#DKsVh|;
z+T}gQwzxwUMWnyY{nkzLQQm+=WF8b#<-i~1^3VT3L~0nnKM@1d!*Ke;{xfS2ADks#
z`}PXMfca>J2(a^jG(k{u?>&b1x43X5l0L5aA^(W&G2#%CHKR9q%;_yNAUyqi6oqfT
z+8CcCQP8X5oktP+BSB9S88Xg6m7qg>?_4-dv?U*U^E5c1P@)ms1=HxQc}5zXJ)E`r
zQndgm9Rz-breSGyiP`cA@}dv!;%Un?$kWM;!usA4GzoVxkaZIm{<ohUAs{S?E$p@n
z@0N?!P}`$MlW<(vA}gp1VGG?mwip>s6CSF7*EIpZg|QU=<b2}MA&$45uO~6aCk1E>
z7n+^i{IL+IqHV6%hN9aqNkJQQ4@Zx>v{2XuM5N>{J1?biAVWz!HGaO^!l0~|^C!gK
zjhM%%P9}CBQaDA#mRAa(7I@GsXzZazf(a-*v(x}J=LU2)VDCi^svN{Tzz2DO@{pm=
zyw5=Ck!|TI8uYOALF$PYtsO(Qi75kq`@m90`BIMkdvyiz_-Ol-wRt2-w)OrPz|9^Z
zNH`cwfRl?weys(<V^M;0$9n=vow}t(LrBHT-qL_=FR1&xGeK)k@<II&5t{>lfzy&E
zHx+W05DNz!B^t27r7Pt*>Z=q2d-kPreWboT6yb~bN*S>VWRy6Bw_j7kNZ{=pd5}ga
zLFSc7xT1J`LA)Ss%z@X|IbS>}%GtS%m3^PD6N2WR1X=PUL4{ve@?$QQe;RO^=L8iG
zQp1K2!Tiujcs(dIA+S0D4*aME-{;s7E9kLfG9FN5uD{*3^ho}&Y(#+qi7ug;6ckd_
z3o=9&{gDG&66%M)^v{<qc-{T>dDqyp76zX!x&0Q1JiVYJqQdIyX`*DKT868dhKSQ0
z@}*-7sF>;7mb-PheNp%(=SM9{Az&Ue#5(og+nRy}6}aN0umyEu;OjeRug7vr{jnTq
zc_Ax$VHgjB^@!Ha#BVi72W}1?>JkNAxlSxlHV(X$#tPG|S<CSCquhA~+JV>irSVas
zPJq@4foWF|*+g-=RR&)w{iFp+A?WA)lndkHCR4+H%4$sv-;G;{(BEmJ#4V`I#x1zg
zM`1QVV0rUKx7Sv_N07P|kIn*!!)cJdY3&d@HI2e|#AAYMKv?Q(d0GA(l>Ddzr}IgB
z8J?W^_k%}<y7cN#MZaRJVs>Nte^$r5spXG|eY}aV^eLS*aC};o9rxz`*vG)m{1-bs
z=>#K4=#in<)25B`x-_32e2bT+of7yTB@IBk{6e7D*sT)E)+=EsiHHP`QL=U4WQMT6
z!819b91A@CBv25$dir=>LE@-cEnvXCC&o9pLd?t;C`Bh{%?6aIXMOaYpk)${t(yi8
z&lxW|Ol-vQsO2%<jzlw?j^he0+<Ta<3t-uNdqrfx@*AO+aN*E$9wjwp^;lWjq7p4$
zr#86rQM{K4IV21J<(UIi@1^;O+5IXEd~`d2I@A8fH&`xl6F$>KANshR4VBKC=d$ni
zho)S6wxi~TBO8dNHw)T_uexVz_xii_%ed1OkJjKxQj|C!Y`WZ43>if`P5bBGMIdhM
zK?#!X$0}k=xH906#9rSYC9ai4kZdxvttt2g83S=8jmFol9Wp+Yy^^Wn&Mzonp71Sd
zBmp|pHi+UDPs4SyK3nUlCZr*8G0Q*QL-DWS;cFETp75kc!^gZUI_;i_;h(vtqln^W
zY8Qb!wSrCr>iJTJ^dp)C>gqdsSKjME-|0+ydoMxICehhLIwHW_WyN!g?L+=;^nk%7
z_|}8bf4gNyWJwrXM7JF?=KdQFl0mjNG;`@QN~Ql)JKVAoBZc~QU<N|k4>~xxzWYUz
zO5~8!O)G$qnE4P_`v=fu5rNu`y?st;@fR!xi2kd{O*+E})`%@Tx(ivFGK7{lZPz?9
zpK6v6nXqIsOFtQZu9?Q6hH-Cv=FWPpw{o3)U7$)3p~Kxeu!VW@DT)}I9;kUomwEsP
z7J9<D^2*oRt&;(TXJQ2{=Djb&4`)9Ebf9FIG8)%V>6BpvxJ>4A7x`RYbSjXg(1%fW
z8fY<9<v?`2rb~TvQE9d4xyJ0gdv}kvF1Z@r<&@&37vJy%0+=9A*>%Yl)X(6Cc=*UI
zi`0-D4%Y0R2=bH?GjV87g;!EXV$%+-fPcu*i&O@jZ6f+3->31&A(r9xH&91n5$(O)
z9mEV|$JMkw6a8ej<Vi*A7>5T`ah`$$bqZUTpMll9vS6RBEBuA?hJto{br4!O>%-0v
zfxKmJZrl&!Oo|>)mr?gy0^p&>bvlTGsV04<@5o%Nb7;AbqCx=8LsH8hr5BkZ$8*w2
zPxf&&_Kl|PBatK^L62G3<&O^RNPhi=#wF?e^~w8y_Xl!hQ&BEe!Kdk)*E}aN{Agq$
ztSm{Zd<lVq@9o*tMV_~_0d-8<HQQu$+mAx4j;UZoD#g|@u-+M=!FL7`;oRH(G@h-&
zu><^wAqa{rsP?=IOCM}Yp)0qVA>HM210RWvR1JDwz-k65$1bP<?A_FHM@OU0`Dhog
ze5O93U98_&>7?^ahCvm-WSA@@WSzCU6-dLI$#fdYU!`zw^dw`ch3Sjl$097pq&KCk
zuCM+MCz>re+nc%xp{)twVpZ>kuUj<){T#2sVYIh}PI}QVs0{#RM$?$j`@wbX##l(m
zAMq4MS_(0;I8z-ygH|Xqf+>UgxAH21n75cRcfCCGAy&zHvf7Zcwc+N;Ycf3DUGu4T
zY(G2Sa~dVkFK{d9i`Hz3pfNgz%n^&Sb=Za>wj@2J4QRULVo8%i_peXlGV*>ZU(E{+
zNZ?VxD;9iQ%`VxMM;z<mjl?%@?anP69}m^r9LNsW^z9ZATe~!&Fcs%ZHP9FDIkwRu
z!u8O#S+*4yyP$?On34Cr`vdZhh(I)or4#FvEc-C{m*>z6n5ss)X@41#XebS=CNh~x
zp-{M-o55XW=`}+HYcXcw@lps08D*;(&vL&XiE&2k@Psm4Q~l-Qga|lG(Fp{#4_+mb
z2|7+Yuy1%{(_sTITe3RUp8Og>FF4iD7uGL6WJ6m!-l;|~ybTwMQ2;yav%w49-7p=m
z4Taw#V?sBmWj4q&UbI%a_KGNe?XY38UhBb5Ra<-Xcx(DuQL^2OZ7*BE+DFUbyPt&P
z($*o{QzsKJ&fd@}vL46JJ~Ss6(-)>gcDxN-NfWL24JMc-V&mw?h{R*Y!=KCOc*0K1
za86(YBzjJ1;&iJKipAFCcJn60__;bhnltN2!dlQT@-WC+8$c69Jx4m4ve}PbQP;go
zyR^O}pM0W#Cl{m^C}=&;mY7$Dpg%Q@wra;td18<&=F|Bh_$AuQA!?gTVx*@ZpOf1T
zwfPqepZXlz=+lZ~@IABq_2mhAcVKtE5e@8u;>RnVvek8gISEPKt|z?hm%F<)Z1I6g
zQEcaq(ddze8F*Woy}-?c^=F?`|5%%P;w{0@6=&)(w>WG+i31}v+9q+AAh>SC)Tf4P
zf&H`e5ieRrjye#Cx#9Z9$--D{ZrT-`c(!|O5W2Zi=?&HBU7h>Lfc>Naf1g4bAjiC8
zjl6%oAxXGz@J$f8t#PnYYNYchD)#7+C=s0!&-YD#aF#HM5TO3u@gijDfvX;Mm%GRh
z=;mV@gaa|NtiR5{z~#iYYdt=*JJsz%vkRTuxol8t^K#&<;G|fVZ8lS>lhreAts3~j
z*|T<)osE?r^1mAIdw)AlnB;=KBs?-L2?t(X$yh?dvE8mdrww0u5pZxIx-z}1k5RC+
zm+qd??)dW#0XIs1LV4xxNSic#WVG4tA^Ewfy(8QDv9z6^!J!k3zIf`Z>Z0%`tq}HN
z0nDMm^P?|>0aJd>?H$6{FrDBQ+HT9Frp4FK-(tUI>vktP&e_smF-L9Q>~Px<%abi@
zkr#Xk4eWIo#<RzRL|*{+rQZN|+8E)9Oz_H0M0bBu_*A}XxY>PN5vxa^5vXEaOGagk
z_Z70%_wta7NpjQ6FJAO7qjDGk>U>0}Dx4ORc*z&RzeF+T#_{8tXUqv!6PS!m)?`(5
z09GrIg%RQ|j#eX!AYmDQT3WgIsE;6Ks!AY}Z<~SJO*-L-h(-}n;#`BMQiJG)5aO0s
z%j^@|pefLHUN?I5s9*YXZ^srvBDOr(2FsaSFroS-YcD$2-4YsX?Fw_W9N$c?T8^&5
zX}c%dzCzq&#$V9b-pa!K)&|Hdb^Ue1+8L4>dUH7pmOAzw<uS_mcviG8yopJY(M9X>
z999@3G<8D+0jQq`XEyAMmJLr<!UMlk28T;rkm^xO|3up`_epAA+t+-C51F@u@J1Z;
z&d$dERf)ajt@JqO%t++#qw$d}r&g1-c3O}^C>_bd<k@}=-iq&jaCz*q@f)q|Vi{z=
z>XWrH;A9l`nWj<pytMH%vtD??#V4B(J8AQ(kCC0qVXZCVm7Pm@`Paq23?PLXiim4u
zor{o$dkrLfm4pl^=EvN6pb1<xz?F1Q7R=m@J9qGQq{E53`Y~9tGyWx>@kj(0H!W1}
zcT3jmVH|g*r*qUDPOuj~*19$!QcKt5-#!D;{<4es<t6#XH|;fTs9TF!_+(bo-vKb3
zF0kfQc}I#bUu%0r6r}Y4H)1efspg&Y=I?nXTqOMVC`<%Lpz80vKJ!zp(FXspYJYqU
z1$e;KKf9?X%q7eZz!pvy*>16|D|uxhnj$l-*PTGa&9t~J#sOX*ibqGrB;)X9aK=*L
zFLnBjX%7hspB60M7xj&_OLoa5M>hG<Ep3*#s8<0QkQZtO50vCTj|9s@LqN%au!KG2
zl}PeLAZ&?A!W{(PX$F0N@Xfi9s9NI23(a9mn7?KKy>+Zo$!hoN)(Xf-K&9K@$Hqvu
zZw;SRd3{jKObuIU?Pc`)mxucL2X>HKtK!c7D|)reykIaAT}Ytx;+-+&yVl+-Oof+X
zu!pezSn(_YpVJYC*{8dCE?tRcN5fvg^naa4Gv8i>_CKEV1BsyV(APO-u=YMMP|AH>
zJ^OHEtyCvVpQv0!!{QQz`XgV-_5Td*;e$`jbBflrSSAt_LXAMdDduko^^O%|4*y~P
z9zFon8o-U(R{iEaVr4)4L!17=%yLSK);Rp<k)D8e{vTKHpEk%*0VV-a<AEo5N%fZ=
z`15Wu9Dw?NV)&&Hcr`pT`+qgK^c)VA7EAYme^4;|HxD4vLI;ln@$?aF<Y%5<E)php
zKMwwHKH#-7;_l<6B7T0-9wVjF57$~!$1#Ih`H}yYI^z?d>E8YankIqhe`Xr!^Wg)}
z5D2C+0M1b08Ylg6YyTMw7AiLmgltB7ym0qWX7~}e<F28fngPRqjVf5WrT4YE8%*;;
z9h;W$(7xbb?E-{&&;yOJ^uXQxpGPd#C;5LBRSxh|u)%-J-W`zr8lkaA%>><#|0UlA
zY+#TP1|6xD)WiSZ3V(u(HbdfnGcDp2mPn)V@x*#NShJ;OJ1^IIWBK}@-Cs^xt_j_X
zxu4xZG;LgZ!*=ecZ&`<4*dpBBnD88@xwv*MVv+KOX&NbYaFI9r%F_%Fzx1jmbS|Tg
zG_V{gzGoAyy^i=<vzL&s8@kz2Jj?jg^S>rcf*0O~0Va6Aq5<3@!<@?gxO%dVMd~mC
zdSyTOSW|6dWfgIhT!^Jx?J|N8qEnI)D8<5(Zg6MGYW<fEH`xb3YB|3TgYAn>eb{1u
zB2&1niZ;NYO}qij@qe$qw@-qkTE~FAx0{QTlAc&Tt}}9O@1)p>@O=N{FQl#Z8$`J)
z!IrGhZYwWnThD%vMkyN3J8)x#Ewd{INu-ihc=auq?}_QRld+Hgwb-K=5rN}%Hq&a$
z@>OiQAGp?fq7kYmtpdHiHHMDL^3i0<0x{p_`$NntjP#&d^n~Tx>{y$HS|HQ50nntY
zI<EMQ$l=|WqQmUwWM;}(=W{MpwwpuQC<^(Ns^bd(8bXT-ybT>Jp-45fE}S>1QA#}G
zOA7t>N>q>UTMIK&SF4WZ_YR40iiJJ3^y*)rj|vMB6+&~)p;x<HWx9ip<7z&k6be9s
zsbNoDQur;Nr|&Fc&VD<12_Kbd0^lGHr0Eqg(O;JFH+D+kKQu>Xe)l+MNIeWal<<|{
zQd8OtmrB=)N`RlQ?)TEf!Nu!y<B!UN2c5li>u-uR=~g0z?%(*Va6Ylbie3Nia3J&M
zvOF7C9=tc7Xqf+ywm8t7vCoNyo7)hu*rY+(8pp`2e2VuER6lIs_}9QqzXJFBo!cMt
zbvvTy<m$$d9(Aase9M&1H7e8Z$FVbRZxs$4ShU;h``BOnHF95;v;FqI9$%yBo`|NT
zjyQ#DK)e^H^Cg9P?pl!!C%&*aq?;?n3#c8a5S5;uZY_2EU~eTRvP8=@m&I=A|MIWh
z_|Rhg&JE^nA^Y+_9P<Ngen4E21WTthK4qT8LX@r49}C)U@$sXz?JzLX^dv|%T^L8Z
zJjw7WW3}%rjUFAbYvCrjIiVsEbbqQ(JJg<5tsQw|l%^YjZ<OwKp7VJtyIOCcl*Rb$
zeVRW|IId1GGMh=TZ3drpH88`o(*w&%w2_zdd$v9<Y7Vs-EI;C3$n)C$Nf6V&^-OL~
zk*oe_wUFLvOmO#2+I8A9&Yb`tp_Idmoc9yLOm!WW*L?-@B~bLT{{jeytiTSy7$jFX
zS6E5@(!?8C?|aozPf2rhzU^^b*H3sH$A7i@Xq$aBQ}}rU6l(nrJLkgKnDOJkJD`UH
zbN~;djY>QMdXkXw{>vc`_i0=Mky_;eKi*)<T(}g1)OBX>Y`+@`N@D4vZh2q3AFB-a
zvVYLq-nt1NE!Kj9hJSBH5efzr>INv<Iucm~*szV?-gv$TCDd60(_bpVxA&a~$&^D(
zQ8E6%R>l*bIEp0^X=mOjS=72W@%9v&{Oh|dmie_Jr~m!67l6t#2<}Ni2UH>|tyL5g
z;eR2HwKTvfbG~&Wk^oPyeku6xPNky&>TV6=|4v0mmp??G31_@TK_F1oLoUGona_R*
ztwR2n{>gZFInvdy<H`?#)tDZKy>$Nz^3||Q*}bce5A7*3lv-o<oS1v1UPa~${u^z+
zz6N?v$0`MAR`vg-S%u*v0Nv-Pz{&J*Yz!{q;n<k6+TV!L-w1^s?uC%G!QHhAh(ZV$
P0O*~lj7afYsPF#+fjHw4

literal 39727
zcmeFYcUY6%+ApXZMO2CiN(fb|BAw8yf}w*5ihzLh-g_58YC`C}C_#FWCLN@P9_hXJ
z-aCYOP~Ug&Z-3vMnR8~&T-TiI{1exO$+OnF`|o$(PvA=hNxVDYJJ+sV!;_Yhcy;X>
zR_L{B*FWH31AiI236H*Z?ZY)`iDxR#TI)%-%E_8*YDw7BNy+pouan7by=SM8<E=T;
zlG_44<aulSCf$IKdpP%`zN8)bj?YgfxNV7^4ONY%K;lOR)F@8g_Vyf!ppO!fBWhiy
zh9&V<qtLK>&v734{x9(nz%z{rXK-=NN*j}@%t~vN1*->6P;?R=oW~i#8{EYSBo?Zx
zQqxMA3Ud<4qvnJcsg|+w;q4|ns(&O9zeqygEWGSQ&!W$`ub0lUvK)?i)7v}Aky3$V
zPbsp`PYShF1!E-Hs@a?wnKGKS{|L?)%oXYt(oSoXUn_<9!np41*@;m)*yGWquG8)%
z4Y7o`GeqiLvYN#0lZSc}|CYBsvmv$DGSy=aap}6CcGTqz1s62BL|3}lc}7I;Ha(@X
zC?l_3bcu{gva346MOM8k?W;_Dk05ppcPOtb>U~b8t9v!kWQ2EOMH;=VdTQEAwVnt$
zBkPGklOQ%6EVb*4mZ$7mq9FRcTR<iD{Z8L=D`{oH89n4EVf&?OFRju@XT^INOGG;=
z@W_i^Q@NLZWexF^1G!uQS+`@vRgi1!VP|yT37@nL`NEr8bYeS_mR5iqb-sKMHH3{X
zH}`IVLefhHVbBxdO-}L}VYTkNMA}**Q_01o9h2m5QkR>ylH|<(IW1V`)JO#`TRTZI
z0Ct%YQ+VB5V*(Ojz+)uHXIrUD(pz3q+9<vITb7lHLRkFOX%omIj+x`N?m`awi>G*(
ztV{`fTvVb#rjp&1@{K)IGN;c@MU@Rt#x$__6;(V%8!*70jZ=L?v0S*6%%Il=@Dj`5
z_LSyH0$S`YP^+r!XkX85C5QRwmAt-4Md?<~(Ae6PuQLJY1CphK4y6$Y_>gjWh~8tN
z+OsEciL5oaZ>Uu>!9@oQ6)CJB*t#5p;EIpot!aPy?I$-M7X{+4oxJ%*BvT#k=SA^4
z1I|kFT7JRPWO#X>+m#_3++l&-9TjVEGovn0(B_gsK@@)U>tx~>k*It#$%1{ZMB0%^
zED*K|Cw`k$(CH!?noTHR#&Yol`{Bzrvcuf5?~3x_IqgM;y@NjRMvH^|Qi`q(iBX1Q
z5a*g|qSB34P7TXqKjdsJVEcD|%S}QPtL-r{c7rAX@8j$c8P3#&1}V}Rg0BS#S#4S<
zizX<Jwz>>Ed?l*dvz_K^Y~i`hscY~)_o<mm@}kR2>AM*yO;<B&(v$PiFgT0`%E93Q
zjrKlgz}p*Q(xtJW3(cyqHa!J*rTASpW8?_gTrMD$5-E!skyP4H(RY3%>?Z<m?=$3O
zGhyHj{}W4My_-tl%q9z&0?iQ)!A7rjqt)ny-zdr{f09jN_lh7%7L|66_J-^kB}V$u
za)n;b`5FZFynKJU+((M)Ia6}M0wc??InD6s&>u<;U}<{C*x?8MJYc~d8vfXdGx^g~
zzkP1oj21&3?A`|kmDOZfO)^R&`lmK=;Ld(6FTC{M$4HfJQ^yJ%DJB%L_qBsPiF?@n
zc-DecxBq^(&1`qFAE}%Py-<f3rs=j%ai6t5cJCl}k>gLrDg8b{Z@9fBB3t>Ld~UP8
z$^(WB?;C@(w&Xs>Rkf$s_la8f)f+aY8sAOA+EH(paO5;f?QdzMk-x)g!&zkOyVSO$
z{o47aSBYLe$dq^Y8BFRgHKlv|4j@-cBypQ{O^!>HT=avs6METwBvskrz0h38z-$J6
z<zV6{IKdjzZ700>ITMhwm}YIvm4x206^sa>416+$Qd=SwK!j0!lg0Ou<{w9P`X~F+
zhjz>H?%JhH#J*6(4wwA=c9IipnlOEgeSf|9{q}f|NH04Vs$au;h_k3Kr~aVmOOG3U
zUW{KHpkt;b{x#u^@r^z{*ayyL$E8f6Q(QTp1!413=B+1xGD8LSeUJ-hV|`{6e3|1F
z6trlVktw9R5;B>T+(J9<2)z(s61}fGk}1IH{|?J!jf8NGSQ5H`=Q6&MZAgu9YB)|Q
zVOKFMkEOXw)H?+G$7Y0|PA|KMuU@!|FZQ>!+?pj5dW#r8qZx238`65eZ-Vr|d!HO|
zQ}a+gn>I2LRJ4*yO{yj`PB4E!WNt|51k4L-XF#w+xk#kuDifBkEgb29TPo(UkOe#*
zQ3nXxzVcw`Yb=w+*S{ls&q6;}C@fVcq;q_DGdmXK6qNH6H>m2Qy_VF5T`YHH&TB}F
z^S&`MIe%PUt5ZzZa~x$;t{b0i!Omr-qNTio;;29r{}``Cy=7~~(R}#AQRjtRVG^(K
zts-2ygwgQ?zcU(9kb`|a&X^syN8wE944D)t!S9Q5FFQ1-c$Z>_k8q=b>9;7Q-be0y
z&#dldzYDG93t>fM3p(#g;dblz#1<fTU8SHK=E7g@A~xRRQSML%wv!DFhvemZ-$+-`
z1qGhDt4!iWniM+XZJz0EhJ>`Q9R{H19_PcdX@ju)csXQq8R;k&V<c6~H=r{75GK)%
zaKa`@<f_VBDyYy#d{L7hAIMVU%$WUEhYeK2BlaJi#`I)ho$;-ZY+8hf9EYL)J)d?X
z-BUJ~g@%y4v}SD*XuxeF;jb1=J}yS;Y{<}RDvCSv3jzD(e_Xr-CWQ5ug*@-4{(GHz
zk2CbIs^U(el8R?Z%R|idKgsgXZ~s>*I{x=P{<{@=hrMyX@1l60F**&CW|Bd%t*LTt
z;8GV1>4QCsB{prHes=ZGGP92ld)AxcZqDPWt0J{0z3K93jetH&JJ5AQb)WpB-Pf1x
z8M?1T9z^VB8HGufmN5Yqmg<4ySx%SL_oC+TsVjbc#jD2)${bHpf}~ISi(8SS8c|nI
z61gGzF1&vwRqO15u>bVCCy~2Zv5|_+Y3{3P^GX|eGJsBvsXkJ%O0U{06<h=*i(BUt
z%bcjl{o3w;c{lU$4L#t^B?R>ZF?0@<nM0m0(2w|3?|gM<V)(V4>q8e6x19Tc4~+Ux
z`{&fH)+ewAS|wS(fp6CBq#paTO;I&jI9;rfP~FgRah+}np1V9g8GGjJ_3&V{V445M
z5>IMM-I0p>S(Fju!Sa@<+f%+|`dP2DnPP4ouant>G!e{nvzyPc+UepM`i8^Fj0S6P
z9cDU?dJ`U#{P6NHBIe^ZwBKALuDqr0VjfN{B&(X}@h&{$@}#S(-pJtqQ|xqWDc<vZ
z5h+bBN?vo+so8cgyw=ez#DG~E+Pbj=?wN1Tz_2bTqiGPPR<D73NSxg}w?0s1f!NkM
zV`l7{%?bo6Q*TP*Dtcy?k@J3kq&z5dv7f{^yAZ7o!9n*rkR&C#*J^w2yIbdq2BX&p
z_Jls#1>f1@05sp6kXr_=p4Uar67F>;fn-FJb9htAS+ra}eVx0!7&tO(D5%*^bDl2g
zYstwkH@8-ZPA=bTpXci4MJMjGo-9g3_CD_-c@D=M*5WJOhq#xT1uZL-wkhG6aqgTt
z^UhSO1zw#)NvbJiTO|YK^OvF*#{=`#{DujoyK0)6*6lO(Ck&l-lciL4)0;cGcj39!
zB6Gu7bIOQuwt%64-QIB1ufGgof%TV?N$Uwzy&hoi`idXjX&{ZGqXNCyMU*1h39zu`
zh<vMbn|6-H4}&t2T}??_Pj4(eBt#lF`r!vMYPp;4o_cwnJgeTR%t2{b6<OXJdeCy2
zRh56IK~>$d?YP|M2$m6Vn*AIZ3P6E_p1V=(aw`<7Nkoc*@5S7EXU_V4^&+)gmRqJh
zg0jm_d_e<(3jr6iCh1Rxnv-$q9GtpGjNqbPb7AwPq*Bn{i*n&CLadV^wy6LkDjo=C
zq)HLnCYXsWPbkTAAG4DzXs$(@--s|0IBF7~Gpcoxmj@lO2sC=`I$pRRG`DUH54^!`
zAT)tZStL-CQ<Pp>Miu?AK1$c(j#r#`yqoEo@XMTougsYVEk*;iGVUB%6ZP6rcW)z=
zq{kBg9}dQHU+nBj?|jOj3?r|@q$<xMv?|$1H|9Ww8~b|r0d;392~o$Qt_NR^6N&J%
zTo5Fibs{3h=#4oK$4REzv(df#M7k)<_DCAtMUB%ar-<%3>yDt}RGe>L`Gm`wO&|vy
zoNdxIcTrfLEU#F-`nch46R)OYC9b{)U|a!EK_p@`QWEwE8DLpa<%+3Y3nvsWOYA|h
ziH5%0im?)<EU0L|vva-5iX?IEX@^*RCW@<@-J>f~eU*v!tLJ#?s)fiS=n;w5drSju
zD6A#_HK)X(s|=}B)AGj5*#>zNOLlchk%TY^qU_Q?JYXf_$q^8o$*32t%>=43?2mUN
zJ9=`N=$b0ht25(|9LpDas-+Xx>|<TB3Dyf=X+B4*tbcu|@fd?rh09-9(4|zqHu1QM
zoulD@WGdo#_JA8tWiXh~DXOQVDJI3iTzA`ex#n=0rOgJ4^N@m@#-zL_6kN8s+NM%z
z5W}qa^kTYV>n2{kzhy1Nq<?I!I7IZZ3tEq0l(u%cgm&)cB2_X-Jf!E_JJ+30a|eU(
z_-l8^=ZM7X1Lpdiv^9stYB9rkJ6>LAo}BT`rLAI%_Y*ZF<ec8=_9$`dfM_ST&d^gt
zO_EhsWoo)F&$nq&Tk$ZlT~WW3ZkQ|bWoEMX`FvE-BaabMQW}`RUKLdhc@tae<RX#l
z+>XZF<>_I~aq$q9^kfDjR@+f`QnUa+F5MKFcH}23YO0EZBIoaFMz`hEmAJzd?PUnP
zrZm52SFqOk)R$Z=lF`w2T7oE)?e&FR#}?X_O=!*d$T&iCdrHmh@aRU(P%PmJkW1m!
ztT6ZI?g}DqHA-)wDl5K)L;YNaF~>Va5*PZTB~&665)_A$A#V3wA}5hRybkzIdpxsR
z&<+5C6(pg<H09W9FdTNQgKw-dTmh4TX*mYYdqM3$(nVTM;MZok)$Tf2Z36u1dp*m=
zbC7QKK*;Gy%AU#on2p@xoRgdshK=*n03Fr{ZGFWH2?{lfucHS+Fz}vdzKvZoxXp}}
zFN*;t0BxLV;0WRB;uF7zRu#rF@mq5nO47cK40&~#>F;??0~lQelP)zWx51rFyCW^t
z+iNCxbYyo<w)8l$;uxpIcC~zu9QE>`yDPP1tTMA(g%^g>Mn73p6V>d{HqA#Q$Ym=r
ziCg_|$O<ANx2o;Xak)+h=q_a?thN?8d+QNgyeY9)BrDsyQ^;g+8)N<_hsLzF+@Rbo
zlA~GqA#JV|NHM#iRVO#<+xtVQ=vbSEqi%wfL|uwqHqY&M_mVS8LavaDh4gmgfc7Wv
zcOIR#(t3D3V(|{q_4Q!Nbp5=#!FFyzVO@~1dC!M2_r}tQtej1LqHg~DZ{BIonv8$q
z#U#Foz=l)g2Se}CCfcDnG<3~5;x}2dIYWj2P7DDnh+6OSi;M!gmD>Gjyl3v<W6zB1
zy1u@yFD^doz+ziuC`$NXK@|tSPl^t3=L@`!b!|Jf`#K=69Ki>3BuZ9$ZLbNj+5!Y@
zKsH6}K&gMKPFYbDLhe{rDm3fW0H*LxBMP*o@wza3nhHp=sqyf;YkaIBhI5VDjEfD5
zI)@jRD_WyJdk&x3c<lk=10F{FdNU!nrbMsSJ8g1L1U{}8v`92X#v?$Br&V__zftdm
z!?6)JLUR~@21&a6j?s+g;<$DfmyR}m90hPO*f=^{A|6JZ;N-G3&IQTINx_|lv`uwB
z+ED+ED&>2kiY62P^!@gf1{o_+WGD=`XZgD|^a-iOK5v;=u800yp!nfr;bM)+^Z-{V
zgs)SqH&X{v?`74bE;ak}eQJGYJR{gh+>la*<Ho;=bGbSKKRD*CetkSUIH6j5aQQ(x
zLJtGs8Kl3TOrQRU$iK&$H<57N75hF`2Zt9IrDM>+(9FZvzolq$Fpw1V*8a}ZHANQ0
zamqL7kXY(U23afwbFod+F8fqgE=lzC!$df`9VxS1GV6ZSLzCVY0;aMmUU3DYMpiu+
zI#MY6hWPoq@DRgN@tg3uSZl5ko{P!X05DZTy?97YxIVo`CI#u3j+>Jvm?ExLOa??t
zaVJ(+;6}M9XY&}s=gm9Iws?wAtJ-sz^X0^M`E+x=A~#o<QnGiNJ4mxRk*hQw-5vgq
zXhQSv_3F>u+rvWWd8l=pqNm8){q#Vdo*G30I7N|z23G|k%Wco{(;RoKz-66`ojFVP
z2Ky&a%(-*!g0AmJZqZyfdxYNOPBC@bVGjl>vh`T+v`4V(`kG;9l#n0nu0ZA~wjR{)
zbxwPRG9ztGZZ<kxb)f|OeScn4l^)t?5j(0UyvswZ)0N|Jk2<|rDFh?+$N|<9q4|A;
z&zV437)yFCdhf!)eXo<>jdOJabrl~Ya%ix_(c<J#7v|hd@6K09KrZyNM)DENsnsPx
z%h`JhQr&3IliY|2hcS86hVR41xdVG(is~W!Hk(z%r^5U}FEmg9px$1>3vDyJ4N0X8
zWg(oV8jzG>A=p+|oBvkAIU0wh?_`26mS;21(gS60Tes-t2<>)zn3<LAEwoqe6Mjkh
zsf19glUh0Z;@CY#EyuI~Eqd}TUOp~bO*bsQ=fG8_L*c3pmi<VTkI81n3N%)UAoy+w
zGx3C!JOAV>u&{rB@|8qZtM2$mlYTpCRpu_3_Cp01R^SYB#{ljCW#J#frNa)(D&E&0
zi>DZC^*`e2TZ)&O9VO-&^xwg3?q>?Mi5*H=TN3Krcj>>8@My#g9FtqMI~NZcFjN`Q
z>RS<8dNJ6-2vN@<<70Q~BjXAMu_T1Iwel`{TMrIZ>}1)=b(JV}K?la%ZC^KuaT<he
z`Bg=uap~5cUzlV*?vO|TbBDTXA6pK%p<RF3ZC(>>B)?`owfzJG<qvq0X30px#ZZ2W
z<7*!rg+qZ1$EZv1&)hwtgQB&@aMGy=bf`Bx8;!waej|$_YS`i3#6$Z>X{jeon}aaw
zQ8T$TQTdoN7FxGMaIq{YXR&Wc<QH*lMlOE778p@4l!r55l~ii=-ld)Bq-F8JQ`E3|
zAU>U&=%A67`>9eJ=Z#=-Ng^|qjRQYDRhDxKxv47>`j51Qu%yx42)rNn-XF&5N}vVt
zAaalJJ}J>I!bLqcX4?577!ye08~D<7&YCt{x>OsfNp&p0p-t9YIW9V3uq31(6j+pO
z+2qhWy(G+$(Ns+704}R$+)@-rUkc^xQ@hdem1V|q23${r4XXt9+~n!qnTw0<ZNUku
zF?4-DS$qhpNk5;MkkX9v-hV+m$cqR#(CF_=?OSr^j!;ae!M@L7yV!k={VF}ZEH~{J
zD_SeGh#nwY_!>pe^W%|-^6W{<85Y@$eqJ8ZC$RLxVZ0LGV9DtYdkAdUU?WII#1=if
zBNlnnk62s})%J81a#@wI{660g%`K@L;=Dm7)gJMr5d#ZyWr|k>L7=D1in0IbZp|U{
z|B7QTFl@G&v`HJcoYu&9Rz=G-;GQ&~DuY=%TZ}g&!tPD)0-QlqBq?;W^ZITZh4;Ku
z!v#fFMv3PYyZk$eHA7TzQ>j9XoqK3EcqVIIFIV)L$b{F_@T?{gbFp0MOaRr@L)2{p
zy=y$b?YU_K&6{+e{Z6MaSUk0U!4U%cScw|Wi?CR8GcB>9OnJI?dRF`g|CJFcSq=LJ
zhJ_{X#!B8}Z;m?sVT`{J&izVXSMq1BtGDsnGg_MiXquX63q(q^Iy1hDvB~Mfn!Qtd
z=Y85U`~2aMv}jeAY+iNaDKk-W8@6e|ML!*AfyiYOwdGZj<j(tqNWf<~x1n=h-fF(q
zx4y&;M|t@oE0KZce0C(kldoa={i~I4x{z*7v-ZXJAO?(4<+rL9M30OBW2}YuGhNhM
z!4}s>yg@B`sT`mf$S79i<8#Sp>A1H1twY@`aK#9ZZt7XOX)$ZRa7flZ)tAPL7$q5=
z@Abrca#{_fZDk=zP~aP~p0}10KfAMEII_GWTRE^#VgC=<JFO+kfcop;Ul{%Dl~B<m
zGq{F!5MG?|I?pc7b!}yVl;ME1GP05VH!vxk{N2HC50$hR`@Wp5N5(!8b+--dW+F2%
zmw<idDZ&r}=)=CGFoa{PqKsMaJ)=47dsn}8bLWS$luBA0qtmC#eri+3Eg>PG<gT_$
zkonr@&y6z?(gbh1yApzAF6uuN7sd<TO!|5YGkWU!AF!-6To${iM&b`m;PC`X9OwGh
zDtm4Iq4IL<_ho<H)F(-bvHXs8>n0_(wn0yl<Q^O}k^u)LqPVtxKGqkKTJwOmKu-0b
zaV=2oo)Fd$!IpG{U<!Lkt}>fIrK%$A+-SphEO+=FqZLF*_Wy)du+$EIV&S5!eD)kR
zIgI_d=JnIvKs=S$><#~;PwbQ!KS+&|_7YJGgVlV_gLvnita|$%n$=+U)N?S0vGS{a
zJt+Uh_P<^0RYvgtBzmO3-CfsdeXKx8Fm|&Yzx;4ryQyTc_U&CYm4Cfm{+nPp*YG}a
z#6d$@y=LUcOoS?>lq1RdMmm#|wwu=ZP6xq_8MWE4n`zB<Y!`Yp4+xyLR+MP+?)_mt
z6r?BYE^fwEAw`_~=<Ds)?x6`5Mv^_9wR56Qn&w!YS#<_!6)nr>OXhd*JdbJ1w4#N0
znhCbdN;y-GwYF;dqN9xX<A*_vX2^s&{7J46?|y=m`=+l&c0dtPs&55q&vW%6BtKRI
zS*yKpj$kCGMJ(B7-Mg*W=i(^#)qk*%QuTG?cAwIVZO4arZ<0%w{EyP$Lz==aI7cpw
z6c5Rt(olZ(n8}j7x{c0mq>~lQGC3b0F50>JMy1^q>Qa~PB?^A^J$dBu{pbk6QZyZ)
z&-jWLS6o{2i_RMA*MW_PBWkU1T;4T=#@)I}0dgk^^aALNn_B}zrUoKCE6`I|e#H#I
zgu96@^Tf9^tLn#rV=;)ESs8B@`hX;_F|vP)Yh|){q<KJq#by2$dLi;xbFm00c+{UH
zSA2RhJK}&TbbLtgW%1d#r?W5z4PO^TnUVJT-A>VJRoeVKo=~CnB!9iVZHvpdPn(za
ze=%mt&xOz-MqJ^wR@59ajZ5)J2ACx8HuVuu0}j8&-i`9Ngc&93oXTOfNsX!Jgyi3z
z!*Cpt7dHcRNZ)H9OZtWFbw!Lzx?@S-LY&FTL#vmlL{zIg?+4o_c|&aK8rV6pJ`B~J
zFP}KaSnw!dE;lo*7|@RlWO8uRfQmRw9zYs0J>uq~^8tD11y#%QY2Lolb2_61BDQu5
z$3s*?MA`s_(`G}7=_HnKFuC#0Ve+;#!Jf|9rV1DL%8;l#hsfHoPq^fhMB64$X>!W*
z&O*kmN~TGprMz&$m&vybmwx_C?LmuA2co$z_SSZ>P$_j=mCnrz(J&GU?$q9LqmpHS
ztdL4pT=atWRWMS<$NqzoM|k7Ib?H&h#N>XM_xz*LcWearj^=3O?xuLV>-t8byOzIQ
z=JM@YXXLfqTq=9@NBd(#x<QAr3o+R+&<S?<(#{Dj)PKE3uX=BAZuUX6TD2K2-2tro
zb_am+DEU2y<^}Z&;u`oZCkEz#dO8Ir=NRlgFVe)tmBKV38m;YG_(P19PuKYzh3-(E
zZ=;3v9grm5x8k=nR|QLLnvHjq8*vH94qqX|wz3Py2?5Xj?;R0mMgJn!E`UL6s}ynT
z0#iUu7n9t&n;vr{!A;!1J1)UdR&b+u8$aj%0GGMKoV-{eNRr=DcraYXb(l?HqxDxA
z-u)%)2Y>7<=ws){yo=$Kr&;OoOp}g%2N4Gx8B{f1y4Y&8dPDWC1T}|M7G);x3l%6v
zW!$(~2_NPTu}S-p6=|$%5r9;iT!day(sr1E?Gp0q4DS^%9t?{0@}>4vj9{C+hJ9m$
zdI#1ur6R-89<w@bD7$VEv)zgbXCb4|^E{*IjFR%rQPX5uYan)>me!Bv(48op0vJth
zvuY-_-K-tH)tusaZmLQug}dkNW=`V8xl#|<>d}xUql`b{lsK*{FKH|CaEhgk5p#q`
zhcv$87#FzBF4sl+^00fGq?nHsH=FH0^b|@Nfs<VAC2UicJeNF4GSXUUkD$L8K_Y+=
z%#pwfTl><ZrvrM?z#jC_?}>gqSXt!uQu&y%08n1vFGf91`@|8%;k+0Z4(DSJO>6{1
z1Zz)Y7iOCug4IZqkdK~`yG;vX+ZIriWi}bL_u=*_l86EXP}>?b<C}u(ak}$rRz^r2
zP@l|muYHg0jxT*AC4A5#wj0MjG-h6@#d&i8VZ@1L!a!13ao2E6(+P?Y8NxJN4Bgu5
zVX~PRVA6`29<L=P5KQqH{;7Y;na~(oYes#$@q9Rd5bBl8Z=@*j#Cd`RJ6wTI;O#NK
z5vf!yuU!{MZqXy+Bbbi3e^W{gMryZG188vlu%0UFCYut|a$W3+I&JXNw3m1v={y0)
zw#|VLWpy&o2=eTGCXK-NzhanfpKX8I?IdjK{l{j({Fh4+<>&KZZpdHUVIaVBz2@vY
z9Ae9<V$r3JYgug3Bf2r`aXQ-g0Bc2o*TA>abzVLfPD5#y-6=L|c*ppxi#Nb&%CflK
z%&a87z7aralTD{NNP=OOlRALmAZCKTJnqCH@_r<|vh5K*H-Eimnju}~WXs!CkD3FK
z`;;r<qdD(eB0-OMZ~F{fu32%=$x1D+@By#3Cp6SBr-@tr=9V`gru#`Ga)Ek#_*#G%
zaIdcwtaHQNp=}!uydbIcuUu;%;BwAM2GnsMO{P?L3Fja)iE~V-s%A0Hy^jLA>HZ<y
za`!2O$)~K>vwzW*=V)9!eH)JC@uwO)HA%fE`noD%TO?c(%Mg82kt6{|@=Dcvs>*MH
zVS)g60A0OV2^1`l^Mp)1i2}6({CNpOa@?~)Zs5q0O|dPDyf`3w+DU+K$^=Or?c4@Q
zIZdcVd%ItbRGCfMMH~zU5DeFts{rg+ta$pwE~b`U7k0<<<ZzCACRXIq<E0fYU7wN~
zNpruJU`u9x0wqH;?tGOA7O@WVZM_?V9#&9ylf)B~?=mY(kDmiR6_}L+bN=b_fn6*O
zZU3f2aB!0IWA`Q6eJg(c7+=HuEx*6gcY~OBkhtuPIxlTgnMESA&!i5eXxK9v4``<a
z7f3E#-hpu|_IVBJFyVFg%t3rk?#p7Rt&B17)rM<vkM{BCj)c-NV~g*EY*FwrIQD%T
zs*H$EfybcI!<5^B<pdguZvupDH<hoN4jP2#)}=C_=lLjZ#_!m@6JPcZP?qxPUcKUp
z2Gl0jx;PU@h7lCA93KZYuuN)B83x<Gc@tdn0=6a4adwi>Gwd+Z{h<~lOaZi(xcd}H
zOVf0gQC;DrQt9R#6OlsulCL$nb*U>@oib2wg3U_qOk&knczhKs5Gm8O2DluGn>@M_
za|7@xz)ha`Y#JTn(H(d#ht{_pr5wY$&rbSTuzO<!KSgbwg=kn3H*q-35f3WdcbT8e
zykZ+2X^v-1qlW2xyDZ*P8S8w<-Yw!nNNe@w)b<GE(4!US9iqLPKFIT0zyEp^i_KWG
z3lf1DTj9+!A!C5yC5NyU#i=KXIeL;(PQ`c9QN0~q#jDJn5<*kc@gQN%F*k^JpqA|*
z)##|aTQr&SH)r|DPzGcSrlNJ~Qxv*m|GiWDx13b$@XOnKDH_8vHo|Y}-KhDTWZD3(
z$wS{mzXnxd-$?vM8$<kNlVDwU4P`gLqWspQ@L7n1eNWr_s#ivVU5)D8W}gR6`ft-9
z#>swl&J=*oSswA)9Tye9nwCtC4TKU2E;XJxxX1i?!wx_a4Cg)Ykra<uv0iU)kCzSs
z0(+66-xK6|cY8oWl?L-zz1}9De^^+x=GzMFhB%ouzXWF?oUJ)ay7|>uET=NR7c}1!
z%k<nS8_q4x<!im0zHu#dCAF_+XL{?a1U$*boaCv*i@w0=PI|SAN2h$-&6PX|ciz9v
z@`q4%BVs9A15ZCg>Z;TCuc7+s3j{wR=GzcRN{+UV)#aV9nc=-PdeXS!36Q3a?eOOo
z4{SPJiEz74sYftMzb9<?*x^a9NY&<rJsN}9tz(tD4H}8Nk0c|Q$K}T!q<@@i+C#)S
zQ_>GjTmHk>S#gkugL5Ut2lQos=yW@7&(U*X6sRoJVl`9TF0PtBMYU+rx~M;`#_=al
zc6*^&r2OCUgN2kbI55BKTK%aPo7bx`#|yVUhLCE{BdhN2MZS8vmlg5v9m)JRK~NgO
zyQ6x7onogu=_SDD%my6{r;51}T3*n+S(dAG(P%V4p*i-f-|SZf`t;N91}V_r6b=^3
zu6cbMK*Ag1K#^rySbrB9aA`nI))z_XFf$Unxd$10@qhU<eBJD9L5EkSiiNcKRD<~c
zU-eLB!v!?|n`diGN!=xgqB~$zDhU33sx5cfjT7<wPvbXevJr5||J|wo|Gf-)w9H@j
z`i4_JUda#der&Q65Ar}MDDl@cUSN6rU{&JJV}`St*mIkH3RB|l)-CmC5PT{zP_nok
z64-5!{W#YuT{+dZCb8^(RlA=OH|17O%<$1R&ZXG(^uPwRfk1iMK_8!T&;`<V)Xina
zrBk`LgFY##Hl2C>G)As5-JxMnzR|U@pI&~ey0$9KAGv$>YiO}V*e+(@P};^1qb*G$
z=`m&PA;gq%WOpe>wXH(zsi5ttXPIkfF3VWWHq4uyUK;5nXU`hTSeJZqsw(Y2eEUhv
zSJb^0B%2E}Q^sDD*XL_k$nCx3(`V9lqqjXe{gwE;hR(70$4R2*tio!wxIN&GQjLbT
z4c+pMV1t89cZ;X|I`-7t$tXdcUlwlB)=BR{`kL&8cBH+AG&*{}PT^SE@i|ow6Bd74
zPY(UX1plJvVxdAgZrU<lzW2GFipJ4oaQ0N$<<qPA)MW{*le5gz5l`ZQN!jG@M5nP(
zT)JZXfU}%T0hxFMYNN+1YbvO89ry?S(j$93x+LfM9<NA94iW2>hxItJR~qBR2d9o=
zueQ@)&<Y=Fs%&~}JyqACzT5z2T~(#Mg4<D3whh1&q_AJ@V-Jbx_Pc*aWTO=A8w7jA
zCbuZa9z@B1_1H2j5FTfQ=u&UN+iln!8j5iBi`NNXPxWz!Za)X-D@g%S?0E?b#Q5@l
z6d4Ol$dFrs^ivf*Qvt1RN$+RIi==RjAX5sWCsC!BNr7CiDw8}W?15}BFdr>EB*sb=
zHAF+0azuZjmaRnO>|voo*)}5t#b;$R2_Spo541%*$)gZ`pSfDnby|5t%_8@uRBEEF
zkh`V&a5`pC)KE;kH2r0rYVWy1A~RN-y2`GvP%R@9!uQ90rHY=*p-l3WUFW(q&Az!*
zkL5NoR`S;fs#Rb%Rthu4#^hCoiKD#ql)fb~qTG#&+IN+{B#N5b)f9ZnL?%kxA-4J8
zwImV3uE7IfkIVV9DVGE-t+n}U($J~9PMA6wSKy9q-MV|>TCnV?M}{x0YdsiYK$_pm
zT4+;O00t`cCsG$~PAU6r9r!36c($K$XvR3iws<QYOn!X))X)`_d&%G~W1o7J2A=N+
zEu@<2VK*vA!c%ugr38mAs2X{z!Q3nIWqP#?0y~{zoS!01-fg<kfQ?!{X9}@&hv?kc
z@B&+GE)uJiW80?Df)ONXs8GRG_Ca7;k&gC`A<W}~nMqNe>mkdEBI(XO>|vhH9ROve
zYG)MiPx0)(XBkQ9v))bO8_a~dech38n+8h>FhbK)i9T#a!o_+>#p1}+u<!dAn;jN4
z1;m@tWGQbRH9c7K0NR+A_g`nE=r*}WEWSdE?T&EBIl9+E5<@;KCR(e$+X^!gnk(6V
z8y$<i-W~qflzPSd5oJnWjgvlUYc6-Fz50a!-WfXcTo^(dxXx8xEG*}4g$Jc$N$kFH
zC5eFVdc}{;=ZR-dGsj<^@@0n<ZIHg6GDAVkB~&cjd->9pgdj{?yD}g=2`+I%YQmOq
zKXN=SGy7hoRl;augx}8OSenk7#DG$)E0XS|K2Z`8Q(*^8zA+3aHD^YMwL51=l`3@P
zwBobeIpKHF89K4m2dPD*!~EIt1_>fdU@545-XmBcWkkz$C<Mo-sRgQK1Zzj~&Xd6{
zdPK^xITd3iVZpM1ddpPq>g^{ad2>+YXO;A@)gs?0PQlx!WMMzy3ge}H<@w(FnxB&-
zDaaFB+8Vh-Q=kJAGH6))%4i`~`p%3DW8xO~kY89TZ2)_%WJkJ9OknoZ1L#PF$gFm(
z-=j%1Ai08&_8pO<hG4MN)4Q8ShUaZ}H@V(n$zpv_YTrYyp1_C;;wZ6ds}4^<Y7ehM
zj<KLIv6|i@`_@}RknlC=4X0pJGU$h}B6hGBHbXEUgxI3#2QFRu&m^Xfeux`^S>gzU
z9XGBc$%Ie&bXBQIOn0{vr}6DlEU8q2tsmc}Zr_2i3MY-MmfGC*9Y)G^&E9k+hmv0I
zP`S0@OI`&8*%=>d_{C9d=6GgzeC=a(MM5kSgUt{z13=W)yQc(78=0gn3o$Dkop*<E
z$kci9d~}pfcEI+pV)-FhZLNn^eV3$~Pm2r-TUm2K`fhUg6wi5dG4H^J%*3xH<v5(#
zA9-{g8S8@$f0{IK7_tvXjsxyh61b$G=AVjftHHSLlR19Ubh>kW#fHR?`RzL|Ji3?3
zn{K~T!Gkr)ftF6S$-?BfEKCRq&Wp|zFsc>w7A+Fbir3MIO9y`)gY2`0c&YG6@?@V>
zWjZo;SoK+hzaF$|V71vzf!fZcRYba7pZEF}nt(X5T$E4amG0LSE`v?e8F$Sk-7HQb
z{LQE}PiS)h9wzAA8G<QbGWl+*(t}-&eIIG~J`D<8Fbc&y74Ei|1ER^OA7m4!Oa~a<
zBuNabg>LTeGIxqq&Yr!|=VBZZ8;vw=lKT{sx-0cXUh|WZ5;vwzp?oJe)7|{l>FL6L
zseFNhO~&{56IN5|HnUmb`A;O=p)hjJS<@K?9?GgmOrqMoKz!qj@YYLY{2l(kEp~UN
zi2mb!qEauJl8b#ux^!=&CHym+OeF)8XjONE`HgUSOXg0^gvb|@thth3XK=VAX~*+P
zi}|03L*A@~8BxQ9k=w;EhlWnE$>iD=G4;mDVR~V|mKOu48=sS*Zors3_vm9fyjL2G
zChS4>hALJ3{<rP6v}24ZT(Em@66x7=I_r~43EXzRkT@M}<_$GUFX#z?FcSa+zF;UL
z-sKt6POV=9EAFP*M;1)rB*!=WFRFb*pl4cVM>u@k*|kG7CU2FnPTr`5NY+*mbBF$@
z*-6VZkKQN`Tr0&dNybfc(9&F@`c+98lV~B@@g71+&$02=9$0`A@{09Jng(A?D8bda
z;@_JUfH;cT)7wa4`y2}}B~IDmOz#_JZ%^SyP*w9JePLJ5(9M1g7{{;6_P#G+_nL^0
zGwOZc|EWies}{5T51XyP?2|>{T@Gt~m2NLBuGN(WsK9q942`gH7)mI77Hmuq1d^&H
zd5Btd0~i58kBfxN@7)nCLn|DEgbi9NW=)kwK3TH`(1p`7N3bFGZi<pihRWdVNQFk4
z&8~6380SH>lIQH+h0{#&Sfo`xfq54L1<z(C<+p?<sJD#c%fy3{NZV?o>TRF~P<D?q
zxaQ6ncxkEv?||U}dLJboEWW_{Rdj4Ls4F=%k_<9(^z!jADc0LdmYPpcruQ~jDmH$A
z(=d<Ak<j7};^@OEp+S3fOHCVa?~<%S*f;hCVyMKz8l&*k=Mx~GC@L;VE<hinTnd_d
z?XHq9u<e4mzy?A_U4V&q(S)vOg<u9^jt8~9t%sG`g|yZ4s>+PN<XHu>NP>X#v(tF9
zR2>2*AiByr1<k~n@TC-FJPFoqp8-pCCJpdQ#1-3RQyA$_0fXh6r(<JC+gmrMSe*0{
zeR|7tDR_`AIHU&;_m;cweLE;Mq^^_Ew(I&X%Y@W3>@EU?_0hv{m8oxFWWk|6^HkGj
z?ab8#6i*x__$FYhJAl3T_6t0^1XV(WNhT|>;$nC1IvU=1Y$uqSON-(ee`bF;Z2ZJ}
z>)Tmdx5BUSX^F9Vh&&gEnyy5!gNR=q>GSo&z@D@(M(Hbwru{N=%(5V9Bq5fx&+wq+
z7pv4&{M=YBvfHi5RtACP`+|?!c|xaov^;F!v4(ohnXXB-7*Ld?;iC?*qPD<1w_@y_
za3z<%dkyS;k#}1fyPMw!9oBP)9{7&vc@%{iP;-z)BwB03vMc&x)&`W*3yQyk+_7$x
zrRf9^?g#<YS=QdB(@2bHX;k+k(U28nrXyB95Cg+r&xUQ~KLdgRNkBGYX-1@7xJQ$y
z?V<%V=SvGKmbdzf)!yKu_&c5YX=26Q54v6q?Hp2avAg4Xq6w84z+<^bOALEIem@Gm
zlY!)*3FIg_maEbWiWWQw$%;!{<2__o_N8?i1sjI<+~v}TG-<(2sqDf~c}fSu9zEMg
z!mjQpRv7bJa&SK@M2zvpz?1B$7Yk^go&z1o!bzs-cT!=xsfn-v3Z$vJWqAV0TDlHI
zfPacRy$;rXqG44cCFjC!@9AOp;?@rfC5NW2_Z}(D9LQTB0@#cq(sV@coZTh%H5If)
zG&tL4rR1{@YJ*(&MufeVQ^RyJB#}^VD6L}K<NKPvO8(>AH3NYv`6-EY!%8?=n@&pR
zIfx)w7Ii1ZXX3Nw@cRI>mA1|z(kS@PO4@)(9}mie(~g>-4I%Q4@4sYLdkz?~QE%FH
zc3SVn?9w7O&J@Yhs~e6XFwQHU)OYzYyNIVA^Ifp=i&;`gnmKK9+wC<(5x6t&{TtEG
z_;zR4<)4d>^PZ&&elGsO&2;d?nRB9)qCSylSF`0@9^YL3B2LTM>b-!CqJj6wVM@-4
zm`bhRT#faWW`M`vvuIjX8mTUfu6V%C9o-{qcwfKk-CxkK2RbZZKE~l7Q{<(haK?Q3
z;m>~r9QruGB2q_7;mSdQN+%pN!oqBSy*BsfvA>@E|K5jjsAv?i>?@*F{0BNRL#3%h
zSKK>r{Krz{mltQ@{box3=Q2^^0B0Gre}4DRF(Hyax1-ab5)`{nUjx(M9#4Ar*u<{x
z3}6%TXN?-+U_yu10u^Aw`tYk1y6)O<kx1Cxz_XGzmFRy|_iEu@;#drg@Z~@L+^UNZ
zQca&l_&<3wfk2>_vWiwZ;gYgQD|yF2#<X<UKhL`Kq&3oWGG9eZSyA{vJGCviy#jCG
z)&jG{*!}-}s%7?Bgy#;`tAs$?ox9o6A0cUj!B;DB9FgFDS0z@@XWY`Ee!o|@d$ezZ
zCnwU-qR#A|d!IBuZVT01t|slIWmsh{&y{V#FSiGRpH*Al|4UkFQ|I#)bSYj(#Uvl~
zgaZMIFSCE7V$3>7eRuxRG@Kgm_iFXtM>I{TR%EdDOZoe0kAAPZ*bg9m=iE$n8U0W&
z-^Bi9fW#d96qD-Zb@WWg7UL3|9SM4N=^P!cuB-Ag?5C?9s%gsoQ2$;XS>)m~!jE=q
zE9YGJff*^D?}bdgUFKHO$v`Qdv&Ym5!ePl?*D0Xeb9Dc{CB^e?wLjPYry%~hBD=bL
zYD{uDD<#ibsB^K6S<9`BGEHkCSx_r2!Q|K7(}KE0;=1Ewhn>{BhyW5bW6ZIr*X0{)
z1uv365?Ne<FIk0TBsq$m&yYDW7LcAaOVn;3mp&U8`h6<vzHoK6_GY$BhTFZNzgPH=
zj+tMry{I_8IY9Re{GV3v8+comJ*;DhV5dn-<ng2i8ev2U(K|nys-|q(Ss;_@Q}jAE
z7P6i$*>Q+T0g<n5rQ)ANG5JrA3vCqcrSB|~6}%?;-pE5nXRwIsB3d~i`Kfb*m*xMK
zj{d2w38|mko7fz-e2ut$&4@`m6HZgUj?lQdwF15;0ZUKOVyOp1q(oK~E2F^s3V%z(
zWaKMJprW8oUHmsYML|9--NKtb<t|r#BmkREr~e<(4>k0TQHq7}xu};XuZzbIk+G%@
z=2GilQ?wPAlVeR!nXBI3o1A*_$JbrrosP+O9*|IK5<%m-xR<_BTP6c8{sEu1+(bH$
zWMP&>CE|vO)bEx5J@<$IE|-6?V(3RdCtl;`g_ul^k5+Z|X0IMiy6z0lABcJ$LC+5|
z6jk<@thA*Y)<|1Dj)pn4EaP5$+{Uo(cKwPhi?^{{7Mf9mz8uem#=fb2{HGYAzPcrQ
z3QyXgFP;`0@z2LP2X8p#8J|rY1*2=-b<gWny>54R_3mwXky1JLw|V1>9F|a9@&^5J
z=c4_7jTDB36Abz?v07v`($1{iB|-BrNU>tN{dj}TlPXa|F=oo%s(L`dt!ZqdQqjHd
z0(}Ej_4>*iuN-36{%21{)6n>i_3mX2x>ij&<>zrD0zu`zQ!`ynlS=t2BiU^$c>3+%
zJMNa89XHO7QF%N$N!cpb?!HB3XL{bWMs;gxfat)$U$_OOvH4BIedoXl`_DH2W=31O
zsMjJ+(T-y_5p>;2yvdqGp4-P85!<fVw(If8xjY^+GT$SouP!RSfDMPC<<uM+Sw)t%
zoxj{%x;wl0C(0zeiDs?ZNp@~!7Jphm^47S=`M`lE7gVQtButlV5mGU0AXz|Ow`YJ}
zoNeRvB8IRp#v4sK_tOO5vZ+$~6N$pURwX-gOOLf%50nlD$7t9@$?$L4{2<(^rhOP7
z8KO?-xqvM2*(<gm{iKR2?i66=NJb@kJ`<Xg<7!r&!o0j!K=wlG?B>9u8!7aPvaEsv
zD*@A&x68(<6a_ba#48|6ho<<F4S#$%u6}BT-@PFnE1AFGO)`lW;(%<uTlYDhPKs7>
zybhgYt-O1gMWPjTuJx95%WFPxCw5?PvGB%Dsgp1`G2v@fiaU3O>d!3a?gzY0d+>KJ
zDPUQTN&fsEUA?DvKB;&adw$tsR2P!sc~%^xby|=1Ieu|)no$>&;<2<xU7heNyj+D6
z3@a|RV0tAjY`K5#NS=~x2UVbL>U?8&44oq5h{sYCQdAcxFt3^oyh*j!<i~HGHs{CD
zP}R&(O19SjSMbPVHXv;!>aHze=<cBMc!1^@656>gnM9m86=CUd{K@%er_UTp>*6ur
zApB1vfM5Nh24A3v0Us%Hm+iZL^*~6~ttBIqPaHYT{wdCH77n`IsEDn;tqcPBh1g|-
zL<MlhXrU8qTR&1G+*xb#V)hmPsT-(W`vnVvmSDEhce?@1q^(7N%K>QiBij(-w}>g=
zTbOx*dJb(jN@t1c4OTZ*Ep5)7d^$W)t$&)Z#I2tPQ*J^hDM|+24le|C&IuWts66*H
ztu3S3zu434Tx3{w@*iy<`<*4<nRP(+3T?Oo&hIr?huK%)#M@n0#>WlW`FVOBD_t5|
zd9IhvjCE3P=ER_^OG3^sgjw2}!tGx9XB;l%+We`8&1lfDHg&Q7$$9XL&zzg3Fk3(Z
zBT$HZZp5)vR|AIqdk7hmh|qOEGUvVIaJH=ahIX4uR3{O37kU#+yIsFBM(Zm=tU%WC
z(?RF-XFg2I(OG0L$m!0{a+Wd#?}TB-{3Vt9*^zT;Az4$^hLiDsFwYl2M@;*Yh4ib)
zT=-h}5JZebO?QPlIj}ps!XHf~dcJ#$Gc@FHX#(F)NV(+Upyt8yezKF3SL1n)P1c{%
z3tGK@=jqLviN}9ZSZjn8b)17fpzPzZ<K93g#=2X~c01^<UIjNQ5JJlUew;^j$?D=U
z%OBIsML^lR%^8UK^;=6iW9K6yVf&%xHpHl^s4ESh-|&m5TSJ#;RRkaPaImD4Pt-l4
z4CYci&for>^7r;HF^8VJ?%fqj@hp>OLtS$-d!1o0k^7tfOl*0~54b}ixarBRt7*>y
z-jcMrN&>HvDR-xYLPX3blKL4%$)ADMm`tfnz7Zi-<f>a<F2X7MhPCb63}!!(Xfwr~
zwg+6@ogrWdPD4;*yGr~K9i9P<x3bjrX75biqKG<2kMho*+Q^yZ@v@2>!tO=@-Ljm&
z>|E#pM`!>(G`(m~)Qq7EHosC?MJ2n6T_;LVv+rqLP4-uscgZ%QmM772a|05${f?RF
zslmwscNzWK^o6tW(FFmbz&0B4p1dOYEX7SQV9bJs>ipyCRsQjMh&I!2Bo`gu@1PPT
zQgl7_JFdfQJ6hxJh`EJS&5{E~{+Ii%-6pymN2;QT7!%1w_qJc-#YsxIVs6@l>gAux
z0%#NouQsAJv~K!%w;7e>qI7Svi-zFhs%nEXbDL79xX(pQ#};TF^7C;Z`oHWIRMM03
z7h&s{1q<!BXDQ=|$R-DYTOYj95*`U?COB$VALYnF$U}==Sg_O{{7I|xS()DRo?i29
z3#Tt7Vq0#PiG;#bUd4b<ELIBsRbX2e>xylahGZs_u1msqJ<15{h{kC_d4~vxge!}z
z4EsF8A`K7t6PC_ir7dZT`4f(p6sZ7g#fAPM{}O(~UK8{-Fl;vVb2P*9-k<&Tf&H6s
zURr*Sd!9UO>`!P?<}JFD%s9JZ6kE6*Qgfj;U-~Bp0uL(kJoPPRz1^V6@gihzq-Ti7
zxL&O2Zpp^A(XM|ztNYT)=8>V~>rH971T<OOD#M==8k!8SE)s4txg8AHGY9+2X#{g>
z{#}!E9-lQNu1Y%~_Wma@aa9CGeWk$;f3YXa!YeRB+9N8e2)Q<rgeKT>{Oh^VfS4Un
z7L)~ro6u+FY|zIpqE4tmNv$VdR>KZx#1vo~romZyzZ3$J5_qs|{NBm%GWKb@_*oUN
z9yQ9c+yr5kSM}_#oCnx)_*|d7sJMZto^YXGNLHvNXtAMp+3>FouP!qB4>=u!Ioiv&
z$(C;O&Cbi;M#52;V^rm@s^O{@{C7C@JBqb)hOVc#?bY7H6L)+yck4%T+Z%m6ItJgR
zA15;ttbfIfSJB?%#=#p*3!UmwU1ek3q%WMOPn?*R7V^CH<7b`+vxW(YNA}!G=dq|B
zz;CmEAqrGG)rY?zi)^wxRvVH$SLD$kr|pzA@0(Q<ytJqCI(dJUL%Z{2QiiGJ3!Myn
zFzU*LuLA&qYB?$A<gwD2LSI0@zqzk(7nO=2SVjM<QahPv-P-IvUFfMb7{B7}uHHWd
z>YZOO=?W8s_XlK(dwy+y#8<}l;%^TUstSg`Tt8?x%Oyf{ZJj?T<iA7U=IF&9JK0+3
zN)TeC+3{j%g3#Bg>aU<GYyM6wZlu3ozHFo-J#plkkLGL5y?{)zSgFg4d&tnIdEejk
zioM@A>2l!BhTN&r{JTIaImAw&^ZR&d;#_%fyYN|i+mfb|T3<kj$j13Re=)|*z~tDS
zc2<({f*NNF`nMLNNrf3!jOV=u1<n>a<%qoYt+#v-A-rRbG1E2oC=|vvzFYOGarb?W
zi*V?75;>H)$`*exh|6l5Sno+I*g$~l!mhZ<ZKvvop){t@@>jXc`(J|0j|tKLjwT&&
z>W#ww20O-|_NDyQ|2`d{#uD8G1qH_X&@aU;$cJjxoJ6+=#w3ht!|t~1b&GPw#%tN{
z7m5Bil^|<c^GCSizgGgj)5U*k`F~dRz-oSFT2C=&z1Cw73;l+t`1yJ$tl}J_>XpaO
zxWz%ap1GF1wfc#`^M2D7&(#SXX#X=Q@IQ(e%ILUhRztP3Y;n3bWd9#v!2dFM{P!ZL
zH4xv%2QSG5SZJD-$@CACEOBAZE=ZqTPbJTLv2ZO$qVbadgpdC#qTrvDfdg%QEBwE@
zV*CBEVk*amxMzF_yZ8RwqSZ=;Yw=ZP8cKx&f=gO1V?OnTNJU{bu4<xR;SoT-YP!n4
z!URGzI2q=xOT40&7S_3h8?~1+^|WF^rWrN(0qBZZKk7sye}hj>AFuR&rR{l>oTS56
zcLmy`h+H@~xM!eDG*U>imw=6C)8q>6oNp>(UJ_wC1ykr9COmSbG6Md_O@j!pnE8?7
zEq6-tv9r&}S+XrR0URp{A)3q)qZ^xU%Y|*&?4Hii&U8^#w99V<Hz`wWF~044HX1Dd
z5x-xRGP18||3clh#D5@m9`goJ%f}IT)h&kjM_ANcYy)p4`Fr;r&vAWoD8Dp+mdw%C
zm~&w#@XVdzcQG%Uq3P^U^tF;IWMxiR3$vUv{gyV~?ATp8X{RyRI9koW8S9n;$yKe!
zJ!AiVyP8kblQBT(;7?@4#*O8vJ?xhJxc#6dH(H2!lE6yBKgHg$deC+5<2KD~a?8?Z
z<wm@^r8wU3ZGyTJR=$jhW(T*0er7=St_O2g5hYt5hh5VkbV#%N$qd}M>;hk2faZKV
z)pdOUF{@MD6((f0y;3H6H{>kdbu}x(XnHi{^BK9VoY2wsNyO9-$K|)w1%E1t9vxYX
z$MM0`M?HH=h}k#oljXJJ14XX~zPvP!Rvrft+BaruhoA6l6_*76w^Mw7fcLy#C;a5(
zs1A3w@9|vhC9MwjZWr1sf}HJ?)&l#!9;aiYyjci<FNQq#J9ZIJg*oe9g1WY9OJ5h)
z!wITeGaipbB7QMXVPAn)Rb1k@S3lq|icSD;gbEG8?JhPZ*KT9AodEc)a#2#WLyYrP
z`&r-g&eScX%ia8h7RfN^;`}|A)jx;?m<(juDo#E#n$kR)1Ud}02~9ib4x*!yip4nI
zUtsT0Qak&quixA9_+-mxJsBmH=+z0H=n9!6AshmL4G@p?vy`v`m8cab&sL~>j>oyq
zx6;k1J6ui@rmI~?&!&+PVNDJ^qs$yW7#nSnl-uB!m3opXSrOWUMuh0sW(VirI!oOP
zDOW=b$C~`?Y|E^rnha>MsPL$s(m~yvdC~0QQMqCiK%)JBRQJ|#QFdM1u!4dpA)wNs
zbaxLW(j^8R(jXuVT>~Q0HImXQrP2-3Idlw4hYa8d2;&Uh?>WYGU-uQi=l!1VpYQWe
zXV|m%I`>{{9mhKMI(-j5zvPp=zRw+C;=l%mm`ZrC{EKt52C}8x*Zl62j_B_PN}a)B
z1M#4{lL3yrsor`Z$6bk~ixj5Z2jx@l>6x4{!i3A+MjOy=GNCkai?$F>vEB5|K-OMl
zQFl6357(1szEu+$uXfEO%aS1Gc_kx9ied3F4RnSZiFQfwsxz%0Vka&^Sm5T%YY6V9
z<H#gD2BNfw<HlkV-3oD7F#qV9qifXga?<@utg+q(HM%3_RO#;xEnjKc!e-L0(e`C-
z10ve@h7@e5s7pmXgVlzG@mzv6Yqf^*iXyzu+??59?D+6H)Ku-WZPL{Yua&`GpH272
z6LN6qf_WTTm1+nG+USAcuufYu83Bpi`Qg*VM%nvo-h8n!wAC(o0j@D0Nv|Lcoq_!4
zvm|RlUDmY48s#r6pdetgQ?6Qnx1g?o)^&EZSD=q5p$8IzqF)K@Xwqhd@k66{>Kz+<
z$kf?Z1~2XGGikleEnhQS@0!`H@PdjWvTA^k-YK;KdF0ws{;mj+G^*Uy9@PlO>soiJ
zu<l~Fcq|IU>E$$moZP_ft_fZEqcxTZY1fapdQV*9aAVnLCB|vMb3dqCB_Kak8DQAc
z?SuMtx31>LX*Yd;zf2huxYp_DHjp%HD-kd}R>}rUslT{R4bxQRDsx&C#bwXkxPMR8
z=g^*Mlb~`~!V5zKUL{&khkrQ;4gl#H;K5~>_QYgQ6K_l`{(1;=M9V<pL5fX7d!>&z
z+02My8{&h$(LM|P#}L;?COb5nsAudkNR94r!iy(8M;$dD{>sbewPK12wNGP~S0_%A
zh<3EFOUcd`3r2y&?eVRp=8vv938t2@0wUMR<rLs8yh6~I6x7!!lPOxx#+F|?FnUcP
zKYOoU*L@%D4=>9ph#yrF;`LXe_KLO`(FdKpafq*5cdmE{cXNs%mLd7_R#x<a0CYLZ
zY$*B=O;4#3eS>Hf3qmqF*l*$a!KC-JP07!Iq+j*4fOrJ`$U#TIx^qR^?v>DEje0l2
z4lN0!rfj>hvEM|FDpLqvE|u<tT*c;fp+`AAMNMyBsAUS{)(`Ufu}o;pwR!7zBq*&_
z<mi>S(`E#I;z1X6*f{$d>Pmy9fg^Xh&9;|(??Ur>Kqj^D*5}Be&dy?k$NN(~f|;7L
zdh7*Jr7c3M;eravbJKQPP6=lRy!R+Jec#OL=PEd?+2_BDzGH#2>Ak)3ISmRl^^mF;
z(>Z<K0JuJN+vgs)`7zbUYK6KQ9KbajXu<rUun%N5m4am_@r=P|<*3)Icl>OWR8bOA
zZ<?|WWYGZMU19a*afG$*rM^@UKdvg0lqtw_J_2c+OsRBQU>xBLV`pe6xGQ>m3X)vy
z0PY#w<e(_zBY#paTe}qDN5VQEV8ot%rL;*p9C;bnXY)A?c}En#R=Q)8W%dF|<c~}p
z`x{utZK_#cp0%Cp##h=aCi(QG<j@CCuWTIPI((=5g{?Ws!>TO@8zx$hzIu<Cn>%sD
z1mi{VC4sX^gjUF_|5V9%29SRH0tI_1p^1faFa}*AoCARK(mJhJz0BkvO{^bxO9_V@
z=5y++NSG@%t)MOKaQArq`wF!RA)S3u(`k3F08sB=>v@~%#~U+!pSP~Uf^lv&wGkg_
z@zm~Ir4ByR>+oSkf9mkl9SvUeX?52}VIVTL%fTlx{i%a&-X#4@>9>_6v>K5SR@>n(
zDBJDq1nzZ`NYKRBq2huXzo*S`mI|Ca$tlko0QzAN5m8^EBm{J2LcSVh?=E*gm$bu!
zD~XKkHlXw*Qgkj;^XU$S>ujuV25{FU_p+40xC*p@N`MdIr~}eodY=*^1oiiNcVAdg
z0LT3T6pslqOHl_as~_Vbkv0?Bpu$}pivCQAyngT`Eok}7h~A$!wa<6GgYP4y3U!5X
zfgk-spyd1gY$e{9hf+3Uv=S+#4<yrO4?DT91in96zGgp>cy9~vt-XZH^2*s{i9$>P
zHjEq$gy%Db{Yj=K@tAr9Hi!Ke*i)n!vLB4ehmp4VPdiA4tMHXp_I~oEaVAB5K;2B5
zaA%#C4ig?=zP0Y%Du+2`UE|MG-x=sneBf)6kN^+~sX@6N-GgFxB+`OF$DNh&FD#TU
z2*gF-Hd4L`-j~kT%5d4uQ#~mgGTdBu=SEAvb_-e{RXXN@Y#vbhoqf1kY4ug(4KD_g
z6%a}loF@p}QkDw)S1!QnGFMaEHYY}*{Tg#X9fhf5Z|;j$@Bp54tcK>TxugmG0>OLB
z&TkYH=$Q}&$bJ!MV*n;bJ6`Eh({V}Aa#)uM^11jBNhXzwmoza`tnfLi!DY(sPLx6I
zyA;Sxco9e57Xo<Cy-|)Ljhoifd}F&WO;wiq2$_|hj#^UBXLQ(AXdZOQO@B1MgUeKC
zYTXO4P-K(`ul-GMRhzbyDa41WbcLXCExR3k0!q!35ea8h^=~>l_-q3?XO^!`+&9vo
zQvk^L2f!-wV|?42f?J~Lp`kH&V_%m8+Ux{(h9n6pL`N4v(HYrUd=qlFqAwDeXfUIK
z2tZ%9uOLPXi*(%RCnDQg{U0$pEt8oWC%CEa-Cny9W7#4z_14fVyvjlCoeyv<9{@cZ
za>|ccACbL`k2L=r-~=B2;Ar70`)Y_^FycJR?7|&n9>Xz6HICFmR$7l1xffAavfuQ&
z=xdov$LiP5jYS+q8@b!QY;3^M!1f{$g1U=;U^*MvFOQ{^3_BQ_{wiQIoPY5?gRYrv
zN1u=Tyg<dOnR4=1X8y`vuY?60n+yAB4MAw8!oP6{*tRDI7MUbXFFZ4LoL|ZZaKEgv
z7`uCL)8g>e$s=uEpk>3SWl8TqA?S7}hUrUIKM6HaqAy<j`pVYm+bT<q5DVd(4!Gez
zPm~SzkpY4U(?>P49F`has_AWFXL`<gm=8R5&{HBgHqi%~agvU`UZrfEO%Li!qy@tz
zM34fKkF9%cdxW4GM!V~mVrXa{qOSbm$FW^N9M4InIJ&?1ph!)rqEx;y$|t=4PDuFs
zyQqXQIn2DUq(F2dK3@D`Yg31GoQHd~?pB~QZf{fA@Yk&Vru?3-cFu~^B6%M{fC&L2
zQs86sz3H#F7HpLw_p?!XnmT#25z1lbJ>CGFIXR#M8R!lQv&Z$`$R?7@?V5tnDNf0d
zq{Cq{`~a8%PJ4LHNy!Mh?~;s=$^Ky7*AG&h3x0&m4j(t9Bl?|VvXlAJ(*_1^&Pb%h
zvX9fdW%wdwl0z04sd|QP{u^ik2+VDIAx<lzC)6>t9n6^E+i`*pl;YhVg+=f42kQ;X
z-7vwmHy76Wy2>^?pUn|n1gm1W;>&7qzvDwL@EFeOiDjf>n5mN-;cY!e^TgQ+3dY(_
z0wS7FAX5SODIB}WDL1LqaYfTK)m*TR^$>|vjC+2hYf)AZ_Xj*UQI2CB5cP#0aL^Qz
z6lGfyjprl;jN}Aaef(I=)JWkVy;WlC8ovIHCqEYJEP>5pzidglpY7N2VM$aQ6N+L>
zhd)9%*5ItqQC8nF)yAvnp;i&&UZqtn;KP{MUZ#yPfiE_<M_&N$TPWTP06K9<=WI?T
zO08Yb&Y6F!Qh945?cU=oz;gj+SZ3HGm=O*`UEj0MC*Df)6UhwNg)q^#uw_gn1PuVh
z(w@ZnRGdcKvLR+%s87zO5XhgG#HY?a1CCB4c6MTJ;W{SMPpDGirOqR!uv1n^fOK`p
zKgde+xwAwfE5o6z)MU*lmHujRPE!)`Dsu^2K|O;xn~}s?uYN|)z)~{&?p%!(rKz>?
zKii<Vuy|tfONwB0NA_$P?H#2#e!1O`><_ppkmS%YSVPcG6_FFvdPd2gpZ#6own9Xf
zGPS*^V3+W91f0EKB2^@|X6t4HCA_D0Pa3oAmcU_s9Eh|mWk#h`X9WpMd`ZLEUQU-)
zgTHy*>JUxHL@NwZ^&4eLc_++GwfI@uIJl3MD%uJo+Z`%G740d|6|1CWre&u1n_Xm4
zGOm!kOi9HUH*n{^XU!PL{h(FKN4arzKn!bH7JBpIiPTrYPf3!NsD5gpLOoq#AE{Xj
zjT5@7PC~2>?jw_(iTsMORaDl^oSA~^be~V=NL_*EAP%uYO;AaTv9ff?IbUrobBC1_
z#Is>W`NZyo1#2qiy+{S+=UJUniHKv%8?Gx-kyJUsPqnU4pUiSO;<80YG~kiybPE$7
zWc#M=P2z3Z3qLw>L2S{Bs&${hiie|(r<yEEgJX*X*fm}wjc2}xtxnH|z17Z$BE>L}
z2ul2B#tabi8GCbG%oTouXO&g00SqtuBJ1-5T+Y7Kp2L+7!(ACT)Ui3|LhqwtI^zc~
zht9^`tBf#z4VrVWzj1c-#G;sK^~(lz<dk_+F~dCXZweB-*%FDsv#`t&;#b`J*|9jy
z3!c$)U?)wmGZC{?sUce)S3r8jG>-~ih1(^pNWm=d!13LpgcHc3xqc~vT#Go?);=jI
zId*Txt&ov+P|)hjK71>du>GjvH*?8$g4ilp>bgNiMUrXl;!?!+|Co!y)~K9LZu&9u
zW5Uq|<RMzplG>C!%)IMgQ_=0=G7cYo;<5;>##to?G#F!BVCOA-x#`GG^C3Nixf_Iz
zt_-*zAOFUm4My&5sbQ?4->b-Xfx4^Bi_hN4R*VY2<uuZB-k&(oMGE+^E3ve^xE$3n
z|4s7=QW^mbw7uspS>}8h8ou?<@(BYAPWCYcCpRuV4ZZ4}rU6!S8~lD;et!)yBq-f=
zfcu&arNB~f6S?DqtW0m`I@G&9Kf#An`87F3FmHaRX$Wr_m(6A9lksgEo36V1rkXXT
zwRFFkd_a(K>&hTQw%>FcT3g1K*#Nn-t6qx-_V1Mo{mldU8k$daA*7r^Qq6ngTR``V
z8q)t(8{oZ^iHZUiF_@LD_&@7XUQJ{-a2RwRZ6LK-O7+aFr@Z4b#fT6G?Q#~5Lm>Mz
zxiM|oyLc5cZS_0bD3pxKX`h_h2J?S2RfPQhHz?2_NYkWcqyCvF{Y9Q)Io<|&KU9rk
zg->i#^Xl4ta@_p_XH#OthjzIk=?O>lq=+&;HUA^(pWO{#5axc=`n8*XX@2_7b=ZPQ
zOKDzDZB_n*)acI)qI^d~%Q~vXN17hEklCw(Fl~5}WjDh!p~?B2suju5kl#&f3SJyl
zt;;V9y|wUD-bLE`56kOT8Z0)ar4jK8xbbJ|`IpY-KZ=_l#Ye9>|3P~+p^yfJhP;W#
zfFQa(=Zki}R`U*`__|w!eiW=;5c>M}=?S-DT#uR?n9c0FN^-oh4l}Tdu&tO9qBcR#
z^7?IB7W|;8v8YEFs4qQpo%dK!K+Mql2!wNq{80N~H9iIwZiFo|Xiy6ld3>oagfIWz
z5LOW|i~W^>9Oz`o7(E%Erpzyz{Ff{XOGh`fMD4^%In|n9J;fwUuq7cvV5LN{slcIm
zVf(W*@ned`bx68$xxpvaj$?y2rtVK07gbFgssb%H$C^YIVc8zw#@Wci3uq}Ff0<vD
zBU^@tDhS|wL{=IPJR4YNaw|_hFql2o`dxjf=oF%jP^YUntlDc+W*hdaLWZg(Xk!a!
zfU4sK3w@rPvFE2M*Dqqhwtg5w6WQjE89ffa#s+bvARmA5MfNY?qjP(iu$%E`-4}vh
z?C6ax)RuL(c8PwWOixw5^JV=449E-r+v44=?;VOMo^QP}Z3$`}pK%V<Ini0f4`oQ}
z6+t`=$^fWQblDWL2=s~`(D}oeo%Vc<Nr%uyrmi*gNJ`nX)AHsUze|At#fRH?;zD27
z-2U>@<96>y+dz>wACty2*2{+(lr`)7@lCAg001pkGXY{5#d`#S-@EoW&Hab@Z#`+m
zQV@4ekJP4aX*an(PW}OIs$k+TOX_oGZ}=YMlyaJ5g$MGfGJEqBBH-M^Tfe!`xvc@3
zWY3snV^8<v5p@(kI`2#37v~9b;l`uNmysXPc!gbOM=5*HEW>R{D(Z)@av~Q(I#9sT
ze!N@UrS?&qrSWJ~%gHmw#+ImGHPEQ=7mI^prd*d$s(%zi6Kxr5XU%K9H+ICdPR+c_
zSrR51k%BoeW^)aWOrP&i@8!3Oh?@$w#URTFTcsGFIG?s>2+1Vk8um|na!7@6W1GE}
z8wNBb7*+`J`Cb-sgok-K#usi&EzC|Yvk>mxooW*aV+inHne5TI!Z!Rm&13RgZ#Yc_
zQXS1`m5(H*7gTqhqm@GSr3bI~j1^`d-@4@RVEOeg<`slnfA)f{JBqhn19L{b{^Xi-
z%?2-87#EM3N%eVVz^F*C9qxf*vjC2~_(6A1^^xh!w+g*mrwrwvZ`TihU&H%RmjNnD
zz4x+$mfPl^Z+%A<J_%$$;N`;-S(DF=Hsi8=*T-^oB#$!O*fJH>OUv#7ZGiXJ&T75r
zKf8YXLQTJi^Kzb$GPvzg+WE0U`}~j*zp%R&w}qh3=NsPDUn76Nn*Z|UjZ2SO-O29R
z-GpAnG5*Y2#VppVX};N<_9<)1ObGccYsYs=XyFL7;O3b(swUJDn`;Ii@c(kkP+I5n
zc%{}GHq$B}yhK$RNXg(eeSm&;g^w&ThhE^kO~<sNam!;N5e=wvGKN;=37GNt)Ydcx
zAnVj7JV{L4UtVeOnNC0_Dw~C;=s%{S=zh)!A>2TYl|ruWG4PkIoj@l2=MQeUBtY<K
zJvBg&yB<Dsz4o6W8*x)Nmt0O)8z5(lgCKBm@#{zxexOL}dxx$9&;l|#2`L8DoO4nd
zv9l(?kH7wQdpzk%Ng6E)HE42fxKltJ%2fnn4cHG6%bc``9NW(+2v|fEXa#mjxYx>j
ze{lS@zXd#F#t?8yL;-tRVb*fGd9l+U=3_Nt-XBEy{QGTl8_>}n!&U7J_rfdMQc%5l
zS2K=M2~J$-QPy!ijlXxnHGJ3V>IpBtJKg59j_VssF}0K3U74b2PO(dQmdi&@Br+R#
zZ;wNH)D0&Vw(J^rh#TsI;$jN*CcCKL%Rd8EJja!Aj39mJ*=&Jn+MI^0e_xsfCM_Pa
zH@u%j^L5GF4MKU9Y<LX^H}Ko=<Bo4`gb~hZhjmewN$~_1Y4MxQ+y|YWT($4lxn@q`
z@K*GucnY@<etb&E(P8J=#+R9IU)L^4U2tKP2vxmwdPV@~z&;@|_fB<JE-pF7Y?JHH
zCiI`B4qKvviA()kkmwYX2CIIZC>Pqj!*MUfd0d*aGpD4w*lw*DvgH>nC^@YUC#R8_
zVEU=o?a!|{aT7BazNorVMN=d`7RxWu<$PD~Erk)Ptl?;A`SUzJgHEqyAt3nkhrOu(
z7TWORsX92k#UsG<+)KudV4Xuhy<)4}=iwzwcJ^vpR@1EA)5DX%Jn)pyYkFOdLe%4M
z*jV!L3=PXNK9E)3KGL|lbrA1>0*YnFA70Tal1_Y_d<VUN>#X`-`QuF^pbQBYy}xXG
zWyIz}jBYZpWKkZxr|zfj#I(V<5#-1*{&THUk9OJ|wqJ@A(vmLdlo{52@uj(VVACrF
z^=FkHBAVeCBW5OXnS3(QR0tmhJJn6;ZVC}EuTq$pL|9sXEU+HwkYhD@GUU{*YKw3E
z8o2EFaSPFvVmv+gici>Gz{}|yc;~%;j4i7?y#dfXZnEm-g-OVnvlKGUvTUX~jNU+1
zVeU~iJEW;Qc4oTx9$Cv+7Jg^LH#g|f03@;K-n4(zdyL2v<!5fa@m`E$7H{m5K23NA
z>%_JNSy}Xw+ycvp#|Gt;Y9E(kw5;*o-lzAO-(Y`qB+(+ffODn(e9>Tot`F=50b9%E
z$8q))Nu}rk<{l>X`5=ub6XvSb00a@f>!^7Lmp$v5-3K>zuxMi@TZVsZB@Y^O%_%4(
z^n-6(jW;Wo?H5tiVqrbf<rjG%{P^aSh})+OQy!f|*G^%niPrJSbZk{eY#9($(CNzr
zgak5eMw3%8Lhda7^ej)7YA=Rv*7lIfQ)CKzf`Mm!i;NHulhf;jIRY93l^Kl}**tRT
zq{4;<eKYN8H?+p<W$T(TkQfh9cU(N(LpWxLVW{(7KQ~e)&d_`%&6XN?Aapkhv&8-F
z@#ACrrQVx%9v$twFtV9K)LCLbk+f`hPOT_66ercp2IPH;N|MkWs|#)N+aticyK$>!
z?%Ilzx#ksoOwZDAkw5Z02jTX&oi{^Qhvi;u6g$dxmM?RB*cFR<UqLke;<Ir@Vr_^B
z4{c8f1(oiB7UX73v37Xo721wn#Tw<x;ujln0c3@y<!arnwc6RcCudF}a3{PUs~~1}
zsy-XiwPqzRiXkl=5s!5=vw65Rpw9X*&*_eMtkh<GOaDxFbyT0{%hU@#R={qXMaa`V
z;zH*eIU!5(vLAH?S-DQh>OD0%uW`CRr*w%89>?Z6b2bv05I#6{qOkCIi}y|4`1oB#
zW=fzMEB%Ie3b+OUtx3HlYxGo<$qvv872@{likxMJt(RG|KnM%*1JXGw<HT<`I<=8n
zHsqWPa_7;h+`;dv(b;^rP0Y+?y6W^K{2`uf<tl9tW?V1PlW~HoA$6T{%zmnS4Zqa>
z;Chc@7!T>N$R6kkzu~&bhQ@KI$b(>D^6zx<F>lEwOJqGPPwwd-Qr2)4)E;Bf3HJ?>
z%1B1oE6Soo9;9E*P)~mB!3f4V@tVem?|b5uM}jvJ!?<D*d&7-AbAE%K%IgCkxo9Kj
z)PpDGgKgdR)F*tOm?`V*{)!^NA5W!yYpGnxoS}r1%vm(#EX>E&IcMT~7gwKlCmkUh
z3d6Ge=H?2OvzpY1HH(qE9@nhJJRc`A#$lb`oMPkqkkd@P$9dVEvAC)XYb{%x6i>h^
zEr<{{{;@ssVccr{6I=MxUL0DLX`*y5(yr>7P|jOztw!-{z-<X#Kp<K*c<o>cr;18G
zCn;}J9104G+fbP$Z}ZwrT9QIhZ^Oz&;_;{Sl`E6|7{|<u9)o%30X5hiNuF#9pDh;(
zo2G;LRNP)Q_L;{n)CWVKQ~Y@9y~dJRcreQ{nWwmT7r;A_Zsnnmp}>4SU_n;b!`3@a
zqWzIsdNo`5wU;dUYN(Br@9)VCxl0Xm0`B0R7p+RwTH1Pdx?^?mOnzUV2rAYpy(-QY
z5vt18nXJc9;>T#~3%LyVRp6UzcqAj{eoUe^I4|ff(>PWru261Pc8@-a5+WH#FYe3(
zs-c9o%E8UP9_AZ?c_r3twwRi-{1Ds-x)ZM~j#f3-!yEX^u#4gN-@gY&x!JQhM-^4j
zUDp1Wm>c0obkaeD>1+y4F_q2Zj5e^$AfQ#=^lOz@g)RB>dv9yY?H~dOP*8n=BOka=
zie88S<;}pFv3-6UnJKkkCRM6f*>!{(?To|NjFpSOPifv+Z18h()q1c$eCV#;T9I4v
z!{4E@(?@W|xns<t;l<og|2ZNl*QRge&ZNO^x?YXAs4gqHX<kKgPr$C^fX&E^;BP4s
zP=2?<w-Bdnn<;8x=({|uG%sI1Y8@Wm+KyyK$k@^cBoSMdt+74(z%@ssrwHUPbvBb!
zVZ+@%nSm>sXr=VP9M0$^d)zW1i<QWusWI3H@0MJiJLN`Y;GtJB!sA|%3uIji*2Sb0
z>pGLXA+T}ln+4G)M!@~A#p3psFl9|<o}Njs!yRTt_0KYA)Kxl&>CZ-_-*I?8QGXUx
z%lwIH4O$G`&9P7Cz*Jte6MCiu$bPU#2smm0T>Z5aKxp)9SZPJ^MA0IX=O#t3sj<F&
z4$Mfp*SiI}U|OjF;_6$alXEn&!D-WCUcD(PG&kg=`p?e{xEz2T8p=rWZl|4h{I1h&
z7v6j8AwNe2<VL?{U=?vRF*QCtwRL$iEv-4}VmWyb&wEh%(~0l<6Owsdn?c((y15hH
z1MX6_EdP9V01eNtN0&Q(r9oeznxL)z*Sh}raU!y1;MNTGYcAWHpVkZ6>#xRt<k{KW
zSi76Doz~@1oM9$KE^PSM`2kJ_xZ{IOd4O~RK*7H(^z&<xDrK%rsypG5R=}kw`DvTT
zW5@H;(XU2<`zinRz3JzljepKeL<Ji5m*F1X`QJREzs{4C0B=72LEKw~8sMT;u2X)`
z@Qqi4m#B-m!S|1!9MK`>Cz6Pf)@YE{0=-$O>ET8}o8@5?_wQ}=0rBPCCcv|ro|SPP
zM`X9G&yRNb9XO`>cCPDU=1~TB{>b@Gw!N|&d@aoup0Q^5YuR`=!7b0E{@5rWMRF-K
zsg*^>&CfZ_*;J;bR!HLivfaNemP_4_!M#puLLpYBne+HVL%_HBO1rpHy~d2w#H}-~
zOMLIKfeCnqFY+w4Bqmmx=1TnZLYB5k=ckrt97q$02-%&yb-F}EJReg4hY}uzI_}M6
zLOop}h+VDo5TmAuligir)Rg|P(N-`gWs3bS;ER7aMu7uZc_aU6s@Q1*=Mx~xi_O*u
zl~;ch><1(u*C(cbCeNLBP{G4Zo2)oavvFp}9tGOANo&~6{jc7fg;oU*A9A!C<s7)5
zs^*=Hd(h_>=$WY@J%q=1_QV}Jn(&9w#GzY!b!C-XfphNKETG{ae*i<wt>vC%f?|%k
ziCr1*Bhn0iY-wyrucF)SLKzS*1E>Z{csKCO`n!?A;bJC9m)oF_^HtLaSlsgC%-Z9-
z0{35vBl}$$4az%*cD*p}r%7U17jn5hinFBpXPjOAVz2$7kwj`Ukuf%43BF!%VR7ec
zkgeI3>F2ki9Gr;${dtK)jh@s-`MpEIb97yVUOYF#cZF~4!LIN&s(}1E;b^D?GP0$7
ziT=se-f<X^ymHPXp-gk&;YPGh<K%Z16Wj8M+U6zGTLW_6XUMe(F}!bny8{Az*H0o7
z9$NIC+eQY(xV_9cqF+`<wg#kHjO4%Q+vF?983@h$r_;@W9nFowpFYj?E$OgjTnB)d
zc$Q^P23pIHj&j-L4p_7JHTK|DEYqpXBSSZa8?ElK@NUS#v&uZuJ_z`C_INj6yb)R2
zxFSQLOl=C<tgXxTT}&z&Lf>=KW$@cuB?mC2z6m*%^h0chiA57Yx?K9wzck(7qH6>O
zq&kN|nzhqIJ4D;~lTH|jc3mA)ob1@tSgk-<AWC?mdDaD`ji^Uv@@-C(wn$|WsHF#v
zcK+UWK>l#*+n2-lplgs2AJb4^b`_9?OfojUMHm_>Tz~QCGtWObmOfwxU@{kOVo8i2
zF9=u*dRPVq?PEkA)XIRlqUJdoS5DcMfXM$r%Pu{0H+n)JC?Ki@>oOK%z3!To{Fd~O
zRDIcW;zfV)*G-38z#wEWwGOXlzxPocs<YlvPMlmM1Gk1nk7&_~6SFrM+&jJI#)Vp{
zRk@&_5NXs>)mPf;u^W+4w8g9Y4pJO+h;cSHo(`f798g8W`+3d4^!g|H#O3$8yw(N{
z*PXv@26v=(>fAKO{yng?Pdy;fS6B$&2!N&Gv*X(e6vY}cd2~I%t)t0mzTxJB@}q!5
zMZJdJXA}#kIqO$|B51JpI-RE*<ig2i2p-J-Vuj)F2SVn#I@jRU_-VJHJ4e^JtNKJ{
zND`_cxC!}Aisn6{y>|AwyMb%5K(nz=`hHtrfzvCxmzbU{w4|k&TLPXkaAyYUMT)h|
zzao7sWZSJAC_0x**W<?9GDLKh>Z8wS(C6^|1@rYQ0G7B4-@DTrRCELI1@9ADQry8G
z7gg|nB;WqcON+LdnXAVDwh*<w+IIOx?G`C4dvViktEB_j(>7&+!%KX6?JIIXhdje_
zeLqqlorMEgvoiLsNaJaDdP7|mu={aIR@G|hij)q5zT7WuMrDvq$3jRnJp~vgei`)A
zMpq_Fv2A-Wr~=5a4K!^8pI)~{U`LA3kjdW|$h{R9VXF>;T=L9vflnnZq~IFxlvJ1B
z&-n=wws9xfo92W!<OF?sp5U;CYJre&@6oQ5x3ubKZI3a<B%p^=sqSA#+3I?40zZij
z<o^l~fZr!2F))X^>^O~a5;G6sb3qBo#-^?Mops^T$Hz2J`lqnT&e|TZXl$VVM6lWt
z9Urtv<X`J!#jzoGi$1NY^|r#C)%ljT*2QOqtPS$n)j}8&z0dO0gp9iSJ{q4+99<a;
zAJ*KLBKv8sh176L(2OIRio(ofMGm1qk~MiK42BmcEm-VCE?8V$?^p&5yl(mJ0axx3
z&?bp@m4t<koTV@x$ez@Ir(4kEVr0O!U<78g#}^Mx;&MPbY<ibrj2yumW3|qo#?pYP
z7CM%0^0(q8{&WEc1LTR|3kDL)e$zpn1fAgR)wd+uRHx_F0a_`#;d&_e`MOvi<atfm
zk0oM{vWf7?VhsM$Dv4I)*&#ZbkypL6MVRGu#jO?}h%vvAfXS+@(31$~rp==Lu2r$F
zvjapPW3$j7K(>OIc_sm91XDTg`;ZIzJP)O_*z0)uTI%_jgkw|3=95<M`Rii3v4x+4
ztn%Xp)jd{akDIn&!xtyU`Xr%_dL}jpNg0_3M720RdrVLGp#g~bZL?#UX8X0JB-HHD
zhc;HD&W>bC9g%cy{0c+sqblItD^|Ud(69TIyq*bHK60Vt5FoNFEx)CYFis==gMr7Q
zdqceS+Z9jxnHC%A-w5H~+fc0CMqw8C_8m&?lZ$X8^zD~D@pZG|Tu}1T<~&Xw(e2oa
zdwmw#({T@V%^DE;JwZjfQ!;Zws?SK&KUC*-s8N%%XjucE0ob~4_-vm&4QQ8w9uH&u
zK*CW*j8|)mw%0xty0#Re^!Bp@mh);kTiAf98pDXkuBTva>{Pje2hNshTI%N&7Y*-+
zmq^7%iOtvz0zUN@loi!B3)p|qcq&=g={6uEPTz3)(mSg@tdpx$SnzZ|Q#sgj>&|bE
z4_JFRKFbJ)H-0=D`;Y{HP}4bwZVh!~a$udnwYZm2w(4X;364f>K30Wf#H`@iqxPa)
zNT03F4^R7iyHvS16355@IFwd&D$kdCHH2D%Aa$q^e8%!yownb472HLq9vgXqf~9LJ
zWp*pQ5bG!(VS0X-yVl8RK{s(DVS%R&jHp}h9Sb~}0j|wj0$N3M+(N!dr$S#=)ngH*
zH@~Ay2jKf}`k#Z0I_dn@d`g<jpH&~7E_lKO81qgRAyo#YH`vqa^1HRFZ-3ZRJA9jK
zmg4^4W9wyRS0e-E;?M0(xT>=4JVf*Mt>%veg9ucouh0&-1-djj+;;(;K*q(B#o2vO
z&`)>d=MLZJwj|4*g|t*3pXuYr7q3i)AZY>1)MPG59aT6K)^IX)U&>wDz3v!;o7(Jr
zkJzfZZ<hnq*4m(Q_;o`W(F{523}dKp06k7+dA84ub(p_)z0cw4^Ap5kM`(AT|DM3!
zNfb#|w4VNxJB??i3xu(Q<iF7|3!bkZEx*hqfS?D6n+MHfZTw8V<P=orTq~suFM=vE
z3q12{IR&^@Q3(=@-_&Q{CkNW8kN$WT%g#Dtn?azRXI!(KuZd(4M+FURjEDG^{EuuO
zlF=~f(tvrX7#{`d=0K)TVA?L|B39$$j+wa|N<?I2<|)Jp)z^WeK*09QSIFa#>)|5E
zF2XOn+3PT5Qz}?*jR03|4WGpR1Hd~P({dTiimmG9xF~^nkEI`c6srN{qpEq{V-uU0
zvQH;p-lp%#A+B23N}cc7B+m#8<l};zFYFONnM{>zjm1M;vQ_Lau(pn;BLq0vhsTp#
zEgsqS4#Ec?p_vdv-Bj6_c$qg}B5>B_Zk=kUKC*f<A{BgmmgT)^on)CWeWJc6J79H{
zTxE8>XZKSj@5$>1J^CY8+Kuxh6O{)y9$Fh+&jL;fb7o*SDY}9w490MLc(e9$udc3N
zbP{dP<6qQSdI1`LgVxM_X!M~u8?B!l$hYf7Te;6~3g{<_`J`H#)y_F+Xii~S;CWM=
z;wQWlfsfqR&okfxK*d9*Gj@AlRZC58;5b*%s&rcMw=B}Bcv1?;x0c-646(vYYTsC5
zb~S6tj<zb~B9E4#8`hb0I#UP+MY5N?E5CmefHR6@LreBBhHo-)Gv3ACDRiP&V$0)!
z*!2s`u8-oamr$b`Px9rxvwp!H;7UF=KK%#lX<iyq`~ztIgoN*3gtA`Nw;%G=6Oz)N
zRLYpQcbN&+h8;nU*!PGZ=~7QwxANj-O%6&Ao{HC0#wW9VuTadKIQc%;;_#v8cl4uB
z96nw4^~#<^nPz@zUI;jm0Jj7{8XAYSfUW-vgcU!W&6a#5ceallp1skz67jzUhh94F
zpJiscEp_Ryw;W9i7l;M=dV#!0TK9_*U=8~%5KPewd~Ii}z}a&V2#t!rsuetG-n%+O
z<t<y?Zozd!zunsZB_#i8DJ-~YxP(Ze11%<O)f)Pjhb#pKp58ki=i1W^DV6j!NYTX}
z^ofES&#E#BXlQ*u$D+Sg62&q!9#t?lbNvooe}$_51El8v&im|1-P(g%7EGlqNay<L
z@se0wq`<~dRIsoqu)waJpXHb=7y8--n;h@2?OU+YfjW`!QpI1SJaCQbzd+|F)(c3W
zcgBBF1pfizSn4>6abBQqsfGrB(e6M;z1t|pNbJ(ypZ3~TXK>7yEC-e2UA}xgaP|Kh
z=X+^SI0poj`gspQYR#^7zK$Bs{7fJAjiJOc1h?0t($I1AVz6b@E6l;fZK$Za8RWnf
zgoI87uSX@*3vi3eXs-x6sb@DsTEYcOf<781O@|J@NmT(}hb0pyh<RaHSYa~#LROIq
z$yBX-a=3HmCQQx|WA3f@2pad%aL`dI<x<A4ql!)7_S_-cyG)y}#m|nnLl}uqR&okV
zK(HzC?L+nZ@99gHx_0&a97%r{zmiXPrlQ)acYMKznr&#TKR7o^<_(h2(wn#J!@j-S
zeB=&h;(jn5$*CXnW15fZ_p<)XMaWe>XHq|WfBd7fO_#+2J5|D*Gt-@;zti?l?FdlG
z+kq@i_DnyT%PiGPt94tQ)sVh{{GH;E8*m+ui{B`mL<D2Juq|h<g~jDq(Y8Lx?a;`u
zyZ?7$<!WVyk=o$`dwG`;i)KkI&}Q#JMtJGDbJT09cjO=3yhDNy8;$Fq7X!c1{HI^L
z=WRD6`zBTk7razYpO2m3wcwjNJL2bJ-b&uC`UiF@Jo%;(wo6Wz;?h%krN7f;Z>)Q;
zt^V$SN5x--k^ZfQ@zjp00&83yV`1X2=6^3iF&c(C{#Z6e46ZNP$<dCcER|~fn(o=@
z$y)^zc!74df#JmAuE#+{DgNZ09}~HQoygy5t2_Nr9$-1)h&Nz45~jcL|K~;pt+0*w
z-qhLAj&xl58`mUPML6}NhY&9{{x<(V*qukA2HQy}GOGK$boAfB?f=Jqo<jA$b2=pn
z@V3t^_bCE-<K!oehr(X|hS)@L-V(`!(6gDPrZv0pyWW(#neD3DI6uwR;Ly0Wo8Dd8
z>|*PpVf*8h{KasJ@EhqR<b1M^d%Zsjs$T#b9JY0$x=)Vw++uSlRw(ExO6#XDqaB0s
zyITA_459f2QNP$lkTkmW6L_H+f{3tXxIXN?JiQOa^$bU*`l%eFrQ+~lU$7D8uZE;h
zM712;oh{Htc^?KRWK_1B`9)tRUVbHy`|LZPtbd;b)HcK0?&`$Y@3@ggJneks9p?PR
z_~0~~&omO5<`%o3*02358T;#o=8YVZ^zm=p4maQ28c#2!Kvi_uv<c|?5HnMM_<oEY
z4cj1nzUTgjoth6ZD}6}W<|BQO%cHaX<{7@rvHjR2?{DKibJ0@bE`RHG>i~Fx$JSE7
z$zeA`(}{|UqoEJ+vSxyo7jb`6ttn;ajyrh%2j{t=Rz#NVJ$oKa7DW;Ri1=t~$?W5L
z|9R}mlIaF7)P9o~y~_en_eIW1d(OWeASbpeHEntifEg6XP11Iy6;e}dD8dyf?)9VS
z>zS85SjQrR7@0$S(lg;TeKN(egpI{68rbyXuP4BZv}$<M2<AqdI#>MIho|Q2Gc+QK
zhxY7f%PPrXOLIIHRwCPNpF^bUW^O<-TE7(PhVTmTF@q}W{C2s;7AxSfnDMuhuWtaM
z6mSIv(?8m3T?<_mm8%j1Vsk+h{yZVQF#kNHU=Vn<iEQ1iO|xZRjfO1?ui2@B=+e&2
zfcBfb+|E}VUUZ36{tz)_&3hm4)<Q(+JbQh}0dk6(j5eJOC<<T?Cvg6S0Ux!wipY!u
zL05oOoDiXFXuj%Q2wulq1Y$3sF8onnO)v16pr%88SgLzpChavPXzH<D@iR&M-S^A#
z0xl}6q*~WZU@xLD#l)qkK}-+@iT(2s_?sU-Hd%z=e+7wd^zFYz`_<_cgZxn||1c1@
zE?xZsI#s#!3Sy)HCB2B)qD{&^{;IYYdz;48>W|G;H$LN7FOSVw(ONT3@eY+BtlfUD
z?2W)bIHmur`L8mMpPz(x|AVLd)%1+=@1=;JpWLMW2lt7!E#T$>v;0*>!1&{n=XU?#
z6n}nlHRkVeK`ueRHn&`Y`dh3WG0+^>$4MVPOE*f~94BHvXYH+8INeJ4h1)(Xo|yY=
z<!Issks9@5Ty?_ecn8rGg6vdoL-?BLUF=&RD%`K_o|$$?yi(=99&*5A8nN*d+=nAt
zM#G;^MZ<DprN^%{Rhv7fPHm28Cg#2oGQnFc@@8n!w?yys8;mHJv-~3C-$KPxRy2&g
zmrK1rKQ?{JpEl$G@(|AQ&Qg>sY)>o1BV|n8cNC9Wi*q$l-M2J%l?8GSZl*3N0~wik
z4z#<T$4l0~rB*W4Eo#B)>aJQkBL`T{sh8wUne48lh{x=}7dZO>Nz()Pvv*4}F=Wam
zI>}M-3ubVo+*SifVQx#(wy|nUY`pMZvas`7ovxeXHPhOvfHANgt6$=_;z1>Y821?I
zT*vBIff%lq-zf7to1}H1{<HSUA$h!dzR7`WLgO4h#!uAc##3CM+iOQp<9DOuw-zdB
zMY-ryGCEsQs=PR({YiLNvvnLwbZmRD?}ZW&Y7IKBC(5|^S?;v&c?|FBT>KN2kU>s-
z*9VCjx3pv5dojdttk&uXhJH=BeD-04OKoWV9L*@f>h&Ik1eE1j63t(QCCndO8If`E
zq3IiBMI%lLku#ENHf*T`G+CGT_!GQb6UA-0QpMNhd}X*->o7=eeAlG-W-~Yki&Ns|
zy|N}U{4iH18WtiheSBB%L`Q9d#$1Uvxnn&I#I{PkYRlhNP{zU6rQXmPGf|t9wD{aj
zObp&ATis~oN~wCdCN%=B<Pp(4JiIp~ydw^+R8*zf=jmG>?&$E;lcYzK_fCHr)j4BN
zEoMHYc|ofJ^i)*+h>+XVhnK}S)3HF}P-By)YT26Ejz|3ZZ*!S)a?kc{K8~^}B;yiL
zByCrI^jep5_3OW>gGt|B@>J&}hf67{f?x+mk^o^Oq_Xe@M&^~gttK)1%<?EZ-jgfP
zmJmu$avoAZm*Plf9hcy8mqY5L()NX`e%cEi*pA6Urw<?n)75}1oXxRI;FC-|b;Fl{
z#y=?ibMpC0)qO>-<RRQl&6i0nBXlOG^K}ECT83Or#{4}Qn=M!Bm-@sF@40FDb+<qU
z#LO{EXNq)toQC+Am+at-lDK$c)K+{659~ZFVK_0KH#(l@tyuCBKvEGQ8G!b?Y5{&J
z>{Bx7UU3&QSyx`+UX(QS8)WY<SsCugv7+IgaT(%R4Pl=mTMEps02!TrtACek)St~U
z7N>T|{=T_+sSka(5LY#9TPx8$5}7QVle>`AVeXwciCB&2WnXP$S8dUgphvzCMkTw0
zRr!>A>wAGs7oH{0L(B|02cb@A+}0GA`$qj{IDa@V@DZrZ+2`xnnK7J62hLU*sm!oM
zSH+DVzmUg`%xeec97_X{g0P<~Ip|ajUQ@oVau(&ff1q>Erf<l~dpanWuvW2GSDUqG
zb$znN_f8^-k)ed%Bf>|eRF;|VR))nrwpSA~9olWX*)#})`K~NoA!#UexhK8dv3qEm
z9JyyD0WHdp-y_$F4{bTF8E%<0($6sfy-o4`FD&p%b%b&Bt+p73%Ss%Y$TIhKQxBe^
z1OMvlf(P!iOH2*vxM?JGeb|xF=#G8o`GS>7cKgXue4TibN~^3YT#3fqK*fkkq&Z6*
z{{ka|Jx_|RPd=+7hZsLTlnnnX=B*`b;&siT0wKHolRi5=V&*-y5q&>cio<PNWh9P?
z1oTK*)kj}*Cxen@Ww#{)cXeGAC`I%-!&eW8)3@W4>0(7H-}vuM;N&azN}SU=%YH-L
zuoFebc8@q0=<81C;_@=LYUv#7GwJQI;XA4fw*%Wa8BcC;#nWe{%VkF|^?;*!T*#zz
z%?H}vQ(g@VUWT2&#I||B4*m)Pq(!}28Nv&;<%c~NI3Vn0mY2d@?;c3CY<VVr1Iku3
zBq(!><Ondzo0tzc5q1@ICArzjvR*54o3t!a={@}&%;H00)@yx18Ek5BhH27`jN2uA
z4GnM?p8et~Tp!Ow@pbz$2IM(c#*M#8OoR!jx3cf_-UzEY=6vPFX#WW0xYbHUlMZjz
zah<))lBoj^O!v-^r&h{xyetm&QdX7RE(YvwM|)BeS%s_Gp<<ch>HZXt)OJMdX}_Mb
zUaP(3%EsJVUkAbwQhxPmD)*9k`)>n5xkbW<12<wdBR*--8h`BzvZ{IUu)myRgMtOA
zT!1SM9ca#vlXom#tcywL$VdnXkqB+W*;Vbhk^SjSr_ahUJ!hDugKp*+E!EFWjswAo
zjiJN;mMIz9EY##+cf`KxP*-g3s3smG4xX29Z_|LuUS~PE{8_G<!6O=X#DL{QTh%Km
z?mSA)^+?h8`k<S4V($4vtpKJN2C;oX0sB5?uW55?)7Y&9q`@)2TE;_Qj*U0fbZ^7*
zJ0+m_5GP>6Sh64_CdxU?!-V=^-T-+;!c}4U#)>egW8ghkPf*q0BKQci3uCcRVz(6Z
zlpg05+M1?+n-GWseeB@3j<jzogNk^H_&L;c1#fQ1Ron~3wKM117}UWS62zL+Nq)f5
zqCwLhq23<OU#!U|L)Lm2>Ju<GSMn4IcZx&ZU1<AjX#F<#QsM?(8QGNS03BqC&&0%n
zx?%qfU0u3^0bTkdskr1g2A$I(@BkccFC_7qPKA}iMQ&__pzo)jC^pg`D>TPf?0sIk
zVRW?+LZhVGa@)*ysUAjJQB&5=suS;-iSa_o9X5Lpy4Hn7-0#ED_!u8}W8U79jLJ3=
z{WR>MJZm6ZZacTOa&M;?aK`n3i7eylAQETXlpXF&0d<~2FUF~p1TRQxr)I*69Og))
zj9hDBs8e{tkk~5q?`SXW4j>=W&9$Q)BaDqtUp-Qa2=0Vjw72_gH*#DP56A(F)5Mjb
zo{aSrPrEaD679cU`WhXo+*=;7@=<|aT}d_7-*OmT)LNd2XA>2FUByOInkmO>h8L+Q
zZlspli;Dr(A5*cD6G;;@BYSJ;B`1Ux0Qay{#a_?HL^PD=#h99DO=C~p_qCJ{h+Q}{
z+D^8?2^+PrQ*pJLD}xiH8<H=>`m>!Q$=IZ0bQwdgzCi*W+DB#W$WsULjZih@Jmf0*
zyLtHv2*yH3c_AGsCs?j(s4LMX1&wC->TsgPN>ZD`zk0byF5lTY$XO}E?Mr{BuF(_l
z>*G!z%9%#$hjBu6#i5w8hKOF<7KKO`R@<>z(ubvTGRcv+H8U8aXnS^CYjc0Zl+YTW
zF7?8s=P&`6WOn7ff#7r^c~E(MyrZ*hL5CS1PW(5~W5QQJiKZ|Fxzgz~6cOJY-W|I^
z=179x<8BSmf%6~YarOr-@99Ifhwe`YEuR;jr2#*X7y<%Y=n6{s*zx*~BRvZde+&~)
zJal+^zekit_XKpzOE+e&6VG4-X5d)e)4|X@_j4(M@7?&!eNT&4<yBDr{q0QVOgyau
zl)lSxoo=%^lAeH8A0-DbbAyak7P*J#qUn^2iI;bDTxG+(uBKXBE4_5XrES|5p`mBV
ztVZqY3w-%dSwTp6LtIP(3iJ~e0@l#`@@c^O?1RZZ9e7;Dt?>IDcLd4b1RxT6)w^j6
z5?gE~pxZm*78;aWRE@X<;__oT#B6a9CK&S_l9#i2MF9itFQ22t{pW+W0k@iBE;zN<
z6|G%o5|SzYeALX<>8l@AdKpgd5d9?IdtRnkftBqO5HaI~MH9yB(X$NUEhW|L40<-8
z3HZnkac)a<bTO##&SE7aBVfEncx;udSj#icf3?14m+n}gQm<}LN=~kh#_(lKy0E+B
z>sRH-NrHFYv?_sF_JiS)x@$l%{`_ij;PbiB5yTxjvsheOy@0(G9mMAo>9roA4LKYm
zE4#)qBZH}3jE})Avh^hR^ys3FGFyiRD=|GW7joz^B3&m8xBhP8?9q4C#KfgN9e5j$
zX>38q&1k&SUW{3W4d4YK;I-qOm3&A^X?9GAA)SgOM_+g*M^p>R$A!r_OO3FuNK7I-
zxCq6;fUE}+Z>fpn6ZZ0VKN34CcUaJcLuSv}k%c)5k4A+TGK7=wRb^GqltCvW2|6$?
zITywa%5yJR=N5qS{h|)$f!iTFtAlJL8Ego3)gmc=M((Q~_`+&sINK~xmG5yiT`exs
zm~AFyfpC+zBxaW5aAh15C-k>o5IA8u(2kQl1f%3|AZ#b%5H?GiZlZ7b7?<K6<U4i_
ze9LpdCzhOk5a-BtTY10*iLWllbcB-+i|Tft%OL+}uRI>}e($r9KA4)uV}LQA>N<;9
z(~>xk9pVw1eJVc4?iIX<{NVy+v;4JFJfdxV^3W~C0C&~^8XIcK3!4vfjptRLE$Y-I
zg$@la-2eJIfr^4<RV*g{lY-3+`=?`KU)%<FBjT@1+S~@4o@g-ht4AdCp&Y4LP<d({
z$saS{<dVXAmkhzuWBcp<jyX-D#-*=W3E5GTZ>JYw=h>nc-ZauCggVI4(uvYZ*~5UY
zM&8E)#tMBEzMR0z;qM%^Z;*l}^Mqb%46V6+iHnzsJPisMxosB4bmtft;^1{0D|hkr
z9GAv6ahAE33c)&$#A6%Z5WYbr9Ww1eErv_m&XzfAGSG4Y^8o8n1qNl`L;FM9Lj_&2
zCS1H@dbD^*QHh}LcjA25-qQ9?fGcRQ8}MX(`<5MRc3u#wtz{Fl;mNm45f$_q=5Qw<
zcn>(Ek0zh4ua;&qh(QC*0az@p>U_Grx!+x=9VL6Pkt1)+ljb_+gXYU>;<+>oS9X>~
zU=Am2`S8GBlO2Ym1J4dm<h>hyb6G>%(^2;kKt23DT&ks!Pn=g9pzi;Kgy#~fPDE*d
zH{Wi@X&-I|0fQS_eZ@hqveA+~w<W+2TR%vN%OXaMV5<$N<^$kb&s>+p%n!A)P>g|3
z;@L)Itc+va=Q>c**e_IWgDqwhgg=B~nIvpwtMU8otu8G0{rP1HGk%2wc<2X>G)P`p
zhl{@($B^!6eO0dz%k2CI@i!EFed87zn?l>Ha=a3z-4%>a<UXUbAz<>36d$AA5;&U&
zIX*3%^^$;APgI*p2P)4N>Xq(q?G@nvEE@a?(fdUvGgMnyudih_I8r%RwLNYFMrj~k
z2KWuq(nfd>mUQV3Jj+HJH`4Y@uTKxkf|yKQf_xB`#0y-e*4u*U$sJgy*Q|?{i}4Sl
z@jny;D(e+RR^4dvqo$LncbiAr0q%K6Lkb?>$}`_d5ic{$d-G!no6Xbyv!k)CB<~-S
zC0Lje+FWSKPCr+-H-2TVDGpG#$CjRyCj6?2|A`QhmJg#XQ@~PwNd;aNG44IUJBpP9
z{Jbd6YPGXGv^93xRm)p~J}k+Vl+9j39{V3_0llhL<i<@W0`}H~_pXh*PnzSS$oK$C
z?C)HU=fqh6Mgv>*SPHq3Ax1CbXs5U}EanB8$Z!)!qC0BocZoyCdh$Xqwgdw|QDKt2
zQR(-sQSl*QrS7$9pcV2uMy+(slQp*b&z#m1FU{sJgta(53XbqaM_X%}bq~D^vRrvb
zzm<2zfhXQdERLJ?p><*CNz2M*x*plHy)pDw+`{J?W34793l>vis|^pQV|gi<o{6jf
z!$}sTrc1)P-O0ZYOkNF9_$7`FJE8J*aYm3&E6-dylgDHrzA*A(nLY^YVCfKaFe0)k
z#bxH`1zAt>3$E&s2}#<hD0=3r43yF~JYqK9D^O&~2AmCs&F%qoVD?P&;R(Uf#h={n
zpLWlTLCeXfxLHoiteoyKX$fRXb{cKx&6>gr*B*hdPn%i{B96Y1XxEjgret2tRTvMl
zVo4~j4B#rnZYAncQ>t3D;pPq(S8<s;$Zq7Rcd1G3d!HjtEMNVnZRB}FYo?)8(~a*<
zEjZi-0lw>)G2!P7`kW!3G3ve_*zC<AvsZ-@XZx5h-&hj7aJtr?!euU=`TSsE<y0fR
zWbQYmtarFGya}%PK2b*h7eX$vI56Pwc4P0P2tqr+=*0cgo8KPdhnDtF3(oT|@3ZTO
z<o&ad`A?kX|Ihfl-=ySu-@VNfgG9nAmUl*+6R&w!KNU_C=qMf9yc>7@@glKb&g+*U
jejdXAH~$y@Z0k(<7Lq?8A3ksn_@}C<b*JR^qi6pIyZ(!V

diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-6.png b/docs/source/assets/design/v1/prefix_caching/example-time-6.png
index fbd7138596e8919b174a3d5489881838da2eac59..6ded9170e8e83146ddf77db70e16bc37a144c255 100644
GIT binary patch
literal 54829
zcmeFYcl0B5y+537uuGRJMdZ?Uk(QavOePAlnM_NYlu4%Hg`FhRl1!#1nHf-*MLGzA
zAh29O>;<J=ih{0`1!Zv;P!vI2K$fBi$Vw5B=QDVHSbffO&i&{4=Q+P~^38Yh$)~<Q
z@A7(=oYUl(op;!8hfF53bD1UEnam<2lUeZTrQ3m)M}BbhU%|fxL7O2m-~0IiFJ?0N
zN=Qke9qHCs%N&*~{p0B{IBPmVcvy}+42G$()(?GW0v<uT?F=m4(gy$d4$i{p(b?S5
zFy4XkhvkYW1m1G+F;Kps{G-2WXp{e$2t6hXCRkBd`_?3M0`ssO0X~-}p*9BpK{I%7
z@Ze1Xe^B;h2t7G}1b8gE?pPDGUeyW@gK0FEKPHEP<`q@8Q?DHcm%#Jb8foAkqxDBl
zI%MFh^J6RrbcYJZWRJ;#rjqG9Ht0k{Sr{~-*(`btmVWWU`9A2(tGeo2+pLxzT?<v?
zAIEOkMak$xW)#bFZY*{t-3j}T<IXfcumG*zr`$i+P6vmvtNo*Opv|m4*ht#y0`dg^
zXa+}u56e;D4hEKL_^J)Qw*H6Ak%3oBZDnX`QX!A^#^I1r|1rTt1!wxlV*ePQSHj)^
zD`RvWkdq#YGfvZtV9I+<Y2p-ojjL+~Rmq~-v_vy<*{96h%y&CT$!Jw8Rw+^djR~kG
zo8=C!C!v{-2%on#d|GcCa)_IyyaP=|BveOUt`da2B>KFL&|^jzXWcfXv_mTrs|<t;
zP}J0!K|;Dvtzk_{W|fi}m3nq0ceFN=<@*j(r+UetMw$Jv5q5%Nx$h`>DsSNh0c@Qu
zPn1!w?{T4?tv0NBzBiumLv2`P>O3uW3pfLr&SY40(L@{@5oKiuG?JC+X_>|XKG)GB
zS@pfSPG(CS+Ovm{?c-{v;5(2&`op}|tR-NTrmOT5AqeVIx+x;*x^t!F7FrGz(&(hw
z9Oe*ZI0AP;$TP^WqM_O{``L1@$C-8>Dv|9LS}+R5c|N8@SsW&#tWpadt${IZZaz_H
zwv)|aScO#Kq*1oI9No)PH5u1p!|tj5LSW)7Qc9{M*O0oA8gLFa9?dbjTkr@JMJq^Q
zOqDv*awA*pnPt0TGUKSkmHTu}blL-`J({8(*c<PRqzRq~1zx1b`C<o!i>f?FAdv%X
zLMYi{gHc)q3O5xhq;dn2FV^d1RwcT+UNJ4M0*=i)?X+($ndIj=u7+9dcB4$?cn_ac
z`$ZE1qD!<}*?0!w<$_j~38IX1#RMN0tH_j?cUY&$H5du0j~cEdG>k&IIc*u1fJ>!r
zH4NZ9;Sa$SU7%5jJQf6$4=1q`G?Q#L@$!ucYSOwrG=(lT4>8TCk!pKB_4`3h972Rx
z?abJct(N9lEMh#(9~7&YWr5FNHqYzl{i=(Ve3?KHzlw03us;{84Z9g+jX7H6I;{ag
zLkX*4J&Q1OFvdkDrZD7TLSmVgJc1fxIqS{YVXGL<kRs?o`+nG#y=H86L3bD<q<LO7
z=QX4h1-ZFCFN{DlDoqI?ANdmo(T3=#-7B&(TX8CSu4ayiR??&Ot{fK}oJNekQ(@F@
z9%x9fH=&Atwl(3=04!vYYFL@ZzDCsx?M5#GI*|siDr5K%k;tk6Gz)=npoy)G30a=&
zRiL%#NxkgO%C%xVW3gmbP@|B@NdaZ}C8wn$<^B-q46>mE`R!Sk<E8QtRl@=@s`C?z
zsLspy-0sHHP7BNpCSxju9pGf2i<<MS)~sVns8fK^r7;dVkX?4@C^I8FLMk#nt}?0t
zcj_ZI$aa;n?*#*+HNgvpTN{mtQnAHl1Fk%;lt)&LBBxMfRPtzxErhBbXf_!e`m8-~
z#JyT^NO-U`utvD%lq}101$IQ?^2{0ru3`?vCOWq!C<->v9YmF^YN(MucRh%K(mi$B
z*&5AyzRQPQQyA9;TPUj<)N<i!u|>vn5p>F<rYjHXU5_*o@Uau5?`ocQ!I8mJG@8<t
z0ZIk10LKB6OC`y+s-2z{0xqS~E+vFuN6g%1s-U6QDovp=Qk{&|l35am6lX+(RwYt2
zmsZTO1yK}`5*o{!NEXwyY1wnq-N#c-w5DZC%(q+x8fOi~_i8N}>J4;1=tNz#08z*Y
z&9$tl;5J-R=jXk^W7VL8%0sk}YgofR+M*=hcbFnLJMCn1PRPLm-GUo4w1ztNu#3c{
zEF((3fi;G+f~-wzjzB|PHOAarzMC5k!=VBRV+5bJWfvElqpnZ7g=~ujEFCbtDcGkp
zBvT>p<iQd%@Q|G#CB4r~p<yYB%`y(j65Fek1HP<KfT7wkf|gja$|89`uZl$3_SH^%
zY}X1@S#>66YsQpKn1*q!-?efKmMSXRAca9Lfm^wdoZ>X9&T*fWLty@CvLI1TlM880
z8L-t>MEAzBfV5?XO=(YL*#hQ+ZhA?Rpcpjt(a@pwqF0c8lWuiw*-v79V$KRUm)G@X
zu90ZIU{q(_4ud-mUXuN&iuY#~zK7;!Z7>r%ga)Bt#50VU5xm+LMpkv~par`fdN?nO
zBWw`(RSO$WD;=9d>R<?XB}v@}vPcXe6MUNFt9cEZkNJ{~W4h^Dr3%jAsm3U_CLyqZ
zB~7(0UzznA-Y`97%6TFKS5(cP;2zVZ2cQ9TC%O$C*MaQ^AOC6o{|yU#VCjMQr`X}2
z;D9L}T~6`>cQWHU*;p=2B9Hnf7Kr{G3rHnV;smyqhX<0U1EXa3$Iz(l49ix(TX6!V
zkZ#^+(UdjKwuU)*X13sZKULjeYi$l5OnSX;Q3kk+p2sAr%~g9qOyg3n(~F0sX+^b~
zIwtkGC<-EA2T2*(Os%Grv#{y1P;tsN>s~kkV>pg)mEnF{b)78JHzQ|MX>oEn*0Olk
zj;XvpXxa9NY}84HOrQxKj_|pqW(U(A*h@0WR{RDs3~Sm@!vcedWUwK5ls94-BgbZU
zgs8Dm9JZ^~q1j1~G?<p~6e`kQ9(<;&2<9Zyl%u*jXXn+>W0Zsk_8Y6|Os7yH3{0sN
z1~J4+9h`*i^t@9<#G6y6mD<Tfj!_FqiN#b@VrJJGG84!F&SM-m%cf%AHoIAl8baAp
zS4kRF5uS$~t6bLLJcS1^j>!fCajfKe*@i|y92i0yt|^acyD15#?3$ClJHzo{ZuDz1
z)y_2=O4aSC^=7$G&6Ot9^04+CQ34*UB|s1pND?7orNL-!MmZv+N|Ya4oq;J#x^{8c
zqJajjB`5-0m9hxgyws(+pa>X+rn5P{&d?-e_(gZ#h4N-QRR=CSXOz0Cu&GiYoj9rv
zBc-erv>_zT8ls7gVJ{elr9N75i;x1$iP1BSzMk{Ow4QHOVUbWpy@-($LS;l^#Pf{~
zX*Ilt-RMkarb^Yc25V4ou4Y4xCSgdk{u~YpQKwoDLf@)Os?X9m-%MCOXthK`ZuEop
z44;}NGtW+jCNzby&Umgs?9fKYRx(p^42cM;gm?IcFevBzd|Zd?gLbo+1nsg_j~NQ8
zSw*@~D=P|PqZ-2#lQ8RcfXd7DfLW8ga-1GkHO7u@u@#eo!fZ#KL{+zCRXyD{_-x0j
zOqg<c4w0rk?-!<)OehZ4DmTG6pB*(N&0vQ?by|#xib5JAJ-wbF7cy*XoYh(kmeZ|K
zt0%O&wB?kPdQI){NW;z}BwQ>d`DvAPgC3z9u_BecY-pF~iPacZb#u~CnWCT#%{fZU
zc^U8Z6EhB?dCO={>=H266`qm|oCnubGz`86NCxhfuN0H)TpmPWjm=5Le!vchL1Eg4
zlAbUQLM_UNK_d}bf&&d|t-&zg&Ku(pY`H*!Phf)gKNoPa6%Dz<aB3kMJZ$&13KPn4
zXP`+%#O9R_Hi&!xdf`?8mz=Vx7Ml|IJ{uM|$E)jeY)HLtd41YV3Np~%G%*64+9St$
z;8%t<No$)$u&5#z0f0z#Bq@;L2wZ9dN^ok}DUm!;HZ2B`;WDRmw8?;q%DF*V@e16S
zM$r*$D#b<)t4|a#n&?o25pS1cv4`WKS8bN-5JWktHz*8eSx#|`NUtgU7=_v**&vGJ
zs+h}W16I&e8&w@myJabs0>^Ptia>m$!4<kmTj#lEuRCBBz20^8Mq`NSe6HPW_Q>kA
z@1l5qnw2%x!|XsdJm5(Hp+@qpe1J`3yH`#3r8Y*u9OQwjBHTiQuD3g*BHpdEyUprc
zS6Nw|qd5*}cfu$o9ac%LltUtF-v)pPuR^t|>Z_6~@%=pG7*sWjl%}C>RWzfa7Hg(R
z>f_W*5y8ZTr7`F5)3Alqr{YwsIsF;!(70ga@=>a^1VK{i965(EwMb1(59t~`bjl=p
z$Y}+O0Sm(xl5I5!r5))~vo`3(V;J(QU`qsqPuOJI)9b17&`F5MO@O|j`I;o;6<g}4
zbRHRHComLuXPPdI&?-6z+A&|v_XU1jYhpdp$7d$04~#CDt_ul5;^;vC5n8)bmW@)K
z;Uz@uW4NQJArN;#L>w5?2ko9^kD^p(gBLJ9g=V+GFp-h!_YUEWIel2x*s$YD4K(H9
zyetuY9@b`koYGM-^%dx7&bYde4QD+fnaZ6|g|R8^SCh~#i9i&qO4xKpE>!3V4&_a{
zP%etQ7#8|-A>pejrbL5)p68`v-nOD>EDm6xT3Uiw_R4LCO~gTKq~N&$PP1`Vqj{ip
zb288!p(r|rlU`^}qcot>SlX^bAui-vxpr8~Ldi_cL0DZYGh(>_$6UVvoXLT}g@ypp
z05VMHDFO|v-4<5sGA%=D7*=^qbTOSMC_)}kkM8M74C$djr=$d*WwI;*53&#JB~2%W
z#o`Sa>vS;SlS8yW?G^_RA99JFAG@fZ^zt(ur?8=m0_;a45y{y~`eb@c8b|<^0z?`9
zclZRzI=l@&X@r$L!wK~`&@^E{w(SCu{U<(225GE<s;79H3BVhO8iBnNMGd?OEgiIo
zrVnN#a=Fl-<z}-sWz`**0+yVhgqb$W%?ZNjx2k@ZX@nqRL(t`xs^nl^j=Kp!D!m30
z%t^d366zj17|}D^@u)hJF9)I8YJmQ2p*r?yN+@`d+;EkO31{hHB<nC4^Sr~=h$d#s
zlw59yJse9~ttwI@mAGB(7sy^=Y*KK0l207AiyLLuh>W06(Z|ks67*Wbu%O3OkGH|W
zh)!b^Vw|5C2Gfo8sDcwz9v-*rZlx8Cky3+2VtQ0}h^$vDu(e*(bZo5Qd$^Qd0(f;;
z+K&*uT;c0zJ_zd(E2Ij|Xtnj(z;@8d%n{%sD?mvOZz8EStpq7>9?ElR5=D4D&Zb1p
zHUy)hRg@v!AEjd6nZPVtF{%}|>C3ISFgKWFM9FqdBCByTPX~>-UkZl;6su!mV)S`U
z#|14ktc3NmhTTw<iIp-~IIGWRgh@Kdd;+2+G}Uq@^%g?=^I6Fir))N^cbf$&5QKJX
zKo=09Hm=w$Q^*l6MFze8s0H>4u!`aorB)q;$UK;{%!<>iD6wnsj9P@7z@M2l6p94C
zA{AQQLB7;Z*~EoQOfMLzoU1k4Vcw!A9S6lw)|+YlMw~RfZfa4wU3A!Kn_g6}jEgz4
z2$V2A9aO7&U<h3yQAHl(1)@~17+yVgN)^Ebn5$i#&{%d%mlLv*F9QF$>omz^KFo!Y
z-<}69Z~1K5A5`6eggXMy)W>{rggIqa56sk!YNSpA?IB(aSeWLDZJg07Su>mhpPt4#
zUG{yr?W7*Q+_ccJ=b{}ERU>&)fZ0-s;PV4PCg(}$k6UdG9#pup(*i8gPO{z3#HnC|
zi8F72SVhMp+$?913NgmRRQMq(<?Y#oM&$x&4zQ5PwP2ldfEzg?o9&68AWU!AK%ib!
zk3df~@21=$z3(FXm^su7XfVmCBj24?sd4P(yPYrxLYt|1;zC;x19j>L%_4BXp<ESa
zDp+&YMdY|?$so4jk4l`NT75;1TSYnboh%_$L`0`AaY@mi@Dqxd1eBB#h(bD;Jnj#(
zZ7k#~JT^tU-ZTrh3sa!bUEhNtI_GO}BZ@*jDfj&pf5V{!`BOY}rn#;&b$hYy&W3!+
zg<yO_RYjp@Vs0=2vlFfmP6FC-@u2S8(+cg4`_NSCwp?HjXw?WLV~$ZQ&@_b6WH_n<
zg3rW+8%;YVMpbh<i1nFLk!JL9lC4IKrsY+;Fzlu_Tt~W<N|)q;lWGkhufb?!<Z-16
zGf{<+irY>WY5Q@u+3lC7wmhp*D$6s80Y;)~8&w2IoeEt7hw6k93c&k~m{C8n`Z37`
zqzDd4%VAWgi?co#s-=-zb0)ctZ)W?3pX+KxWyGSMFL7{r1cAPKT&~Yb6Dpxv?HMNw
zK@4bO!zC%F*nYtkgt&v2n}%;gwW2?i%L11Qh20fEXthms<Xq7Z+Tk!@Y=lXMO)4c$
z3C%IF*p@0&Wu_-3Y&1`;Dh<woj8hq}s1p@~gh+N^uP#Yso@IMcV$Y|^V46pyxk=>V
ziUYw~vy>-zrOA1r%vCFir&bX_g0e|Mn3@k5*pZ4BV*~3(&+(``Q@|C_+1eB==IRJl
zlmtOH>bBmQr$m8?0UM@{Txcg2G*ZCf3DicSVioRFqy7|*!FO`jCkt&SUmeyPvvMJs
z1aRUZsAgJ_$JG2Tp6?mCP#zYzbfy3v*%g;z>WG%Ah51MZs*<4SDlCD>b>DYt?W!b5
zl_;2x<+7XW1#Gk7a)3s}$m2TstUq(7(|k7H7+1TgDW2*HoUax7Vg+xiXmysxS7==b
zBNVT7nqXLClyE85WVMP@8t{q`>BD@kHnjjEE7zTz3F2n7!_bqcE6mEC3F2<^(5&Nh
zhs3MSRGUwYBGqq+#kj_J=rmS>u9mVeO10dn-AJr?7OD;F@CXk8If(fvH9>hx@l>!X
zF=<a`G!NHBorZI8e$ZrDuWjh=2<jFDJn4s7ur(Xv+lg%zBsrF=IGU1vjy7lFq=wa^
ztTJiMB}`#SMXL`(%|=jvR*>2aX;9PHAQ9bx2nGc6Qq;pBAf5TC%wgmJ>Uk*O`f(l^
z7{bUU0;u9Ef}*D2W?B_3v5g9W!}-?C3gT*r%;wcz-BTh2n)BlvKE_fDpQUDw2rE2E
zG(ta@XIR!QfaVmY3~3I8SqK~Rqy7XQM)<HZ7Nj6*jS9UPJl7>9*YO0o+Nq=y)yg*S
z1I=ev5!d3=L#LYtFWF{&@UMW5CbgKJzYl%>x9A7lKT!7ncXURDPM#)REz(=yAib(d
z6jQL4W_t2U)a#=G^3P0-hNWDG6p^XIN@y9e2U*zfOgpZH56D`LBdq!i;f57A<@gHV
z6yBn;C7bXk6{*xRb&#N=hYr>tM-C>}hW*?GWQk@}Gw{gHsMYp?KR+B=JTP#>A#jpo
zYCP^b6o8iEpx0L$1zAjQVLs(Mfh2@tfoyWcd8-e?x8she#;j;EVmmOWn%U{~6q1}M
zQw*y&#8#auaC5OSC3`Kg7FXRG90CYPm@@?*4I)KgXYH{nwz)E<M>H<ep*QL@N?s9>
zwbqEvhTOd2bej~&*#OEAvyR`_Cm}Kzz!;8edI3Yyn<b&39YTX~ZdA^PEM6`4#@rO5
zS*g$?hyMFppW!teDipbZC5L6bS&QbpUgI2K2x-}5O5<!tDKhyej7EXsi6qIF!E%LS
zz1i%89F$vPYdjkF`!gb8<$i3pkV2TyZk$3=))|HdtUE+Uu^|Va)tFqZ?Dv%(p#Z<V
z@6Y2-F2@v`QAck2Lbf*Y`aIv%%Ou~6#)&I>HDfU4XMH!Ek3bxWajb5AM(F)Y4f7%k
zX$|Ledl=w34Z-=A81>xR+zr}xMB#;8Q!5aN+>nJ9J*@}z&;`o7kTd6IKWS+rPn*}+
zK?MZ78#>t|B`P;!8db_ESbi3(@^Vy{+jA8KppOXgo{?Tmt)n`EjhhCR8-M_AH4w;o
z)CE-P@DAzlz%j8$_9zyblu*_aLcly;#G0eLPsdnZ^pZZ>2A#D$(&4&H)|&`~83UfA
z6I2Qnq2uAX27*^nwUp!gN&uoU4hPQx0Vmc}o;5AL23a1G$RN5=O<JR2LH5GBSZ$K2
z+9$Aj)*klz6C(opR2U2ykVK2<tO?I^B?6)tRuq8WNsJvf2G{q#z-Fa5t2$0W9nPIR
z(^87a$m>+f4WsFfM_Co5^~yqpCx`5?hD{y0&^8&e#5&>x;>U5N%+{N^UZ+{BMzgLp
z8nJ$N9$8U$T9K0y#jBAvC`QFufHv@11eN_+pThA<AH;9y(X?TS$grSlC5>ykV%Qi^
zS_-2k3TtRAC-;dgT?b%Z@8vZfc>hzzZP<Lee||<r{Q*$5$t-6LY`rqej(gt3Z}KJ4
zGXeOem4MA#H5yo~LLKKl0!Y5mH|7l&GUr1}Zn=B|#lTF`%x>vs-Zpu^<xgAviesBx
zt2-4(({Q9T^qy24+g(pUDS;TOx!E{T+9fMeWU^q*>4+$pHJL?KO|;PYXi(5=Ien&u
z<vFHJV^j{C;G&#?SD!j;ITjj-Ug@MKtd-~@l9WK$pU$b{raomyb+2UxwT|OXX{+3o
zbB!>;38<$Rw0e^)D*-j0mT_}RaLxc+9A(P=MltZr8k#rt5i=O%ePikw!Cb628&<!p
zJ7R&QB8=80E&^dtQWhI#Nd-7JbQ^6#KwN@E8rd$A;Q4kAjaZw{CnDFRXF|+3X0(-J
zY*TE(rfSS2ULCix+IY|uu=Wr}?GVXwFzfTPIn<k|cnDMNwkm`{0Gx}7X2x+a3sjJ=
z9WoZ3cW|EWhJzGWgQL%SU7Jj}a7ZIL0htLBfj07!iq{U~xrcW3!4yx;5gm^}|F#wb
zv@^*AV-7I`2n`EWXjmV044a~18;XZ9DPo|W!cjCw9E=hbf~jtbN#q1Wtvn$FLttkG
zsP7gLX{1dGozfsj_0l*vL@-P~mFK+5sENgxfmBqPHVF0A%4EJ)*Xq(No_Zh&Zw50y
zTg%G>GnmhY!ien**dhR}U~I>mBWeg?qSV*zwy3#0+iDDOQ((|o5w#+fvKWT7^17If
z0&zM4!CK14<S-xurl!T&LX3~cl}?J|hvO+*Q7hrJA^Gl%8T48@4~El@fmS7FpyM6O
zD^Wd~=RKe*9YJFvCh8XB`iPH1u#7Y3G(i9lLB528e|#E%>;|sDXl{R|z}?tZu<@|j
zor!&riIkK)U7e?-j%$so)-_3utM!U4)|quUK1&1%2rYQ%1o)Ju+^AyIXfo?y0fT9A
zztb7Gm^!I<L7){(APl0i01TSI37qEih(f1KtIW7=On~<pF*JrUAaOK~T|W_g5NB=j
zTGR|>nd&tsGnK?k00Z-dsGqA&q1mYMJ~$rQXzE~7aIvP$tT9WC!b-6*%6BJ5FIHVC
zM6hwSqBA1`PQoS?%xE1`n(TD!%A|)0&0^LGvwGa_)c7%>vwU6kvm`4>91fM$atvh+
z$xUN<AXei!3<>ZlSD?$9Rj4Xi*YM<tP{%cZ2~w`b26UnWd`O!uG&QCz{C^+K0XMiU
zp79J8EpYWRC$|kmp(wdy@&AKo1dwe4c!p{sP}A0P<ql#{l>rNChI%ez_j-O6IFx;@
z=+`)q6H(^n$v}b%QIMs1!(;pMG?wPHA)>PstPVw$vAR0nt4SzEjGHmw2wH9hNV&u*
zcBKM0i$sy`@@Znc6$Ef-Sv+8Labjo3z{LXBm)iY6N|}gIL{5WEJ?YPj#ITYt={TrD
znZ`(Lpb3YQ2|hA$ILhWhS1*-XrZ{An#`~xy%)xU{41_L;g(*C-w1lqvaO^dPQ6mn9
ztdjJt9-gf=W4_)UNo9Cq%k6|}V>6B<N2XR9O>=p?QYsHU2%EX+G-(Gjtk6%boi?e9
z9X(Aufn2>-H#p4Asu7*~%M6ZSb1l#dO&maPqD4g=6AGF$e}ts!;eT)q6_rXo*$oV`
zH-fW(etpLT5fyO9gadGKfvPtO#*j9(ywYfAd&8iibjoS*5y9|XwAm-)8pxtF<01ei
z+-RbSoGsD<3K*F1tJz|%lT#`~lS_rN3?e$*9G*$7DLp7@Y!)u&rX#BWRcW#^ZI(Q>
z(rELoTEC67rE0STE7UOd-AcJkk4VRj`!g`fT&Q%tN>3TLyRef`CN&!jn=^|qvn8QF
z)+%*5XH{XVVT-jYaD?&^7*k3wvKR~?wm4Rh_d{E~h=WR<u|YM79>f9j#<Z1*>5nNl
zFXf`p9|lHGRf^fzr$8duv>ALfjDmXHN}U72C8`}%FQWBP%m}j*N2STVN}~%~J`D<R
ztb9+bghmwz4w2N#bD>u>VbYgs#o2Hy27s&})dw<5)u;s!P7}l(+5(6>bUZSYtUzow
z`g}E%U1Y2ib~FzekgTLY(7z1ER5WG=cjn5p37iTLM=7Quxd{x`C+encIaFZ|(yJyp
zC{^MH!j2Fq@r$~fpyN5n3&NF(n~$5ZTOReZMro+9loF1mUcHD^%OE6L3{b?9x`~vw
zx9Tn2f+{vmSG^8ZmlB<uaX6|#YD2SHV;=bWePV2sslH5CduB8r1{JWGWE#zzzSgRS
z7|qxHs0<Dw4Z*EO@T}Rc+mlwcr?x4a8u$8>k`c{}R>gK43(tulGDH`ZLeLv#OP1M~
z)`6o-SNioyR366KgoABmQdRO~81|rvBGIm+hLd)^M~eN{JP7<zb{GpTPt2owOJ!YF
z8Iq*T4hkSuo~t(<1-Q~m)(ecl?aXnvg;iUQmN$g~icNJ5n)`Cl#hEHe*zi=9#^^*T
zg@)V06PM(Ng=(=>o?s2aB9&RG0o=!aMZ;K!Rh*<g;<9E`k%K`G5{q<PiiJAx2fFQ<
z(203F^t@UV2rJ0ZgY!(Ag&fE;%>AKJ6_F9RMG$KUngm-I(rV3Z&W0ung6@2~9=ACt
z6b6{)%o0#q$25ouUg3~V38d~c%|g=Zv(Xg6bhusPFasz!kTE0GTs;cBIf#ouk}|K>
zVQs7rBeH~7V94v@6(f%MS*gqUUBX0htSAKDv|s3U&0GbRy@)Khl_H4T)H-CuV1i+Q
zlBBs&709gH%=Z{C*RFKRf$4;rGE}*#1OxM5m8L^gOxPkSV_{M2=cnW#P&nXz77dRs
zf^7yqGi}txsA*|72v?h&E~Ny))Pd76BOC|yfq^;(kp+4}(m)YaTQx|VRzQ#k1_Df*
zaoe(8@PLoYSfhhyAsB6rVWBG3SyXRYiKa|RSDQ2@S+quhgpg$e$tT$;EDu1fSs%f<
zX|1Srn;|yNS9vWKrCb~9<cH(9-J7&1-1h2(9M|%+fcJ`nCWysOCS?mN+8{C_=J7ee
zd;(r9#foQB7_5p`c|f^XN9q8}MF>f6z}B#l(<{4JQr6mDv00V{8Z)&9Ep;Z%nx_TL
zp-`?tecr9gNiL7Mt~qBIc{0yc-Mrgpu#w?53$)V+yCZIjfr_BAK*`o5U%+P_sN!+}
zGShj+ER5#XjDe#RwaxiKZ7eoW(a^K36GT|hCY?gFt;Us3&L_eEo#ZtUFW~kBq-y&v
zVhvJls6tSorhx(yk1tCtVW2slDbyj7*Ha2t#CEwpg?a)b2t`B~H2M?U?e{xohrtFd
ztO7)*3Sjx5Zo1t$7P26c)4{>4J*~@<nmO@0wTUH7^FH8pf*s&GQFlOmDa3NM3O?$>
zgHo;sjpiUjNA<%pI<1irKJ6$CuSHg1YY>A`${Y`~Epr0=1U11Ec{D=#`Vhj=Lb-wE
zT_N$pJPl$kM7yJnM+_};U3e(9sH8h^6wDhW6h~JIEf2RQ0!g&9xmfcigZ8M{a}-_>
z5Qu9RfbLOUzuER@hC|k>g*sJ{kz7AFm=^jRtJL&L(6>3WkON*?y90MIH3#BDpaPBO
z{jskQIRY?P*kbdbzNs~6g#EhHGRkynSZ8z5C*i`V4RF>N)CCX;S)e3KAjfFmsZC%j
zB<7)xnBAe7*XLYo0z!u+j~m7v5bbX1YPMj(ZIA8AU@jL>TBvdzm;<3uf*ZBFxrU7P
zi$aG&bGa#nGkmY1vW1pv(HuKrB%gdgu{BqdK+?dym!_^cgW`TI1#z;`DN}_}BDDdo
zB*Yo0VF<iPqC1i%xK-FAT4mcBR$*IYQ%zooQ*h{u0N>KIW<#Sojv$N=5YJSfx6<qN
z0p6jjQqDz&{r^241cSE4gGm?eG#p+$I#16$qSo#XbO0~@8Q=Q1O60Wqs4lQQP<_-H
z;rULe*KMUgHLF&gC*rc8&s<PIWsIm6UF(|;*koJ@WH+^vJs1S^T6S1%<SIz$<Xo&i
z%NCVsTg@vL;`3hLiiJu&g-#R|#Hc{0pvCd$1tQl2wg%h>VW7R3#45hjV2OaT<Z>C*
zIHFms(>7d$tyTEB-9wrvKDL0?BMckl^&k@|CIyg=N7bCpwx^shR!Bbu_yLC(m7!P8
z^Er1o5CyC4NbYFf8fKARVp9_r*fNkqq*b<?6?<Bd$N9>D29Q@M2pWd57|!+zDuxaQ
zpdhWFd0yZJfdr-S(=c$nP(y_nn|e|f%C%>puxFBQw#ub>pa~^40%^olgkTv<!<Amq
zbQs>Q&5J3Lu|v7s5u#w!Bus;yBZQrET4tfwYz)nj2O^Alkl~W}fg=x^HGPg3xv=Ns
z1WVdI(Ixu!(1ts3z5_~6G$I%B*_g~tV5$)%az9Dty^$w+BW-G#><ntFG6);=&B2_n
z*_APmsPp2a*c{hDd7uWVqgs{(wGet6pUo(TR~fk5kAa#ED-#La*<?hP%tA{rZKT?c
zb0b2An_7*bJ5@4BP_!88Aj571eUs>OK^tmea#v}yblau6f{a>lzRe)jW|$Yve9Ly@
z(V&$bj=cn)Qss)$<<+)LrvCb%J5nS<E)hN2=nE2N)9-siRmt`xrq}JwIF}iTHNWW7
zvt-;3+8r!{S&@L7`lRYBu<TP@XPT(ft`P<ux<^snDmfT&9ed_VwXR4gbSab(TOTqJ
z+N%p<L{OF~^pN*Uv+8oa?Ix+yCB2Up$XS~}YN^*r_ljP5igX($Qf0GkU+A?Oo;YRX
zS}8^RVqdYQ3hni2YLYljzAY3-lLi?94kzLs7(OD9Y}XtOLH$77Y=hJ*Xlu{nx^2~|
zW-YAL$7RX_c-0W=V~rO{aoBT>3RF%t6AL`nPMfG#C@#&BA(cr*8|C3XUxF|*tAip~
zYHGQikvs%h$Q)^lNQ#E5C>qMvj4yCmp**PT#h@+F^-8Y^W|ToCjExSuk_b`&CBvJz
zh*mdC7M*G<x;m(fCE(-J{b9=|QHGnPbsU5}M|%V(wMxh+KUM-Rvh)eanX7>xOgRU^
zd!~v^^+_8gq^568*j7-P6a73rtt13+HSr-}d1>gT;Yd51rSrpjJp`HQ0ttTPQSQ;g
zI2!X_6i}iPkO`J2QwOfsaO@7KAdf_}DakWW?8X!*9`2Yhn1;~o9LW8G>VZVYN_Gj9
zvSss(-|_QJBFRcXqY?nSg)jhbL?HpX3X)J!4{Z>EPxCoMf$fqgr*|v`@#;1xa<NN;
z0AkB5I+j7&t;iHuZd~F!DXId<J742d>zjUP=}uCdw5igJMT^a5wO>~UZA7$_cAXj&
zvy$|FVght#BE1g`72yb?ot`JJI8{7Vl|ndA2>}HbB0EH^00PtfAYU#{#WXroA9idV
z)NnCU-A}KiB=^#_^Fl$P{hB9?LBV@98pmBTDkgNcGnV?vEJ)I>GkunetD=q%E3<Jq
zM(cFnROoy&A);KIbzpobR6w0iedZeIP@rNiD%v%tJ(Y>XBCIhE%n^Ywp4XFVIK_oT
zo1ArP1A{6Ido18+3;brI+676bP67D;v$_lCK<thH4vAmrfQNay2L^8RSWsw(6CgT3
zYb64djm|ZyB#DVT1udXfs6I<bqN4FL)@-q1Inv72LI;E|2t6#ZVyKj{c~gSn&R8S`
zd{9y<vsn)HL21m?9@Ei8FaVk-dRVq5)=(CHAJ3Zj46ZkaakG~93pP0mM7Y<Sx82Z&
zIWgS`@0I#mE>*&Kwx!KP-7BV&Qg)(F8-eE?#;KcKGC$Quy($NRZq2xqU<D8X1wVzL
z7=+YOk6?NR;ZAZ&O_>a4DI1MC0JlMI4=vC!4|nuVKSioNyA2Kmc4v#0QceaoHsJiK
ztgxv$GV@w~pva?n+o&|{afJf47n5ckWaI6zH?>PqHBQ(naL;(tkdkrgeSiWv@XMPS
ztb?D=G#gHc#C<I82c{T=UD4N}LLxdWG{Yi(WW%lv6~wsaLqnd{TVOZ!flB6JP<%cH
zzs5m_0UM%~8p>ndq!D3|0Z26f@S|(Ctz1{?cKmsj1wS_Nv!ezhl06;ficorQp{D`C
zW~#x&lS~vq{76XAi#i{a=#DBDq<mS+6|!L-qsVmAq3`87;dC&q!4n{VeGo60;X)o%
z#^ij(=*}t*02U;ivdpNGq5)9dSwx$LubZ6|15_qZD(A8sG^D^LtxBQhv(47HpVVX_
zy`Tg^+RZeoRKS%`lzKWw4+}N$6QBRrEFkFnKR=txQ9r)!rO#$E2W84+u@jy2$NyUP
z<j%i*@BS0MwzKu6gR37Se{<LuUlPmK7upx!L=V2T;;Y})e%gNgX5q52cEqLsdD%q|
zT>Zscn%7-Q9Qxj`j19-V=l%ASYd-Pj6<2&BxMpK~*PX8(mowteZhWKgyWr;sEL!rR
z-4^`IYv~cE^@%GT?Zn08%71wsRQyWipoIr-``S%exI6M?^uOR^{<R%+{rI;1!1Moy
zU4H`q_FEZ$;jgbQ-}=^u&u0GaUG(mw3l7+R)i+L`Z~f-dvsZucT72b7q!^sA`kW8l
z@zjFNrA%YtTZgPX?BfTn+#~b51()xB*vV@b?0LakZ?`i?FF1AKwa2Z#FY|+pdClMs
zf6-eH4&GRKvG?kK?~(cLlI%V!pFKM|X5kIX)-BCkkU1c8!vX8iE50f&S|?}zy6BsW
zRvo(f;KRRv)S&zQbGKRi?3LN2<Q~W`7t}I0JaX06;JQz5zxwqJ=Xf`-8vOQw`+l04
zExLS{%|FN2oViDG=8l;scUg7h>NQyL^@CUMbMaAMy7a7XWxjp*zT2OC%6==IjFvfd
z?+-P+%>E1RTz2i%Zy)gBy62xgqw&yN=JTJ*{C&GucgdV9{Oi3gJ^r+4&DRfE&FRaW
z1=$C;>~;H2D;t?#EtJ%APouWnJYV#}cAHOr^bmK~)z;VNr`>(})n|MobHI+PP6NMS
z%dCFtvRgC5wXZb~x@Ex)zpI^f_2o}&cys$j(MQ)EntAhz>(0FHxjUAgCmp!ocdq)|
z+9&S#`F1Ne{_MH0pS6B><n8*y>;L>G^o;);Wlq@;?Y}TuxUT&5vwK%=J*m}^*KFB0
zbKAnpmtDJ0CMm!0kvkTy+bQ$g1>>_HSpNE7mao1h)4Vg=JfH=zSbe*7r#3hJV*6Kb
zykP%pF1YO2<h%zC*>&}snJ0JLJbnG$9T!}5!I}>)7+m(}u3ULs@zGt^9l1qVaqc-=
zwhXSh<d3gBbPi1Iviy~up4|H{N3Q<ehWCO;PI}~TZ>|igzdClyW#3-A;P+W-*W-)g
z70;YOys&?;<FCG!S@EgcAG+X0==|K0g&Qwwo_gny+jPXo-g18A?0VU?Uw*5%V!`HP
z_=7s<y}GHn?t19;4R;3L+53!*`)6Ld@}!+7_kAUL`zNP<{+U017AxNKx4Z21!SSo_
z%=~D(b@t>B58dhi;lSfhdSjiv>cmXA-KGWW_C37%$<l$3-CEdTzjIG`=d#@Byj6oA
zy>(RF+$0g-yy=L=N6ZesZLc$baPX0@Z!3F`UjmM~NBMJmjLzLD*@a$o?dC7vcG`*8
zM%$h7)LuI*orurId;a~8Ugq@7p$D5QcEA1lmwS&Ma?{VRI{(r0Zu;3(zYcENm^<zp
z3obVjboa;oz2&{PoO9yyKl{yb$kwZmLYC}vJhJufzm2z?w*5=@U-`r@-`eGj4cW|+
z%yGTMTJh}_tKWL_(#)-o|NQ#qTX*S`cRv2`p8D2R@@Y?9GOynVKe_ee55IHopWDB0
zUcTwXz3Q7z-R+Q*&%fmA119Rtf2_T8{dSw~Ise#G9!oA&woxqy*-P);13B)hzkg_T
zuD<J%<$rkcDeBn!tF5J**4}ORpWMCs^R1OlrP4+cDc-aHZ7*2Yo^|HIr@i<2y?0o9
z^?p}>`o_Cg@JD%1C<m|Evghmf1sQjAz=Op1ccXSZcFE?;ZvKAlC#%j``lH@COC~oy
zd+V`D<*m=ZJA?kP(*>92x16={;r;ImS6qDV6+iy{D=W4hb>SEO!hIsQ>(*OeJ%7je
zwchF-wiFJZynN#BrA^XF-+bUyX!6d!`~CgL7sdav5;?Z@o6c}Aa@#|^vm3j0r#iou
zIrGG?M0=n9JcGaX^|PP;>ebJma3Fl&F1K~OM`|A;TI-&Vw)@;3$scCN9Qd>61$gw}
z<HtO@zyIGae)(zPh;t8oYSnt<#hs5>vKe~x68^#CZoByGYj3>f%d(yv9i6l86~Ig<
zZoFJfdIw_PS^XT^`^{0u-gV&fS3d7XPe`A<WXIbcm*3ue#lD}c?z-E?Z#RGO>VNHD
zyKv_Rr=MKy*uOmg#J_HrImq~XUD*1WFD$?9#)prji+eo;Xv*Dk>cihD-Kf90=~V%_
z?3x3y<gz{gMeiT<i^aR|g&g<7#?mW$9{%L_?)$?Tcl`Icd%SYxmco9C^TKnR3wxdf
zZ8+!l>0a>fo>h~{)6XovW{=ODdtd?|RlMis<$EqV|Gnz#wVgKJ^y;14-*%1X|NQ#)
zo(CVdc;m9d^4IUY<Gd%HzWpy3ocP^8{^ZXmHNTqO^hD|V+b`PqE%%hQ&wcsn1)tgB
z!#nKq^?Pz3ubuw<$uD@%6!tr5ZA~q<R{o^;%8n1J|GC4ud)|BO?dz%=-aPZlS6<t&
z!(AtAz4fBg-aRFI+!MqCbJ>49cr&&B_r>q;bKA4j9>|S<?moQrrR?#S_0PACJa^w8
z?(?x<>?K{X&o;L2fu|Nf^XaqJf9Am}{`<Le7M_06?n(A)bmK2BI%xSb$KCnaO~;*h
z_cDCZUC`Fuq&4e5&foL9*EXCx9qoJk_FH@IANDz8*`n_sw&f?EUjG`LdFnuY$+?Gd
zkHov&w>$Crf+zP>e{TNn%U@poO#IQO2puf(gXhjSj1?EJ7=C!gh80^@+z@^K%-hb8
z*PQ+KcY5a@@y^l4)(u<HB|Ch7-Ih1-r1{|$P3f`+{&3m#M_zizLl^FIcy`l{^waCt
zJ@NGAdmp>v+OvN4nIl$y_tchj>rJ-vmE{-g^YP~3ksa4R@#6pE7Oz{j-|{8bopnZj
z?Q1_%PuVPdxAgU;o8RQYH5t*0s|}_1=^L;9)e*0MdV+ps<xLBz?T$Zc&wak|yuD&+
zboH4Zy?w)QGnRkq<Ld)>&!dkzYp(~+yL$hpZlb@x>9flYU;p~iTW@Htdi>d&fBf_7
z^(T|{!6&y4piP(DgIxdK{gq=*`q2ZI{BHN_E_gxy%F^FcJKp%`FRc2)*317zWq<YR
zmBNy`e%KA4Tx#uuTyfVW|M}U*uD`tS>tFiV;}@Q~zJLGq$=2+{JEF^fNC<aO+klcs
z&RhMfHD~6(`r03Nx-IWL@kQd)g-<SIcHQm!<Jl)({UQFhTX)%*IbvJ;fA&gcw^x--
zUtMzOUc@%C3#L5i^2LXmYpi{qtNdI00)TLqt$gH=ZFMhr{>h;S75Aw9aE179?aOw%
z=-P7T;sZal4Pya-we;`>%dTAY8|seT|FxZ6y?Swd;YC}|{ug=xq_B+LwD{3ues<}<
zwJ%(J)zT|V%kKRKbJO+<mjC7OQ+8_|u`NZ~#JZo}yLHo-KYBv7eE6lNyYzrd-W%%c
z%Xcl~wp_Z$HaPFJTMpSg=zXks$!yE~q=T02_xy&fXKuUJ9FXJA{@Tl+>ZY%LP42Gv
zVCd4L!BI9{c?`Pmm#!`EJpbCqwt;@T{nzTn7Yxrl1kmjB?u+UNgLYeRAvjUA6aL%l
zz^MFr-#+dGy6rXr-Jibonpf`l(zUOB<c{6<+=fJFmsYN!e|gV`7A)J3xqa&m+wSmw
zu*2iFyK3?<*x_2Ld*BC)EzN@+x=(C;^U3O_w?6-=W!r+Vd;bGq)0dt;8|?FtPbJbe
zm%XkJwsGJh?zR^WGaq^Ax{qD>!D63V3vTK5m!Ci7PluVOGT&RgEybS!7vB1dORikA
z{zJzcf7CX_*bISPU%22KCvM#xc~Ng&%6zcc-Sw4g4uA5E*N=a5cjQNhh-*F|#z70c
zrR=6}CiuYz6`x)Dr33fgh8VA24t9P2>PL@GmR%X%{LP;o^1)(H9}d`Y^3K~=pR#Pf
z-+kf8;s?ao?KB(^s&o7SAG+vT?{n2%&)Wu(Mm4bO{<;_UzH4#)2Xg1R5195ecaMv%
zU3S~Mr*6D!@dd9uaOt%l5aaBX2LM8S`@qkGed2e2?W}EyaU9sjv(V=7ypz@({@bIz
z{pSy4Z{_Xa{y+D`XMXhjNo&^6e_v^SKn(JrT>zoFyE0&(2R!}AJ=+rFK(LM1KJ5O0
zI<|PpWbbbLfoVGENznan`-I;G#}=Rcgn!tI4~TK_Zf5~PefzBc!-gp-{|_7f@7a)9
zLmx`5J9P6`pIUn5HXOftX+*EwgWtI6W4&zzE0xUer&_Hpy|%4X0j3At*R0Roa_+YJ
z1sIuiBOo~^?IHYaTNVe;R|DC6ZKvxG-d5Ow(RlSrz{bD&++IKam#v(=+DJ8D={sj{
zD|tXL?o5RbRqs1?8%ap-e_tS7|7Yv}Kco5swN0<S`F7@S+pYSsaR>X#*^Pzn;j3TH
z?6Urdm%I#@d9`u)#>c<1`4hKnJa)?|FV@>%yLk`xmXns8^P3wpmpgA9_TF|Y`aj+B
zhfifbyXe<@U3=5#&?A>Gd|<(<3-4dDYQLRc`)lvQo3DN0AKdli;k#8gxvQP+uHA9<
zar$-VE%?QPSEm<Wfj@QC0~<2U!cXzHesg+eJ$2`g7p*fh8?Uhbw*9IjR{JvhjYX?J
zpZVV6SL3~(i9fRLf=q?F;@)@1TW&jO^(=GJVt4!1yFU9>aq-G-=Df^zx4Yp(>%z=K
zH(zxB4R<P;b&s9B;ZNIT_g~3pPFQH}eC;PzpZL%<%lTVR%xr+4Jz%%x3pw+OuOGi*
zN%p{X&CFRBm!CX!`OW^J%J(mRh&U-T7WdeIeQA8$8-DY<$7WxC6uV^UZ_Ur{v*6c@
zS1n%s*AKmT*d@olq1H2}Zog`W)nCxxbGz#nD>vO*`Q<6MJalA6S@7X!yu-J<;=JmA
z9sJ1C&um(dIV^MP736_GxMIy8A3yY|`j)-kfbLlEq;<+27eD@<cmC$+-91;Ie91YV
zdL=b;D}TN8-rbR%7r1xd`_qG$>)L`&_Px)Zbn%fFU-qW@$<=Fb%6xmf)wk7u^5=b5
zUzPdOLKj=DWY#ZqOPlT->~JV22!~(syHVz-MX%mcSjEt#;yo*m-I03A&YZbG`sniG
zcTx5{<AA@NvdCPt`gBV;L%f*U`WN)EyFNPJ!{&$wF4v6XwY&a1y6CwLhXw}pncdfo
zyk{QRb=BI(k9lgb`|TsJJC?3XepY5~`j3+eSN-BUgGY^1dhedP{knI5O7E^-=wyyt
zu<F3o))n`@NWXLBWQWiE=FIb;H&#8r*8CBN*$*vQ%pLmR+C7+aUySa~@A%eD_kHKP
zyMA=R3s0A<+#bk2Yp3fi;r6rd{`=YQ-t?s}pLU1z`syPdlkeYmMeo+j-#cm1=60Z-
zzvEzsRNY<l#$w=s%zpM{Z+&@RYQyrwmvFb-aq$~VS3_Iw{XLX1F8|K=4%l?PQMmE<
z_nz@Y6@Tl^y!hB1#19Wxrv${xT}y}De8`rUmW%di?TdAEdFcx${`RP1zy8A8`0Mxm
z?cuLHQaJb8r(ZeZW0fmk45pWzzT%!~<_8NmuUolssTkaG?)aMK%gB-guQ49SZ`@Kj
zdig^~ZCsUkh56Red!Dr6*lV7@_e-ZAw)}y=t^M&S2dq1Z-*nt?{X_o4Kiu=^lYh7N
z$SvB2pPc`lUthEE*E?<AIWv9q&h^Tg^;b4;y8W@QUV7xkgT0Sf{JkZc9{lcupSu1P
zgZt!*$A8&4|GHPVd-U^{)IL8e-m`4C`;Nb>bpHLvTzpiq^pon-53XGM=Z}03zxn6)
zRj+yFW_!~i_a-ZyrGGg5w%>yI2lvvYzx?ENzx?c<pCQkGZhwtiv)z?1uO-f0bLN4|
zAN$W^h@+t6PaEI=$`vPm<i(>0n+IQfcgYjS{^2Wo{qea~r~YyI3w7<3ukAD5D>KPI
zbKT<O?eUMVy8OP;XP><L@ka+gT)1w>uit;M1)uP@6YjfzaEYO>S%3Ahw`#vRN_j_r
z_x6o@qPINp`qyte^E~uZUtAijI{Bf`TsS>#dd7Rt6U$yEpQb<Yl)d*wvn?;3aK<t3
zywc0O@NoYl>PPhBzx?ipKfKQqqwlW!<TdXeOFfs{a^1SO^;4d`m0aI|ezw)*AN$Ck
z4%qz4&!KntWL@F18(to4sT)rW-hKLqUt0T@%h!DIy!qBm(wZ|h?d`{z8+Lwm-{h>q
zCm;FK1rPmo<6F``e_V5BZ_NRZpAek7`SyFg)hC|*(=XV|A78iRBah{e!EZR>=EXbj
zbMmjBcXwKK;@^KgzVMbEAMK_g?9*;Je7E|hUyI2K_`7%g>D0?keE5ZTH-GNRtGA!M
zx+nChqn~+W&s|<R;P;1LxoGhxPOg&fpYJ_qw?)@{t@-rgOYc78lHEqn+;iH?#|Hj=
zKV7o@E~nhEc<1Fe)Ycxj-;Xxl{yprjBWAnqy!@c6&d$F2kzLmBbin7{TyQPB>e{n6
zlV_fG`jYLJPSj^tJ%0bzh0V}ick5G6KL#R8@#f!u@$Fl;zwDYPcxkrXE<ZTnkJ4wp
zuHJZ1{+-Kio<e`VZ^7m9+aJw*?{071?;qQTdkO#biKm}PVvlV3@nzSlcRl;nt1sE{
z+z%z@cf0;Oi~qXg{>%?wzP@<PPS^kSryqaF{h@sSi%Xt8^xWDJ?K2l$yZScgkn{FL
zHthQHJuhE*)#U+f{_@i3MJMB4CNDoq-LJ6;|CadLOD8OM{x9y{GAzn&Z37;<L%Naf
z5~Nc~Bm_kPX{kYAC>gp20cjBFMnt4Lq-*Gs8oIk1q`sSHKhNI#eZTzu9S1+)U}o-n
zt##ekI<NCQ*EP1jg`TO}<>|aaomhRQ?u2^AMNrca6m07!W)j(;CC2Uy5Vj(Jc|t(;
zz_b(mm(3YQs@k!uEiqpT@oUv#g=!fM^q#Lcyd(;<@48#|EcyegI)iSXYn`|zVNrQD
ziC%?|HyJ-Af7u;yzy2wKSWhE|!;PtiKIrGt+LIv^$1nYV`lQov<aTG+wQEVkV?Bd7
zL4S~IL7*B)mB^073`+)623f8hSVo$hYZ$WIei{5e!7f(ce(H|>3rC?ef1uEeV9nJC
zwW?OTz$QV(;H+&V!;ci1usAw2%o)Lxevdx12-krp`mmrCn#R>ifonJH)lravJ+HWy
znhdI+wU%UQx|Unnt)M-LfLD<FDl#cKHv+yHXf>tFTCgvevHGE`YUA1!#O++!;?vCw
z1`|y0J#n^OY{R?m*j=n6crEeT2-pxM8G#VfonbE|JpijXFAr%X2jCOvIgKY1<Kl-J
zOs(Za&x|3;oL_adrdt_{Uh+E}@ab5HAuX9lZm5c?tbB@#<u2BAer4a;gB485Ek~Th
zrx7Yzw-#DrINUFQ2Euq-{LayD$)r65b94i>_xRjFcF|;jwAND{$aUeOBTI{R?x>f}
z8oj!#_|Vf!2bLM^iFn##_TL2-X*G+?;nvag7#~zIr_lyj9FG;F621E<N56IFL-0$<
z%%bwuUVN)URj*WYsArH0KF1P5<7I=VHaIx7imNR3dh`e?8>mz_NEJ#L@T?09Ovi-t
z%s2<e$3D_<+ifKhx=Aw3Ild<uEOajWz0v!$6J^EHaFIC}8Rn8s%%bs2)HHHde52B+
zv*4cF*Af=Awj3&UjeWd`n&1*mr>k+f*^?O@Y9NNh;_C?@m(n?>SACsm6NWZlEU}Wi
z&o4^rUJPMT&p1epR#(pHtS_U@u%NMA33#)?f?!`ZerH$NV+*fO?lYKQi9OmNt8`2M
z{8cy=m8b^D1geOHGP>h(l=H4iUr9N60aKpoQvA`ruBI<Wst}x9Wrp%SM1?|(Dub+J
zi%+y&>@CZ+BShVtcJ(b6`g96%c`l{M$$62UjQbJxUaJ86+sw?RwrH28FGCe{Wg0*{
z;pmV!P6$fs$eU^*3XM<U6+Qx`X->E+6TG~UrBSBz@6)W!=N8>mhFBDduF<N=uh%dn
zUE2lNo@h2yG%4f6z=FK#lXR{8M2kmO6lY4KwEmN;48R(=q7D_YHIhO%B6FuS2DL<;
z{u!}6o{O2!8QMq-y5Fz^MD=Z{w1kBQ4Li|$JU1GJLekf{7FbmHAcMGU@(?4MVlPK0
zC5HFpLNq2~Se3`^t%1a1HmZ0SLN<go37yHBdJpOH{w2e+5lb}u4jRO60Ey0tsHXFB
zXY0VPem`PvjT>&d5g^?ummQP32L-t=o})to&8BKUBozb8P7-)3>A(_4j2TK$(DW+x
z-7~*}yxHQm9;_jV9)@pEFxM~%Y+`kCu>Za7ZFpM%jO{qO*>Kna$*-W8WFQss+Xg<q
zs=MV3YY(ZG@-8rPjEf`A0mI<djiEaC3!#KUZ)XcL;)8=`s`HZ`i`K19@N3Mv)sr_y
zadYwInHX8)ctCa;1m(<(MH!5XkDaT;-(G*49Gf0Ae;GV)&@|N4GR<jI<pM-W8~fX9
zOK%_%7Me-cboDURFb6#a_Lt?~zb*qM#jdQRYH*T0`~R4_gAvO5R$tvR(}k@XIqW=e
z*kK&HW`WP;+W%OnuTs)C`kl*C4|{f;Q&WO{Cjxc-VL|$(n=Ym%hNib`mo1X7TC~C)
zwF1KMar<Sip}f(Wy8?Fm&q^`+UUy|^eb${hL0FCSwSN?yaMY)$$+LK>fKAnRqVNbV
zFtP2KUDZ>VTXgY&s~<Z&fFX^_sU_#YNUkn}r8Q>~5DCt1VqH=Aj3eU6`DrAvG`=_u
zN!_cCjoLOiso<~Wy6>UI-jJShV{8*({lg9L%#&$^MBNS-QvI9)hkPPgrzxpIaWtOQ
z9(P{s!Qj(WW_uCqFQm~hF-VI$j_u=v*H#qbV;^g1)}HytvH>aq4oc0+h`vIrQVFq?
z>TFPlT;DOFuG0F%6No?s>!<jQzG)?^IYvOk;lI<pT0UTu5;73TnPi)y_CVGVFf(Td
zt{28+oV97`UGWJNh|8u^`A90zU|=l=W(2~f@;KpcRnbx(>A@0ij5Jq!iH}nAb_B)J
z&akqad=EE5Y9~JU@w-2>zq`fzep#x|bMvw6km-bc!=n0e#%XlB(Gj+`3Kc94*SG8A
z1$XyNKk29tDD*uoE?0MA?7s6$Ma(nRq{Jv%!dS~?K9dN=>_vsKlN{R8?|Rl|w(iBt
zBnhu|d|?cTeGXByxOQd)i!K2hRGkSC2l!p`vmiQDh$^lPu*>&BXnAXJsCZ5EHa>0k
zSl(8l;_Vcy$z_4EFaq_vX9<N2yp|dL*B)hEe8I7i#AyH3=R3-ZOY)3lO7et)%VBxQ
z#Lgm@@Y`f?*gGJMSC9fzT_}Uz;D^%j<~kIX^5SG6Hel2l7Wb-I9YuvCLbk@I)v;_R
zstnzVqU)ngap5c6_X_BdFIVY!fJn(zRWjIIA`{<FUWE2{qqUY!yj%lR_{h}*mw&tf
zvZ%{MpI!QrT)uM{uYU?llaMfluBV>rUptGjm&R@|`}ynlh;pp9dT20j@+_hTz=COo
zbxO^!sCF_1F=+5ZX9u6@NMjIVlW^(A^5RpSqB}lo-;*VgoqgLuYG0pO#;TqA(ed$e
z(|e6@F@a^-_8y+mrPa^+v6}-+c5lpO<=X9tz2ak~%QM+N4>>RQh0r&0oJMM%I(R0L
z*5XBYW)|@Xr81w~_p>dmkl|^d1It+)4euqlug#_v)paUB#*ngMRJ$|s?P7>i@Ac`U
zB#3tIO!e&G$mJQz@J3cgQkRGk*kNZQAt|h6k@6y$trXcoWoxa(U=uEUoyzU_TS;yp
zM1pNvt*hx)0LTj=2=JD(D(0T$I0n2^oTP~6JPb2+qQLDe<9*;vW#J5N5vg^3W>#Yz
z(V$0XPln5Sgv>^@piNcMp5*kKem&BMxM7S+2#x+(b@rK9L}L{}R$CO@Y$nqd_i=&}
z$SXz#x_c?aiXOS$5wRv3m0NY0CK2mv>K6Kr$xunT_8viUtIhAs)bS~|P4qyf7bmCR
zEHjs4v9Fd3<-{iUfU^_nV$_z(BRx8Ze?Ec3`?z!=({BlxX5pLb0FjV+t7EA3yLn|^
zBdr_<hNv)|Li=5NG4wKH6T=FK*f-Z$oW};tONp&zy?;=Vhujp?ZA_cZ-9PR^E65Cp
z9tlkh1n<h|F6e34NYsq}%G~%Xv3x{*4<we&+GVr4H9dZ4FA1@M-mkU>%h*^s{QcdW
zx=?jjkxopunTaJ&4b&~EMNfnA2o`W{n`&802r|mT%63B43rK6k87<XL5Ia5;C}LVe
zt}E)A5!mN;(($4dK2}Bh)AoArlXNxW6D-V|M5EvN7qq5^EBSB_zhkL*TUoa~*c#+N
z&#X~RQG0A{_r0u1o|wznU~45ETs$@^O(QARQ*Q5!X{e~i-wyn*UA!{{T?#XvqGm8%
z*DAM=nY~g>;D_??@q}J;MN^<@8bL*|-TDVXHrkk(&_ZXe!f&aX3OGl<t;AEA^%p#{
zKd*Jb->v07{|vpYjP)QbxqD6MgoZelb-JdQ;T*MS%;q;qXV2EXxbnSAXN+ov5LEl_
zY(Vj$o|+_%jsZu`M~~n65^iN>Wro=Lraph&UxCN3sXg-cMmM|a_KQqSFYz9Fi|$o6
zd&MBVg3ZE2TiAG%V_Tos;j3VNUoX-4%%yW>jqok?9B^iy$$fiZF0koTXB}2~3hxFl
zGR<Jsq-y2qE)abanahtY^fdUKZDMYs|4rnRQA~H4pEo<2eM3u#^v+ioEX=N_yq~pb
z^cJ#A(|!;xxmg+Nwid;oLd$6nh1Eid?jqBv&f5z|ZC`RrG(JBiDZ|sciAh*fTn1VI
zR}k%<XG@a-60+`KAf}DMdk+JzoR+_F1{*-%=%Q^Xq<ZFyU9Bjz?{h4DAYohGtz{Y_
z<2Nc6bv_UX^t)~B&vY6OBld#ttWTzjb@RVG?FE`8UP@?ah%rQp98RAMs<6*prP_9i
z-?zUJ$2XzDEwV|glM0&2af^OhLBI#^BX4}(&&8HFi`}{RG*y9}M&0i$dS<{ao%vYa
zIknr7i{YaGWVnW$!RU{zvRX#9Idxy52ET95*u%5x60!yVvn<8-VS?<HXNtMu`hd~w
ze&?QW$m*PW(=45$r_yvptj4+A1dm-$4rPgV#b7j6-H7D`>0D0^ajt7^V;K2QQBnvA
z3ypF%+OgSf2sv2wyox4@U3m8_d<AVrc!p1E@$}PcTLqlLg$ZTBEz|ez*%8JQ^)!6C
z1fskOyl)+MD<C`l{Xstx?NW~%j(Yl9Ds}J0niSTj`aY7@JkYN>UuA*4=6jNAK!@1Y
z5;i59|3risge>fQC114hmE`c=#i!o`<?O{BSZ;<TL_Hx@X^}&gP(q2A%eG5gDX|9J
zPi-$2{(QERGZHd0<bHHT(kKY=c*O4EX7RDIrL(ZR{l>J=wYTCgrdJjfU~CTxP}w9J
zni6Jy&q^<}jmsBLoGhak3vrs$BO6T=D6l(y>D|kc{MK=zH0Ag-Q9m8AKbsoO)@C&H
zZ_NeqAS8+Z`_F&<@+lsWvcT#;!v7az4ZIZp|A#L%XOjbT=nJ_d!~fEo9srmWChD7E
zTvP{fzhFjwdSp$~mNDi-S7c3)YyiOYopBMMY@$4-_wz(*u<09yfHFW}oW&DuEA6fS
zId2)LFitQ!k|;_`fEECvPhWrW`^qUdO4LFKBBAREdIw@Z)<Dw4YmvyUHT9ZIX58Zi
zeFxb-7k<>p_CHv9Qzk;d2F>EP0Wu&k5%Z%1ktL_6MS?znF41JEK?g{gnRVDL3m^z;
zDRK)F$ma|J_C#{1*S7~5#*zq+psMHW#e;O|UlDxmV%0qTZ*^ho#p7;RE~8%!V~eXu
zjlqcg35(wqC@6V(4GKyCWh$?df%ZV!=!cAcv1&)7^CYCR$!FMplLHH$)(d2y!~-)V
zhd!E77fxGX!=VLfBRNDQ1ZvIx_Z1iknUUqlgd|1E*^$W<!>ZQ8=@JRlVgM_d7RV_%
zoe<KVZ|A|y3DU*NeCn5iWMBnXiP0hijUI^)<8<(R#J!Fs6b`{m&IM7W%nUa0MlJuh
zCxg&mmJqW%vu77qofvfbA{B3u8l<Q$=S=432x2$CboNmAlcGO}jgn{fWdmAOFd)cL
z7T#K9qt+hg0BR8eU9b_lGs1!r&`WotSP6~AWuEaR!emfqcV)~)d~M;{<|6mH^+{iL
z*ICu?m%q&2nC)4<fFRUYl2gb+=qCjr6#ZelmQaqCcOc^wqeLuFD3Y!nb}|l<Es_H(
z$OG~RncW(qTCw01|2fmKz*e%a{f6IdDJ#orTi=;$1~>;Ro^wPqU3t3@Z7C%=#S0yD
zEPfeoA#$+CSMNTA2qQ5U;mh-&|FOF+Wybf)VFZ#K<z|}sS(L{!@XhcIrb&`Jo?wmh
z8OkAr-z-2D>!Lhx@atK=Mvq%|(qA)x-GXvc+y(qntiUf4g}>_rku#>+7<n896P&Si
ztK^{^wTkmXFW*8eM5YEXkeAe+?WscrvGOubvp8!J^X7c?d!*WO?OdT-BcGvmf`M2y
zACA_a(7~+SNM+SQ6-c-!{FBfW3MtW>bG6l~fzx5lP3H17E-qHbm3YDQldQ?mNnzz{
zd^|oCx1I)e1}=B?VApZ~OA|$M2`?-v`dteahWabana&(@5pvT%6mh%jOt$2)P7kB3
z1t;igq0Pz1vLn-N-w!l|CW#~+5lE4wUXmk~PTd%0AGsf;QZHm3F&b}C{SYUl=iYfA
z49L6zjm%)LPrGujKG@UAo-w=t#??*7yp%mBM1RyG9ix>SmU)B}_XU#@$>p&$MSmy0
zF$|93sWN<tITesIHZSS7E&sJVg&<n$d7EOrwe+&TH}H9W`9S>2(takebSP^F90;vO
zL_*Tf&L4ABi4bJ|A^g6(j$%&m<zK5~LJy2AlQTq?P!nT0fYM<15HJLkCZI+{0iXF|
zZvZ7vT7jef3N#hKY*3l9&P0b$2JRs51tTHq7%(;He@_jI+zJ!$nXf!gK=}k(y%1^H
z^T$}^d%0JiG0tOvA>@C-MD@?3?LH}@>$>wjUt06AH<`eAeW=iQ-#p8E(F1%3M3NKu
z%0TBUx3~N3Gqogs?7jqE$?c?M7wT`?TU^LUEm3N<h@~!4A;?rji0z;EwDT#F_{I?3
z0DaDd2#I}n5ZlJmEyEPsrxA!M`9S;SNc-VXQ)8Y(J5Sybm}#{;hUuCjg8tpfR{cvz
zwMd_=zOZk|#K4Z}X5xjzw)Kq>^c>5yawhw#g5SMOAJKeIuidaEJnQ4?au(a8qxW)y
zsej^6RfgGaw;bYb^zE$I%v_$=q_^lwe?!lP=qGztXAZw6w`3Q2HYfFJ@>F6Kk)7{2
zO1N4Q(a@J7LgJh0w+kKpTd^aRJ}%sPT|<;e1#h|AI|SMVT-UEJO<Ji8bi)@+Bj}{{
z_%V#V>!eo_7VM_NN0`tT+XHPJTNTMM0{jT-4s9}_eGuRKD(Xo?Bg`{8Uu<$Rz7Pg)
z&&A|pkxaWEM$*IB8bqcQGm(2TR8X7gV>A*Dk4Nau6SOsRdU!nPdlWQz(yPq2(28gp
z(Ng+x2v-=NNMYylQ~j8WU1zUM(p2U)%NjfzoviI5os|*v0f#qJ*qf6Tg8Q{UevRCf
zzXoC^eNY*CcSFo+@RqP<UW?RqULWP4;TGNJeviE2WaRg*d2u)zJsa_yd@$5IzUXnC
z9>PqcE_sBdZRX7&{%@Cx+MnTEOZd#keWwK4KmEc&6WUb0gDA4Cj$l)pV`j4b<usCR
zgf96yZQf6}Gq+Kqa?1Oge(b#uwqKijUC)zv{fKI;fq2Ik5wH>^gY+%vqATwaih%PG
z`oh%rC#|JZ5zu657i74)zKKbSxHaxCS1(pG0meA)TQvz_*of2pKax!g$4{lb?7yH!
zNRYxueVkTzO5RT`Us2qU$=G2u%-2ZFXFQ=h7`UBHn+;|rBcDn6alQ<R-A_yJOtnbC
zEjJLY-SO2infMaH?sXT)kd=03%es~olc_@2fMUuZ7b|Jml=KnfcZrTwf5xqU*{`F!
zRma5JCG7_B)xE?zce8GrL~%!<`J^)3E;>)!2`Ad}8pGYg3T`k_8Q>B8`LU~!-TUed
z#qScfd>I3Rfp=8|lvjloBDPTK(+@g@m#4(I)YnZ~hpt`CA;LV{W}UglF&kfBSD-T~
z*ar_l6zX?lqmE`@N13i(5;*o%b+UkGf7+)<+TY(^(BspJe_HUFvsR1DuzpAVl+KPl
z3Vd9BGG=sYa(~oy!2UM-YfO8HnR2Ghb-eCfCq9kq%~8*WxVim{;Y(Y7z=z|<Hs|lK
zgFuKFiBzXqke%;eQLphy5!?o;x44MRCr_<dUtB*{FC{9QY4Y|QX=n`yGFeOFz)m$n
znQxoB7*~^+>EO~t?4A!yQfpoPQpGK?Zf#Q>sk3wTYP3cn^Sa|16uW4!HLEY3+8dJb
zRbIpXKJ0s`K*H<6(3`*ucmXu*&MdsBWj?SQx#gC{tmvRe$;IUp{EEGf?pS{BdD3Y#
z%X5655-@UAN9I=SY(Hj=U}Vmj>&XR{nua)15yon&t>?|Zg`cAz{1$%q+K47vFOlDa
zA_#9gcCVUS&j3h&MeI^lGl?STS+7$p6PuT*S0#RcfF}sA9sL|`MeWMi<=ACmxnFtZ
zjo4K>METPIMuhs)3DfD$U#{HQu94!U%(>Np0ykodBvytNeB~?92{I2#k7{!bI&ED{
z#DExd{pfkI-<qjYPwBn3%~KyOXjNRfg(1lGIShL8(ZEcs)@^t*#`ADa#Q56|m%QU>
z(VHh+M$@-|$;c`eSO;{;pZK4+E=3Hzk?8D4(5vWe(ycA)Ud9C<Qa!av+p5^Q^b4c)
ziAPn>hOVq3OSsy0MURYzpSqp=wj+H(=^0gboMHG3v0J?tY=1PlKgVuVURg$J2|sle
zsPrAO_1qlq^w_qR={Y~YvI-_(z4tsc_Nu>fknk*<H2GS8=gUjRgwP5sYIM}%TM)a_
zDmfZON0c<p`f-XFgu3@NEBV6|VR$p0vvpvRmornw^{!davlr~Tn_eHco{H+*wO6I+
z!{D908PBzeghDi(){;I+L{o+lEWACxE@@s~BcCzZ7@84qU;iPxHa1!cIm5P{uEigx
zEA-aRt!=)Xezoy_IyH`6Yx|YX6WZ$Mz<SZ*B%ALFzt^N+Ib2=U>x;OBH}Kf3W5l{3
zp6-wijh<JV?BfdM>7iv5&IJYB--WTd9JHssp;wse%INhNlqZo#OsCN9W%%9%Jz*J~
zh!=F-PWiEC*V{3&nA9<kqP*1ho_uATj<$6D<fVY=pZBg#^j~xhZ0*yhZKfs6MzYR`
zDFof=oYs4j<Pw%oJ`ro+V(k@Ly@s*w;L>&i5sRtwmM+r$K%&{K)h6Yeckuee%z@kr
z+syI}PAIjtGqU7*L@Q-YFhWqs3hoIuxHj@Cb1|--E+(;4WabJAb(?%$?|7{cY?b1N
zB8<{ZIaY2sY3q8|)9Ei>Ot==$Br;8{h+D(OqbD>ZdeL!!aWwl5I?eig*t8-Ab-)za
zw1T>ZZ`|_`T4?BUsWih?!V&dj`lH;`u}Cdq97Bead~bcLcH;Y7EIcAD#=qwGNI{w*
z3^igXK0fmZuePaw2jv5~#7!-+Sxs0Q&j*?NN*;Yf>m+Q3?drL^%Uc~9L^HknN!ku{
zEbB90;T{i(gH*pc#U4Cm&}aI*(XsJ%YGd^xTEpQR^Z_GXnpgr@<MB)^U=6~n1u&3i
zJ!0va6YM2Ryvb}{Bb&lzxo!?1JLb%zf#Ceb_nxthUX?nG^d9|*4mrmXLK4usATM(5
zHp3d7j>;qN_l(Im^OD-0mr3CZ8)9PQy$OxDbK=6CqzSb18ec;f<1>?_4qk^R9WtCB
z$oO3s%!hj({NjK((&VIIa~tLS#P7h#f8_WJu>@EuP58L5rUsM|ysRwsb(o`FdFCSY
zruB(m!=R&H9g6iGp`0J-lKth`vekoq5Vw<byO)+^$LTH}W95w!cD(xqXWTyXlB1Wm
zzt?v7cLzA!4%*Rl@3&CDa>pc7N6=t~CDKnAG&!LZ*BxmI*<FH)8*_!IWTx<#FkCvk
zFFKl@!sUKRYfBh^0WcVf<^Kmr6%zjBBWyi?$0kc|FMktZhxtqLUX*I0h)ic#!6UYJ
z&Vc+oFK?*nt??n!!H9||A(;T0`YoF{lX4^t#qXN57}>rJqt^VY6#k7yIf1^_$8%pD
zjvi|*h!gtshx!ZmKw3NT7hL6->RUssq$~1^&f_T=$bCxA*O5C&6d3C3P}0a@Hx1s@
z#~C^lY-y+WHSTr9*CS=MV3D&ZBg!5!`#jVEa$|^G&y#VyA+?gGZ>4TqAGnjnW@9q9
z#_TY2hjc%^+-P1yF|tP;aEaUgHrVb;*%oPfc<4HuwX;Jxp9e;Z%@k8xTQBb%F5MDz
zJB;ai!e3;3Ew<lSg&S{&h#Vaf?Q?A0oT-R)Y)Lg1cq!y{zh^n?Wal`0-STa7#8m-q
zL)?YiRjVa0f8vez5W+AesA{Dp%X#W9G%<FXI$NW1Y@WzQIYc^cZhB_Dq^V_ZGC>%k
zOf6#9`Y3T*<|78#V2ZUtT!x@d@h5oeLA6)J59-O@T9=4NMxJx(aD+SL4z|d`mq6Cz
ztRViKubGNq;)5T?ho{fS)Rd(*vgeTJ26YcZbx7n+tRxoig16~f{THSSj=Ig<Y9Yj5
z>zQSyU3bD}4t_1Tt<TsuwaL3K_`OOn3h}pZ2%s*Oq_-14?~uBXNfkONY@h(j@4N4<
zx>c63j!?_3<EqRWi@smh?625m-j>XLYQCGdDP=d@e%7B}(~ochShFn|b7XY96mwe%
zRYix#-0wYrx+>4N#7gUSUO${sGjmSww&z>#2fmDekFWhZ!-UMBkE5hUumL=f7?aBa
zjxWCF_zA!l#bJJ8v)q39B%2bEV?onX{&0FJH5hBQk^>BQN+#6|X!45Gem2%wr6&sX
zK(^TidLM0`pP%L8)qY!EvajH|xsh0vsPQ{GSn$0@vrtVpyP4qyFGmDO_}*c~aefsl
z{@}ZQ1%XG{=r=FC_j0vppMC>pBaIY0*EN9NaZub$kQhMMo?c$GAoZsU;H6CqwJV2B
zWNikreEE>ZTU{eN6Xw7UPL>*_8@HK|fV~0SGa;0r6Vd@nD;>lypP;_n!RPz-;q0Bt
z(crdf)*~Sdg1g|GPENCfd2e`K(k%wQ$xb!x=GA+g$`5y#wWM}y+Y#sl9(F$SM>g7W
zCm&~|FV`kz4kw}q^(+#`uJJ!9s>wOGe!rlwfXptIzDO5!7yfnOj>1JocUP@cx%hZ4
zr<b%P;mf|PrLWMy<z0XLKuY*QY~_c^@y%#=5#Q$YGy(5kxAxf4?W^KT<L?r3a_s~T
z>%DQ*ebQG5-J;E>i|VP+KS_yV-cQK*5+gLE&}sa;kB3W7ben7+<qjEJ7(@Xl31*4T
zJQnsTE+#Y|0KDsh4)M#XJ(yXf82ZzLx)DMPCXD7#-@B>cu9O<3v)~<S>%uE9F#rU^
zyYJ6L-n2QVc~ZL{F#?5@{#5LoxkyzFMZ}vu_1hf1r^s_J-@(?83woE+x2p~v(%?$f
z3$*vF@W!X8K&c|}Y`B9OL##=IN`j~&9wNj#@V2x=D<M6=;o_En8WuGFSfiBEZSV9=
z=b%CXl#g;g&>R-F$1{A+NX-VzRp|Mh=RTy)ZhT~U>Sl);DRJk9Qi7@utFpaoBdz`U
zf~i08H+@%<6)9Zf22)kq(8f*wd~YV|9{<*=1Nu9J&7}AGB#KDL^WrXmF<~}v<K4Jo
z7@UA5vAjM?>>;-e47fgpkLwtO)1{?QC+$qzpf*k4peW&Q6AD%UHAhX$&oCZK3@5Fl
zFO+F?6*aeAb`(;8@h7Xtx%a11J7oIU-2iH#ce+1~Pfp5b@fo&+;<*wV=zOR>Ltf^v
zuk7~S)3fmU&VI$_<kr)*b%kd-+kC5U<SZ~g?`YEue5@_eAmdJB=_^WFg6}SopP6*M
zrK8lPd0l3iw~({;2oP;^^}DRX@#_$s-$%-inDX9!6Q%;r2Y;Mb<Yf;!=W7fi_@Um>
zQN~nyUf*2mIhnz3{0l_)bAcr8;1_}X^+g8&qfB?luF`eeI`L&*19`d5>e-a~9#2I`
z*>uXZe)&oY3wZjY8>PYHN_a+4)jqt66rymUF+|7k@Vw!Dm=8fl!=bNYf4(E>abLu)
zb^OXKE6Qw{n%HtN^vPPO{Q=!d`_xV8K`_qbGAxW&qc@TFJk~!wO7xKs!8qho&4L%L
zyM`1VJ+H23$CN2nGHQWdK(66oS4O>n^;jRJ{V<dKP=7MZO1PcnxyhVUwPU9QJZdY>
zdajAjXZkK|sYi5gjOjw3H8Fx)W?fddgG2}=NKJMo^0v(MirMz|$V%I{@~L>Gb)sV~
z>pNE3oRqaE=*+N0TFho%qrAiW1*e*wp@j%iCTuFYUE}IYGwpmQKA&qBwm%m>kEkcS
zA%!=?-i;qRU!87Vpo=0(Zcd56Hk@M13u$iZdIownUefsdIl?%l6Xj&Z?vrm9(&06R
zvAX1&-i6wig6I3gd8{tpG?7g%e56jdo$L7=e0_U;whg1qu;mv&sF_c)!^M|mckc}R
zL0x=?=4@rsk*RZd)XNjZX4SXO=cpg}O1Ht1>p0yTF@EqsxD06zcSH8|G@!3N`_zW>
z8h<vjNHpoeEaA&%@XHGD_=Tu|mBv?&baFwNE9AIdW3Nw{AtLz4UOO%#Z=dx+`DiT&
z)4RkrW={@pR7<GLvZYepC#Ld7^UxmfeI<k8E*pvzy(1&o=hgzlP1<@de~5o`pr`h#
zy8g-Ki(zKX@$=oe;MuF<_Kq44iUot7#48%{PtRBiGVi{<87bSl&j?vO0Dx%jcA7}`
zoZy+(O?Der&h`M)zcSonOLNfYB=K7_rdUbUlk?BT-7zbO#3X2Ru?X6Q%tVes1KWle
z8jA5&s7Ubn(@)3}eK%-0l<zIBU+N#3e@hOE=FqKW-xmJh|L!cEzdF_KdZ|9V{WidO
zC#b4G$E4-63Kmwf;9x&au|b;KZ<X3R_zOx{5aV~Ndy$C7c3f4mLB+l8xKfWk-ft)A
zRpw^!g#D=_jg$%^VmPiygL=%;?NxzRp#sNb(c=5yjGvhyu0cvrXUlgFPlKS`xAXNv
z^-fRm`YI9C;X=eIS9+dnv*Tw;o&8t=%T-x3hDeOdJu7hGNx=wQj`#ew{KQHvvFQ@C
zV>ZYz--Oq!wPb(gC8gOiS3nm<tX<Fz8n%UP;ERbbywg0xCy(kg>rUpW(2G;WoGE>k
z)3Z4A@6gT4s^5%G>;3|#FCkS_SE>#(>yJMBla6Bg==HF!;==g~e)~5Eji&w~Wzpv%
zIp$Hl&{wqGp%!5g_?L4@8OzJB5=>|F7~Br#YVhPLtXDipxV2H6@9((Qy@c=#r=_fg
z*0)}>(m3|%J%I())Xu&u{^Yp67Pa6j$cyHdGd%#W7O=TLK!~854Tf89%Dn$F;y$YG
zYm{{o0n>-Af4f*{ZF3jo_c%6K*^s2U8i${;dv}h*DQRm0{;ClpcGvsM%Yh8?7B%7_
zku;!5)MqYL?-1l_$nM9UP>94H?OFk*E<2i4vETEC8C^e&Z@fD6G<mrvYU{Hde-5%F
z7`lv6QdyCO-c^R(nrCh>LFG<5e~n*SEzF1O>vOzvGrDY|a*3>63zyJ!(;JGYnKi{!
zLC2yJ`C#~Y8Nen_NWVSBdHatSKu9BW@3A4<2>R7jj(GG>Vr8>sM2wsL{)?NVE_?8y
zCMk_?8y0+<P$x7rp@HP3<T$?XeIvESi_{2?3hDOUs9ZRSPy@HgDBLCQ++>eQcd90J
zM83VnS@iVWAut5czO$_l!aEB}qB_N{hUE9IxhVCQ+}1t<2L|H`J%Fxa;_h%kTUowv
zO0jN-D`%XL82|^%r~=T@*;4Nz!nj~cd+K`;FxH7nOLAa4x{27~<ueDZM3@6r5cTU}
z1!|1@n-!5+w==Zbay7BagK2WKK`vWvUSXR<Kga6zB9)3#*Msy*5+R=|K|saVOyc(t
zW%a%cDJu%MN))|cLL-Qo6!yKGXRdTBME`@A;y1HjXnE80wvn~=`>_-y-X|Sra!x1z
z3iR03(X6^P?w(_`{ihgh&tR-J6VBY=OWI_V>x-2vPl(Xq5dT>I3~I^64$^$itJ+OP
zx6>y5@Ho~WiTz?o;d=$688rgD5||~^hMl$9rk-9k$7{KEjjQTomKc!*v7pSUI<tBP
zji%Ys3AY4fi|$S9pdIPt%q*N{x1Qf{MPvYHyl-gt?l6#A>P;kPz$7-C8mP0d5TS^D
zTvdZu$O@@iQv;@or{ZYf5B@-Ji4RH*%4qEI(0pT}q>VdV#4OuRjA^PH+LPgZwuWxD
ztd?aSSf4NTiQE%UY0#ifaJh29X?^yEa`V!A8OqJ8&xXSfbgYou<zTowMh%Wr7^VFa
zz#HWiCGifROuG;QZ3VyF;cz<6SqB`Y$op<QXLG7xRQ24?v);hD6~lC(!Tg`ThG2YJ
zWGVIyQttO|Pu-j;8ij722-xGUxF<bR45gy|Iq9z>ax&<hJ4EpLDASkGAGVv2CZTzG
zgA-Z1{UpXUO>zaGAz8TUdmta|1*o#7^*{1&vk@Ou1qd~<r0=x9bi!!klkibm$|ko2
zG*_zN*bPHl`z-3o>f1ve4M`5R!QPZ1tn#nYTyiUdK3M-^(=OxNg`Rf<Vb%v@#dG-c
z7IkVsWUQE#xxOxRw5Ho!WI?k&Ir^_M^i7{alp4Ujcb_u@96w4ieh0=FpoRd{KBDi%
zGw8{q>M)fa2yfe|0O!9-*|i51!EuDQEc*+<z?TDMeKY~v58!RPf4@x?23H2uhVSR=
zNZRz-U+291cIAk8r`1m$5$qCEen*WUhz7)fAa1!8YKOmy(uG3kWwJdegtq$yw=q>I
z@ygflKg?^mmsF?v%JqAPtvE~p%KuJ4HU9ao1y*v0Xx;hO>eRZRCl#N^{LPiR0~dgT
zUv;?FBue31^c)cjY2k)>jmZ?#$ZR(x<O}wM^IzAdR|Dn;#oUt!I7o;THSK@+=ebXG
zANEG7Ctdz{UWC#>`q|AyIR~-w%9N$c@NLzSvR6W`MXYl4eM1}gYBsE?70~AYD*0IB
zpw3}$qL*XlVCE4WCyezhgPhM__Cn;!m6qKdM<D(wQOp_)k}n_FO|FOa=-QR-HPC<L
zLM4n*QZifaPE1$q5ql(+wEc9)J-Lp9SZQf$-6i4x(XW5!0@sPclY5;>_^#TsXY{o3
z^Xy;HES9t0y_IL&u@~~QTGnZ+57OhPdZu5lpU)F*@CVpN98O|ZKcik3<)EsUhLMEM
zvXr~tMq{w3FdGtlQCImYAQ7DcsY8iKuIcF}X%_82+}^QY@#iGftQcIOkS--aZ#s%(
zkT&wT<KVttHT%l$!jbOYB=I7?i3ZPRE`sw7{X@}c+aTfcn}R#>ItI{P?i%YCPX&kC
z+FhvVwH&aM{YzZhDXr22m&p^9e$b(WFU+9f$vmGZb9r5;UqPhE#~lexyI$<I=`|bJ
z=!=N@Xr$Yab40ozIMf@4KX*-Sd|=_`dXqwN^``)Fa<QP4p6-^Pc7&H!7}R_}op!`9
z0(PuJELm}aUs==?^7n@uzg-^etuNQN^^mKgbmp$_EIE^_O52U!o6%ofXVRmTf&GEJ
zf~^l155eB0w`{1=Q7x#ERG|FSJR%cv%O<^(a3j1~cKoKiFE<k@<!BrsUVGUP8;<35
z3xD~bPiS7W_C64WhX~>!lb*IZw?<<Euk$s&An?_8-x;NGi#M=Ivg<D?3E5pC8|}}+
z32Dw!DkWbG<8Cbwm*CO`)Ze+Hv8ox{ViKti3wh>BK{)hjI(eH=BlIgf+pYwU)`y!C
z8&6eN3L0JLYViQUW1R4Xx2H>6Sr<)M+3OAJXZdmi5=U*SB@eh+DTVCpNVe#YlF=8u
zR0>>=B5KQqdtRkx35=^j9>S-bG8o(1Q#_PGA;+q$5Rv(qxyW3Kk11C?;Ia8xuyyc?
zrf1*z9uOjrE<U(AyayiIDYA!W;3as7(^)>M|FvEH;_9qBuAEt#V^%`WyYDvPd!}gE
zWCGQBaeRg#+WHISd9RIZxZ$MPRf2@0$-U3Yh9Fr<*A+QQl%A{ym>n#wTZ%p>UfITm
zvtiQPzXlf;f{1awE@q?}D})U--$}J|IRP;Do9d9<gRT{nsaJJHu-Xe0r2T3BWOz0d
zL(+HPHxF<WUM&`+Vjm^+JE_hszzi{wvmY^O+};{+zr0;OX*yQ#{2sotA!{ddz4~J=
zRrX`=kjgAXnyda+W}p<&fACPXc=BlM@yyiJv<xWE7R<GVr~ve~{__WV8z|JK;w+9F
z(z+tTH8F>o+8quTeDUN?mfEU<eyp@#ux53dJ&o+;uL+g9>^7@C%FhPRWnlpIV%SJK
zhRJc^nj?-Cxlq09ZODr?KBdw4A+xn(mY_@y%~Jk(s)+?c$+wH9m%Xm@!S7jk1ti5(
zCvL$vp$9;Zw&^=Q&Nu8K_9FJ;jCS!M@7YF7GRA1z802%-`jh$y1rv<4SP5aQZ=kB5
zOv`VGLT19OZc<Iq#(_p4NeeL6JSbCe=@c+3FJb4pMl^`@a%b(AZBZbDRs)9~?OTZF
zD?Y4s6A97tybI1Ik8c5_M$HoV-lh3oHSS1-`zU$0uMtDL+3)(DlS{`P*G$o$m(4Sp
z1@pvMI8@S?fQaO_lmrLpdZFI0jdY~;#kQ#fOYzk-VonrvSzLVLDD4Y(pzHH}f$A?j
zmp3!tQ1OOt4PNh~OO717uejFVUD8Up-i533Ha=lM?7b82_=@$-v0tF&?)F-6UaUq|
zZuVYtuGaB%XFRF-;H&1d5UH^&y@GRP$)5Yy2*vcQqdrKIg#_2lldzeB1uNJ3z!A$r
z9HW_m51t?F_p2pK_TaPVbs5^8?OH2TcGYgJyDgD6`_&-d_pzBr!OlvY4}m;Xo_Td!
zW2{-dDh8fw!l3gmq$Kl|oFSwcl`#qcl)w(4M$kN)D2PDjd3Po?D9zS<bf8DCCV;AW
z6)HlnrCT2y)SPLuglCh%qyI^1;N7>*3R<o8<%)<?w-qXhdNMrvHb9}FN+1G`t(BZ_
z`Vr)+ca#boP;NC1Z#$;Cz0z1I)L?9-vb{`F^sp>+fcQVFz0VTQA{K(o@CXL!EhXs9
z8r@2NCgzseBU4I2doz4#E%F5O+uqnbK3_76Alz*ngO#Wx4BA(oc~qaq(#8fJ0wUJ-
z(O+9R4|vzgR60={>7{UfQjE0^7u-jls>YL)Tf5^hTh~83*oY>*sw^1rWiQxG*V4Jp
zyCngB2fLHR7?hEpcFyvN4*H6dIzd!MUtW^+hj4(Qw^ta+6D5(~!`3N#;+k$goKj1-
zBH;1h&y9g*!n}>k$%!MEb#aV>1Q9y_6_MsOZu%z6lkR-GG2qnl8B@SmfH`{^uSbiY
zGN4OGKBK?68<!*%a0#5M&nRMi*x4tW`s-bX7A}{r?PCsRZ$!_=My2>bXuand)mbJ>
zClSuvA*ddnwYPTncfOiAU;&!;TQe2Z9~e1+YFRWJGy#brE>hrq^Sn)=xPcLP#Fc)y
z%#)%pQ-<t9nQKcC8qkNNHn#Ha%Us++3E*4^H-N!K7K>dTH?4`vVzGDx+C>9-aarxF
zasDxTJ*~R*rm(YnC*ln4llHj9^|NP|ll?z9qdwSAH8lbT2l*8sppeR?upTD80L2l>
z#gaaMRF;<B$@0d4U+a}tH>9i#ZQPPbC`sombNa6&BQgMdlU9?S593FeW!K<B^a1Qe
z?ap~<qHxnoHPFgJQb<>F<7Yz|UeV>Ge1a;45k|D(A2XwRK?!Q85NxXQyc0KNHLBYu
zPHU1J5GWuYsFlGkBLV%QFx!9n!LGqj?+zLdX15Tbf9x6kY4<NQ_3<yo1I))py-z?p
zeU?O^7na`9l0`g{D=)KqXF~w>62x&I3I&c(&E_P55t3491L=*|hx?UvZZ^9g`;%*M
zeQndCUISk~1Y3WOye;bm{{H`FeW8+b9zUxZ&V;`CxGRimnC?r1xAQhElp_soAWCMh
z<F(!O;>Cii%5VpS8L%ZdC9EnJ&$3a|mq;FUv>O%KH_d<Uoiv2S8Q%uQo}@0<PT9@;
zp`Gw*qN!PMX<l({zk>laqrn6Dr2pDn#MS)|Y0B<8xwY$P%-rnSmX7Vez*{(3K97~L
zR~;HlpSFmkVB??2w&m{yW#IM0MA&+k$558%4@>16pN_nrc}_Df)@5t9dw)tCt?Q_c
zhZXGE0sLd^JLS)&@A@qE$26rq<Zc^h*2jA*2W?<B>0sqMc{&`>;Jv`*tuOvVg?Ua5
zNX7fWnd`4qW*RHaMJ{yzP{#fRnK9Y}Ukq3JTJygxnlevK)BEMc^Pr9BPpHesJi99M
z*kippyD&o0+j>O~cwfTg<eaaI)}mT!!9ASq?Xmb)U}o@gb#x+a_dVxa2%w9bod4$2
z%Iii`#-j4j2Ut~fePR2ye|S}+G<em4V%^StFuM9X)2?&+kp@#d$x|@X>E0BUjMoJo
zq`ra*sOc8vEATXkfW=J0rULG~z7Mirb}Gm^HSltJ|2sgecl{npG^o_?U1xeA8@B+g
z7Q7>LPHgz%H+Dp(nA@SuD&Jd5KtdKA9N3j{0O{B4$qte0XTCX6ZYctD>DUWB{%v#8
zK9MncjvMjjH8c1Nv5=t$e%p0ym+4S?vw-<&B@^lZAkS+xeMFLc)y%=U;6fjCeh|_O
zJ!nJxJP`n}qvU9%u=Kl1d~f6ZPkS>prk9AV2f}oo;x~Ewe0t=o?<LQHFPnIit@81b
z2MHIcR6;4;40=<Z;`ugyZq{teO07I4qjFAF73v)IJwrgOQoo^$$6{r=FKuSAzv4LC
zhuD&@Q67vsa1+So@H$Q@QQ*5wWM52sX)hv#-n8e3P1)Tj7NW*p=1$@?<LD$-LlTL2
zK8;7ttf2na+AcqCdgo%t<(IyaXPsQ350nzSht?55;>FI>g**x{YVG2xp`iaUTYjZF
z`!<V&tYj**;&@#?Ao<PVI2`Z6yrXNJI~gx1Di)wD!b$A_I1ayIrqt3l@2(H}tvX~{
zW|yZ(Y*vtj?Z&fsad_%MRVn{)v+NE-j{ZE4u+b}5c+VHCG%N(3M)SR?qmET-Ox1eg
zVZNE9S7Dx7HqcM^8)&vyka|4aQMBqP5v>nt#$ZA}YaK>vJp7aQv-bYn2~L?TJ!ix(
z$@_0Ms}Nu;$7<^t^Ct^~6aEa3fPg+EL_$2Ej~EWze)1rr5JKT{{rBFk@##*Ynu0u2
zJgOaY?s`z?;BO0M-4GvQS2n3&XxOCk2LtCWW=Auuz|5tmopmRM!6Ei%Ew^v%2ib?S
z<byTy%t^<5D>=oh?c5aPD%xGa6LyI3-;>gxWoo>`UmveV+JN1`_$y_SLL@>M_`Aw+
z)O^aV&lF1?=GEkfaj6m1I2!{kn?24E8~0KUAY(g&sNXsU5f45b^=<9;FN!BE1hoVD
zcTcfNxEbY8lwm=-zGx0XdZri3m??%S)%Tv-hCqDK;?dxby<IL(skd~X0a6Hs-eaTa
zIb>7xuB7K}kg`AB!c>zqE3P-s*ij=^UZGv0VYIc4AKU``S<5Mn1<xqyBY5@4BQHsh
z1s)}UNe09T`9`W5v;ME{fNy4;LO7Eg9!~Q~v7o6deCuGjN9k!qk>y@G<Wp*$lPLnR
zb?G)348Ar;SxKVlqD|0b1*l%!JFV`=|8iGaOaMT~KmTdDerA7+gQa=RCoQz?)|MG{
zvl*?kKw$~#H!IVH1i6m^$C_r-^1TEK@|$ml{o$w)Vf@4)z5b;@t0Yta2mrOKS0hl9
z9?sX>TN%vE7Oe<Uh$Tv7`3Vk*Q$N%lVXDTF)FqsuK&n^{`H?YH`#ck%K(+RNr8o@n
z=r9fa{9(GjjHlP<(mk*79xaUJCJ;Q3Eq*?%``Nr^l?w<>$$IT3j{5Ike2%SwBC|qu
zEky;vGl7Wz84NFQEHW*skGXGnwuxx;ZDZ$nky7;`<M2ezT@wJ=s(*oORpa&By~(so
ziw%4d>3di>fdjnns@3-%&Hnkyi|0^m;K`n~njL7Mc)TPi6!Ii)j);k+o(0YUe%S+&
zt9btYDrSyog{1Na%{0$Ef-L{)dB_c$N&7^j=sBXD+#!(V0T*#RY|&OfQI2>L7+!Xp
z1`qvab<uy@Er7;*U9P{V_C+%Ym|lLgeTFF>2UZn!dHlWg%lL{<Phw!fFVYi;ZK`+M
z9J}Z<U}Lc_U%^=2p(>3Wa<L@U=0;yKRf3Y+U1^|~;Zi>jhDH11>4rY}?UD_0G6LKT
zD^i5Lv^HAs-oTM7*o*zyD;0xRq9Si?|2)l_<4|E)98m?DBKb#<7CnIS@uh3OY1810
z=aJ?$3hWj~lv6xmpx39rcH<cj_0--%`xskD8uzp3o;@u#1gdV`YEucP_5L`<;~o|+
zxmcQ-oCV-!sM1Es<;VR+c?>kuUCY{QLV>Nw#Nc06&i{!G!k{?gwa_cHAC&aOLuuFM
z1bH6h0MFuT`wjlUvK(pxs7T9E44^Ln!)byYF`xMVwJQN<+-o&IY_R2qR6iOpe;MX%
zc>{i$c|mDLw1-I#&PO?l2<^iHN^yX0v84uV7yf^~ew|KvnQ5p2u$h#3FGuRYHJSs;
zy<fY-rc_0i#zr+RLMkPCo)6cy3oPj}gNc;l#xa0FnOBwm^jP@IcdE<;_&VF50vuBV
ziiy%nEdTPt0$WwyI+KO!`hz;Vaw<6J6q9?Qr84X`1T@ghpiJor%tukd%P#z}{a|>F
zb#Z2<3D}M2&zF^AuSe^p!LkB`d5SlgbO2z=N{)I^#^*DddoN4f5}yI9u7-vSHi}Yt
zY<#nsIl~b1aZW?<&5?U6-x~`ok6(}puBibTn?A^08B~yEYCt{tt`oScuIYiVv!yBU
zry@K=GR0=86uZM&h8b50Rlt|Q=nwwx&)g?vBk+p>Z6hQ0j4DG_{^O%3Vl5?=)dS5C
z`7p^60IZNXE`X9{gZYvm>&k3EvkK~_U9PTwJH@SK7bQTGD3{VE7E9R&U7-2T34Flo
z3t8n>h&LTr<PZ8aKy1jD7_vt0)@>HDYj6L)(C<$zWFQfwOJUiyI*Jx5&;p2^SB@Ce
z2lPj4!EEhAZWMfpB8t@F1M*(|j+Gwsl?mv|^YGJ+j@pFoK<zhx#JM91B0#<*mSqC1
zn60S;^Zc)i9x|I<4C>a!C$LJMzt(s-J@s%fAPv(NSre^A2Z=*l1(cBz*3|!733Q53
zzyqj5J#`qIZForqzMod+#KWiVw)-RX@8)8%;*T;-@;+b(KmOWxJgDm)vhy8q^7fbm
z@AAi2esVf^gT46JPYN{1f1iQV;s_cX(@S=scvbgzvon7ohEh6N7Z5eFX8QAk@VM`Z
zMpXZq9cJJ_K^SNn<c0@2$hXqBMFN_lY#<#@2>w@-*TVq7rT!G@@IX5lE4OwUW6>UT
zkKAp|`)48y8m|Q0cPvm%po*vV=)zc2>vW3$9<MBFn}!Lg-!$m$kJ`tlgWLioANE0+
zq<&lLcBKuHTWp|7rF8-A|5XX3TZGanMO=f1T(+a=n=T)B3SZr?&JhU_vkk#>emXlG
zF3L%M^tG7P`ibnczFO2g5)kkarJa-7QUS(FO=YO~cb_AF&?Deb021gfZQCKxodI+r
z04Ne>D;Au<^%jRiz@u68(EBP3iKe8$2CFo>DqH(TDcT{)dn&*_Bsh-V^L)X-RP-Zw
z(n{+@${Wr&t9Pp+Id^Jm;O>r9>Fdb<Vb+)0=)(eVP5yWdP9`ED71L4Gf5#*5BVGJV
z0l&>y%>*J9#^h6CAss_w9J~)Yid_)-;kB!-ky1#NEekmP!zjiz0ZohFd{KlG0mT!f
zG*8?xu2~w)pUm5dMvfr>TcP@xG~%BYO|q|0<B`Dle!dtSISH9p%Y8iTdbz)rlD5WO
zNyzvNBNPRLt+?bWOjkriS%!K<OIoIs>ge4IrS7Jh>51o8#f~@SV64yWo>&?&gP)nQ
zTD<z_F^F$r1G!m08Y)43_+ab%-zJ6cDtbgEDn4Gl`&wVeTV0qE#29#Nb~dekxpUBn
ze)Ib~Z^ze*jUQ|Af(6jM>06O>$P7mNLUL$lg;J*1lqw4tHg=l2p$LPE2{tnPa~frl
z&R-E!iv}+9)cQp<3I@?9TUoy!a;0S^J+TcA5dGqDy4ZRj-K=Y|J6As>D$;JJ^S>eW
z$%hXVMkxdO)wxBPS6*icc?Lz4gMGfja9rO<ETN<!ecE)%Ir*szd_X*^8*GLA(~k~;
z<%PS52EsuOV)6bzf!#$(=(4iz@z%S-`;W-~xE*e8a)@8HY#!0)e$Ujq?-XHob*60?
z{)Lndd50tBM3Z8j7Ms~aqD4ptw6cSde#Q7FGc=*DBW<FZ__Pq_L}gkJSK{1VI*YJI
zc!&iENIg02FVuT7h_JLIXjm(>?xRyqE33=gbni|JTUa}ic`%1bnc}51e~L(H>KyjD
z*<W6g)p9l+feGNOJ~q71Vpj|UB4YdBEs140vHH=6@mf-~Eg4@-Z!v)XOK)Dh%NMcw
zdIF$LcI|TPZJBqZv@^m0-8u&8us@=qd?I1L{2hrF@Wa7!FXP5bw{{Mqx-HXVolPgB
z${GXkf1j0KhDt~SXT`<|{~j8lzc$fgjfMwuVqC9UPEjTn#aK?oimYy}$J-vcUn&Xe
ze3nVT`LFm#oXG!v2RGqj>-8ka>uaEBgWFRnNg<<X6~$EiPl2fFoVZ`7g;>C^`iy`6
zee6jV%Eq6TA&Pn^*CGpK;GV}sLV!N7c+4nAEE>x$|M$}sU4*HI!cWrTiNtlr4iW@u
zB06xpM7t{s(orzBay!2f30a#em0&Z-t&r+<a{Uu}(U|@mASL9)<nDH9X(r%^#I9QG
zmxl#}`T*S$CG?Fg{yrJxKSu1<b>p^H|8uDKkr=SJU*v$^o#qiOYk+6jfmJK$?=FFl
z^&cf-nn?PoAaMbWFQRdQn`E{@<>ez1@)rzw!u<Djr$SpqL9yyu#5pikU_?~^cOwda
z%7^MEVe&ox|LJ;}4vQjSIBMY-@F`C|0~alPRNDYd)F5+j^xrAMoD10=IT2;C$S*GW
zDhbW6jn#6vYT%!}MNJ7v&(PQZr02(mnBkyOPzyB@SM)WE)dUUK3`+DUI(7CJEs@IP
zo)D?=GO9T8BK?1D1kcg@O+oP>iuBNqm7J}nOtH$H3col3$*GuNR>|sD|7>&*dtJ&S
z3S{84Sr!sk&A;b|o$uc%68J?JIRNO-#V=~2{V>CXSjms|<lX<%CKBmU=_3k|iIkr|
z(zl1%4S0wND*v9oGSDDKDS1nje@~n%r5Juo^Sok7;=i|?FU-T_Ejpk-ELTlB0LkU1
z_<xl5mQhi)@Biq~-5mm=l(dY53|%5EC?E|2Lzi^7w1CnvfPzYQHw+*xQqm2QLpT3D
z`aIwN_qW#Xyf`n;tK*u*V#(gK?|sL0eXh?{f~g)X(1e^YInX0n&m%$*BP+#9Jg*pL
zfVcdv$^C#H6kE^%eLxfnBtiCd0qQwg8x6=W&LcY+T!k`1Daml;-iQK0TReIA<NI&U
zbd=+F#!f5&6re-YbCyD!l>L2F$L6gcZXD0${!=k?5j9X7$l^Lk!*i;q!N3^Y87N6~
zbG{nCZ-D#03CIdcFjbJMBnQpGP7eeE#1y+dR+R`)s?gObK{Fk?pnT92CX5uQWcD1@
zLpY*}OUi{cV1aplA2q<L6?^d>s1>!25h#bIhGJZ>Yzc#TibyL1#FQ;a)ZjZ{{QQ@?
zs8fd3irj6jI5+9W@Z{{TJR5mK;S#hD1V&*0?(JcR=A&&=uGvLq#DKn`8k;j>!2|++
z=UaX%xDaUZie2wGW&Jl6pnlmgZ|feEV9`ZWs|uw4ol-)ZwFE_-M|va!Ghk0AzX!+9
zeMW`pGs*$q+-D0vBfjjMaG<o5zktL576u*$O|v*T-)Y;m1Wte{^kD!(P>o!hz3646
z&YCA`+g851X_E@ds$erOTh~(JJJ?sV^_w>W3*_dr;_yS)C-oyQ2d}n0Pabr#u$~<$
z>s4Vn)<eLG)fmnH9x+4GMM&sTqM;#cg6u4yhb518C~(p~jl38+u<d&@Gb4eL;@~%U
z>c1|?A=q-6pOT}~XC9f64)Vb_PUU#C8Co|D<O4_Cp@D??H%@sYuR1`&TO}CA$|10&
zyHdg0P>qMt7>?-|B;WQyO%AH-^>()5TNgR+b0)*H7;uXZLAWVrO)d#)YVXDLu>$nw
z4^ySrDbhEWfUo+Z==8nQnXehs;*(F7-Y@pI=ML-l6Nq$ykufcr>RF^n;4*tbb@qrr
zV61~sjv9~R2N=|H49z0Sjqfg4jFRHr%$mqWn#Qs^;%BgcEo_bJ^)Et(u(Wxi*7myK
zdA$!p3IZV6<--$p=;jIRU3rH-C`Lu<NGzZM<WYBP95NX^kB^Hhi0&sD5x(F0B(&cR
zq@ui!I$Wf);l%z>SfoJV-G*fmx25vFnr7PoSU@`PD7juPCeb&N-agpu>8vU|HgN0c
zQpR4lu_I$qo4=c=nQu18oQR9FRGG^fHyIkVo-Vw`K_)W-j_M3&4($k@r@Xbpz-_Tn
zk&=;;j<>-1?3K^=h%g{G!R@?#HmmjrdCc`4>kJKjQg&-1Oz|7XCd)qx<KP3xUGD(m
zP-VcM#hRhGy2S?N;k+ZR@o6LwtsQ_zusLH5j+*Sx`@iR0h~Mb#*ViSD8?T={PyDtR
z1`~O^Nesl=hQwL$woiMi_1ydT2=;3(Lk&(y_(O#v&0`Bb<1E-S3zt=sx@Dgygp0P5
zV_u6sR{wNu=|#As2jJ=yo?2HY)L{Xz?drC2dD{^Wex`DK2e*?N+T!XwLy-V@Vwhc!
zy{IU{w<?MNC?iNAsXC?yl*c)Sy#+srddcARY=?n%FIbGEP08sT0>w@Bu!Q#J>BwG8
zZDpbc%$(P$#F^T1g1p=|G4jg1C*UmM9BF{)oBHvA?au@R?x+TczPc^9(<8!-0}A`v
zO!#8p(uSR}HU)`t9#wqG>ja4D_o54~0AEh@-Ov|@kp7%Xv%o)^?e%FJRkn(BglWdn
z1tT)bJ((Au{wYaj1Cd<edbQ(3YC(=+;!w)ft?{f@yl7A^7S`4!5z7%dP-o8}edEdI
zy`0*RpaALc9N5-JSglzYhq^G1CG6RfApfJ;UIFm2@&U2G+!0asnjO?1tq#A9R)8=P
z*B<SP9Mmk|lTPLqS$H*Y#B)3{B1{fp8K@K?=NwS*2OVed{p9n97n@vxv;<bx$0Hz#
z9lMU?MzhQcY8pj{2SI3FshicNWjw;LvUlAZ+vw<nV3%!%+7$lxBp3^i#<siP*1e2d
zGr#p1dKXwzR_7?nC<YiksgvGf|6)o%V&TaeyfD1#*<I6^lMDr?TDfmBKnY{PdzVX%
zjD8&vY#9Omr~5rK59*+PZ_CgJb5lDtmR=7@1m4|V5fx;*KWK2Zjbx*BR^9YR2594^
zH8T3p-Zd^+L#99FYXm$$gCP8OGjZxv%&*`MA*>Nvq@N$Mgb=2Nzqn7U9FZV$q`dCQ
zAhYiINb{XE`C4AC)MwH@bsF4{y%3+to2+KlEBdRQ$}@4B(#r6w=92xpNpYLz&VAiZ
z@6rw5EGi7V)%F7v{U@vS=c6S@>IwFv+ESfB+8np9*Z5lb(x+@Ax}<v4W15)`k2p4c
zr55TkRR?^t=bPLnOP|u0gUfl2-yYC&_S3NKDjO+}w$+j6yIMpK5l-$Gg$v!Q3{kIA
zXV+Ckp&siDM{Sk{yglKV7;T(9qlhBCr_xGb4L+v@N}l~(NmG~@?|h6QTytW)6_;mJ
zs=b?_g8c*ON@vx$A`8YM2E3*(!Zdorai1QSjYrPxnzXWd+@*W+J%W{rif1l;i<Xy0
zapa-z+&1<<2N_!s#h$a>lX8R2M_b#;S90w-qEL|CE#>&#{uPjl@c_zW*c2jqq}6dO
z<i+gz`{$PzmG@NXuc?L17sD^kyGg;@3#so&yKxtfw<>j7tRJM+(1b9G8wFt<IkH3_
zbrYGye}UD%-Fp0@Ks8j%+rK+$xJ4>>{ie<775nffeGLvMb;|HLAth%gbuy%!FCLVv
zY2bL{Zy{BQ>!4pka#BS^0I-e`J4fhOBkI?D6YyU`VY@FN2%9i(F-ROuJN3bS3lTx^
z`m{Zd_K1NR$anTqHm5BdGZC-Mu!Ln*CN%VG;l<*Th*DGZ2G8V+Cyy5odDjyQ7IR?;
zcuzWPNgNzqnqyo(A><sbEHF4lv-jDoxbdcIP0`dDjYlC5E0Q@ORDT-lcJR<y#Q(uJ
zxqtC8N)#yZ5r5-v&q=K3@P{rf?L^-)*br#8&#6kZmU{GY(`A)(KH1$GuLx=t&h&K+
z>9_cFcl8cri-TNkUUsygg*$FK?g-Mjn-53o+4n({GU}6O><6Pyxwm09SVIDu`BV3y
zMOl+JzvzxAOXS1oO0mQ=Fb>yLXWN8Q*PypkBIcm!u}2PVSNi>h!a44|TCG}@<g_?K
z4ae0dHU>c<iWo&YwA=b#Ce%VJS#3;Ty=buEQ{<dX*+eAyF8RqDk({@oTe+$7{3eRF
z5FctpA#P=KIpNuIjM@v|>WVgg*6{Y*J=e+w{nv26LtzW~%1dQ6s(QXusgB!+<N3G$
zY4k%3LA`#~fA&N>sg4SlzNNoD8uD3b6`R%u)!#wYA{gUzPvYHkwGVB-YkLBfM#Ewv
zDC~l-s$^#<d1<9OS*+w3maKDhauNl(H=H@NcgVr?-J@pyIre#3b&d#T!ItFn^+Ug^
zbZ5t(75|#n$RexHiH+s8OPH6$UAV{vqL+3*kY*xAEZ$(JB^Icz<lCC_`Msf9b>*hI
zGMN;w+5^)ooEFKfD+{G6iYYZzSvlQyU_5}%0-XTcnf$P$WRo)}{F_k1P8?z$v!v=$
zr#yC{!Hi9bHFAG!Q?{|J(3lgT==+yV`uo!{7Nl+H_%*TmQ<l<J>Up?RXO23v%*cRI
z6)pEKqHY8l-EtGx#;}oOQbwGa&Yd81K1J!`jx#F(H6-ZCH8EubZkjyz)yighE5E%g
zn||Ut?N9p=g1C>|LAPRQiX$OG@#Fs*slWrVB~X=vy6OyhESD=7Sp{)FlF|g_FCP`h
zi)s_D<T&wW%<f-q1t7ZWsB9S1o*j?)oD9Z&(JCgm75-k-Avw6gX}k(^vrsfJ?IyZU
z@rbJA@?U|B1o(^eDDc+DEh+q^(fh{q_Fq2q5`%N31?q1OWON?>&M{%!%rs>V5v{O5
zq^xn9NZde$>Scaj{I_Yaq!^%Q;5girc7MVbCr$nFJI0QVLKwnb*{^EbVf?8flDPO#
zKU8niHe~y;0&rc>2Lfm>_Dba})ul#lEw0@*6DleMAA$A}nV4`|O8g=qQ2!kYtg$dj
zUB%Ti+ZU7{7fI?Q3j2O+4PQO|I@D{gNK&!1Uy)8gtO>)V+s(fkz`Sy`o57Y-`XE(J
zRY!F(-d&L_LG(<I?U!IQf6+JKO{Tt2$GLSz7CZ%vOkz^>;|{krd_mQ+qWSOk73pZ`
z9hEqA`$BV;g8D#RZb8YaN?ZxR?2XA>e3=Z4gAZpj^qz7<NrPR6vEqoZF>fJ|^(HkX
zC}^HHckENu&hyVbIJBZx-*|umq{R1OR5fT}Wlljrf`0%|tAIX)J|y3^X#sh)`2V}B
z4Q8H5kD8jR7uT3VZQD|>9|1PWu=*3AL`w)gN^{vgzK9qyE*x5{9pF(+t4v_?m7DyH
zJOZG6Ksico=F4XzQ;M3d0Jjo?{paeC|NZK%i4qdi$d6L4o={tul?(-Pj9L`HD(A>c
z^4el?nXaG%VUH+42?b$n5p0!dkJhaVXs~ZXwM}%2>1Xw3H{ZBw4!cT%_cvdqgW!y>
zp8~8clQOpcNr$-0I+&qlgLSnINYbnWg7vvl<9Lc!THhV64#wI0D&)$h_W07S7jcxD
zt^L5L^Vnj}`f^PF+HEreZ1v;(t^Gs#H!J9z8d7BVjpgOMwf0iAgeoK6BRU@*oyk5-
z>D=HC$(tj*ZD4F}ta>K|&t>_(JU&r+0oiku7?E!jIrP!5wB}e>rVIC1%arYYG_KKV
zr8nzn7Un1<qkk973}p^e{Z7!8?CT#?URlJun@>+s4YY0`E<q>tHL+}dAS!6L5JwTU
zk)xE3$$s9O*L&uOe+xOqdR3}#3K#X<!-br_p(r);cj#uWa(Fe_1QU4E`|d%wf2U1z
zc#cL??9%sveZLvu_UA7~rp#*%BzD>q7)#(g9A-|}Q;gI?zVWHHa+){_jlLCIGCxaC
z$QiyHVGCK*^`#W*d_7!$AYDaHw@Z1vu~MTZwDfs{vD5c#L#}4W-DE7RdUM1Mlvce<
z$-j;p39COGrXoS>-3siLSphp$12tt9K`7ZqQwCS~3Z3isl#?f#Yw3$=$7PptS;`#O
zSbdJh1#NGk_ns&F<6_`aL`?2loMd%|x$c|Z6i6(u57fe*W$8%~4Syq;xkqgX1)Hp(
z_d7_DeVL;i%8V@UyrSjrWR^1PuO>gfvg=D17|oN>w4J#SwI6yL#hpo5`yANoLH4DF
zndxSuO;+dUg}#eD@5cAJ1R4Fmzx+aPKSj0+Br?5A1zZ(f__=$)W}cE_f>E80E)eCd
zc8VxpHOA?81?iHFmg><R(>x}0rRy@<JABjEG<A1m??SKCaaUtToMRXsLss+#ht_|`
z;tAMnnH_)oE)6|Kj7oQ8Unp_}Cl4zvj=-NmBJ8y*UG8bW%1YCdQ~U>2HsuIEyoP25
z|6{*vFJs=U!V6M2zdM3Ra?apWgR2LfDl{^56^zY5ybM|rS@`o2VJTfZV1UBP$N{eP
zCnF?l-sTsI5@mH}8AvSl+vy!V8%=;;EgRy_CF4vQdfyYj*?WIc|Jy9IZB1E}FhlzC
zc=9P^_Jh?B9Oo4fPAuV;2t(-@wa?UR4suW8;R{-Cn-_z3P89EBgP{}R<~+CL(D6T8
z_stG|vjCTV!pt`_-m0xxHjG}3f<F6Qx(E4C4lRETMU)_!so~cw&};PkPJmMB2$TfQ
zYRlVx^6yp?>%#^O^V`D3yzj}ymC@B9MuGx;zg*8zUwn>?J&>eh)9E7x-%hCnvA#u)
z$m|8k%i?gIH$<}lh(m_ZWS4u5)h-=CDn7>v&O1o?i@)O01bMsN8IzX1spl?K?3H%v
zWP!#X`h0+uuT|8PlH$-_>3m}aMBn@S#3Oj#$L|eOAa2;N-GAHNZWK4^?>gDuteW{$
ztS~(`Z@?h3Xq{*(OLb&=9r^D9vPzLX1>{Df(}iUkcoOE;oL}d$gnGDM$HQZmvnSMw
zgMM={RjDb!l_ha+VNb#eGJUQ7_HY%)r+izd>rK$kL<^2-WYULl_<ld{Gd6&H!YTSj
zfkN_35mmgktp3B!E!LsQzM91H)xC0*8~mM(eI?3)wng#Gk>)%0pG9MQ^Bxwev@tn1
z4Jiu(^BFW_Y_YPz+A}$Nt??Wp_6jqqipahH5c=ZP2)cD`i-F$K;B?~XUbUaF&!!NW
z<ItLtKAfDp8-B&TIjq1L(Roz;JC(~a+W(tgtH>q(Jx2cZ-}5aIx~RG<-IxqLpM0V8
zg4O6Too(Y80Hz^B_wh4nbkBc#!8V#}Km<92DjvD$zX-@#8SI~GcJ%c)x*R{4UXNL~
zRQ+ke{W^-Pc7CtxTsoHkSj7pCAbX+0`%3R)vs<XuUVfz{g=GGg75=rGd?S%(Gb4X)
zx@{30v6Wsh;Eh#lKDbU^L+;Qc--vM|o<~(|7M0xI4At3e&T4+cz1O@SNWKBp#(D_d
zVYbo~12jOO))S-#kZmi^EK4=n{;{Uy)Lxo{et*ih+$ree7Z}bdb#D2sW6jK{MC@Wm
z=uzDRz1VjV-3KSMRipN?VvID}9=}&5db1v|{m^%bDU~6JS^<0+1Inb}`xeBmKo^`g
zLe$vivfWMFx(PtQ01Bo(O47}RxF$wr8Y(2lXji5*g_jnt)b!5xifDPWh3LJwudSDw
z!x|8D+s^GS5#}K7i3dR9SkdDyi2_HLQ|I%eWP;weE;}W)TvWV<(!U1cmV0=CaO+uj
zwwJyYclO~hlkD_rI{H%HVQucXg6u~hp?B9%p=mQM4B$;Q6p*@3+5R70r>oLyPuNjY
z&H+FSK%=_h*hn4ix=R3+V9m#u`M<m<f1B`(OX}z(NeJP1dsRvb?jf?3Tvyh{OUpFk
z>P9=;P_DRLWmpU^t8-q<=7k8>y5C8Wb0L`J39YHl&Qdx&HO`m1CZAu9+B=7TIRXxF
zZd`!j6yek?9484q2JJKf=0xmFoK}JG8#@}syn4Gof*xx2944R1r;IyNJ?ue$e!};o
zKq=g1V|b-F712zZb$-qawIA|$@brv6>Om`G4`8q@UCt;mXP^&;Gup|t=8TnD?Siv;
zfzZorjV(#QlDWf?UBkQj{*hDI?bv-bgG1m)zKRz{;GND&Hn1X`k1z;!yPI|D_@NU;
z?oTPrm>OBmzN@7k*;VOji8Jojyq?0P<tc0sP-9W{&tn4&yqHH%W|q3z<D4|Z=$als
ze?CG#b9WyzCMA%CQ@o9B+j1$kNOvHi(vcKAX2Gj)XENsK9xIBUz8+%(yA+7KZ_rQ`
zQ2`*AtBDuBz=GJLLt4s;2YLM=Z|5O}I<(C?G<P#~g!>K`3}>s~SA=A3)bK5yr6VS+
zTZd~^%LH@1OV*XB#r@p$#R?a6quoj;BDD9pb?rL^p8U==I}3c4%+s7oJe-ceG-RB>
z8fbxcyv^-|4|UC*&0<-+`=-uyJC)-4#pLMJ<FjIVKyV>|_dF@UITk{gPb<~3zE4@<
z#g|H&IIg@@3%#No^;25NuzC@sciON7$KU*6!*1xa_R{57g`A7>tI}98%q0Mbq%O2v
zUTgj=c}<H`O5)Gk0Ii-WxSI*kwidUdKBE)7&rq|G&2IXm#^LsoYG=Bc?8HVaR9pt0
zV;@Y3h)utQJ4Vw_-K(T(jUzQ@IHU5dh^-y9H#zfBrmSPioNboTB#-3?qs4h~sW%vE
z`)i#@&|apxAyav0V69&}n#%n480CvzA?fKo;SibdxHSIRk7At8Z|->QEMElW>*oE|
zDT5N!ICzrP@AWZI$&jfAg=H1$9%-BWTyzum*!)5QC4bU+S*v*bI01(9<6gcsut%S@
zZBV%dQ#~^6?_o6FUEfE2&hb53iD<2kAufqM%Clb_|J`i~qZ<rRf6GHR77G$}dkx!z
z{;jp9B)^0Tqk`ffkv>E{i?h=LQ)PPwjxWTVJM<x%&9T#odD<K6i=XNylwuHr+Ioob
z8nprcJBKTjYtQR1B;br^+Ftprs(0SFjQ(}-pQCkhY;U*3z%V4ZI@BBa^l5GrgERh4
zwX^n6;K@q4_tF|GTt<FA>vqUDa^9eYxVf=Q^@FF+X`{30PZ+~`E62M>FN*_fsfOfh
z-C|;(zSSy|KwnhrYASx~<)#2zcw+9jeYWl82sXuIMn2OQxj`0ZaSM4SyU#cIjjKqm
z_sqYao4{WQP?bo1byh>S4VF{|WS+^14}Qo;vW?@-5poQ(NP}tx-Cn#jFBNaI`Ht~A
zp^eP^(rCJ`$Fev@e%|!LIgEl9u@}CM*-6mpD$0^ZL9rkC&6quRkU%aGMg>!T@I;%$
zod=I8?izuTl*{FSu0Zs5Y2(E8-QEqWF?`nj;A(iAv^oDFG^1|rXBf6DcOW1>jz=8#
zQ{|2dR=SM*bDkl7tmD}&F?T!d&FT(hiZgI7_>pZ4Wy)+htEKf>{Cc|;4eK-F!}Mer
zh>O`?=tLu;;3Lc5ai258Z!2y1`r@;6Vp66UZAjL~2s6z4mL*r^cN=q#3v>YI^AE5b
zxR}UyEfzhIk8+zMIch6jSGwzXc_-j&zSN)86XRh<h}#YDmOIWAkIa5we_Y>J_SdVB
zbUWJTb!SuvOMiUnTRtV+Uh#_%uTdAsXLUh%>y<29cy>V|^bHld85%mLwwvQx<@O++
zgBRQLda;V7MDnAgfF3IvWrAk3ETEErU4IM7l&TIQuHu8CKbs!3EFfO~#HB6RJvm9%
z<F{6KC+EGPKHtB&I)!Oep4PG2y&gyrrrjDV=n2r!j8^!-Ft`@AzKn<*Hr{P?SeZ=~
zfRptQLAW6P(AJ@K8KUgU6WVI<u$y=?8n+1tR5j~qOnSw~_Vajy_d;qrc)!C9u0|Th
z9O<VkhtN-q>O4lTvv}-YL$s<`y_zSed7IT~yiac<XECyS_ICT<i%uFi%@%Q48ri0W
zkFHQH_RL+C@8;Dnru}l%bFT3jmtXBq=d}0HwBYIN+EP-ssO^eGSAo^OA0h*KKm`U&
zM)0===$Vhf&jwc$7}7M^)N<uNFI%Y5;0BL#CR3Nf3l{EmM6w)>es=XelDj?L@<)tf
z6(U~mn|zu4R80crf+Ur5Mn`>7N6;u|G!I<wrJb9d#ZR4ucrQ7ZJIBr145e1S$<!t7
zyhSL9MFGWtL!wc>SPo~kW7*8}_wee~5~Myhm%m^>OEq)9bra+v4BjW582s$o$}r{q
z5w?U-OB%32{~38=#N==rByJqz|86XZRcmK`FUPY+eomPE>-+CJA@(@5h6*aLT_dlq
zWG?Vl)(28G%C*E-oQ@ub6$Ntxm6RugJ@HbqIi%~$-_&_RrMvBKants06igO|8xa#3
z9P6CdZ5o;bJrU2&MnWQ+1?_aA;I7|NY_iWtvprvx-=@<Q@fh8S%u_!Bt;in>__yre
zHceGtVIA$;iTd}X3KVG6IPT154r@fxCgqq}!hYeEp5M$?s$wfi3HXm%S3xDn2cF57
z;`Lt~QAKZr_J6Bu`rZt=uvMCMR+3kynL#s~*9mBm$zSfs)^pnW!J?d2PCl%~6Q%dq
z5^R;OyRkK98Nbs0Jq4UkN85U+KZTQ~DrVSo8KYNjw?e;jx5oDyyc0+}EEa2jPt4Mf
zi4!L+VE=w5(6sJ77^O4_KDI>;zRM5}R1@?K__3D#c-V%=CU@$>?N0G{>j!B+)o`~G
zfnKiZ`0w-GWZz!r?cMe59AKricCIg6*b_Eh9lbDU-qIWUnSdb>p!F5Pqq3OX`C2U*
z%xrPIit(;5vAfF3Q-w<^v20j>sLJ?4{(&R4c01Kc$H#O?LOJWy9s?~yZyzcffkWxd
zfJe$Tw<=zmdO1)h0df+aJFYoE8fW+GTU12jVnsp0H!XoIVM<QF$h-V|P$H!f59<fB
z=qg{P292Z=SN6OIdjd>9e>Yp#D~z#ZqD{&Ojcf*LjdVk*Z7y>!dSy#pK5iJkJ5oN%
zszg^wAMzamjGPBgtqPsSs8Ztu^TLm%XLx)^D`eD=rs~fkedayGqqBOdiT}ZT#k@Z4
zDaj@@K5C#r_*pio%)!MFJ);tX`+4DSC9jfP56&VIdq3ng>E^#$9z{i*=FmL#62y$l
zh%lH*zOT0zX;Muz=zw*{hmgzmrzm_EMLrbeP8lnRLrBUA#0JbuKUD)jDi*3C#^6x!
z#&QCxV{sFOOyW|0`e?PCOya|~X7v%D#v7NW%fm`1so9JjKIjxFc+Y57WOQReVzxoF
zuY_=G45u=`iIA1T;}cY|so60$RTlAiu=FkoX0=c)@%xE=OcWLGB4AGNTR<IFFuaO~
z9U&UmC}gFG?l^@N2$=2mcFbqAU~+R+f(_yYrLG&QaibRwJ>D`)#=p|cpakcb0$KY9
z+Xn-3Hp}mwF1`<@+DDLqJrCD=0!R{9ZvD1*b)7C3{1I=bQ4#3S){<gG4`X<_-$|tK
zeOsI~!a2;Y?h#wd7Xg;vLo&mW$CU@VX?5c~L)n=SAuyYWs>2UXS~6we4#nUUOj}a~
z1g~_rAyDsHY^2Ce2!Csarjj)-;tFIY);fzp&^+tII-D(na?@D*#3`&wH2D)hJicF&
z*P!VgR%MU)J|M__eKEs6hr=h^>lqca@Akx6nXWQxm*8}!$q&k^_Mg(K&_Uv`43s8O
zzs?M~Ai2^#8MBk{2ojG@Z?eY|<$hH*8;*ZyxwpjF?34h%uqCh0Cm3q8VdPVjanS&{
z>(2@dggp<@W9cskxrFjYU{Kwo)_c9~lLUwa)bmHsbJOZ&Bhs|)!+ej(oN~kC40T?^
zW(+yTra5fxZS$cu3xFp9lf8kDzo|c|DMTO4C)m$($}DL1a!Y8?0w#QKdJC8qVY}MZ
zt`WzJ{?4<mJ+Yq8kE`zYr@d4ftx?=9L|1vphT-_jelIADk*F@@=WDk&DzRHMYg9gd
zn`KQ&qAj-_K3k<apS`rO)70yS0WEfv>>xS1Y`%<>mhE>vKV>)ah2x8EZ9Jkx;uW|y
z;<Iio%c+bH(`SXAe0H5WYU>HRO~mp0L1)TdW3sHy#S6u;<XoA;hCW<bO<YoE5QxD<
zQC3Q;B99jZJL(}ujKmF)6&aos3s}SliIS2*7PkY6>vFvxO7Q0yW*hUYl9c+q+<Yhs
zlbSIwHx`oFN_9)jaUw%>he&bL9*3F=Db82<`Zt;ASiQLzFWWB@8d%^lXbO3>X11a$
zes#;|fnS+{u(`V-T16X^)pL^27jHKwfAvT=tCM#~wDk&Jbe~tZ1{E+}={ERbwP8wG
zDJRi}97(H14~QtFR;qy92WTXDRBQVw8BjQCIZ2V4wBJX_<p&ghm#T3&I4cM|sc%-o
zO=<Jh6#Ve-wC0;sx_NwCm9coC*QDi1#%AXEQpmqboTp#+Hry;)JDQ(vTYN4;WDG9n
zgRf!=v>EZ9W`U+;g!VgMTo>M1>-p5VeO4ruEG24|2uxzQXCw#IQ;d0bsKxwx1Ra+<
zW#i&L7vSr)!XD>FWj1C*$;ODe5(itVX7gYD;LGG^6H;#NFRjF%x1e1UH_(G)d~jaw
zgDsh|^xt=g-EQ&%cGyZn*{=6(q!3)Zv>HtIRVuFl9#y)O?B!R^2(h3oX}!gayBguU
zao#YLQEp{-W{su^0r)ETxEjjw<p-g7&a+?!v!~}L^#E-}>PvHF8vTJB%wZ+u0Nh02
zw(F(R1oYKdKkrVN-Yx1_3r?+^wJ70*8+PN+5?zfraxf=9#C#0^*u9|qyMqX7ei&W8
z8a++8(~pcbkL$1B9I3#{GqbsVPkhRNX+U+}I%KINQmOi6r`mTlKnhMuPGCei=WmO=
za4~X*TP{p3`40Y&^m}7m^WAUmm+|qZi!X;hb^wgR?Y!*S+vw^bBAPAOhU1sd|BwXm
z3e(RZk63gS;-zoa03Y;$nfO3;D#P}1x+=h1dLMtv7Dgm1;&|!38m8mS{~0jNLq@u@
z(1;~u(QDkou9qH;S*WI}679R?FZ?UUoBG~VEtdP6eiQUkO*yx#IB@ev|2tQD_qoh|
z^qX+!@=<$@3!gWr6r8DeW^)O805z^OvVOz@wZ;7Q8}K?v2THc_?x=72)E&12M)i|7
z(}O50{q}^CgwkY!zX1OO!RgZDRn*X^+@z$8Y00rLfLyl?`0@yOYYfsB*XjDl7OzNq
zKSv*8XQm0<HmpFm1CXC+vZ96WVdaVWzkycH*d(Z4CZfjJ03|~TCY%DO45?lz=y&+l
zpQ|7Ie|Ys+<yaxRdKtTQ^QWpo!9`_1fV=Or{6dR-huDA~s?$W4qqLaE#T`HUKV}z4
zB-7PO!=qEB?TK^db3m**vs0Fj0um#lRoq<KHc_UyzZ&-ab*}&~dRo!WASU`mCNPNq
z%gGC~xN0j#S~{jHO~5lEQT4V6@QkDoA6#~n=oZ|6QJ{#nr-axJZ2qFzN^?mVGdK03
z+Z`1}xoimZ49%E4pq(KED4QaB8xVIqX0I5bek$XJH$~f$y%|22P?5T`CVDCxhaU+m
z&oLS`lzYSo(nl;&`x7BP8~AVkL;zF(^WsV9&D!2ducM~?&Mm@Qi{q?pI`d}tOhN{*
zMss!g4$n8b7KEKQwUN-))=|WDmXBV<hnT<ndCp|p<Y3l)mEW(rPlRUQS>e4(y|I=j
z*ozpJm<f@}%U}H2mpFAj`t$?O9XT%fBhvDT1G6A=ekh(Xd%({ObTsNj$ti#d(&zU3
z#mzyI_6at6V-cruQ>OPqaLm^tSTX26)OIuJ+h5P%J0?`|=CTF7a_rsoaXEl(=Q}DM
zXHVH)7e>D>Rw_05Vo+U0!gqlOU#uLYix-%@(W$h;`aS7GswZ(lz3+ES;5?U%#ZNRF
zX;a!vnBIDefw!UZq*3ea4+&oNj;RO{zTAV}f77)BdfezIoGm$ZU3t~pUFe~!gi#z(
zzLV6X^j)oyg+<zz5?qmP=BD)B!b_ZWn0$T5Ff%RnEGScH=zk2>&}eF+Walm`l3<Nm
z%u1ZWyW)8Uj7Go{<<YCJv4C(hL0oT9i<t04e&h<@yO(API39HBoXPxdt=Q@`-|n}Y
zX~5JqetyxSCrGPSsBL*rwjF-&8~_iH7gY}d4^Q^P3B-e|;Jc`0XOy<*pz&tFL0G9*
z_p4t`s?)@aET<`=<F{WFX(Pt<lH^&cgElsXN=p#GM?0LFcH#Ae26Ew<P)DWPw$Y=E
z#((51g*RQ`=IDohXV+bK$`Ey*z)lN4r0O_4$);01v4)s5J;uSqP>*;Vmp3Kjo^r&<
z@w$UcM6cqK45D8Lt>Tjhk(pJ9U@lTpJ#2-5mrQHfNxmdu7sr2q;b1k&r4Bav31d@u
zBT@$va7#8rJfv`)S`R_B%UfqtXajdKqNlBiw^w3*<RNDZnlwZ)=qf*(_Q2}Mcz|al
zsN&pT9rvfV{MBfrG>f!62LSs0I--BHPTAtTPPf;H9TPC5y6Y<~ozzNZdq0Ta(N2Cf
zg$R;M^~Bxl?lIaGK!8VIpaN}%a0R+PQZp@5(@t}HYU_a9j##A=ffk8M80JC>yOamv
z!RqCcKM|cPkV$Rj-b|Z7YkGK{k=cVoeXP10fxX%~&C*-5ov(K{*>~1w2k9cm&uv+b
z-?;9IN;9Ob8caL|+6VywpKW+3dO&!M=h#T0iwop+E>~9Z1u1K@c-I@btA$gvx|>*0
z#Ly3n&Dq}+V=+nhymIZ5Th<r;i!<lTd;KX1#0sxL=VW`rpETEG<n-rsSg(7ubcy%b
z_j!|^o78@?W%XtmCKu1V_w1KN_M-`@zw4<X)3%LwRhWI-6+>emiPd>H@_|barINDv
ziGSnUEmEZEhPR>GjWF|1SqO{G47BF$ilnxXi2|&bYwz)}T_?n2JPVa*6<ofzklzyy
z4CNy?DaN|^2_IEoxm9kwI7AAV5dik=tk)rhOIF-zN|y~cAeSXyks(Jwyo7E$hm+nB
zmX&Hdy)WJHcIyFs=6>gqi|LM1cSSnyhXl{)Y&WInA*_#}sJr=lLc^=qe@{E*X7^T)
zCn?yk?MKevcVWPpy6m7_d81b5Bgfu&@%#}@&jmmKX*I?4p!SPC;UW3PKEuAU;L-vh
zFXs94p1(ov5C1q-OOOQqD+gp%eiYQD>5!PS^gRJA9w3Yxi>VL?xvkzg{?RJN|7ewd
zYw7=H$E%b~UjqJEkgf`{Ux#D}*$CbY3L3zN0mN~eA@I7=gwmRT`^HNgvt1mg@B_>V
zRFQ_1Qi0fPi-=Hz%@*bJv!9Ah;%h2eel<x_e0A;$16fZgIVb!Bn=bG$LqfgBuTM7k
zn|_v=Cv5Nno)<tFAU%-X-w2HE4?dYp!ME-{GVcUXTW3hBQex2qEkeNMz4j7_Uzn2)
zt-p8ker^Z0>2pu;7x$a&xFJmf!zfa4yIElxGX9t8HMYlJ7wd#?0~yk9hIwimS3lOn
zrZl2es)G-JWYnzF<!9KlzaE6P648s$R2y1Hn<@A!L|nV)N_SiGIh|INmB$1A>}+(E
z=fAjZqnBkAkjdnsi}(LN@%}rpMIOZw^k`MKzsWgu$VVQfOnMQ?<Wb%O7<-#n+}%H&
zIvrOJq`JsCLk-c46q05ta%CzT;E6w4W+VZpHN(-u79ff-7(K}aSYrgUvU{?Mf^aJ5
z;qD7Q5rd4yGFyjgIg&Zdi0EY=Xx$iCXba+fstl1j>WP=xoNOi#pFYkZG5DDEHDaha
zY_Vaxqz52~%X~)lA%+N=BHuH+3{e+M-j>5}B=THYm;QPawthy`^uxIqV<Tm{;j+Wk
zz>ETcmx_%S1biWC@8_hagbf=k^5&#ZFQ~JU5VJNoY0oP1;mk-u$sxluO63-YPZKYo
zZNH*%;ka$DOfMijBm?v;Y_zsghpzHc2M-H6$PVeBnGNROJ2xH_h0fI_UEC~n#F^d>
z;A~DfJpA!lDLg*di(2Z)r`U@cK&ybum$b2b{k_wd8Ca--ROg@whbq!P-$0PogOi33
zNP<~kNFynhO0F2CY4%SZ5TTP*`D`Tn;(E9bi8Y@YS}1oP-jlt`W>ulh=4y9RrWHj8
ztVXHdKds7E+P=Og26Na6WiMVhj$9mVM1$<-Yto4nur`S4*x-t|qtD9p*r%dlfX|^<
zoH5v?yFj^^6wJ|SP>ftu$%iBW1j&N9V`O<VWAX}FhiZL&(T$TPw4sb?rZ2CW5(U8W
z9PjY_sVWn0%97cT4ml;5+>Q0Q@$U1^hz<)xW4a2yatDiLqu*2@Z>?SX134vTwfSoX
zjKNLCqZlFco8J^V4eOeKiksyUu>lxHL!TP1$d8ZWfD$N6v-Op%En(6tQ3gHAMjSvf
z$-UrL_q<I2hB@W2A2nRXC<NFcXTSU%Fl#~v0IxraKEG+cS)3Hozwn*}K=ie%^Yk0T
z*+R@t%iF<Dhz?Jm>J8pcYW{Z4g#?>E!265iq&ZgWu}=>G9>|hY+)WGYWkn#RAf)iy
zpF{K<+k*m}UbEk0V@GRxv0VSovx>ZBs$sfo1b=sO+QXcbGBWPb@FZsF^YI?sR0*}r
zR?@XcT7q|k55vM-#kF9>S&uH>lknOm2d(#gAI<}!9GrqePlQS|yRi#j($$#Vq*2I^
zsP6zIY6^n<WPg6*UD;=c!NXtvYBlaukF^~!sD`qI!sh_TQO)jk=l0t2ahEX5F6E}L
zOqKv!T>Gk%jN=-v<HaFM6?779@Ru7w)%Q2qY>l1@)RBTNFGfOEe3N$4nXxGVk1fi#
zZ^#_DK=udkKLPQ<2UH)tSLcPf$iZW*(I9~&0!AUc3%u^do#{$p_fxLDzK?XNk1eIL
zCtoiYi)apru=Kxb8wj<D4rf!Bl8Fby6hsHo5n*2wgYp4qvV5CX@hP*sN`|E$uO>TN
zEd4EzxpRvVMK!<&7{iAI?5~@EBU|Zo<864gzeNij|H3K=)F1)^ABmllpKYr)x8L#i
z(<Jja<$iqOc0{E*`^jU6SRcjla$)MT_vC`(gPL2%huuvAf@;y+T=%_OFU)GyYf=Z@
zqOiR$%w|3azY1by`BJFzN2YV3Dz=n~%#O-<nH)~tw^$_y1EP+;`KeG(9uT}ToyKeO
zV_W6`K+>#^{tNTj>ZM*7l_AlIHeRmOV;xgT0GZb{X=LC1PH25G^|x!;@n9<TYynxr
z8DnUgD=qwi4GTtDllZIe7D_<wxe%T??gm(CAg$(ikc6K^2!+ym`YKw~!VD>u%!=Yj
zhRBUW?oBJ8?ZsXT&qBZ)=0g4sae#Z<WV%Cybt%|82c(+$w3<Z3iDlmZV6xEfp#Gvx
zp!BxKe{l3AxyOUA3BRM+?$Z*w;hCq?1Y_6>W!uWD<(@1t(eHfx>6-GxKN+TIh4;l5
z`QF(6sK8$KSG|s@h?_O*Lc<{F#tq)18p^arMd-4>uCp8*GiMZBwwSLV8dS>W7glCC
zXt!)EGetm<VI3lV8xEG`j`KgKK7nL<^6YQc5zyCH$j;IJ8GpTGVmx*NmFd;CI-SQ+
zp%FOw0|v3)&TH)<qiFnY9!6X`<s@>EAK^euiHcrP-1G?KBs-V$Yl8aJC!BU_S|GCj
zMTfm2Lh0N+vOhJ^NC=@q&X*Lv4yO}XtlTGBNvpCV`}(-AzF@MXE7SKxVtJ_FCEQB1
zy4B42-oBz;=+4dhFtyCI1--)Koum;95))p3+jP>N=OdrcU8^~eqB&lUNmtawsNrqj
zyX1dBhc9zy-^c~tMm)b6Yj&*KZ<rDl?{clt*#m>i<HcLf>xz9FoKJVJ=90>mCSOOU
zsrNcl4mi!&)qV;YTvJLO{idn5;;Zy^Sqnd=bOmqKiy(iF6gOSHj7Iktw6@G&{L;(r
z^QsddDHV<EkV#vTZ-{};gQAa#A3BhiUyUd$bXx3^754%L?znu{ufep3E)gqhoXLp6
zVYVg@s|3r&PeVy_3C!eIXX+KY>_C3><`(eN0PQdC>0==-CtFz!DnIg3iM}c5Q#?G5
z$WyKe{_v<G-G5c49jGL8JIzHN3nWQlaDE{|-b6f)4n)G0WwXNn7~mCp#IUF8pJUtr
z@3xfpMT^c<pz7B@u!lIXj2Rg~w_f2CWPb9>Epd*Xi8xDT3J5wAcUdHT9}Y+9wU$N#
zS>_C+l}<hNzgy|M2|6$T2kJwH{j~vMf5>3~y?TP*2akLJP*+x*RGC`GBNixqKoKhv
z9y4_S5r4Y@f&a$)0f_8ARt^UcA}GcfjBNdb1^f0FpgsNr&>a64v%iN4+42YF%in?K
zMh6tk4dmemnsrtSM8xf^r%W?7iuC*%O4=y{P@gFGax;%ajjUUVruDyRBMC}OXaTw)
zT99z+?Dw}Ve}Z2nH%ZH60>kz;e+nSa6mcMwg%4d)0h9-N8}$mC_D_R4Su_-!G-z1`
z9PlXswv$}Id}<!(2bwU};ww+9+-=kmC^FOlNKRCFJFS-oP=dg>FiGw!V}Z6n8CZO;
z4bz}}jmVpci-GmZ9Y6f%Ygtj;_oJUkds;D%P0)Ts4u={DmTVwKP8hHf5dn?vaN6W1
zaQqvS7&+vBkrKa$jk5fWH6`Xi-^>&tTqF$`1MJq@D=*L_s7|gN9B*nJBZIv7_5qVr
zUZE87S6QYP*nEZ4k{|<=XpA;rrt>Ttq$!?NQe+wb*zoV=vXV+eZwZjNHvn6f4+ZE;
zen9*L{AW&3JfZ*j%OhYao>dpYS&&m<54_G>k$>*_AAbQ}0~7gRU|s@%c_F9PmHCs7
z{r~h$ha;P`$ms_VIw~*+8pdx3G>~_%=8Qi1W3e(5e+3@KJYGQy#E%AG_sZni{P}uW
zCS0HwpKM_K6ItZl@5ALM|GdIG$xo#3VG&yW*7|>r7g0>fpRWhBArq4n8HW4>F;d-2
zGW_TK#Xkp9AzyG80(r;46CnS;egR*U1OG>V<_(oZS&|_QTd*XI6}jU7`NXRNv-N|!
z2=SjJW8^lFL?;6#-FU?s`kxDtzYuRB*KV$ZHXFHi?*inQI0NWVcCZ{ZbpCD<=3rox
zyqVI-`*Q?WNsSMD7>PdOql}2ve{3AO!;-|TZSX_mBWRNZeMQj+9e@H1P^xMPB>h4A
zUc$Ac0n;FRm#Z0BP$nn?2UaYp=84kQA**VL2CE2fH6U6v0@b7ZL_J^%NJiTYK)Qa3
z1D`zvc0t6e@f73$+Wb*(KoA_5YLMm~=0Wn^xF;kyC_hx`?I$dC0#K+N2T0E#D2(is
z5e>Spq%B|h5@)CwJHRxsV}~0g0rDVkgMb#$JXiy2P{r1sf>QO_gj{hFf0W#bV=dN&
zM{*qZZs37-QC2b3g+>TBA%AwcEU?Stp)X_p$fj#HNh!@)7HiV1IZ&GzXzwlexEF<_
z?<9J<Qr$Wl_2Q-8)8v9rtf0so5_>}&+%zxHt2)|6(y~ra7HV0$%E&c3j1E-Oc}WS{
zTsq8GCss!n1JV^iouDGLMINXjs$_dxgoGSE#)}p7;BUyB(s0lRcN_w*NS)Uxb?q<R
zO;?r{git*Q+wM=k5R47HdHOpvZal^flvej^!=V;Lz4I+@r9i>zDF|2MG*SA90*CgF
z*ESQ7L&O)zg<XKt<{%gLTsi>2hC#l1b^I*J&oq=BEACos68yOX#(@G0Q#i&+He1xm
zJ_Hz&{71APp5(L@QhY|B{bJ5}4)M|gPz0Y41`1HJ2dX1SY4B)N0^yLt?fV$U@k;ux
z=xv|h=)+wlKs+KV74=o1dCY?lB}cwjVJpj?d!ap<?k9l^>0I`?;bx0J>XHoMm2)gp
zdl+RzZG${&lyVE5hUX7ip>Py-d>Cgy1`iSGb}svBf<SD%($H!SV$FJjAK0ZDa=?!M
zHc;ID#|X+rXNP*BP3^j&cZ0;J$-(5j?w~q+W0nAkS#(L6<bWk-eq1uRhk9MvfUC-P
zPjL(awmU2o{gt$HO)Yhw9Vap{Bz!pnH7s<V0)AG;%Lan1lX4aXOWQORs*%&gfV8m6
z2G(3J#wgM|%q;<mjqWOqmqcppSHH)UP$Yp`kcBg|&taV_6Oc=4c8Sqe$}@UM0VM1C
zuuA1M{;A*51e~-{ia$mwvn1+#QijGzoi`?Rm0Db&x5yUmW`T~P4rL>R11C8{<Gnh)
zlcEllkd%xAUMO6&BvQEiu8jrK;==8_@|gDwu+@$o%lIZ%T|quQ8Udp#9W1tcp-N4$
z^<5;%P4(VT`NkwK0_O-`wUwm=L3gRnqxzrGh7>$<M_n7NwUK~J%NYc3XnLufBsu&M
zJ!bC@DhHw~kac_CzC-Q|Srin1^uYp<xLfykVuF>|)I>5S&fh7ZcR5kIgvrUl3@IT&
zVc~q3tZ)j2PA<#iLuJ%5@-}qsdxIztJQyYjw?F|j65Jl($x5?WzF(hYRqkt{n#TmI
zGnF`Z=ogjdReK3K9(wD@ZIoyE+YJ;=VQLahH2k^uxh;g8mmEAG4lk#0J^>1EE6qa?
zIhz%cjWETP=PZM6(^L~mYUC>}t2q%t>1s$F^^5_~aI1*1=|6&t1O1u=tPE=NAk3%9
zZ%g@&JLOLT>LyH9<%db61KT-ngP$C%j55gN?q!)`5}ifHvykgij!q!Pioa>OgV#)P
zJmiQ&)68?csjjRZ4cUJ#-xh)~kN}XeJlrd-9FM7u%rNeVu=wB~8!qUQN2#xe-8dqt
z6Fbc~hS(t~9s&H|h?wx)h}_`7YyCs{eca?mjS=+#7I;r0%tV<q`2gV)d7o9)Sf4a`
z**BNPGTu4LiXIqvJ2N2NxGKJBBXwv;6sTJ=Lbf79BcyKMzXRl<+C+2K-ofPWrN;h-
zdQr^*r<s|n-=HJNw0le(JYXzeik=*%3#5jz^8qHU?uoXO$|X&knp79eH}am>(<!P*
zV)(QpV_E`8un`?s46H2P_6c>*dvVA0vts^6I1Xw}-Q~bjnvKpk=;Yvec&DExb_E{+
zIJr+_B;&LL82OR|xf{6x`8#s{)%Lf8$39k!X$K9-KlK2?7>)x}9K+WiL~0eu5aPwF
z6?^R>95Nh7oaC3AzV*Y^`cbI&-*=en95?Y<fzZ~-Ce-%TIu@F%JJ+lR800l%_+2K}
z)=WSW802?DpRl3_ogAWEz39~BweyNPn>|(%btD8DyT<oR&Kd2|zxx;dQAgMw_W`KX
z{|7bw)knymOt!#SLW;M3Iw1x2e|`QhWJhk1KOYk%Kz7)w6vY0|7PbH~`4`sw=YDKG
mK=PZ|JayGauFe1bEZ*h2MM6CIS3E$#KSeoJ*%E2vp#KAtti<vF

literal 25462
zcmeFZXHb(}6fSD{Y$&1vQUw*2CISM|L8T~NKtXzm^w4{ag<dW6DqW<85_*Rq1PHxI
ziIfm}fB+$ca^7J1%9%O$oVjyn&X4;CogMPNYp=cbD$jZrLC=(*kdx7q9Xoc6T<)o~
z+OcCNVZi_E&z=T;c3gYdb?g{OTu%DYb2t5!q%*O%UQ`|+PPxRG1Sy0obU(V<UZl`1
zY!;p97+G|k?a_M~nd0joWmF%%z4`g|mq1!xJ(*h5+Q}bXQwE2-NsIlCyQ0}<9$8i8
zrr9VIt_JCp^0h!+-M|l*{9Nmf@1@6QZ~R~NV_tBMb5o5$Tw@;Ow{24(-k_yb*&Va;
zmdOsJG3Q(_hxKjO+JmHWQ&koAnD|9o7EE|b#6=Mx4fo8OHy>lEs%qA~`@ugTZYnI+
z>(^J_%YJ|Q658{&(G`Z5O_h5T|7*jcB6p}dM0;uv&X7xtKRq<~G<iN5(NW`LPUF;Y
zQ){pO<$h()?!g)HxooXCjJnIC6CBreQVtDL1sE)<O-~YQ9a*d^udlq7{caaq-;?+F
zP{Vxi`hHX1_?+`UUhx<d64dm=ZQ}VnzHwuPq5e%}??_z~`JOH9mcd_$x>ZOI$R7lb
z=;&~+zmky|x{{_fx*O_Zl|dfhSp3G?AMVor_)u?NvP5|7E7z|>YQD)2Q;9VxdhwuW
zZ~D478>reYRjGIbe?HFSYE_kus<~{ci4Ki$l?j;F=f-)eU3U<jcA7Kvq*S@H#sT+C
z*ZuQ{ZL(Sfh@yqwz$+p6wy0L+YR)6=j**M-8ovjf9p2yXDzv2~LH#fc3!3pxzifk6
z;6mF76MaTGAB?s!ptv{+dAJH86MrAY6>=l?9vtbvl*Vn6e|ngH%+&<_@<siDOxPyX
zJ31XVA%>TA+V3T}ICbQk_jfUQj7uW-UQ+OUV~nUb$zS3Ncg-{upC1^h6J;2=2r>|7
z2aOeQZcO<dP~DX(ZJ3NBkKOd_blA*=-(Sm$8M6K@O_VP@I_Pb<#e8TUqYeQdW`eTL
zTau&Gxx=(;&vI=acH=O~+VF(9)0u4qj~9xp4pQAC7PgfKHL2h8kWy;mkA>=`_AoOb
z4mcS`1hs|eY!@N;lE;U%UWaZV@rKzt+Pg%Bq3^8(S1k)M@g({&g%StxZSD+p5F&Q!
z&?>`Qfb-Cm<HDZDK?S)9*dl*>0VbgN>l(?PN>XIn8qNbhx&R1dO1oWeH1vwX6p?K(
z_*93LlU3$s*i~tYdX^P~l49arU-)(@+^i+EOX@Pc4*Y!mz$H^|{z8gfW-2e}2e<@y
z^y!R{_ty{MFslq^#*D7+)15m$TJGSd*{8<(bD;Z98U<!%Nvi`RyU|03-?Vi)3cqz|
zY>2-rZLGQ(&=$d4|Lbz|E}jkJn$R#HYuQmpE#k3SOD)puAY!fTcl<kk<d{}vX#J)5
z9Or%cQ<Nod=w6FYju`XTYPDSdey@>TB#xSZ1~=^SJ{USgGFxZ3&zr8qOiVt-osP8W
zFba9#RRFzka;5sf#S)C_?AT)BT>unz@-4sdlXCMaaDVIf`?1hd+9g_7!=YDyiW8{X
z{0syh{?;)(Rg`$|YpAVXB(&8ntCMz3S=YAswT4s**4QWWfg8wbuzgl86`HBA?oF<z
zH~%ruIm^uOBU}9eyD8n?4bB*-N`oT8E1RJV{T*zmgAnW!5rFyB76~u_|Hk^*R^^)$
zOqWZ|*g#7&DxJUXkMRI+B-HOo$s1?d7&3McVn?kn`C^|K(iDAv=nY<=E<I{bp4_d_
z@~Ju4p!>A8Kd-ssLjl@%VNfNg{(~Y3tUY<-T@PHih3_x}3No!pw$^sm9XFW+-Qt3U
zz9C;281djKa{6%a=t0rNG*96Yq2_1s=pplH3)5FWjko2?D>e|qSH9G#n2{1;<#pWz
zyg%|hv}gK}h@RViXrmL-Pl_JZ%Nxf3nc=<#?L*1)C4#Wf4<>KFCefo*g`x#p?ZUY9
z{10z)4nw#Y%v1d*7#8ODWWQU7VMA$5o9<BPecivMR!0<YvIOrRE|ZsLtAnalHJFC^
z7*KCweJNdxrl*##b0yNz^Wc2$Znm4o{dS0Ck3mEP_;c>PU2rPuFGMXna4ZwqAN<W<
zm=vd`m%JHzJhY`}Xu-8vm_`W5<Nu=AGONR(<+=s0a?yBY*vV#5Dt7-K>{_z`%+7IW
zR}u#?det7nU&T_Q*U)*O!m>ZCMd%)VSM1=xQ6Fa4m#G|<kNv8oiFvOaw!#&Lja3Wh
zY73kp#L0^2HRarTO-XMt@Bf(fL7!2k;b_*Qs^3!5KL&}$`b3h4QUc<gs-h)ct$PM|
zV4mafyNPi^k+zqCWfKh(TedR^*W7QxqYHZKmP)Bjy-MD^xW-qqv0vn(H*1hP102-q
zhp46XaFm_eSLHA+uBDp$?W+{VC-Us}*5IbXpJWI@mD@HMM!x%hz5IQPz8Z!udY_wk
zVXh%XG==5}-{c%lx5_=0?xe0!_jNAwF>|{-OZ;2jq&$R2ptZ8CRyu!WMm;K{zbo1a
zo~rx+Ay#^Uc(~wJ0%qIp=x`k>HgGDLd+Do}CpF*XsiYBaH-xDGPZZ0xZ)sI>hgw~`
zQ1zM!&7X2Hzp;H7ftBwJ?Uvw5tfZF;eUlj#?fuyG)k3tU_7E>Fui5v-q#`kTCV&Cu
z_e)QK;p;Q5yv@#2)gQ3NG=D<86jVdQrQC;gX*sEO!n8S$zYNv2*lzIa^A_6IHsi4f
z@s+p24HH^%<=(@EzaDqCm$&B7MI&{8`7KkkRro!2r+B^V|5v3AzXkoDlNn=WUTn<_
zM18M4vo~{nhIQ}3(#^0H8N$;o%Vc-VJ`sE|)ymuH>*bgNUZbWKlu3|RhKGPnvb#@G
z&?Mhsh8(2x%JKJ$FUoD5Ui38EIayBceL8r5PBm#S-{4S*-sI-J2hXU3Hr_AN*hAke
z-cUVI+yB?nmD(F3J3rS1uB@ayuMZ3%e=alG9_k(~>o?i%q2OdAw-r(`^6<s24ymsO
zEv=`&21pYW{=Fo>=|BJD=i1=YQpkT^v3OrK`M*p5yMm;NW$eo6f<IR&MrQw#?e~($
z)5!Y&EJ@o2Uab4M7A8$(9ps)iKKbGU8%2)KWg3Pr_bAcq|5;+Xg697B5?e1r+PhzC
zEWHQ+-8(|sUH;#D|Nl}Q?A|MFkj@k$JR$)_*z+sgFUqmqTSoTUW_%zW2}#%NwFCsj
z1=GY(zmrQm*l>jmI)r<ni4f-fEcM!*AyERS<eqipB=TQ}PC5~GapLY6p%_sZ3t7;S
zbVN}kxPK+CmUS#zp{rLNpB=366-w7%Qyli5ogs)YfD^gPo8$>~0ZjQzH8*~$H6n7=
zeQ(rj@oTPNy$Z^{JX)x*ZrkIYvP%8-yZQ2!%>hmnLU7bL;cAj|`(UrfJ{$10L8-Jo
zb&oy5uwGvMLBC`r)|1K34Iy0gJ$|X86j`&AKVL4jpi{ro=9RnR(#`C(h&I=%5B_O>
zESaOs5HSPWbc-%5gDvcZLgKKl>$|-l=_;M|J8+eB6s%wHV3nJ2xh+M1Tx!S;zeO>z
z|Fl&3==(vFm#*zavU%OD+$`?GQa#?W{vt4>C0S2|nx&z#T0FM#=Ba)(K|?D~1j5_Q
zZ+x$h<SEW0?$Fe))-k)yuB&>{7&t8K?Qs@{_>3vjYLh)W&s`PUf3bQcseU_MQhI4C
zoxZI!LE8Q4k##u7u^}UQSo4U>!;slJka78_4~;ALlk!P?j#tD!MHj1LqGbJHJ53rY
z+o}H=#axJz*3J5vDI_Mvn~?9-pKn^R@_lPpmC=gfOR%(ihSL~pN@tr&?8u(-)aL7E
zvj;O?I6VrC4!ByXZa%VA^g;PBWK+T{8ih|4jo;nLuBXi(jv$@5_^l4f+ls79eV-fg
zT-n%IY%%^#m*bCUPo}PNlyjlioA&b$Qt(OhgXNGE+j2y*%@|%{-K(_5pmm+c{;PcC
z6M5a3$o4%vS~D@8*||@vO9-_FRheb5tum?Gbl{xjt|7SgV%5{e+Xzkjj59S2v+d0V
z%Z$0Ccc%$TpOMUrgw}5OM?9#<y-ttzf7}H^dUNg8QghW9SuqJf1}-J}Ohxrp9ht@p
z;JRf--iL{<*4a7|6i|dr&K|pPU9YV(>8R{dqEEnVzw7sjvJ>i*f$idRktON>P>{53
zn0mK&VXixdSXP2{KuM|YY#~GkpGdAR?9G2dc1MVBF;aV+_~ixS)H+61T}OC(Jv^5o
zFBLc&PNsO~5Eycu1rwtUocmV*`|$5}$Ci<7HoOdQhiPJJ@^oV*_q+`dBDQrn6@qL6
zcy$#i5%!h`?^Z3|BG83v$Wjmw!Wl{vvVkq+U{+zG%kj3tpzJw3Q397~8gY@c!%g9C
zS*WNPk7!jJMIgwU<e4HO)!^H~OOk!X7t!}TEEqkO+gC-A&IMbA2R{URx6PemT_*_}
ztol2qd@Juk%JLGdyQ+W-KJvHO{E^VDw2=00&sD5M{lO>6s>k=>g9T=-GVd%1+l2?W
zwTF&u0C4LD@=|m<h35;a8q4ZDW*iAy%-vY8*&K6mr>!p6*_49~hAu2Cpaaj?e_GvU
zgPDPoJ8|kvf_1H4-LCf1oF}uBZ@&$ZX<6IxK`bmMH?_Q8C8)vyB24VF+UUPmanryq
z)o;SxXn5&~eV#SRX`9I>(d%?axYjP|5zeCyDv5PX121%SS3H~?83mDItAqyU?vk|r
z=t-YTZQoCSy{XE??raRcOs)NiM)#X|$>p*7h3<f~txH@r&%K!4o9GA!0n7=Iodec1
z=iL{Kf(%~(51yiy4c}&wVdT9ZxwTWbb~8j4{%xr}rY;G!HBU2c<}ueka`BJ8MR;NJ
zX}Pve<SJg<=$hwP#Z?p~CS~9wyf*1xm&=3?Ra@6D^$z4flbmO)3#{l;=+xU=A3ka;
zpb5S@Ntk0W7~h!;jo!*7g-RgK!~pM)e#^TgzOkIUDhjIbx6Cveb8^<Tj6>_<WvXkK
zTY33^UM6;sD6SNzJ(P;}515^5OS-y0xtJh%7~pjv_eq;W(A0YkM7WQLqUiIXA!!2$
zHd%lanF#tN@y$)maXhA;@M*R}>jKwO??xN!qS&<1aKcP6MQ1qRw(P;$&ClZiivp?(
zT1wc0j!j^;ppeFMC)JAD<tm+t+P6*E+r#6GfM=Md{r+(4J=-DaSD%ZNFyT+vIoi&@
z`tm;1I#<a5j`EA3ji)*4AW`CEo9^vf)GvMRMGe%5)P-Kn>b3S>Fo}r(UI$#Iqfkjw
z{*d0W88~?^+BsW%+i_wT>>>FlIyqZ{7rr-nFng|*^cK6MWu-)?L}}{qK{iK+)6l@}
zN)1wf0o<;|mXVvC>|8jpvjpYbdmxu(4=1aP^9&e270<(b{4Aod)S=WyG8nitgC=QW
z>Wc#-5;zZwmjPAEl81xXs;#CeRxva4Q=61MTO~pd@{PVK0<H_NJh)7WlC95a%86d7
z=65;hNuET9rkB)hJx57QHc7V*@msMkyYDe5-iPH+)5Z(ExZS}5B>yX6<&r<>Q@~X(
zI!5lhN7}Bexc+m;(W`QyDMbq_(yjuQZ<!97G}BG??~<Y-$%g@t-QUouw_6I!7`rj?
zJo|3lt72FYs^zy80CL?4H58XjpFShuWqVLr=*6my3~r+0Th4-w0*4bX<eUJ657Ob-
zsGYSO_Q1$QFu?*A)-CGF1QXxX8?@S+ldA)IPy_Uk>`EX$(KXBzNZDV=aJJ10nKAqs
zWP#86x*~(w4BSU^UV(L5p|Ce!pB`ytjIz6i$G~cWJl!c*#pl3ec<D}FQ5ubz5a#jG
z!Tk)C!0S7cD;A>r(>Gm=87Jg(^R8y%tN&^8_AjXt{W%8g9A8~|TO`wvFg?}TBBar3
z|0{9;+w`X`(q{!N*UlGraGn{tqZhfd0bH=2mYgw?Wd}h`#>Gm^cipf0_XTqnF5Zd6
zp>mefbgGO|pDLcXX8^z8y|sd>WTK{WTfyeXUIKSqQ79{2<TZNvGI`#N<jjw5KPx<%
zC^><6a-ThxWWYm53GdS!N)RG}y}~EwTXB7P+Wk*Pr>FiZuA$HSbVF|cI&f6CJ*1r0
z)B(d&W1D&-l*s#Dk-s_5;z2Enlx|wy0ulhgRnYdo7=AgXbln(|az${$uJhWb7d`Zw
z8+e~<`D3=zFULq(G09x=Z7Y{bo%eGz>#>vGZ9TrqkbVl%$47a98r)@N>w#sruTo=-
zRf2{Xd~g@V%W`ixe}buLK+W8k@oFK&Xj@6=^{OVxYjRvce*glMLl{wr?;I6qF1A-B
zFY(oMg^PG0?USzA_v4}k^(#Jt6lb`EU21)(ex>#7nW0>~o0wyTzNu=#sgK0)`&c%_
zI@7CaaZb81jek7%J|-vvm&&JNkH_G_mDVN2AFPtCY9|tfE->gEP9#(~o{e~zC%TX`
zNb%1gj;?h&Fj;a^|E#dSZl>F#oBgRho$Z_XcK=IKMIf8DufT2sj^l4hVPQ}n>vUG1
z(u<k`SH4qXl4*=7P#%A!B96T;sIyublR3O1_(vMRZ_0<Qy}0x)h>mZ;SOPcTD6ra1
za@9w-JWVb5L|EbTw5E2XZ`gdkRD9AoPaaUnm$MTdL_OW9g|}M66xBL2FR1|)9GUib
zeRJ@W)mo!O3W;55!P#%HuN0=>IKyDvYs@jMgpHu5gCr@HJaT<*-Yn~zIHn$Ln&gvW
zV>z*grnog%Qmne6y<zR~C*{2D|Hd~z=Q)SJJ$yv%GId>*EdY;HbtW{F)itjYOD_m4
z{OuR62#TD3PsCEmHpS%gv4<u-e4TI;aoQ8gH&$BzNp|AYB_Iw<Og=YX+ox3VsYEkA
z%5i1nn`$uRUdx)pwv^m+pl`1jhjRQ*u7S+pbGovfUO4X;MGmRP01akIoPebuJo?NH
zfx{XUk5_rw>iJBPx$E8OQ2$w)0rB%^r3EB@2LD+i<zv6rfb;rvR1hAi4S<<k`F-~O
z8dd;R>uozxUzx(1yPIVJrZZ_h4`B|5Mgwv2`Hb7<M>dt8l@ySZ{>*a$RYLx=MDiY9
zKiBAkwpIQk#Q!T%1AdlvF+m%Vzoq}RL=m(R^Y5j|dGg-IUuz%C4}VsXz}32ORPK@N
zEAyzZ{Cf$gO#ZWU<RX5q0S-a(KN|(^_W!0j_99~!w*71m#pE%5oiEI|(OXQNZ|RlU
zCv)5c{;DtQ?`AOt!3vXOb)<j0o>;}emm^Nh28WBUKYV3apVjq72h@l4<-0}62uT$t
zWR|ZVbobu;X(@kKw{cer&5cg*Bpi$j`YdllCzhB+s0)2&${Zfl8np73{k$Y*db<+%
zRqX^HJP29gP2cV)LRIKhLWv-MQX>+)Uojh>kZ$sBp|yO{zFwiLQ{}rDuA3n<JzZIh
z4~2i9p0;Td<ZVf=t`fAN5`>S~kbjcUo%?x2_Um0CM!?%l#(?`psSDp5NxGf-tY_M)
zLnDL%ixhH$7$PJOJxamKA^X2V*YuW0f^CU+qz|c(Ij^YCBhMVpMp4Y1a=7T4Zj!Y4
zeblwu+G#^q2sfR(XYXWuMPV}Avy01Bc4;URUKb?%5OChp*PKWtP~>Dm5g((G{AfhB
zK0&_}%ti<pRQ6n^;hL$<p)uHRWb?MUboePqJqbwF`jR3fR<_6L0wVFrA0$SIp`yL;
z1na1J{C>IgLQ-(RMuq*1h)4ONlnNvheP^My*{8H7CLgLY@dy1lO3GICxj|zL_KH8f
zb)UH)knY3EJ!y6=tqLf)NXbMvkY34FL($@h#FGkKCFAcg>7mx1VJ;v<bypA$o?t`A
zRnjz40PWuG@;^J7-B}wtV&FDN-n;7@IAioHIMJ`buQoSTJ2ji!hv~5Z5eayp`1DS{
z-f=(8z6;I;?jYhI^IldQ++F@u$H0r>z@b8^Wdo~N4odWgE2nf<&pb#PveO?|u30Ib
z3Z#Kct~Uv$Zz{_mM2Pl8sg#gtt@ZjfyYhu}Ig$_mwXc->Fxr@qf;wJFY%qm1HhJ#5
z@hEy2GO|CE4FbCluOF3E%L;||&~*gctxDH4&AZQ#O4G65e}=*-TAfuwORSsEq5{fX
z3&mBvw@&~nvJQiT)x!7%Dt=*B3AZ-ZiH(5zwZXaj$(R=5!IKE?6x+IWf|>qaG^ZPi
zn1e}k{%#=zk8TnoHr6V_T^_Q7JW*zKkp?_V9;^i`cHZ@&!!^A#9e_X<?kW50Z4+fo
zlPvn)MAy}keubX7i22)3mn+-uLI>^>0#<!)X|qNGswZWlbOh)b$%GWo(K13SwwnEL
z8DT3BqdJ-XaXJyNzr=ATh<RahWaBQT&gUR%xNMxI0#At%L=m#tAe#fZC?qDywQkSR
zfDm3ix3b?#2CANv?N8f86jYb(7~IiZT^S~~7XyDLflxj`@;Sy`x8A=;^6O<^fjTo*
zG<Cl?rVfYL^X232bRG9b?Ew{?r9@8+w07kUvcjoTYNt-&UK&ux+HRa!qsWyqOa91R
zJQ5krP#^#s_`ztm)9Dp3fw!lL!_pHoZTCCHV<p#TV<t|5hrKU$Na~I|HGA1qVtT!h
zCodvhM~My5hdG+lko)#uKbaqrQpLV5e4mwMB(@H9V^vpZzljr9xW5x&L&poRSuGwu
zPS8KY)T^fT28zD%t+GMbVS1EC&{kvIO%Ae=Y5RnT#?9{iNXh|faB)de_3iu^$T&6*
zXJ~8XWZg4b-WmJl<Vr6@ND$-4vWqT>Ug4^n?IT>HfJn$=pC_I{+$}dgSjB@cbCpa(
zwCvF@(l>i=h3Akz2b{TL`{Yt>9yGH^TCivaS#e!*?b{442&d9fSaHB&Lk9^WM+TmZ
z5W1IA+Hj)l?QMAU3GQJ`<CB#6Ns!`=V3Pw0r1<!pq(<wZ)%5eIp9PTElc3RdjQ(Qs
z7L^20>9UuH5|<XO@|Ox^R?+HL4f^QLMH}7cQQGtO6svEZExPbdiYbx>d~lYs_5hBx
zPr-|Cn&b0eOQq^ShO%+mzdgnCKI~*M@c)5<7n-if>HUkM@YaniQ6?vZP((#}vQzi%
zf;9U^zpLFTJ}YNz=!S*yZ<6riDC>6}b9j_Tgbti9QS2LHE`mpjGru^VI9{MjFAs;+
z0x4!>zPUgvzk@g>-}EvkZ(y|nyf&diO=1HcUHq0?6DP_cx|77L)>CjL#H>YQY-L%(
zwth)}HO_GO++g=ev3G&>o}b=>Y|`d$c}SC-;_2d5|Nil2E-DbCmr+1Gzw+X{aFIsw
zA7652{O<!L-S~z@g6+z^6FzNnPu~Qbi4}y6|HWG>_Uz@H61l(QCoWh!YNyp8eSd?3
z(<YX|y126X*j{nZ$fO(b<mzK|<5O~GZ*1ia<0dAbIcm)YV{e=I@r<@EK9z%SC52Te
z<F_%o9$MuW-c>KsTU95NO~lv)AGWvTC<7^CdxC^#xSiCE35GH8ZOH*=uBkUt<jfT8
z9Q-fmY?TL&=|jf$32EwAk%foz-ZpgbXxe0#1N1#>^xzQqst`|0BFjORh0M}13U!;l
z@lxK(akmu@{gqHI4AEFUI<V98n(^VU8W?C>@9!*{>Dx+f$6E<0;^EO`AEv@<%@Sfc
z3h?WccVJwM?(HKYKq=WYQfwn?1g@-O8~1Iy_Gt!-pn9%c{UdnpJ_mHwd)ggO_eq*Y
zsy>vAyTHAgc|Q$qzj6lvDU6=JEKjOnGB}tl)O!<Kf8aW{QfyYjdIo<6f|70N!#90a
ze7F>D;9t2iAhfZJjz{z1J;*vHc_=cKL%H&$oNQZ12qycMV6E~b9+;S9d14(1QJd6p
zy2Y&H<&=<lhyLV+^}826JM2QumTR7rGPB#O2+gEvt@7a+J(kFaCZZhtCk~(O+kmwM
zZ`*<wf@BieQACt#wB*vq+zE^yjb3Nj+(sOR+dpzh!xr3Z6=c@j%Sti%<ux19K75T4
zBg#cwMjF;Oey7usj^hH~%~c2kan2p<ZBz+5bCu#2Sy-x&Q4KUR_ncs2QIE<af68!V
z9>e7PyaW7QFN01U(Ihsd*)*AcB_Im&(%uoB>U(3G>uV}h&t$7imoxS7n!n%G<J!k~
zU|^QS9dn@i>EI?dfbo~Ivw9O8H`|f*%i%KB@mV&3`xfNt`9NI$C<xnTOf7!r1Z<}J
zio^xF!-lYr$bo5fmiY*-`gIB}Ci5xb#+DNUSI73B2I1acy(OXk=&ey!NTMDR!7w(l
zEj}{%fTf$2O!D2njdJk5)t>@cttd0opX_Wnz7LWA9<qzDtuUe0)Rr02c9p^#&NeQ9
z!l9YBQ&7rU#_XCov*(%jlEwC1@<vvhLhdelfO@{F+)mH~0!Thf2jA^bF1@Pd3`^Z~
zFw1hVB8@5sTcv)Cfd$%YH@v!XQ%i7ij)S~%(*2&V_+1{@bEaMFcpDv`2|nGR<#+Z)
zt7B|5qbS2NwnnYAmwDnN8CZLw!X`p&>O%Q)ML*It-)`L1{d9m{x3}i2!=z{?ZODqx
zOTxK#-y(3ciwRgeOggtgkrE6|9d84`f34SzEZI4IYnyimM~v?-3qPzG^|ys)-VDJ=
zh)<fU&IoRTL3Bv>s*g6XEti6uEO!l{Os1_6jQ*q6(g_Lxk;HloSL@20!yKEYvmcN%
zt3->vYjXTSusvv@)8P<aIeZ-P=pe>N5Vi>}Zp&J()1lXqju7<VRTG9H8=qA9A2)a;
zhh!+QP~<c*h1j@(5Jku9uO*)`LYs(AopMtyVs44WoW^`C9GFf#KchHm!~;@jj>2Ak
zmMz5e8y|MjlK|uS$rAZgM3IA&#v*PN_LRnSthZ^7`RRt{3Kma}IP`CfO`Z5*WPvLo
z3*$mnHUcN=2^EBiWmM2P^q+?)gxxfy0a>wzk2oO~R?A~KU@CZ4o0`BO**^b8oPi9u
z{<i!U2?I@Xk;!UkB>z~z?&Lm`HN$%5hmq9OTbwU0%n|NN*EX`dY7~e&(+|cs2S273
zl!IZEIO@j?zh~J9`_VJ&NV}TYf`w!DyC|iWs@1|yLm|IlrUQ|S5p*}B!6!iqojF1q
z0+qV&sO)GZ5w0GZI{{P_52vx!%<2y(O@W$qT<^T0QE(s~;)_5Q&BK?TjKPi_<p+*F
z7X)Ck@S)4-xR15No_=~Ca`>A{9Ta(mV=4TD+(01}wtcuaj<1Quh0CR%o~&!ZVYxt4
zU_eguRe^mn_W7~?n-L3{FD4mue0ygP<a8~xS`FXZutrSl&I9NMQ-jm4r_glVLo_@7
zoQInqo~2u1BlTG~)?4ewV8}bOW0>g|f)lIVdZ?R$fEisJq>h`CvMM~Z84eDIW~9Z%
z0s^q<^qvf#^bvcf72qW5PKAFBI5Fxw&m;XX$HdQkVMGf*#ulwHHaySabntZUqmJu1
zZlFKyk(Dcq+$h43<4<=VUp>cY<uR3UCt{j_3W8=DC&*)Yx+5F+aj$WFp?t_47r4up
zf)z)#<$$11^OpfP1i&{T>{A(8xyQ>y$M;|pM#ItPU!UNn@n?=`47(@bKJYY*D=YXd
z^{JdJC>N6lt)^%K7lkg<t8uT<7`r$@7%d$Pd9rWFl5ob9TRIsyswh$t)H32wqT|z=
z(_PUc9(#tuR3LHdBBgW)#SD~JxODm>pfn&)ayU?ZdCqEwEflKq+{JFy7s2LMz7#AL
z-*QkUog+eW{--UJwgl$%oi3X()U^(kH#m^<Cj0NGjw#Tejtsbdihe9CG}h5z9TjJI
zXKSs@xcIW=oOfy*E^tU6(!EpAHMPYFS_<4y1olW=SAq1>`L0!^<}M7iZgrcHRZhKn
zbYWp_xQ>KnW&Odj6#PyNTZ|OT$(%Z+8Qt$e1MBO<wHgpvcws(G*F{w?NyWLrJo_1g
z_YLEaA!g)6mCSNIK}Hawgq0C~9VGg>XQXdho}zM1z^C+kuX&?cxXwt(Ob<xK1wjzI
z0zy>Eax-gVo1E@&rd<(OUb*}$IpKfIW0b``ydKkWp+jCw-2zI^x;vdu89V-gUT1DP
zRm_Ife@1tD)R>iJQ=U$z<DfG{>Qty+{l$Oe!L&M633-*Pr@KrNH^&1jj#UtN#361U
z`Lf=wxb%<Q+{O2kJ+*>jgB4vv>=e&XO0Qj?F%pc0qHkuM;$4P!o=iUf6b^flM61Je
z)9WHDA<d<`JbB~Ug|^3r@_WJpfarCG85Zs3h%Yj8S>XN;q)h)ibsY48Zh(_amJZKp
zIE_t?kx3ujo!BYCf$m*iH>%iXy0D;YpEO7I^4J)mT$~$*!l%-Bl4ub}2>}Tatd3we
z+@r?Jbsw+FCj`~R#w9+!9HkA8K-Nt=@&FZFHv3Li)rXtfXJ0Z+deZhilsFLO@));T
zwte!?BSX|Cn%%)$t`ey<7TD3A8qe(IP66BQuqv>SoO1!5BmP?G`?+}@uF{Uo<jUvC
z#9&HJve7;6=i$B6GpxyvJCoiJrc08j<8E9imF++I2&jZop=b|GtSWC;<s&O`%xi4Y
zBLr7la>rTiO$&>yTE5eKuue;)*_dcB{7UyV$t4>$4rJ4a|3|2GA#h{ScO_dQ^j|zz
z<0@Ud*<}@Q`c{D?N1+AVv`8k}3xKu6eTBxX;!{s=HWi_tN~7Zng`iC%8+qq++<9l6
zUSLhI9aDeTrhuC6k6xh=Wg;8s#2J1rn(jJDamL0b)Y*xmY5K(Hr44{s#6XW|j(=1b
z36-H$?W={NA9eU_97Df-Rl_}@{qg~D#(aXZmVZ^M_S1KDC1~HR=ANJ;PF~6M2-@Zb
z(xiMVC|CFK?%Nz<wC`kM>}+1;Ax^$$;FzGPT&Q|<7)JlSF_i1nrH!*~k6EgyacfFS
zJa2Ov-7=Yg{@co65ENbEsN@GS+Hdy9y&EBD>vvwF(6rPb*qF%xgfxfxbDja$CGUhp
zOIJ&LggOIowDj%IDLl8RhAD>c3#@pVO+VD0&FTCRB|!z=8$Ta=6Fvaq(Za;*tU8q~
z?uD;w8M*RGe1e;4FMn&E#58+@+}*H^6~2YEpes%KsPby|-zZ*%#@^kb)OmCR@Ai7r
zmkqJZ&+WIkOIreJS>_ubp#`F}rrhNH@A#E4@ge<E*siFnYxtI5Wd8iNsjeFt;wj*6
zZ!>Cg!E;LZv7kkj>aHV|YC*$m99=s6!QTB1Gsy}FZ(@^#W)<mmN3HcV#Yhi4D^fK)
z1QK%GgoWdMg{vc%I`32=Z}L`6>t`=1c<SotYI$=R9X-<=*E&~$nB=!+i4tDrO{yZI
z?>E8m!e{)(J5zS4FJn-?QOnLjzgq9NlS^`hrIu@x@6!%nLfRIX+_Awye+VmMML77`
zhKceP1y90lg@&#*FJVuKC1}|c>TI*t82F~0__SYaqF5ig;y0a`rZ!NZ69aS*j<7kE
zpa6}<+V@oaO~a^Xv{R<CZ+ru2_Q&?+I@-Slbt(DBC(Zr!7k2NH=<((NIzbV#n_f0`
z*stpk$<ZiVb6fM-Rv*8U!<DKIZ@%O6?WaaV627;rm5bT&BKB-w=-)%-{O(j-(W_CL
zcvE5jVb!c2%`H&6dd<J}WaQ&Zi2b{*1D`kSW*h0RItjZrwVU!t*U9)hzO%lR&11`<
z^PO5=2(_a$V$vNFz=;4`Pn%!)jWd&+QLInX#b_g^9O<<}%x9}hQJT-6+N~G^RYwhA
zmY9nacSS$lw}&_E?mbq$(UhAB7MX^mKJia#_xyrqM9PtnqkmCacD1Ow)v+_l{lB{Q
z{|oT8G4yfCiFI$G&TrZf$yyVO_<x|AAFfc#=Bz&}wdvVV<`kq>7?|uk6*_gcbh*c*
zi{Uv(*nDz_@69}^usU1e>;%Y2lYCc^VAjCKr+{lQ3MN`RUkjg=-sMdmhMl-^KqDF=
zSj<)LOhU~61$Azgn4=k%qih}J{~yUf0u7<N=s;}q)MDNiUbFO`^4|=j-z=VgY2hbI
z`@bgkywe(e%`W4`eDOXEBu$dw59a-Uj4e0jfg2)k)=`|H8{_?-z|{XEs?eaJY!)x9
zukeic6IuDl|6`n5C8%9{xii(}M2zrl(>@b2*Mx)RCEJK#*N@IpzxyjeqnRthkMowD
zR@GcdIPu@WHK`+NoW?ao-fGnoyYRifN^UWkzO4cvqhz$?&Ezpy0=R@rNo>TN)-*;#
zAbdUXCO`_gvkRJdc3v@gS%1_V_3*|E%^6zPPXKu0R(N;Ge_F4o=4MEF)W}w*r6nVX
z_Y4Wq5*u)na};u%-&v;|@62-C6<1xd5$e+O+TI7%aM~jlL}izdVI=cCA{^A1g=-f0
zV$PnrY1&F|P<YGv$$!s^R8Hab^mArgtw!A_c(8({OA3R(cJ|>4JlYIQYc{FW>4{Ot
zo{aiRazsas8y;9qAD7sf*qmg|bi#14+yv$CYe}m!`TP<*&Xp2pDRHQ5ndAAVfpt4{
zWi!deR&J&?Q-QnM%coB`MtTXaCgwy!^t^zk2Uso!x<Uu^ZhfJd#Vm^H{wc#nd55b?
zX@CMqPXQQoPZvmVPa*M~jM4!YUW`7U^TiRRjzpz00+X7&kv{8#%uc8DB~Zi`+^PUa
z#*G4q<$*i;ihq#n72=Sd$Ha)Ezox7`^As*1q`pRSVs4oG_rrsPRE@0r*-!2adtGLj
zb{)#ZIu_2{Tv?LbSNCb%Vz#fx6hfjdM#a~(iHr{&(N*LNmn1gUNpPfcq(Ob)-|FEk
zgqcSAEV{>_a|>&eiJjH(6t4v(yu>a-f@iFDI&3KkU+;KP*JC9hzVPLY_hwCi#Q0UL
zcy^O7QgDLs3?aHiRQh#Rq>W)KbRxT#@Gc|Pws^u^HIvJ*2-<3OCgZYZD7XEc$(o4d
zO=5~yV=XfvSZ#_&_lwsXan333;y%M(M^gRlCab}(t~md=+=YL#QH&eKOU!km?Xy#`
zeN3LKRt35A6w;Gc1QJ)OuQ18m)o0YX90m?~OyOs0^S<<0Dd&J8I6`DyahQB{zkIS^
zu^fN^hnR@Wa~pHq*zQe4#plXn?vi&95yEIN-f$aH`4E6t4a%2$;<9JC>$a3jnQKXH
z(yu@t2;lMPNH_Wo`AAT@)?s1T#?_js({ePX)9uhZVGK8x99Bsj9Imk{TyOS7S0oan
z?~-4w`h9O#=C?}p0F=J_o85Blm(~=sNq0=V+sGtSx)$l<BM7~FWCQ#XXs%C~kx3)+
zi`43X7Wzb4XwHOXb<%`5LwYqf^$A9SyZrG%Qvw9zU;OM59sF%a2PLxVn|Cr4$J8w}
z`j5Aj@xfZ`NwkWToG-dQnNfOiKN?uV5eCFw<_&<qh7yPH@>4|*>mN?6#s#Np0UdHK
zDK!l}(8^rxTNL4hBRBu|Lg(Mffu$LMb5+6!HCx-&0H;D|Fo`BIou@2Oki_TmX%`U0
zR4;pf_Jjyw1t4oVF?#M{xE5QhnJ!odvVKDZA7>^xH}Vgg1v<X$5L+H=byq!{cjgNX
z$65!TswYWK8V7x@J`38oN=cB7Aj;H}MzX-R!{sh+%mWv5xZFWjB6(0TcCU5^eCmlL
z>01A>5cQ_!l^wo&b|~T0%HAaJ1%YlmHKKCNY0FJoEBB^2nyRb%Rrl1#o|Hc@=*R7<
z9L&S*>`u;}`0+S(8?|~S?R|^V`?q~0+(tWGjv|apK#@cuD!Zf3r#R^aS&YI~qC>bV
zo$UMdnpap&z2eXs;HzJFYL*4z3HuT|^OO_s+?2zQZsd^%1AxUSGAf6iP}SQPpWo`q
z4+B*J9H^=eif{qAE5H@B=jB8Oox+`wAR0UBh$*U6{~*H?NMti)_4G+qpZ#AEB_fgs
zmYZLx9NFzWeZGb24qb>0*PbJ4$KLT8M<A_dmz!IP?D@$U)UFHyM(Sa7@2$n5a1(Qa
zSD*%G7<u_pZ1;A|N6kr1?jzbJfK_rEd)2s%cr%+R+t%zKFTZ#S74~4XhQOfxidUcx
zeR`dtKu<!w0_`=8L!kDfsv$d`PW8Viyezk;<<ajV)=A(vS>X(;q7zqs-aBZf+SsAL
zyzB*zQ(ASn3m8%4vp%;~Vzk>TZhY8NT;r3KcYSlXoWv*DPo%U8=(ehy162deL!AQW
z3zvp>+nDz9bOOxQfiBqhy)F`*z)@DW?h3KUsx?Dc>s5v##W29djv*?>XineHX1E}G
z#GR4>C@f7QYJ?vkMY>qe`zH1sSU?u?>J;_K6c}Ns3rnj#-Ocw}pFR>LLF#ne*Pdm!
zvPzCp11{T73Y;&rHPC;8!d*zUA@gCA2<;djcimS&5$3#ir(C{_*6;072wK^GX}^*5
zSp92JB7ibZ9wR1LnS)0S$evAMzZi)C?X(HAdk?A<2qwA%V^o5c^-MH126N1v%vHn>
z<^~?ONr^m()g#?#GG1ZNg*nR$|F$bn7HaLLp5d3QaowQtjcKxrehI?`OwWzB#90aT
zXK_F~t4rh1#=LZRj^fVmN&tXbIXH*L%;Slj&QE1%$1?f?`3rdAx{msa0QN|uSun*Q
zv3;)mV#+?uWpwG7rf8=bQ#@6owNG;|jXv>F9Xd2gw=vZIN76xI`lIFCAoX&fbrR&c
z=^KQrp5)i3a%ft|(T){eYPof5+^QLsS34=&<#{aMWeHpaB`PW~WBj|ygPR_2kN=UV
zLb;MPfH(a<Z(`A9kV4I#YjPRY`({*6htzp-M9#}?ljF&aw8{y#<(mY}x&&M3%Ei_)
zi9$*LUXgUUFz-Zlfb8fCMeARKe<BMnsxHcXC2_bhhd+5SWg!_qW$UKA!{lAseT7EC
zQG&2(BRA}E9>7jiyEDNI@KF8x&&76}d0<8nWE`AdGR{JzWN49)X!XXh9Yr*axUhBI
zK8XnNlccjT(bcpX5F2GT#~;ci+X~5^OHKd@#a)_E^=b@YKpznx)#o=S)ck=1_VTG>
zE9whwH~@yQ>Qmcg$9*g5aB)r1h3?X&dj!<RL({a-=%C^|iiznH(|?Kt_|=(9$t=%x
zee(qX9X^v%{8C9CAwbPm`o1JFUJ_LdY0O)ye&A0pg>V-#nr<;49skGmj_tDBUeaRT
zi_M)+b3MIuiB%hH9+gU@6X(~B2m&ZU(fwBMUg8=u9yV>wRe$am33+48b{u7K`>R@9
z<4!(-mFq{i=F5?&(=pf$>|g<>0~>&@sUL_&QEm>M#hwd1C;zgW@Mx*w6Sudh0b~Y0
zJ3Q`SR@$Z9>fW+OJxuMAJ%N+Cn)ES(K~SZ1ay(6zu}Ti$d9465R!t)gT}15j)BU#x
z;iCrqrA@fyY#k2Fd)wXxSd*1^vb$5hD~D%3fNn7c7Y_(yuYIw5m3&H@xGZc3G>9D$
z7D?zjWeQP_M|<Liw>E}F^peX*jWmZHZO6r`hE9@LvCcBErjh&ILy+uT;pR<nqPOiu
z4iWYe`}`REwqUyTHtC-KEce(4sYl5`49cgQUS5DJ`|}mtfIlxzM26PBnm%jemgJ&v
zE{rQJMe_76KzcE!XNFs<Dli^|$FtaG@WQBI`#=Xu4G|~@BZ{+{LGJv{GJkH82_R^9
zVV!icTj5MgB#z{qh?rqqo^N#Wl5Mr*_c^=S>NJPRUYX+lLu>wE@A98S2wENO?&glO
z9JOq-I|YHd3UyYJ#)-S}OF->(J!8%l=%U)*%XUsiQpHw_kJ5tn=dU{J8dYT#6jw1&
zfQRYwNvRu9a`0t-pw;pB;Nh%Fd3siLN@xi_7LT$R9K-xz(Q7@^OtJhR;7U~Pcs;xH
zdVQxL%1gCT&yw59Qq~wuAh$3oJWtk@>Yf_KgloQZy}H(>!(gc`*~6x8x+6Di*Q)k%
zfUry?L>@r-_D(m`yNG69xC-0ur3`U2%$QJ57~VaBE>o5ffn;{SD%=9Zs5wTiIwc}j
z-F$93)-Y7aR>-=<+;cxqma&ELy2uWDSj$M?gfHbdjiq^I>g{{WeTe5X1II>f*kco1
z{1~mk?y<$!m<%Tq`>NW)Ms43kAJ;ba#^(U_tRGmCHW^E6Lxx>}?`=OO<vJwFmjy7u
z#5)+9QIx;{a%f%+6%)%jm!<Jw*~V9HYvPCKWvoz^Gs;yaO5~F_EsSJiR_*1JH!n~O
zgGRj1*(Vhz(%q*^kn)<4zG-S)X~UVwW>cDbIY9pkI<D_&C>Lm+S8Q|AX#UJ6GUJEM
zs|dw`Hh~4s2Q~fx0u3q?7L3s|y6t*TDUrYZnrQXp_|5H!pZsB9m-kaIAfJ-8hn8h%
zUc|_c^uO)6JFRG9T9L1>!X)sIgor)}x`h_)Fn+Gvi%vWpP}CY8e6jnuKb!9u{pzf0
zmz6hRBkC;>BzwvxNnCi~7<Bt}$M~<<EAh3w7P6yQTUyg)?be)J>otwP6R{|lC<6?S
zq>zL+{dgt=g1jM6779JfYLq6M;(gZ2#bWCQS553=El^18<HUmZlZk2%vsz=@^S&k2
zj|c9*>;^P#8WfPv{G_4v+XYCSfTeQaA?j!A6j1e5)e-bmOJ}ec70_WFHH~@Ea_jMP
z?Gd_<!ZCZD-brM9UL&3AiH#=HCCZ28y==eL;S2`93#tu-G8>?sku%7uVDiHx4TGxa
zUK4xe?Qq(a-)4LpK1?n!6z2(p_yy3w-|7pDE6@d?t78M28ouANq6%*E+_Y=?>vd<1
z#K68e|DSZi|Ei&~wFjFq4(R^EGI}Y`_sb{7mGUNDzo^R8z&{o=9VkvIKPvV!e+DLp
zvRjt4Drm+wpFMXP)vV@O4#b?q9K(AN=~=aP>3pcdr=%7CS~NK1*R8aw<<YDf(h1yn
zdhS!ZBK(0kRTSeCgUlr~+w{{!f)$1%^+Hz}(?uz%j8W??QeN^4R?_720>~$UeNw$`
z5_x5h2Kstr-o#%!&~^+wj(%iWo%!Ch9Nj9l7Gk@{FOPYf`l8#0b@>q6O3QpGA=A>C
zJ7qhXGA2RDAm-P)BhQgYrT~qN`!?AFAH^4<>z~i-nANfkIONf>P}f~)IDhh0^;gQm
z0hmZ{1ML_g-NZ436jDfu4PaZhSDg;$qV#o#Lv6Ef2H%_Wxke^fI@h~shD(2eoa;SG
zH31Len8#MrhP3`w3jRH*1(G@1{?CPf&uO7ve#o~-O!!kRQ?~Hl>B|a8GYKCJZ{`Q`
zO0y>vXFa5Jy&acCZF6Xi#`6WBPR?INU>-JCe<hOt|3uc`V{xP!o;N2UY2hmGjq4?e
z-_ITUN7<h%xufLZAUkPlXjT&Sm1OuwI0PwIw|T6T&@Ip{Y{+{2CqhA@1K*x2`;kxi
zAW757zi{zm70aJA#^Xl`_go3q`>s6Z_(;BM4=;6%(vs#M3$yX<1SfjfUSDW-$ACo_
z80jvNYSyR!K$8>~Ncc0+VP=J*Gy~-O)NIa+m@pL>x9HCGUJ0vduc(yx6(A4y1)1s1
z>K9(E3qHTTI;C9B8|xM!PT4cM=|kN%@RRvIi<0sHBD|FxWVejDVn2WPE=E8<@*Ef-
zzJe@&4yIfbp3Css^9}0F9SS4@MHPL%r0%->7at$nd{QlPWRL9Eyf%YYW!pIInw<2<
zplfL%GD{SiKnXbX9^1dj`e$n#5-;&zD3PJjACw!vWl=n-Vfn-W{afO{IhpDkaF<^8
zkn)<4QXTj<FzHl<&m23FyCsK%)OiDIxke2RI~2~~7AEe)I=#B9c}Z+tkmdA%0x($y
zusK$NKK~DGTW^o@(3S9N?9!x#Wk2x+qdH~wR{#)`b`N$QhT!jTg`EbzZ&gtJ=-@yH
z8`KB<hSjfu1^{}BPs)tZlYju6OY1CG#1Tucz@;q<|7YUk3f!fR0I&Y;Lo9ZNsSb`X
zc`gbQlFu+QW#a#IfP#j1z*XqYFS*}ws6Dy5_N!yF2E=vdbkv0>P7%e}q}3w^J1~?O
z2Rsd+xO568!nEp|Z9E10)!{DNZOljtV88=EUV#t7=+R^OF%xhxzC91B&u$Rj!nfL<
z`>xd|t`bISh^mb0KgYy;p%YHIbgN~$<Br}{0x&9H5|~nlz3~M7gc0I_6UT3k2(RoR
za7X03s+xk36RY`nZw><+zo?CT*x0By%pIRj>Ny^bjfJjwFeo_|nvCIl(i0ZgK?)S%
z0PX3H<mSky1f*Nl-4tM|ugi*ARia{gH)7URVk0t_e@e^96SDb71wfIwa<Teox=nJa
z4=>Q45A~j*fpnQoxarj4A3d3UlHvvP<Q89WJI=wD>^7Cq<?J4YQgW4;H5N4q${a6w
z8#@N<?@zfWecS|?o^w9U?-h^<?9WD+5My9(S=U&qgrk^Tmy78}l$0kzKXI<DCGpaO
zF(0jLx>ZClAHDicoUeG?A0_C;-%-u{hu`mJ#{~=n5`a0J=t3UnsJ*R}3$%=|DQ1uH
zTPhOxKCeN6Ar}fEn@}GwCW<ynWL*V8w40)v5~j4mE7-A5A^8A!*dJOgM(@BUMsV0M
zHXM3GJa30S=u4O6%3IE~U&c!sc<`N?jn90>XMZ5q#q?lk#6jFKU*BaSzq{0-Ygb;O
ziuqnePt&UKI%#Z@Cbs5tHH=q=$_W5ah+j0H{e3TaU?i%M4<=ouT3+7&#s`})F7%th
zVSVoU#jLTUpw#Qjoct<6xF#<`l!goPSSL9Kk&djJt4hEnV#>Y#bHV@*HBNG`lao+`
z>mFWKwhZG!?ow;&4@QlBDb;=jWhVq+Ax&JOhaF4fWhydgV0y1B=_Fq4Aa880ib2^C
zGco~q7W72KZ~i9$>7qG9Nf2@TX#sxN^QSf?`}Yd^J`nLa7`?gZ%j(fH<mPdix=&1~
z@lMH?TcQC1fqj#Ffe$Gv{d6hz8)|%r<{*vA2!0#=RH#XVu%NKzBKz!`Va8Z|VO%D$
zH?lRxNl=E#S8Qi~Zev;pTM;nfr#bP1Ozr{pXdL1Q=L7h2O&2haM)KcVyMYu|?v*u0
zh!MYX_Sn>nYIOG$FRyd^h^O$_`)D@f0XkA5Qqtn1RV`?mzMIqXux>LyMdsI4AR)i!
zKnh;&ul=Z10j7MJ@*~Woia6#@%8f3S2hxiB9?!juTy#&3Y+loIby+=6k^%6tmcEKb
z=g0wI9*_e$o?D6KTL=DRLlE<+3;--mC5X-$evayfpxlxBxCPPG=)(GSPu<3m>|9QV
zKbNPSK}SM;Evag8adni5UWcqKc5+Q$(S(Yc(km<Nqo)|Jo@phu-l<Fj6}=BSBs*L@
z5mn%93nnNwH>)A;VhV6QBlkZ5n8nNnX|Q$<OUqKf?ZHrqq)p?3P250Pquxz;pOl&%
zR@a=>_)TRd70{E;DepF%AB>FXRi{E*KliOsMRx%VrBJV!+KYB63#pF?NuI*!{US_e
z)y;BJd*~;CS3~OOX55a}h9}Jfe-*>|>J0t%pd>&uo#+Zfj?cp;eQ1Weg3jO0YgGka
z_$M}y1a6FeOCeBtaF}Y?A}r-W>)E$3r#uyerHZ#$H?W8r+;HTshE*n$-u62>@xV$8
zdh;(hE}3<d!K1Z*w4c8hi5zjK7<Q)P+v{9H=Y6FDXIa;kRkPYDKikQK9|BWyE@cO_
zsYTZZkR)pjdw<BjneaH}u9J4Eg_hQ_@C`+Fz+jc1-4O83p3ng*a9S@@I?UNLXMcW(
zzM+r1vZ9`In5oGbpqWnH)_KMz_O2x*JrGf-fH5cPXUCOf)Xl+WzyM#tiRL!ieAt{w
zf#FhlGvVRw3dn(H`v{CA`rxU9@*Wh8$90CrGT=zHr+L^suenKwKx@&-Pm-YI$Twv}
z6FuUn%0iqDhlHD~9mQEQwP?}{ovu4I8?z_$TnEOD04hhZ@_A8{ZJ;DRo&M%VC|{J`
zH`@SNvhGv1F15g{7s<o@WFZ550of<{;{aAZWDEwZfP=Z>YA@HIErv9ybSJfVLe2!N
zGbrSe?gv2Q0K3ZgG0z1KE=TaYx0fg0?<8KqoUWfqeVR&BGQ<3yM8gQ1d2oe;+6(CA
z^X2CPW1&eQz^;2wd}6Jpd$x}?kHUla+ync_9x7bqa&vOtfKK`h*ZKpDq7zpL^4FFQ
zsdsf|aP@2DpGM(Qppbca#Ww+~aBwq&4jIf<eD{=QKXWkfMvxd6HB%xpRp8z>u$jR?
z<kKBwUS7>eM+Pty2B>=$B=47wIt`Du@qYlH8Yc^_&$Lq{T=(*RMh(if@*df(&$ACB
z)+gyMr9x#&d(`Eo3V~}ur(uXZK8r<fF6)?2$;3~n@Bevv#AD_T<97_e*p)f>Lkg>(
znU^`Z5<5mqyg_y}E4qYJ%%1McMgWIc5i1nLbQzc=2mW@_{QZto532-WiQ@yEqbUEh
zwYTh9tq&57bpb{FE|J@TQ#R!NlE?zk!-%sAsC&$zeo$yVJ}ooi>NPUuQnFnq(s&l5
zQlg08TdY~}<*Qy&AjDs4u~gdHQXpclir4qBF5|sxGY*%iuTk0UxzK)ns34B%1jeIH
z_`ECNE_oq9?!&LP2Mau8KHt^RAs-Hm4(35+o+}#KjGkW4-il8BYs%^|ZE!I#uT+7&
zAf7640I<Mk@6e{5Pk@Mzym`WeV*GghMX+c+Q6W{5dRHs8q{OIY6kz%ZN->+&@%g?Q
z7sHh*(!#|XG!=OUUK3AHqsB`t%C<}cm~<kk!FBszWoxR!Q1{>%3crGH{`MDN0*p2N
zec{(o^dwUFk}_F)h3DMMDL%ez9HSZA2|dk)_*G2b=lh@Tbw5D)ykOB_d4t-QN`-Jc
zxjrzoOBYishF|S9lu4|GnuLnsH`lz&F*SH?!SG~cC}K+r9vc`uvb)8$WFSf`U?Y8B
z-}oYFv>({Blh)_0K?WVTw(lpU>ts0xvH|PslmR_uL0UZCToNhle~oz&|9LVkUEz*z
zt=Eoah+fIB5WBT6qIh3Qi5c-_6yw!2uEv&jqGfoK2?9T*su|;Rm6SS5?b#mY_h8BN
zK9Z;RP}1wPI4dOb@Bv4QxcfzSX`n-gaN-`<rr0GZ$9G4T`$Djx$L9uvk)LeG$Q1?c
zA0z&P!c@UI4YBUu-MequGwqOZkSXOMv23$yYwLw;X-fcY+=WZnGN0i?1)M;P*eh$}
z2PWvZaCdyiuG?IYo7N9cMQhI`cX{tkU`Ly}G8;Tfe-trs)p=nUf5f+)B_&H?J5C9Q
z9Ge8N@32S5NYzpqTQ-zj(w%B95+eX1b;yR#zth5L`M0l&J5|&>9_$3&h?7ZhcqlZ$
z7Fn=$IcUh;V;bDx>Zzy>w8}VGEA^YTKG3l$`Q0)(IO36G<MzUa-}JJf%~Z1YWY5>8
zj3y_)EdAaw9&Ono*LfVkKS;xi>FNKYy7P=`Vq5#Tjb>;bq(~4%lp-||LKPGcrF%q!
zp-2g#29csj4WJNI5WQ3pL8_ubNq{IV0qF@4nh-<?f=D=k2n1<?+(FK{?}zvMyYBmC
z)|xeI?Y-xj+0UB&-_QRy@vt1xr&w`?Gum37RvtMg%rZVpw=y(Db!Gz#k;6Nif282R
zKsy#=)A^st0&FN9;&LK>^}AgydpVW*4Ppf|?lP=UyG)<jREv<Iv&-_l9|Q|j+%q5Z
zPVFi*cr2aJB<6gXQ?*z<dBb;*eq7p>H2x5Oa<Gz~pt>C5RS1&H>%b!VmH7YU6=c|2
zxhdk-n4ww|{Q{?MoqIV885Q!^TMc#i<`T8J+NF7_Qt`l{3ogAKcdQ3NE+Q>3^TL>W
zVFf3LjN~?c57A_8H1yzBP0z@%h9i1L+QvTUnN5op{$T&B&IuCL<8idv)z+&S(@hPV
zwogO;m7#CguYY%uWuT6h-g*U05$j^Vt$vs2rj;7}%oVeZ*-tNf3)*f)0~X~km*o0B
zEL2?A*?VkSXtcj{ZT!f&H)m)e&o2D-E8gNIZwA5*$zEPfX#A96SC8<s?v=m1>Mx{q
z-6`Z7+8EBf-j8c8@=7=(L;v0tdS=dW8U14E_*`Q@&9~X|_apg3`}Kfjb=aLVn=3L^
zMfE@D^J)#(BqSKeE%UJtk2$4e%W@o(K(m`2sNGxLQTOi0l}5c+nvgp7>#HdQN4=xc
zEn+kxJ-uf(zAW^MMiZltOHC?sh_)kJ9R1jQoUF=ng~i=Jf4OyVXn-JKWG*bi3)vP+
zeOUeVlMTr;ljlSkEX02GUCJVt65|#mlE>ZXAyQAcCRg}xD$6I?5Tm1u5{^FIJ6U$+
zU;j7@1@l2miEVU?V0d#&vpFmAf0Jg8UY@xs9HLdIj5u|)$&xC?C}p{~?{$76Jo?b5
zwB-QuPU-Lpd8+DM<$xXEN59-V7k&W#*ZWJ%9TWTiP+>mFraR0Q=_&*3^_^0xUEiNx
zv!sLV-%IfSCQIRezr)g6zT0R<2=NrF`JfWOf?&F9+P`s$vn?~)FTA!Fk?-aJWFv@A
zEZ270$zSJQ+b`Q@v)XVEt*+n9$F%>kLqGa>?>?Doy5M7AP`C#;Wv}toJwjdSp=Wx_
z6o+g1#C9CMTx0I>dbeiiH|(jI{(aB0WA~U@?N@fWJ$wLe>A11s*`y;GYOH{G`C%^O
z#QY=LH@>_hm{QBHyps$(1i-UQBNRD6TIaqUD;z8*ghHLWd7<{~E^`XvIV`Ij)j*YZ
zPOKJm@Z-oS+cJ4Np?|Z|=u-(h07ML-Pn;Jr1g-5MdllnR3AVeJO))=H(XJ9HlGV*l
zpepFtKmn&qvm5@q;l${eAS2bPF;LG#yE3BtTWl&~x5HkScAi8?yp8v68yiCFdmtq}
ze7kr>Q;w|6vDU+S`MVe`rqQ?k%~{3@mAi1MEo@|rgdOwVF2=5a^H@k2dGLA9E**>|
zv$WUUDqqB5xBG1qASg;9#-QKb^kV*Bbn2dA{WnG{DeT++{}a4dR>W{kPZBJfImi@%
zxscQi%IT2e9-_Q|5zfKe<XM($ukoh19<j$TUd$Vzrzeq_-i9_$6-=!NI)aI4p~S-o
z_kYLV{jK|qiW(=PYyoJfG;-*M+R+%o<|$a<Ju8|;0Oatjm>!|$PeedU<HfI-EXX}>
zWFyYWD{@l*`kT~AsQu5B_$Am!{$CF;lnFMlgt*nH2Y5|s;}N1aPO%EC`6cX`pN%=!
znd0%+Pq8wMIZn(R*ZSTDHX^J}@IB}tteb*j&x<aovqYaM{*K%CQ;#|(mOc#GHHMFt
zIFl9*hliF+qgH3{E?iy9!?DEX8&>PG(`W1dWa=l=T$C%#%CC>kJ#A_0I_T2>$eIvz
z^pyBGxgzKpa$nN?IxJ8y27g9<%`m^G6KBvfqSg1k2)}hUAEO#__Ax9Vzk<+rGw)E*
zP;ye)@8|c50zn{d6m{w=m`~NZ!(l4*qD5VRzLe7CA{t)Zvvh950^xHT<LJG3EJ|~6
z(pdX*^SN?WViXFUc`C#ke%q)X3SMg~<)gm`$fn|lvp-)Hdio@ts;*jjZnmJE9=<IL
z)_9_Xto6yK5`tVjE7gL^^TbgoQUuq`ApP?4$MaQI`6mg`S1-A!CWcp-?+`wu*tZlv
z0hQZQ&#SKUJ`~kEqr5u#0axXy&1rhn7)mGgd~aM0J9soJovdK^m5B92iz|ir!f(v3
z;n-2U&e>Z*?kl;2EyG2|3LaBS&!I;gccxV${w&4S!P5h!n;SH95`nnQZLm+^Y?_b|
zg+lP-<^HP~{X#`5!)Rp6a@B>OSu4pX!xtNL>7kc9aVkyNaogJ++!R_CDn*MJ*-G6*
znxG*Np2QyO%<DO<xdW7MX`3cLxz_FqQ~K{@tcqoOW-M~l!I56PvOXX#AF1i!iY3gj
zx*y}<Gbk}_bjGGDK%K@dDG<xS)<qg@Q0O;O)8$zT@g!#^31`5Ur^;GI_(;eovfkr9
z;i6QOF{@vJ;${=b)!a1vlxcJO+x|5`Xll%V^K(A_Mi<4u++PUk4kS&53TQ*li%?ib
zd~3!$2mgfI30+AE+FSC1W+X1N$zw#d=$HJs5E>BaxrtF^S>&&ZZg}P|c%l4-JJncL
zlNHky!wsCc*k~o?T4ZE-k6irrNoC~TCV;%L#t;oxRbHK%jR$awtYWXZJ)>oL$|1M%
zr$Jj}^oz?e$fbH?Bb5*n_-*R3A5h}&c^0fa&+pkT5>HA`pZd#1?6Zn87^#TGC@F&t
z#=0qkHVj@<lm2(0s6u?sk^*-pxc3T){mTyFcSKi{$j(?|3K0k|Pohwgxl9L;1exZA
z(oKgd=PfFbtfVAI-Vpk58;r}O`ZtTZT+wlyL%-T7{_#DEtNr7(t19SaIRxU7P?Qj7
zv%4|$k+_c^jATorM70Ghk_rBlze;rwNJ}cT>C~c(RINfq-^(?2TcrIrGL@1lm!=(<
zrL-O9>tyiK14VspbnNDNxFp2)ns?NR@vk1%l^gDQ_0^N^u3$E|+kp;jwv4Sex|DC7
zY+MVxw6U@RCS4s#v4!wvllcSdm*ggrRpry`-tr9=awn*9&%fY)5b6EtUVf75`prMW
zbogfAvF7o>nW(^=Tp3B)#tL&T41iX$&v%~x8Q<zPj`?c((tXccip8W<oGiI^@oBa^
zU`OaJDB(w{AeDF02P8Du3Z4OW>2`=}uxr$laHJHw`5i~~>BlWp>bNV?d?H^uZ5qRR
zC>PgL@vlj^+2!J-ne%cIBfO4K6J&%V9Cr9mu@!@?bODW<7vYlaKUcFxA|&(b@z2xK
zIkq~G!}3cX$7|y=`Z?_ud8g9=E$NxrarR>#5o$mr2+E1jM^Yh8S$)U>6H91vkOG&F
zv>#fRn6@$v>eJzV2Qy3ojPc%Z>3f&nz%n9`wN8`zHwo?oA6InEuce3?3;tN>3hLCL
zZpAj;bbrNOdWs0^(fyE<B`oP^BVl6<(z-Z6!DXh8<b5WD>`78hNb`^Uq*8b?b<3Ye
zL1G7py9(^nT5}>x66Dz&NM>|r17LOdtnT~aBg4+IiE}}!4&RbP(<s;%hmvsI=nghm
z=yG_!QjNnc9?aM8AR@wYN|`S{fk`>(Wj|yq=P2}pJK3Uh^paZ9my5$U;4dED1qx#-
zi^Aa)3ggn45H*t}SX5_O{dADR)44?RKpl~cB(-r-At`CLkauII*wzdN)I+U#^Khq1
zGBho4Bo<%9rfh6JLrF~h#1M0wnPka4ER-mYDmEA5tRgE2R^48%GC(;}c=XcKg!ebW
zcXHddg!bX^E3OF;JUfH_GeTl{$B)&{UM95&diQQxi@jeu9vAzhKx<IV#4r)4kTD-y
zUc_m_ftYPz9Zz$IJTH!DVBx;d;vL1c8|eh8ru&Ouw62-CL?yjh$n{%PkeIXt+<(;g
zmYR?eB1d18_c4^{lsHBwJx$+FpI_)bg_w4JGV`FptHz!_A6bJpJWX@^pj(PanWTjR
z3Zaq4wyaa#e<pdG2MPiC)bi;re&x)1%whri`z*qVRHQ=HfMsB`@i}5wyff8DlOd1O
zv}84jn2rfCOBb_g_11OLV2i4&nUc%oQN`H=)$Fu-qeGGJ`2m4yku*l{8&CKX7@OWJ
zyx`_sM4PcU+Q)n1wCBLKA-ZJ+5H-ZVVm!ceB8!D9POUIYu%oZPw;TW06D2dxQ|}F)
z_cPWFgg91u8|x!2jo4(EA-46`8t#jI4fLMIFY(DLIQ)l#bMhG+^%tCGft+>p^@%Nd
zb62}@!242#X{ayF3{j0?GeV+X6c2?)jh|R_R*hk#0<@P~I_GdRMO<G(jmC?<9RTG=
z#Ap+<#;|dyv(mGusld4b`X8VgYpH&MS=b_OV68Kr9Wp4%Q1?U)3667zxRmqjl>8@d
zX@``w9qt~9gA};6S46_MJ<T3-vpyH`98)D$UQ1|_h$XyK2fe3e67++~oxYJk4JEKc
z0_S&2JU$xjlT+ZQnV@z+W-`Lb-At&R64iX7-(o($O6>Gs3a$Gj@x*S5|D<)KiO^Kk
z(l-vPoIFOKkYVy|yp3%Utc%a1VHsG@K6^1&^r1Y4hzd_%rlyZ;vTCy3n#Q)RkcqI;
z^4%o=LElJ?D+49MXxatPmXdPv(b!|I7r3cDAEz7EMLtW8bRdSasj#|IK&n)b{C-+9
zEd)_gd|0Hdqlr!{Cv)5oK65&J8bl|lv)30qP_sqTaX5porGbsY?tPFu<Sq)QB{YSb
zA6jLcyXv{WRZL^6GGt@1IqR~`?HY&u&WD)_wt6??0uAc%XJnN*LqG4eI=Q!Ek?2@8
z5QEC?P+0v!$0kH*Z|liTJh@>($CEAisGS1l*K#XI!)>^a{gWO0yIm8mAwi5z3y~Qx
zPVr?wujwRD&)Ma>=y6e9+y&5Eb(fN*mc-GV^r;_Rl0JaQeho-w&ukOEKs>rL&6Ti|
zU^X2=>>_M}8A{WWs^d%?>m<wRK;7PCROPn~sQMV!eyjCoL3IJb%kqa)4y3L6bgi^z
zaBD3n8DKr9I8|<w`cnZM%vU$>LxA;tK}tV77@99Oo~JQ%NARH*Q7ZLjl4je($po1_
zM@Dd;==;iYrFdK8f_@Hn+cXAQDSHasT{VN}dY`^KCE-W}Klss5DI6{$Ch7QMeLDbA
zZCzvJy8Tg&FAJKas4(P*Sa(#D0s!8w1-gxON4RWUleDy83m#J*_Bp&5wGc~~M(W6Q
zM76)o39E^U<drR=24+sK?OQSKi@f$84hGN8L{vL(3q?~o^FYn+*_?4H#!v-RNKFN^
zSD=j^lsdkKv>6t^p*KKZlaz6%1x_CyU%S-QB#;CGfB3+RZp63tRLH1%{qvIRr===s
z3l7oy?3ShxK0S&q?lp>*;pDSVpAz1wgX|1nw%n?24a$2bI-*;K7X*p~TTh1{nUJN+
z3#evIl_pQcKuS6{g5ZMw&4qXNiv#z}?p}bjH;^D^SzE3nWZ%e=IvVkp5AMS7S~U*M
z8?LJie)EIo;B~&_$%830#}9!hA!}TFG~Ix`Wac%Z8Qpsgi|3N0x69adaV&Imqn7J0
znBefE>dMP<E|Mc-9TaZpBd}SR6_8K%h!O7Zm5M1ve8~}1_Clo^n$fh&u&%FlVs#oK
z1!WnfX1BouKX4v>-cv;Z`VO5c`>hy{5bJNt%JuhFDH+D(iW1hg?Ahc>(*+mrhq$>v
z8t-r4$5mBH1Y1ax%<kW6AGxbL)tqmZI8UE!s{#HKAYy~ZjID(;NmkPAcWy2FMzVeT
z44iW8z?d^Q8cFJ^H&rECNK9LSoCF4sJePd?INiBUSH38iL`N#^d<e*XS5u^QwNC{3
zw$~tryXeM<U^963fHgq)r_0Z%<UOepRA8-#_gV(}iY@S{=Df+$w33<fqm18<x~8fO
z#z-_-s9VukxGJ2r-%=9#1-2N=)(CH(j=Bkl@vq<N$mCWC59!ka+q$RhmEJGZthXi&
zQmsuR&lK>_xiC-uJTc-mwYV$*S85-UE-Xca@z>GLo+c2IQWz5zs|QCs@jN^0``;t|
g?>QvOu5)CrIbsi+6`KJ6)7WEv-p2H;v1{DF0nA$VYybcN

diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-7.png b/docs/source/assets/design/v1/prefix_caching/example-time-7.png
index fc33ef50d4fdb415de61cb9e75a085d776a971e3..0b536de5a53f2c00cf8ebfaf554542132051b2f5 100644
GIT binary patch
literal 55922
zcma&N2l(6MwLcD&5lY!*Q)U8T1zWaci8ckx+O}m|vSmpnP}!C&S(Y_Dq%_bl%4`cE
zg|aDxwroP7%&^NWrL0g2EgKqEx$H3iuV{O3Fa7<W=RVK(YfJLG-}9dHIiGWMLY=2K
z{rZkyUuT_lHf0$?SZAHDEnjDybq|=cA$T$!9{3FSS~nDEe4Q)*u=ARA)+uliDM!xC
zu!j0N^KzwsT+M?rrW;1{a>P6cB8Pf63fvL61kW9}XBn2>`^Rrk27(XB<PLz)X0|Xd
zSA?_RBcEA-V37Kc@>*XX{j(u>K?XE1zp8buQRIf^yc`bh*-@kq!8dpeK5HWQkima8
zb9fd$ys!_rEPCEhZ|j}170rVvI9FJZL%`$tWu{rJ%!5kcdT7}?_@ec$?WR?Ffi_)W
zxuAF!Taa0h1CL5(;5wiXmCZol5uC}u3y}1O|C!(a6c#l@3#@<ZEuFd^Y5jjpyW&a?
z&kgY^s;OOk2<Oc5;2+bS=s{?Kwfb`9{?&I{If^~~A5VMw#Oi{9q)$DtJmEhcgDFAt
zawOP;o~88z%>h4K|6Jx-A1#*zbztf;RzNz#Xh3WKXkes)IsK!vf7CCiQKyHn2vr5k
zNe$W)Uf28KSoG`i$i)Jkuj-hlW?+3>qG*K;NHaGHyk@@CZ<IN!G*iJE<JpR0vQ5-T
zBC{~V1JTjZaaHIm5o(qSZg$+xN1E;DxG)msb|4ygYDi1Nj3<z)5LvT!na<{WFl-uh
zFCn~arDlyvW|@?0Ts31WO<l-m#I8$M$xhO%kY+ckMa{6tc3rhV7AzE#z|a|Xq}rXX
z&qqe4T(hc$&Tu3S^g)@fid4IWp>)=CM}wjVC+%T>Mp~I3mCq>Dn5B?V%r%Xfq6Plc
zATlK$?l^<26QEiX3*4+k1cQQJuOy(Cx~Fy%DGaM)s@~40{Z7?}hc(=6M8TtaeUQtm
z0~_oGu1vChYc|jtW;erjI=tx=vL!-jz*rwEP75*FR@#Ha&Zw2h)oTbX@Y9h>G0jX4
zK{!H#k{WBZc&bw%D+;Q|eW#;!vCu@Bd?_gtd`)i6w2*g^p*=;Y7UttH40CyGNS2yo
zww5V&OxEE{dN?cbY?rFEU7?p1>@n<vv5BrNkI+QI#5Of76q_(q)RbvH+vb6o;A*B=
zqtymRqNYklWUfbu#cGwvXn4yoIMd=eFm2Hl(z3Z^RG8-Y3StRDjU{uUkB-XSqL~Fp
z7Z)0tc#=g~OfM@q&Z2xVL5Ib1evD6>j9cVuw4ATnHBXjmeT=P-8+}Vc<x;C0g-`(x
zz!AY)K%?LVBn-&_N@6#xCz(v*7iuHeqzq?ZN-c64A$q?;Xu@<Hbi+z}kj2~O=7cFZ
zT4|a=X0)#dy<!=$EN~B6^ZjliD0@gLQ1E;{DChZR)Sb4=HK!hC`ct^bHyb^i$|j7C
zbS%7YfI41&WJ&`O!exeTD0a5iW;6bj88nK~Bwqw2s33?0#jnR!3lxVCT%Hy*b6Uxl
zW?^n>OfefghUGCX6=uPR&g%o%7CJ>nVK|pFauw6Y8%c*UT1t$$D3$MbU5?gT1)w4Q
z&WJ1qnZ`(jL(q{$Xc0G#1D&j5Laj3cI*|f@Wx5|^@kG)34CpD(gGWqdh%3s}C<CoU
zjjF6SVJpRW!XU{6(`FH#lS8r}l-!1qXS;)ZvzLk7Y#>ZpyeP8+Sc|Z{T@^<bUY@e(
z)M>@zW&^Yh8lySf2~ncU&+5~RUaul*WRO76<sk|R5G`ils*`@k&T|SiEYn&aYF2G8
z%(T>D;D^0_V}xRTuVUMHso3B%A<s@Z*0w4nG0x8VTmfz{Sfm-D?htX`m<ZEa+^G}?
zxDUxa%SLs#WLdr^F*b=R6KfE9s@ZGT;i)x(VK9JJZ^mV`zBV(ao}Z<&>6n^Arb01(
z;E7Sol!jHwkyuU7Haw_YY!LCZ4GI-t(^GoYmQR>@aN7*iPpv?CV9MZXW{)YZ2a_Qr
zL2;PRr6<X;%FT`y0V$=@BBeae*!0w+%iuw$Q5t85`SNI}mCRCmK=S=ruffe!-J?{K
zwX!4$P6>q+%zOsX^)c(a>FDDz-?qlA)h;wVH9O4o)xfVblx(MG1YvX5g0U=_x8YpF
z8cSZyBMfoc34KNjo3Ju~v0Tj>bm0ami-AiQ!Q3e~lXD{;lBfn$o4^&=bq1|`T*}aG
zIp`y`!30zEF)&e7mM_PMmn*b#gF!S<v(hk+js?X-+jYAY5FVCkFhHb3x-$mjlm|q5
z$eRVw#RObrM){J_rN`MpDTz%M%_=g};n+}QRT2o45a;0%LzJ0(At-2VoOJ@NDGZ$o
zMzWeaG8+?`H6aQ@^=`|`(MYPOs6HX}atYMPMZ_4TU~P&9j1mF!PZ5|*x^+IHbhXEn
z8#AgiRHVG1&`i4ac!t4{02DJyx}1%{gDxDolu`6CB`~Q*%Ta<PHb&+IL-~SX)N{2&
z52Sv1(rVJE>!Kwkn3d7)gcCb(ZX$q|m_fD|N&R?&&=XwLx{_^`hc1jcLgb^O(zcOa
z7?dq!IOduTpRa-{;4ew40XU2JAZvoVQK4MWk?Bw@IVfV7o>k&d8cj7uu`!B(1uW^B
zV+HD@Tk{9$EK|vA(-5cW!3g!~7S#g}KykcPGf)HAesKFQ^Zykr@IRIw9RCzM{0kh=
z#RJ%+An}LOft!gHY&7%9e_?^ySFwOxYRkODR0>c}_6=Z^obE7d3+{lmx-HHPRV*ER
zzd@1KIMW#9l!@7Zs@+s|gP{pL)Ejj=t)c>O7d4FuSf6T64<E;+T(c7o2-BKXD%y}R
zrtP-W2I3&AwnJAcDw}~!kI5Fte7)*NBT$Fu#Rdy?1<i9abl04@HrL=4Hr6v}#)-*-
z(Q7!4P1LFcO(fY78rkU7(lWhq2aF{dWw@Z0A4C;>pd(=)pDAEKie2c(3PKFcmYvt)
zesLg_%LB8SPN_F8p(#|P`~tYA%6Y_1#;HWL4Bjbdkx#3M2<$gfG3X{%!uyC?!Fq9)
zk(($1Iq7`I`8HY~yN%RNCQ1xj`E;>pPQxcorKZrK5~3o)^OH<^>;<!x;mJWZQ);P6
zjVwabsA;jR4i!i=giu83(^;O8JwH>^@hlIjP<_u-hLls6B~$UtQP-QGXgKY6D+(#(
z>NU0OHMMG;?UGZqo^AMuFwLu>2>Oz;Svr(uTt-Sgduq;dctn=SAhw!4QyR6L;-EnR
z4O&UyJY;cfJL`yYi{ir~5EP2a<cum!5!rrF^ro$B!4y(;;6YPbt!gThDh0xgXXU|6
zWpzv+WaUY%ZNfvy4+l}H3v*sEs{(V<@0k6rk@JU?QD~H*Hm<deB0`LCjc((%DAt;U
zRr703tvQ;Q8d=e6OrL~u6(?J(<9&J3okAftYnH2F6j)VR3m6I&>j@)<jYhk#)ViTC
zLC2;^Pcx%|nH@t&b2wGA%)rSLjbx(cXd*9ZGTIbtQjg6Ag}4e;dqTaKgaT_-W17rX
ztRjU~SXHGRSf@pN6lJ_7P<goyFl(~U#_43W{?K(ShBL`*lxb?CS=nn?W#4f6Vy0<v
zBbsHWS;BOtU2JSAxauMewhrnAj9r)YJ~If*<6?|+D$%!%bbG>FL^F+HMsLta&amu8
zM{2Yv%PpzZiq;hKHK&j#pkgU0jLVD{c5tH~t8&TDL=HPmtlFS#n4_9T7bSgQPGNj1
zDrl#hm~l9pHv09EQv$}C6G^#`ieQ_H{l3@%O9u8<<cdjVs`O@2g~`dqZpifT9yS)T
zNk<xnkv=O#VJ(pwlAG;S8ofb5DD;OB7&1nHJJ7(F_ZUhvW&<7@jIF#54TP@F(UB52
zd%9fAJEGb|db0q4UZ@d5C6_g|VqFHmXCjPu{i-oV2IQBPH=w)(Q-JoSa2rT!hZq`R
zzzr(0E|^8osj3tKfJk*DArX-cHno;bP;$^L5hBi-7M)ihmRFnls7KD&T#r?Kj2}`k
zY(u76tmTmENCma=CfTz^hmG4E6pj3HovmiGq?>w!(qNL|RkuGgDyleyvqGDw;l*LO
zoy%lGMlw<xRkp`1R*vP+bv>BG^FhDHW35CmM84i>^%&KtwmhR&8z6?56YBL2Q66_a
z7%hx5imv&H6DoZlcoN`H=L?NOh>T;WQ%=XF)ofr63P4ri9#*5OLenmyElz0F%Tq&R
z6m1IUc%a?!eko}(YHFoiBCmBF0Ep0Xwo=vtP4;B5TcF)OS<d83<H)c$y<gLc6|+qk
z!_-XS;mCvJA@7RgsFAOZ+v9e{?M^6{LZyDLFiVw|B*_|;Bc>3d70I#b=Ue>_Jf;&P
z;&sfTLC27l&ot_|D$ESIUg>q>A(RcuU`Tit9WlwcV^mY+VGvn7Hv;;C5-YM)P#w9c
zQH8vn89~{&HPH>phRbj-6k@Sl=t|<SQb#&OfKE)<==EEmxmH$^64wCw57&hztMp4%
zT9otcE`qwM7J=g~wev277(Jn5Irc2o+29WtAFST0(e$jJ>h~t@4|!w2>P*!1<QkmH
za6ysrt_bOqE=n43JM|T)*_8GSDHBaPcrsR+kp>}SDkvwBQ)+{wSXQIDYkOI&Be|qM
zYGre?xP>55FqIOqoMK8i45?{BE*2bXHXF8k5Kt`*sm=PV;4(?O*RWMI*Fz~L&ghf~
zv~ErTx+7}0-M*V{Xig_}pwbx1sb(Wo$~AIARLNwMiI&SERh^~VEC$7V7X!{@PvWD#
zl%)V<7*A6K8kJiOq|%}reYw`R*dg9R3>;IX0$4q&V<d6bh$JdqN^q|btPDKJF0hvr
zmGmtJttm*ei2$EGO9kUrv6mGiJ~0YI4-S$}VPc>pGVox4{iytm;2kx+GJQHF5`d)u
zEW`f|pMbLt{Tn{1MO=aArD`1Ny3`{C2g5V}!Y4^D<toT(inr+yd;r!6j2$oP;6rK{
z;7QvIKx=p|7X_2tWFnAO)%8eV$#GJe=#$(O$NSwzIcU+f2rxD{#WplG2Z>7DN&r&n
z)bin!KrvgY`b^KJCXVZqRl2~2k=Cez@`6+z1{5h_{!FQPDrZ6&YA{m_h=@he<tum{
zaTHQvg{XrfNuyEDR|qv0id~H8U_+CHgi#@JnHJh-nf|OFVw^E_hoi957(|#6lO53k
z6T_P|J3{y%>G$bYY|J<mCkxO}sCryuHq4i5Og^UUs*7j*3dU4Ab<=f_THvE{x(VQ~
z$xy*8Z?K$Lg$rR+oiS3X(E1I*nDiVM9!*>cDl$?w$)WXps!h2t1<nIyDvxF}v>Ip9
zMb6ZuKBsf)fa==mF>j6_hT-~Uj;RMqBgUqEI<ZN`smMe*t{14V7I#b0K+4A25Fhos
zqHds)9`&t+2{L`Brm7<=6|iVholbC*aFgi>uq71Pa7Wcfo(iUulGh$HnYh}jV`L~v
zLZe4vd8snwoQ5goaE~OyPS<XLu>!23x<$EB1r(VN@s`QC9ZrqCzDR3Ds1E#@NlhgQ
z;46}m)#?>WLMkR6RH8eft?`~-7ovhijhZfuz>Gi9yR|r}`K{EVv|8|>DVYAO$_<M-
zq6m~QHSU$mM%b5HQlhm*M3nGSmFxS}*e!9A2{4yX9#KeUNU;gQ6^g)rZn<?LnGSN%
zED)xlCt3l+2EDS^lTlX^>FQ7{+K9_CMrfvPR4sK9C_nGVK!hp2D4?{#Wz1*{e0mCL
zwwN#BwwrqRO5K8^jt4i}ur^ah7{ruHxLD{(3NcNhVAv3JsK@cF+W;aWB$-xo<Z?)F
z<W6gVt7!VTm*M+_iVx8!J^WcRmF>xhf)$J~dq_m*8j!)ez>Tztx-bfoJlz@8^4ZR;
zIs+xOf|p8*{H2Q=Am+fp;Bb`F?7$nB$zkjlTFod2sLj+o@sT67Lv0*{^&)V<v$--v
zb4Y#C$}4f*QUJFR*d<=ltgfoWjiQqJPL`A^BD~p^`J@<(#1V;)LQ+l_h|D(;W!N2L
z1SAqU5gEfRf1H5?Yz!2-7x++?$^|-9o6RC4VY@+!zoAIZ24ggG$GMg}_ByfQO$K7g
z%R=agEVreKiFn}%w2pgHGzux#L&Iv|j5*35cC%x-)$o8lptOD{_ooQS08NAU>oiYt
zfPBVByghE32wBb<fa^2mB1Id+BvYQ%>Xu(_L6Dc)a3kO1xE3J-C)Mg@{Ti(kGoLSU
z^hlFz4Rzd1UI^k$z13yMjxwo`8Y9w4AJl|30am4~HkMjAnyun$BmwVtM%&$))r|>0
zB-&t-loHLbYJ1Y<Bduh66?c?t24<$)4{|NNsM-wd2Qm+_b{3R1VzxRdjmU&-2oqi!
z01jy6KqWb+IsxWMQrv{udOvWom0~bZScy*$h0~G%wJMNJC0FcALNo|zCr>AXI+-p`
z3C_`RTadZ2Ix&(GVoy`6N`ZOghv^yTw2_8nrJ3SFepQx<0>gA>i8CE1y>THgPfff4
zac&mU>!kuNs&(Fv6u!(QzE;kIC16b=i)e+AhFrO5(GIX~)D)eyCMwthDpMJQ&O9Se
z7G+5?`c=nhPSZt!@E#MTj$Gs<R@PR*<Z;-^&x&QJOWNHr6ocQCOh90QTPP2zwF!$Q
zqYz5`Jgl2m)~72$3oUf|xkwpceA-e7*$(H?bTzN1YGG<CKvm)-Rfc51Tz3PvB9vuG
z=4Rn^sIXqH6EgLh$AdM(ZJ%!zGQq?fj|-VXZCGxlrg&^5P@#f#+Z<Zg;PNEpS13b@
zW-wZ5)<LzJo$x8vWOU9g^+Z*g8G}NuGPVFBW2<h?1iTsL($r|yk|wNg0^V&JnN^f(
z5@^{S>(g<+NOl|TVqBqpcpR%?OHV}@CL7+^sU_AlldTM@kd205Iq<2SnxKNE`WomJ
z9|@xgB|_D<K|wjF(5o|yFZ2!1&bBZKO}bGA49&@kLgHAMti(zgh0~><!u3geR6#1U
zj5=yeWkh8NRj&>r-O0nj1e1lD+^gtJn6$lK8&n9VrCA5bg40<TD?CE<vK=1=Qa>!@
zdwt3F@G#2-s-$WuxEYt*R$G9j(B%VbVuf)z%1@@{PSsau`Rr62=FlOMTKEh(aodn8
z5_l~Nas`@U91J{8VakBw0nLItNHFV;pur3sG>4KL&Kfq>nLtxRR&z~XQp!y(ZK%RJ
zVgNLsS<ZWwm`*w!H26~-CIDX;Y>z52HT@F${9n-z*ngnx|C^oBvWJh8mOe8YU?QEe
zi5F9_mPUFC>a5d+!~DM@H9C??UA~wftBee@d8e0wg66pCS!j=_RCwH~PV)SK^HPcD
zfTV~PnJGDVFyiD=!!$sEjvBZ~jj&xrsSLWg5r`5^$a?4#O}ilkz@HxsED;#E!2meP
zF*zKzToORbcCXXbYM9bacVRjfo1rX4ZH%b%#c87p=-Xklt;I~+q}xJhj&-xy>8J!T
zQpX5Vt+g9f662@s+L-7x+LgHMRiFq!K-`?DsNI{X5;GBoT3g^*#F$a2LPfsatd;y?
zUeOyim5KOi&28055VHX*gHM`4*BC|lUJpW0R5vgrpYAM?4TUJ%8|G}b5HV=E*ctNU
zEXBxJhZqE34t;`F&@5KuLxvczM!hndibjQZfgz+6lP(Q2O|?iDX3@+J`+l1sL>6?1
z6|41n7sQ~v5>pZ3xZ9oJ38Qplr;*2^g!1APk}~cf>O+Q$H&rL=qLT`ptFS>=?cgf#
z+q=OuZsu}yu|8`m^+3v0Y`-gtb%Q0u&TN=?ZNJj*jm1gVi>5Z<k!aUyRVTR7<tm6j
zv+|9>loAFZn$z>B*l5o>US;Zqf-@sgELYbtJg?LgsX>jaVKwrA^2T!J)a)h=-S+iq
zh3RpC@U9s|hmgsfP1nk#i&;SiDT~Uist8jJ2A~g*&`v+ym`YOv@k`X~Be@<RaLb`Y
zOlK{yN=?xvToE`Xj_ug7R3|0YNN@@9#Wqs63jq})g|?q`VF47@3;8DBqBH(T!p#`S
zB$dE&q)JU6&DHW~8P-ZUv8#rFjd6Ks3Kno=jg?8=5-VBD&nF6CH_A!F9$<<eRomq{
zk*a+hsb-u(w>#?3fIgLa0~!R;W>m(6rnwTHC26KD0lyO;x=akV@5=+5k>iZ!x|lYY
zx&^wS7W1~><k(uj?hWmX2Eux*#EHa!8B~z5t6+jj6D7uNkFw%0=2)g$&vlyhN_jSE
zS+>mtt?A5~wZ@#1lt@vV>Am8tI0@kzI+<nJVA3T~l<NZihO);stDPTUnqJcRy4Q|s
zJyK6$)JSFeI>RenJVR9hm^V5FT?F3$nD%Opn2ukZ5VLL%sM=(bvwDufO)|rdKMLw%
zsqLEp{8DPj6s!sbEEZNpMIQ$zzu)aoYhKoz4lJeNi8UAjEy)w7VVDKS6oW=EZge@<
zG5JPo+_uM&t=5c=TpT(rUxG;qA85JBFj0k)HB%J=v!>Jx$IObtz?$B+;Hllij7rX!
z=n*?b^l=O;Q5|fQ+w-eqmtkY6mN&R&I>SndD&~_Cp#7<wHmn<C#;*DeGpsaSZ%kQi
zOUc!u1jVx*1JkQ@f>lFuIA&3EjPq^}Y#eDS-C8m9%?eyF4V&)u3PFGD_rqzsTCZ7M
z)^OVxL(UM&kog&)K?$W@GfNu4xsg{Ba4GNMM81}3<rB0Z<lq_Oh=ruh*QtpVi?s=5
zr5M|6Hy~5%Ph?RWHZuCKSC<fB0Kra_&+rfvh?8lyGt$rqA_YN{qA&!`#Yi{fIGltU
zh}RBiiz>LNNVTF~imSoYC!Ll<Bz!cW@;NC#kz^dM6-JydMDf&zTSjk;rsjx>ZBSm&
zV}N!>1z^mxbT11>SUEeW+ReU0Qjn942Qkq`Kt6@5>TbJdCt4OFTPY?{5(KsixD*b6
zot3g(ub7u@eS|ejy&Ty|dGIVw(}nar7c^Q+EZPj^qDBjV>Z`Csp;FbW@+2PnAP8@U
z6ERaMC_OWrP6m?AG<!@DfL2hq=}+@olttQd*KmZk?uksJ)<aE+hGVs@7s*t_5TsEs
z+L>9{9*+Q7O9qG%g+xeK^f-gX=y1q2Qyf1Sju}qlqH#?Qyb0awGz<|`r`$eVmffC#
zHZ8wIb|_KwfvR*Rou1LNRxz&HVjO{P+$pb161WHpB@}##DFN9EJ&D%6?nH%Jv7;ix
zLA^C;cR?glRtr>lnl5!*sg?DXN$7l~Q*1Eqq{)jJJWK$!;KL)}Q<_SxjF9$d(nLZU
z(c^Bj+4B%>RBZvG6*M6AW~>AZn#4=I?sjmj$x<3UtoBF1`<&4f0@EOHG>p9<kpjT8
z)<u0*j}(RM)JGGIKuZ7vi`cB2E041iyY?kG9yoC7V3Sa>qE4(KL)sBntl5RusNab-
zPmc1)u*?~>jYCOPC&LM4AZnc%4?TtO5vg9xxKYN4g=R$@;szsDwID+<lFXx7R%7FA
zrZ0Obmj}2S-=ztFPkD@Dbqg!28L#gvBdLn&028EAi}a|(0QitH8*pk&8|eR?%>g_3
z?|8=VGcd+iSzZzPd6guUrX~Ip&qyHJ1n>-5&u8n7kz<?rKFReMkTcZrXs6Q&%D|!Q
z>cybKgP4dqWk)?Zi_O9eCH8%$tBhlLO7+|DBn7L3wnkemL+n&!7{Q127)S)AG{QVv
z;#G&^pn4H6QY|qJj5op%iY!Y6g5DlEnIUko!1kqfKa^7;!qqmfL#B~*r$v0g6-ovQ
zvQWlxKJLQ_j}i%Ln<zBP6hKiwJ+*XkK-0A^QB9PCrhYq=S_BfM@Wj#+sv1DCUmMJ7
zaX4Vqq-%B1Or;)+)s`)@(8y7QgcOhoPY|}Lm+Wz_fN~{v;AfGE2agjWoFG^?wRZZb
z+HM+Y*a^hy^=h9-yo@%ZQh%97^T<>W4Xlm==#4kXS<}ph^+{moQ}ysqTtm)ErH<l-
zeWGJS8L)m`*943T*kjxUxELd=HLO3NOueAigiL1;*3>4OCLiIn*n;a_BCdcaN<A(D
zV8YuYz0EsqN`ip^<3Tx7%r$c=H!%71P_ls0;iu3<Zj7m3NoO)pF*mj?EL)}sZd@<<
z8dnp<Mx`s{1-V==K`J?j1CL`_$|hVd?oL1>Q;BQ&Tt^)WEyztslbrMh^@%02OiAhv
zb*`%9tTJTPoOY!Q9HGJt)G4JKSqytwrZ`mdUs7A08J{gRhkde+cd|S%Z?vF}%wR}*
z1vxj1f<f5tXlgMN2P6mtn+}cIgIQRO8>w?3d3d=A8%4Nk$Fwvl@njm@<7zF)3Mi0(
zV--4WF6x)T!NHRXJC!;`6Cwh+Qk)Ei?GP+02=#%;QhC+@2&WEs2SEb7L(?ZB*$UhB
zT30MbikBZ6xHFqZGzeCbfb?fU9ZshwP;;t|>%gf1JW4U8<VFzaAFrB@<&xMGgjY?X
zSK{JYp0V@UBq$nQ0uQGkE(md)SBUGe$J*UYzcf%8QjLakr&`RHSwM*vLpX2Atwc_r
zH>wTP%5n}xmHj4Jl@o)X@F=WiwVG}<h9dCwLFQVYCA$h$?wGUbAmqSclJRU>5A;Sk
zLMX8s%vdlHc>s2G22JWh)fqL)9Zeuna@gsPO8wcS-{2hAwa{D}Fhf*P#lp@YQ?kt3
zxC$IyitAQKGj<T`BOY?pQCTezQPjzfRGIQzEgA{c4$<y5rePS^nL#XhB0il}8ye#=
z>VO~=riX!0d9GS_Rp3gi89(fYUUQ0i4W!&~^@1t&U}S9Q*=eAJEtD=3xC4zfc?gfx
zQq=bvXyOs#04o<u><Fph7NJf`HQ+vWIUQkKMs<^_&1cLRr-Z#uwq2y+QY=-0KhP2;
zQZp8v$oDICa9BZ%9?WN4$8sRfFbxL%ayxH>T?AZ1SjQP`Kxq}PJ{gz{Al*fw8Vh_j
zl6r{lP7;t>N7wKX%JKPT354!+-O8ugXZARc7?4omkv>pxAYw*nx#}$Rr+^p9%IdUS
zh4i5@m=PtEgR*`L<@#|fPD(94XyGP|B1I|m$6c(|GIJcH_%j0YxFTR~DotWWBT~N)
zQj(_qvP5LOdZ9!6Ie}}kq3K4tI?(vB3<2|Bmd69FoiIgML87AGEsTj?sPe%5EcSh>
z2!<I3^te`S&+3-$0J_@b4LMy9MC-XtbB3ZIKQPpWfLWkM1O*gPxlzgLV-Ao!5IDe;
z84H%<feUoVBDE%($wF{_2uWqR%D_h5N_2Hhc>1U|%D@#81cWRHoP2_rMQjh`nsxIi
zKduz@Ry{(7g|eu}Z8;|(&B9<93!PDeL><41D{-YjNoc3os{<}}G-55J=m2J<T|lP*
z^GRs26sx{NB9PX$*dFO2O}Pmy7cM289#cVVx5Ii!!s>!wth0(lA*NoV<mRYe@%6Ai
zkl0GLD|%%m$rTXKGp97IjHbD=SMa(tV)wl|M!B`9W%FYMWCXDisaT@|hEAGU&f@`O
zrV6x)*;8vmL$egMO~qbi*sj6tzL8<vaE62e;bL_`i@9blz@rcz74$ZWq0R_|YP(+E
z>ZRIH%Vx2P4iZRwk(C=#Pj`J&s%8n%NLQF^3v6|q?MSpF74uTB)*U%sx7#$EG}3Dz
z95^~<0Lyz-(`!wUhyhGa69uhy^cF+t=E!eWMwUD-1VGktriU7M)dl=ggybq5YPX<X
zDc8x`QxKseyAcbIE5r;PH`SWoAUMeC#h{itMWal^905N;OVC8IZCI=hvM7wPHKgE4
zi60dxz_s8)Qy<zi)#h8!KyHvptLLhS-%CiI;;@E~S|f?Tg-kBi{ZUV_iyc=LB`Kff
z1q|pO%?s*6FzLHQrHoZcPRZxGx!xG-@{C$BxUlQ+CYA$Un$Uz=h?WEV5Xe9i#b6ky
zcn${w7B!dx$Zu-&YEif9Hu@};8rI3Q9S{&^3jk*gL0$l!5Ev<25;26kZe;{n5k8HK
zyxAI<1!KxLMu0jj`TQVm0=BzhXc^3c1fS_Bpe+xqv{dGs5D%zOoVSHmuBO1<qSPee
zTy9LFwAiU>4A#&rif2Z&91vdywx(JV$~xHh(%3U6*|=LtL7dWWvLt3FvH)-;-kyLQ
zhR~nMR8!U^uMGKkgLSMy8FJc8s>!i91&2W!;9H8)ovd9&^LaV|h-Yj}8|n520Pj#`
zIp^gE-Tylt1eN|B4<;?BS#!nq0R?K};}xORGXT8!7r*tDOyo5Cs46iXkbTs&(Lysa
zs*c(nn`Nsi;xQ{36AvU%^=-02Rl24N1{rgq;-yxy0|7#>;Y8(Hj>|`G&O@q`Oi>*R
zT0yn)f#`RwSmLTFbRx+xh9xQmEp9Ny@LUJj8n7RDAMV5i!Ub}T!9&tgSQg|s!WpC~
z^u0V&;l!!a$=6|YXaTK<_Z<+|%bNLOf`M>6tmO<w81vFlC4v;-hdf$T2R>U6bKaoW
zmMp=QJ$u?1Wb&QFAx9ptWgvz~X-q5A?&w7k6}cV-Ag_c;I)X3=%5*RdfqOlWkcR2L
zANpY^XH)pu@1rPIfu$H3`*J3m6DA<BXH=*+*wQrArII!SVZ`(ZK?;)6m44B5Y0;@n
ziz$+EB86>Av(T>NW}lhnaVO_COsrF{4NThy3}XRAxMZ>CD!qEen4(2K>I5jx5KgD<
z;az9oKuxI71gR%Fo{PjxOyouoS(_zFH%X=)+i&}}KDJC|k`*)s&<0(zHx(-mHx%>Q
zv^Xl(hZT?>sAo01k|97Ygdw1l3F(R&4Yj&4P_q#?lEI#hY@%dh4as!!Wg*VlxB}Jn
z3QaZ3M3}&EF)~1ey&rZ>yvv6|wt*-uRbVK=BU_RJTTnru^W}O}Xq$zG<HdHbkr@pA
z1R9epr?x~*aER1j@3m}I#+4G@q554(MjYx(PpB+AflPN=oeA&JcDoW318R~Cg-~cB
zGl*&9P~8}n0~JyNl5dU^ZQSZdVUy~RWUEZ{Y`*DCJh{?p<0@5(l)PgM=o#FpO6?g=
zTBg*=f0>$9RjPuQq)wOoC0ZaR0-mp=UMJNl`s_I0s+svRlMw=`)2R9FF|AZeDdKN;
zRcowLewQLgiCY&1sc4UC#0*F{9(O=>8_#E2rab`p194pdp;z!sn8a1bs*?3e#8rnZ
zX#u?2Z&!!9*e2S8j@Re1Y^s?U;ITFZyvmV$8Y9bdBTlyqP**Hv5i?_eBv^86c}-gx
zfGA{+aN7A44VPg!Qml!H@fnHjRgGdONK}>U)Ip01$b@m=UQ2F+5J0K#kG#BIHA@zi
zYAdSTtF}wP$EUi3MnJ$cKS}dAaAyj4a9(bd@^)dUhWyMjMj&Rcg+Vyx-8|Ycwfxu^
z2{10#18c-ILT-w83)GlPaNug910eG9z)R^!CzGKHgK9Mbk!g$oFL|&XN*d0FqCX4C
zwi*%%QXo<Xu2XZJCdeS4wc)y~O#F5$CPDIW(}X}XxbEaY>=$GYBnnb;N+6XjQ=o%p
zP^jZcMg|%c2iPrw0C>Z(1n8=)47ZKQ0StT^&%rUKMZmn#wA8#`bwHAfQ|g6ThGpQP
z0>W-Z8e{ljNo=O53Lx)7MNF-4debo6q&O1D(u9GF^?JEm)p|m{?Ic2#>=iSz{AFMQ
z6sBiJ7Z@tsmD)}^pMvU^(NtAR(KKC%Y-r^b7mro2z*ILZu*GpZWoN2`relB{E?TYz
z>6Vm~PTF@t!c;1#_|gz0yq9OgxMj|Y36*IM<!&+wleFl>nB?Me+dv20WXQ&FmFk)*
zRj4QUEEi{72pvcq$n&XAygodT$e5oMor)`r6+E$UYls4Kgy(7B@5mLD<Re@lCap@Z
zPhx`(10>o4?`)J?Ajs6jfd4<KdQc8<cQ|lJ0;~xxrs)`HsMcXXq8*9@c7W1LI7l0v
z>SRf7C*Bx50l7leNkZV9E>0M;!9?s#XUkX<&=<H7m6&#<vdFY9Lr`<rCM2|1Qn|?_
z2L~WEX6y{9*+}XGG>><XOr>3c8R$zqYoZgVS|7yqN+G};ViLBYPJJqPkpuDVbReQ%
z>gu^v38R^YK4}|%F+C})J8KGgXxgORs@Wn6W8LnQ`79__k4p)H0R|Mj3PI9&!hn68
z?(}hQlv69}s5eQ)$Zi7Mmi0O?M#Uo3G@9KMsdgLzObCq5X<I6r^c<we2W3TNQgdV$
z^g>Tn?5WV_>dufOLGHz<UIo#3XXuZe(ySaOOc}UmqS=>|Vd{N=1Uc~L%>**Q>ofJ5
z8|C9J5_dzh9Y(EoU}UkR?K0U3G81hF@|-Nzjw?ZSAW}vHjHcSth#Ulx&xhb`9C#2i
z5zJL!5%EX08ItXTrRo9r(J}=q*OFV!U|MFt%O*j_uF-9xV?cZ{o9<iW>)>G1<#6Q7
zCJZ3{OiIy<Htm(Drq;&f0;}h+OjJNfA{}%T__<~@?u{$Z2%Ntz;0q?GQ~;SVxj^l=
zCY%d^1p%cZV{<7Q0NI^IxZV#8vzcN5Zj?>WxuS$MIdlk%!zuw&Zw$LhMUm1CO7eN3
zp0Y|9szz<OV<6N3tAJO0{(qx@pzJ?io6C`ZDt&GLb=KK!9hN9IXNSFU?#4H?9{c3p
z+g@48yz+bT{!?~8;+5Tnd**EV%}eszbFA<q?r834YMUyz?}c+N#*d$S;)%Zxci1!c
z{1-nsebrlEJo6a#;dzUHwB+H1uP%D{-Ni@T)c^ac)gNDe-IIqs`~I!3FWTw;Wm{&}
zqi&zG{knU8fBlnx_|KcAbLR|~-@R~h(*Ntnb<e!uTK?Y(|94Gx%S`diZz4C}|CQF#
z0+qVD>;J2x6?1-mLFMab|E2doo&R@xd(FRM^Zz}rFMm2j0IltH>_a1Hz5f{ax?3+_
zz0rS9<G)*?cH3vYz2={Gr%@oc_!IxAv~2Y^OUK%upZXrX@!wjfTzkn1m-BwQXz-OW
z9Kx(0koCj%u+(q<=BjxM=dIb0J@}ZrbEgiG)1n`*sLfrt?TbI!eD0guz5ii*?qB9x
zzgV&L+*Nbe@a*+_FVjx8e|zeC;q8|nw(Q~Oe*4h8Rr9P{SFCP4jMX2#dD%V8V4pPy
zvM0^^*7qt$Y&!SN`PQ#j?6vG3?$3Ka_cOflru6SSkxt>!>Td1zU%q(RFS%sl7K`Sb
z8_s=m%Q>gL{@A7qj(yeHZ_VE9Z|0ucKIQ_Ve(DEjM893}@Vqy7*nR$D=B5Mg4?co^
ze@djKWB6d%$K`{lNA1SLm+ZghTkMPT;7zZ*XtP5PZzk|;mnEUi-o_-vRwr$E`d3%)
zzCG5vaj*F|bFXdt_CoQ|+?LYH!LpCpgZM|69dh!j?GIhNbj4qnt=?$cl?N>|He3xK
zv+L>CO_m|2U%T5D+(Yv;=N9tOpWP8|VV*npvmIVKdfqjX_r<DZ=biH2)hkZ=TH^%s
zjh&y^^_F8Fy?)a#e){b-e^_?){<~}`&tG-HGYhwP=<yG)KSX|Z|20oOcjEzb7wu;~
zc*xz$_Fnxx;)?B-7uZYpv3|1R=6Tw@yAPVJSn<@dx0YQCU3$*)*J<{$dAskz9xoCL
zj(WerF8^Bbsmi18Zu!g&h;-xCr{8(eIV<j+w`i`_^S)Sj+4s9&71X8s?|zDXRB-%?
zrAznQt8-;_x8H7ooxdTz!Ko+A_IUpEwdbz*`_V`4dDf*HezEX7-aelp-kR@}9^Gr{
z-|pTjiFVs%%L{kiaKVb>$~gLhQ9feDaqBI;XBYhLXD>Kpx7}a+$bIXlmBZ?L9Kp8c
zp1bRuYj0|7%dP+3*~K&W`Sz6;&Ux5hcIDSEPi|Om&s`4Q<jT^{PaeO;^5t&u_Bp4`
zdvorZCF~7PT{-806JO74`|N$k?DQJF?56TfrAIegw|wQ&<EBrO^*wIgbJ1a|P70zM
zZ`gL1IoyGSy%E0i${pTZea^hAPTl6N|9u!=+I#7tTQ7g)isP>?zNBGi&)@it6Rz8I
zyUGW<brun~KfmC?BZ$vebC1qpPng3IJC3f~XrpcKczNmecb<4dehcaN13vIZ2d;VQ
z_Djz={y{rhG=J+WZ~o$>tLvSGKiGYXTjcpG@gufbzVwI4<{Q3s%`-=za!XB^$Nu<h
z`^~#>+jSrKV%K?VPC9<cNpt?bc)jZ%J77KS<PDzO`|{%+``XIck>4Nx?qhDrmS@f>
zU4Grl^<O^Cf8ckymAx16*FV|o&Fd?|`On2`*Zxwy;>Z8ePXEvX+irCqTx7kU|7u?A
z8~aDc&wHl4@}2ijzGGK=kI`i}^mf@*ei@AH%#-HLKLI`Q;!QW1x5tma_{q;c*lhow
zy|LuIHA@e<`~~Ff<1g5M*Jp0HZ0@1qC7VbMV$Wji#Y+y{VtCly{z==}H+=NM<iv|l
zS-i(Zmwmi&`K`m{$Gv{_J<B%V<NXVsqmL+=i%+`bwqKR&8(#k7H~eF^TN?A<_+aG=
z>ZifZ#Wh!4{m9ENJay{g`D;Hsta!+Ivh?WwKe+gE=6`Fe-Hx2|(peXr{7flcbPT!G
zla&u%Sb5|Y&mVm}(Omh(RoZTdzlOMWxAnHv550GnL)QP<PkFAs^P$SB2VcMG_8sMg
zSJz*-^n}07J!-!vJDYxoc=U;-^M2E~+RbiR+~LOKg2wl++w=H`|2(J2KHh%d#Ch+$
zbHUTeW&6b9;g=tA^9G+}PWbIj<=ZzX9Dev_A5?+^wyl4#?;iW!`0C@gci2@Q{Ompd
z?#q^z|0~0KuY92$`~G(4-S9|f8|shuTvP8nb#iOL<!2rLGPB9NhIYxp+Zy{{_MggP
ztMzW&fBweTmj1ZIF}uHY;m+<-d+$46z2^J7&Rbo+{0DE8UOIl0V@^5v#G^l%V;sH7
zKI?6A$FYCAd)ccS%v-$qmfi0h*}8h~Wvfnp<HNf?+r~fUvc2$AcFWJ%<L+;sTK4b#
z-TT{a_wao$T`@hL*lIO@`4MwJ*yn}AUO!;@`hP4Qvd2gF9k=kTm+ttS-v7!sE_&%t
zn;rA53%_y4aqpj+e00Hg)_?ZDJ}Ga$>03K}>zRAcBQHN+-bvdc-evCMowj`Lu;Y^l
z7o7b3(kHf?J|`aq9d;wX@#SZ|L@(TH1LxSEpM33|#{I8%fA<xmwKmno7jOQ~R@N`x
zTX4pl?_7S2{PsZ`F5cG~AG5_B;pdxuu<af{WaK?po)Q<=eAs+vr^VYYd0g7RFaMXf
zklcP5y6w@oW8b;xs<{{b>f>W%a_-8_AKB!QSLdF)%M%-P&n=zsL;beB8}et^^6x&d
z-{QTuyySRpt2?*P{`hu%*LjZ!t9HBi2d{3}L7)frJN^DUmLBrboNb>8SMIiKi{l@k
z_wfyzAGu<q(LL+$zlXK~g`Rf&Ps)d_Kj%(-ryZ8>eb|j#eDJM3p0QWX-MYT<sXyK3
z=YPB9|G8~@%r76c^(K3sx(It`59s-?f9&pdCwlNBn{Zd|K7Z52JMMWvbn1t}IU9ZW
z;%g@!^8UuB9(U}P{&uUL-Ku!c5f7YS7Cu6s1X}O$)(7k!J6>{J?b__=<qLn`+2*so
z@A&jj&mMp5mP>b9d&_-K`{Uoe|Fb3M$pI@v_doW<w-1|;KilNIbLMe7X$p1B)_<FS
z7`D~r%N7}5wZjK({L)1i?EjNp`}<OxJn+I>_dogU%0C{lgKVvw`^#@XzwgP%ZnNJT
zw?qd&b6)?e6$bz*jPCpCoRx2!d-tlXuCJEKt#998yG`cPKU(o^f5BdT`<jg&{|^A9
zP`9tIpAH|t&AtCybpj|-mjGPZf4>9tm-GLDJ$gF;@S&{UP1^aZ_WR^Rl-<97)HRo1
zyZmR5{YR6h=wOV7a<uG!)$nH4(~te(#t&Tcs37tG+2rXd*wQ!c{4V<+BU!!;06PzI
zWd-}+Dl^re3bosl8@zm*a?jHziplz0t$BU*;oMbA_V53D2TLJlw^P~44wp;CQ@{A^
z!v~*yAXt0F8`f@DKl0WSF}KH(zkFQYakkxubN+nSUmtv7{>7i)?DOYpd(<+2IHAsc
zw&RQE9CR0P-uKVh`WMSThVQ|~KRV$%cc6RY`@C;Icg}rtzy1C>+}}6eYYXnDOIE)$
zJPNYUx^tVmH~h%^^IhmpuX=NiIqjtP`!DXi*-|f9`T6jiyLR7#U$EEwyPhB3xEun@
zd+{^tG@g3;xcoUEWd3juw)1+=zts8cDfGC~Zs#0&`N99JzwrYe`n?4gSHLFjQvmBH
zSnHg9^)W~P<v-isd6#||Z+hHyFAFDvJve=jPq$Z|UT0T+@%CRG!RqsW9HQ&rdH?VB
zU;e!J+e5iC54rTHYkpT<`2Hum9%<hB+*`?em-JiLKULiT-uL=9S0D6?1b+XmADsJ>
z?_3+KgcrSk$KE?GT+>+f#UuTL-v8*L@sh_r{&VzL^~6;Np0VKg4|lKb-JgB-$=`3g
z>88@=*O|@sy}QKBg54{>UZ8nLe0Ie&|AS}pOK$zxe&Ukxr3<jXf8)*ON*sUr!JnN3
zJrv)0t)jmD!0S6bchfKLzxbfnuHN^?qgMVY>7A9^VD|c(AHr)_K9#)t=7)zKdF7FJ
zDEl5hI=l1W>E-Q?xg-Akq?7(Kef#=nzo<VXe*P<|edvW%<SF!w%(mY<BZGc|XZ~&m
zrS?O~bGy8G^^&#!s`Q6<uQES<eDC+}J^#o{KUlN(um8Fhem8g~vx&Z76(*hW^Yw*o
zmOtd)A|Cna+B0_9{?SE`oqx_%C!IZgyzTzMk5|3(aqySIn(V^Qf4|-pv(crO?sDzn
z&q(qoFCO%*!!BC7*>}CeuN(egWwPlm7oGN#h4_chJhl5(>Vm(nbN+%q+<p%HA-OR7
z;Rb*C_VZgV#QdxOaHWqQmi^^5XTEaF$CtU+eRAq`<42zQ)n(5gdtH9=KxN4%pKYGJ
zfB(ku?+$wGoL6ss`%>xhk1zP)J|{Iw_wr9X@anH_Tsik};g{GK@4b7~rBTzl&V7nG
zZs%*ye{08s_+RaE<nj$8=llbYU$ijy(Ao#Dy0vlfo)52n^UbSP7e9IP;=_)<_q7*7
z``4dTuQ>TPE21MdTKD5u4?l9v&;Q)Mfn4@Pxo<yt?tVMInh%cXuHE-r<JB(@9=m+$
z*$<E3JeYg`z+bQVc>3LWzp1|>+OPVHa=XvJ=%=gD&p*BFnYl0B`uX+PBWr(u6uG1S
z<N?S2`rx<UKJB$<7XSLY-+O1pC!hQt+obs`cJXKL+;P&Mlgkcx_m{_VkjDNfx7*6e
z=N~=x@L31FyTjJY8w>YaeayTa{vf~p?P%8(n;&-G^TGG;6y7}fM~VCVMF)NV{DoH>
ze6M^w_oHy`eYbt_Io)6FUUAuLhhMPYf`tWl`!kw<n|s&VOV(~O<}X4%e(|(3u3KxI
zZQb|teRe#^I8j=K{sq1F%pdN3*A7>T#_u0FDfia&(7nI=&Wh0+$gjVDDsuSukNdB0
z9izYc*Sp^P{ND4eKff4O>r`}Qv}B`=ev;H5zV*GcjGyfK+<AXpe4o7Vy+7@Iz}W@t
z@SB1=@a4+<cQ3vF-QQfJ&3Rz-8Evh;WxI378=i5=z2_ali*LQTXb!qx`|nFWx@50S
zUw{7dTR%N_^GN>j$H=;y&z}d~zW1HjR>|6<`A3#7{yVgZzVX2;PVnw|^zG07JU-~!
z2R?sm&4;z~eleN8_vCcHSM1+BggkWi9ee%k1Z|IV_AG9;<7XGhKUY6}e0}GZl{e10
zbFW1&{^<Jp>V?1h(WULnF8yxl-b<d+x4ZQ9A0P0}FAx0HzRzsE=HZ2Z-{GP&wmopX
z$&wYmxXY8RH*Z>V-1NN1CNE$A&`zthlXrUffI}}d3X><#x&N-{_KmkY>w@3kK7ITA
zBR_xSwlBD6e{lWk11?2w{d{uW@X*462fVe{{CACy9(eY|AMHNhFYc86TzmC{9S>ae
z$SV`4J^J3JYo2;!(NUYcvDrpX{_)WpH@{SU;^=pNxl~wt%3gOqwf@UTu7BqXCqFT~
z`<Ua#OU}4;$3I=MYIMWL*W9w^K`r?F<tLxL_USFJd++VP-FndLkH7fNiT?KIJoWI@
zI_d0lZ~o5W7q|TR*2n$zk_TTu@7(s~xrL)!-u?XEgFaebKBw^T5sO=oju(AAc=_gC
z4|@3Ag_EZ-SMHycd-tS$_dfhW`mNV5``U+pDc>x8`@!|~jW^KQS9ZE+FTVZYPgWhe
z*=h89!qEryU#=eh!Hd7y_<>sn?|gdsaQmaKS@Pt%qc65!y7+~QSD%}_e)Dw)4_>?P
z=4a*|ap3)bo}Tx_2QNSPa`nX(@9bVZ22^@{1O38t&b;=VBgt>C`tXtdO}BmF9CXzy
z_s5U@bP0LaYY(rVy>ao~UyENoYKLw6N0k>nedABRt9*LQzHi?2o2Os=t9$qQOKyAT
zsaw8&&bHK%w}#Ix`N;trpOjg*`uXRdK5@%6H-Sk!{oO5In@)yT{QXGzSLG|;`n7e@
z$2TmwW9>K510FkP&x2>Teg47CSKpqSy!4{9*VfD9!e{s2_VvD6zvh}%53hXnvheK7
z9{+88*9|}1p|{JK@Zn{n7Cm{(N3T7KuY1Hh7wgNu_1xAk1G5De&z%42_~6f8_~6|)
zZ*I@~@oP`dyXu)I_V`4WKRffYOEx|5(u?^g;rq|{#f5dCOm=wkDfrnR{A%yLz7ee5
zYKPxEaQ}8UFTC@UHy7>W{=9$IyZ4^I<dX5hUtD<C=HLAz`NAt_|7rTsY?D{EdGG2M
z4!vRPZ=E0i;lc7>gW96oj#%~A&)ycF{3-qZ>lfzugI0eWW*0xR^~=A1VZTl4h2IWd
z`uUae`@h(1;i~h#tLDW|-nen`VLnn+zW&gUPeIq-`O2}uJx{#&>2LVPZBPGsaP}3~
z?|ZDU%^#M%eeY?%JowGuf3R}Bd#=9YljoIp-}~^{BU5EM{=*J?4=y|4`uXn%_wIYh
zUhBX7?E}B_!Zmm98;3u=L;dXi>-qT|>j%zL-1P@v!+!SeCfnS*(E(>JjDLO7hEHF+
zsBpdTw-1}fyj#BU%}eKQeR>Z)`-1&%yrA-<cUP>bZG71$?><Ivx8t54JQWt4m1|$w
z;cwVeYwyc^ocko4eD`wWuQ%Si_kRB$A733+)z)_{0s^Aa-Hp<qbVzrJ;GtW(yQMj#
zbcaX^(%l`0l9oC&hdy-2x4rki-+12Vj_>=Eu^4Bpz1P~mn7=vaV*Jk7Jv?(eK!GE=
znyuk-r84bdquPT=hlc<6s{>5=KQU{^j9_<vXu*7>icjnD(o=V;CXjflIRAo$ZETc4
zP(1&+@#2wWznEsF<x3}zDDxrbbF+&Yw*!$6@gW!Xr%ekt(Z5RMKW%UJ_kS*enZS%$
zAXcwyBM1^2W~>`~w`|4X+VGb0wDWS+F{Rler}TpSk5!LjGZ^pnTaFW4mrV=BHVtD2
zJw7e22tUaPboFZYT0-%!GBcJ_S6Solvc&5T>ERDO>6$MrmS6b%qva7=cB7yW5|gcg
zdCwV#_HEm4=iSsj`0I{D(-6ZY&Mp~TJWXjds}XtA!zS<(r*6i=;{2FoSEQ0eefP~*
zw%3sl`!r&Y=R3JAj_nE(Dg`AX>t!>$TY9Y9SYSI67&>ghcUs{RcE0!MdYF8wueE<s
z><DGJ4LA_odi3;ax^dcQHV)u)S{AMInX+`x#Anmp`I_yn@b0|fnmE(7YyA2I&H4PT
zsHWV=0rF-!Mcc=LA8ruIPbCn9Ti)cLclv&Xb@j_Exz6e(o;f#*^HS2}T62k4s|;H^
z(aqBQ91zhp(jPbb-8Ap4_dou;KW13+6z6gyBMC>_us^N+gn!*4+}&q2yg{@U2)Pc^
zo#$4MlGko*NhKW3<Zt=9ZQ<7c#I0_;LXW4}d!YWd<F<->-|a^L=hD(fzl+FTI(&9(
zdi*(9;l<<#-*PsAoly#0XF}I+J;OG)GMfP33?HK%bXprVMr5n8hIV%i)1S9VCgaLb
za}^dYeaOhH#VG%1Xiq<}SEye)zLjoQ=Z`?&l9ptvM9(M>ZYM~{ZsQ(8gS0<rl}p+&
zyOmL3EG@{NkCqodW_sm2W$(KmB&J<<l^lLG4X9^-0(ZR9RC=4G(wdgG_)A^i@3z%m
zb!W;c%&C*slNjvxcs5i{JYQ|R2>a>zh@<-}uk~}}L3xab{!e_K^IwavY0kQvvKN00
zo!#E<EOIG3!W^%Aq~Ir;bu3{6QYj`nCO#FZAG(Nn7OZg+=K2`+Pl7l)LUjwRhsStB
zC*Sn_cddI4JK|j+huDJ%Aapb2a;zPXH|KmfOApPvwIl@AyQb?mWz&`x?r^0Wt9wr@
zsu2ohf3H{ls<?EF>D&#Jho>OT;@PVBEk&|#YS996Ov|?_0tYIh+SZkgr;Rb2C%a7I
zDzCU{ogNnDx%#(*c0-e&j;vJ!_eAhJdwB(dFzNiicl1x~%z<voo53?c=ZHavFL!}k
zk)1Q{Qr~!L#XaY4%y5!B8P;s;fS}*dpKW>tiK}<MyZ+RRn-JD2@^B|#Nap9f2-Ees
zmONb1?s9dym{C7xv%i({eOw=-M&2YD`c!;jIxE0LCu(XxJu+hwz}c+TC!Duh6Y9V{
zx{8U)K(IIVU(JDnN=P?fSxzKW+{5{&RMSeSnzQ$LcV_cz)nAw%w2KOtP-C8@d9$E}
zc3#&NvO{=`(0zte;yFH~yzRbir5?S!Nj1@RVa~=g=2GxN5j)l)f`xl>5xKlcJj82p
zQHvnkOG-yn7PPTGW|zBQYhHH2LShlG$jwR@2R2QQ3W)1ok)0c5<>;boelv_w>;Ppa
z6dlxJsw|1Bc2FuUr1eXTA{ne^8qDl#q^*f-pW$T~E2DKIhQ<Aod(gs37_64x8Fd%+
z!2EIxuMV|se0X!<yq$W88p}OGvYiSeXe|0Ml%4sbSN>Dch|T@k%-!pG)57qzthuq)
zeIz;*u1gBO@dJO_6WpigY4piIv_q2|vUx6pB|R-4#1zTL)fN&LGzxVWKO|@Q6%gw?
zM!)&Ux3uXYvw8n~I+@V~-tt%}wd`Znxr`-+vR`c`r^D&u^drFWaec3$LyCoXnuT#~
z&#ra-%D39P%yrT57Mmb5M^3-=p~;}#N|vG{)tSQQB6vsyiuH2;r)j&53IA5u56js#
z$9F9i8nV?23+KnruBsrZPsqJUT}Ge5pJS=OGmm8ay*%fi&|l@~tlxF^XYYN%VVw#s
zAD(LSG9A0tyB<!hCYDzA{0R3uOH~qjFxOqK@f|kLv|b<tN-LrJ{h@h}{^y4&uV2-$
z$#RbNI?)I4N)JmE`Y+isgECm78y#4)s?7#?l6dhYilYSeqP~R045xisk;RW_bWn=o
z>PukA^7w|SwUtL=^U<9_zs~7mxt3`!v(O>134aRKhC(q}1fCokzAmL_j3CkEI%_6K
zX!oi4C24sMc5=`IorUl&SnRHk6^WLNBDbG6ZE7*Rb%;6my-xm;^gQP_>ep5n2@;)Z
zfl^COv9SJqt&`+t$8q;7;SNdTOim}IVVKYP$mI{c=GbNwD@IjavuKJUvO2c%_}#aY
z@)5~46){yOvMx(hjj>>_+16%C@tzM}*8E7O{Z?fkhx$mKL&}#u>5Gj;K42w02+9`;
z3j5}amUuWSystSZn;vz%1&jd>x|Q30U0Epe#s&rp++++-F*5kNsUmu7ZVWe9!^VN%
z@H$K{RmGtgkrUO#=exzQt0Q}xZFSG5I?Aj?{!iCNK-Mz5!ok1uo5p!8lZ>121!#Yt
z+n|QO#xjqL$+o=Upvmf?#>?dfJdH!n8|ieM(XZ&CBl2+OgrJ5WA#!tu&XRtq@~u%y
z^;Z<Ic)Sm!d2huOZuv20P#y);Luny#d1mDM=TToM%RbJ>ST4nNy7de}vlMQtWnI6=
zQ{`{J$zmfOvb=!fS!dbw@x5Ur7z*kUBYX4$dO)<8*nE#~Xpo7#?DtU8>*Mi4{T~yy
z0>xdfo9IiTxEi_L)Y#Di<zSoy+JODihZ}8zCSK2cdgy7`FNo8CG0@N9LqcW6ouO#B
zLM5zd;wqynG>&kIN~5txKswo|l<2xNry1T16_UdkNm*x3y}Q3lD{L^<PincYtDEy=
zauXaH5B)TbD1P{{_xVC21|G%hi{U;w3HbfS=`^eT1cl$-&c#xnoSq76=55Xob)lja
z>{n3Jwd1+C$5Cx`fqc#mGa8r<Zo=FEpXxW`d%ju+8r4yxpuI&e^G@EiG05F!AOX6j
zm3a($nRPiagZk{tPx@PFu+nF<yJ6x?CGej%*tTUlBs`p*eAx8TG3o-<-_rAvX#4yf
z9^lQOkA4p)C3Zc>7FK?sPckZ_UdQV7VOOJG6aw--1xTiv<k|9^wms_#w+2Tt8!i3T
z@G)Of^O=fhF2>gLeO$L6VtF|x^0II$<98?i(q}OJJr|Fq`Q5o4EuMan&D<?pQM---
z*_-<qk&M;iYg2+I;k#WDrm1hYlss%dUFIFS)cpBA)5oDf6jWYbr$*@M>&d8uWcZ{G
zu#yFwd=0#q1@~((6~STpzMG@hJz|59jV$Txl7)qLvyajIA2Yo$CaDrHXFF^UTrR^<
z-d+ZOny#LFH@dVpBiwy{+mAkNbp&#uVAslAc<Ta-sB8Y-PJ+w(>hp<l^%C^=>&px1
z7dHG|IqA%ww>Qf_qIl+iH;O$r?0O`!)sH-z?ZZhp)z>NI+O3WD(k~#XEkbBxcDx9P
z8+6<V7(QiV8)wKaACyBk)!fL~MiQL7=efCVoXN)7T&#U$Y`t79WMw1)Z*PQ77nNV)
z3paIMpwLT@*tNt$AFsGt&vrR=r_0i_QaN?Zvb?RkMocpzV5Dd`So)hCK6144jmx|Y
zqt`#{muFYa-ZECGcs?|^tyiU#<_vEhH`1<$z=$H8fd1D(8fNI<FOR-wvUz_AH4q%U
zcpz;|M0EIF=OON5zSQtFM?wes+a||z*)j8!zvXUTJ2RAM%_03>c>+K)HWy0@N4@(p
zo4#Dqz%tjr^>HJOmcRwP19(#nJHLO&A$O8@1iqZOUyuna&CXQLb)s6U<7G&B{!6qx
zaTVL9L3&%i5@jpDJ6Va}k}9@UgY=%Zs{pYG^Y#;FxJ0LqL*IQ<ofC!I(v??JV{1_Z
z=6b=R*>88ZHpbbn75u&zo1c*5tKAR6cz-UG1}+F(kBK~5?%gX*^?Y;NJ!vu%u;RY=
zulba(yYz+tQLZL9uIdk>a<e<a2Sjx;F*v8>%OlfRu9BdJmMnLLTCuJyjNXED+qA@h
zDrU#67kU>eNp#7xI$OgI>if-?SpCO+zgJg?ZCj}B?}qFSA2CWC-*<X-3V|4ZQ+NLE
z$B&RuD~lolU%*UwZ@1xt`^P+^WV*=;c_IJ_V!lDueKBk+GL)P{SWDNT(lWz65=XTM
z`}*#jA=3<G0=anDN2YTc3@&foS<CgVx<dOz0YCBF=lfJNW6{4i8vRpu8dSPH<CpLU
zO+k}u_gN{-_QMmqJwV?7q`%#=#S72kd#(Wiy6#c{MY3A~r)(4IOu5iAPKu6a#hxRv
zYh}p(4;Ns9iZNo|Da56sKj*eLi%-w|q8vLm84YKq<k?SK2QkO?R;I=;ytBIMV~s6q
zb-O%<)sNe?rgm00_xIr&I}9d*5kVy<bMKUX#p9p|sD`W1Rl@2l0{J<R=?*!OG{jVr
zq~vCh>9qNH+BOz5=EYNgX6$9^R}TXaG_Nf!q|v1idSiPKlg59-$`e_UqP{awU3-yf
z=F*q{a|!B;y-pdyB3!xM_guj2jmYiMKu~DTpWrh{)V5&vX|z|F(Smand<j!PlOy4I
z9z!yofp&6!kFocBk*D=429Jb#;npcHXR+7_8RjBojfa#+kPs$OC*fd(h&H>+wWmu{
zM?lF)(<)ZP7U6lR=sD~$F3IrJ6I+dDX%NY@2|RyV75BaXnNBomm%CZET<HEwG;dzj
zsgvK)c;S-7a~^i;51hwh1GSB{*&4zkBL~Hiktb<kIcFuxAfzAU4duqIC#^Bg0KRe<
zM)dSGhb)a@ChZsqUv8Aw)a9<S@?Ji_73lq|6Hn9Z+J;_>0`Eri%WErqNG^SdX(pHP
z3{CYGLBqLfHh_6LqpN2a5BvIJ$A8D!d#jesI!l0*L3rKad`{@B5591K%{RoF`>~?I
zqIjU7W#7lZh~0;YVUFOZNwoaGNnnppcW|e(R~6p86iroLml7Oh&Z{;EW@11~zIl$4
zT>uvMYWRLhtyXslGZ!7K`+}(4Fn|Vu`(2GAZ}Q#8B{gj<aycKUpX*H97k?H8BRmx)
zBD!8ghlFOQz)Ca5gg7Bf;=_rXCBm4o52+Ld0;xF)Ul}R-fqk?NqZG<kH0ApeF)1(D
zR;$nR`v5PLRi{CRIbC4x<r6b~s8qULTdJO~cd3&;Z_BR#;qYj-@AY)%!SzjstZj^m
zpy>5{RiaXul(b>y)j)jHR#NlXQuy_}b+w|dfCK$bbZEvmVcq$n^TcFaOM&*#IMKpg
z(c2v$$h(wbOL&;tp_J9<$Nu%6oyw3m2Q@|?x^H`eMCYp~Sdn$zlF67XlQX;AmM)M7
zY9DlP&7L-oWSq7LM2CO>&f8{wvGL8;+hV>fJ;MFq@}pEVi-gJDRx`!h25A)7v*=l=
zsQ75r%XBR|K_nvB8Xsaj`1AUnozv~jU67#RK=1xHW=sD(=4g_^3$wB6w@CR8Pz|Jv
z`GYVm&uyq))As(PE+sAgj4?9O(1inymdZr^Z|-0Mk~Iub@Z|!4Z0vl)YOU=*6cE@m
zS#nZxyOml;liTvk5K&esGmhq!xDQeORpBss*1M<ND>z?FYb;S}rKmzL^YzJ7SeC<w
zPD`19deeQ*+$yEChZ^E3KC3`;&tp!-jm56GjMJp;YvVS-%26w$-XJH}&m&&rlJ8Bq
zPR2ti_Nl+NosZt``bjs}unz+%Ip%rp;Rbp-LB^wtO}uB5)i!U`{(>gTVRN#j&is#+
z7wn4}hkJhMB+R+UbWFAxX2Hw1N5_+kdX)noZ>#oPR@%I5?d$O3-_3W!`y)-reO_%g
znq|>KLes4+@=cyyS#OcTNg-A4)-v2q(J#u$=iI#9+LSWOtUhQhH4aeHAJeaxCUV_U
zISiik)3T2`Tu^~KFe637Alp5^`@~3=5wOk3mVItt+LpWJzMk!i%Dr)_$}p5Ydew%)
z+(H`<)QI<UPUrqbxrpnP<^o&ah$P9c4Kmc$6*{^P5v7}Lq-eLaclfz!hmZM^ayCH3
z(Xa)e*Vm;Gdr$L<k25;x*Ty~6HT|dYbBO6Fee?ZJB(j7b9TuDM$x)Uq6&nr^DxQ9H
zU^AbqOopaCs(dK%K{HWZlh1&=PZN`5U=YDY!CBfV=FaVL86x=b7J7(WO*smAR(=^z
z-4sf;_aQeCm+C5G6og^c#mfmZrMTcDvMKfT`lK@1#hkHQ{f&4KOKF0(O=ECG>G?5C
z-}15b(%xn05*L7XPzlyEQ1c(lI7Y$Ma+?tEwc$0Qaq;Z5w+$<xH+@ZZZTiikCbQ-R
zi}EWKORi9k&mVW!<~w*9`Wktod+%*CqMRIj^77gXIc;$hlLh$UlWaP};R9StH1NU1
zhe&O>bhgi`;N``lPx8xX<j8)o&>n~h<G0&b&{!^u#qG^?pVZ(YDCF>ZHhV0^I(3dD
zG?(G)a}q-a5&i}6(~AyuA#?qug~DodbwOZ`a#`848zet=1kDT=<OwOtFs2`KP6Dbg
z{hyT>BU5+DdTLQPC04^70%wzB?b~rBcCMdo?($_p`=~LIBJa!ys-AMV?b^G!D)NuA
z0A?ix>!ja+0MjkW*n1yu%q7#{oaHL!LNHYT6EdzjS;B~j@=TLy2B2}A*P*)cO}{uo
zz&J9S;%V-zuBO5cdM(cR3*SExlfUZY%WxY3=6KFH6%I5@64h{z&)*bs6nS46?B|pn
zwlGVWbkAhEoNp5Bgv*bqFB{@4&S@d>h(2-Xqa?AgYrBwhgKDV4`YK>~0M4IbMYHzR
zY%O{qOW>|cS#S%LU>kALwSB?%R|ZUf{3{1W<Gs@utM^E1`y5HX5R#Nke`7HCIm9`G
zW<#7XrtMbaPp2~dgS@DOME4`6l<!Ua>GVcOnbz%v)4WOiBX<WwIzUsCXXU!woD8Dl
zLc;O-;h+?$_Tv)~>?0bzYy1EE4>X7o?{!g{Fl{+O0;xiK?^b%V6XN9$-?9Z%LHnsd
zV(&+Bt=2OO7_s(ZBL@h7jCTj*Mo%CuTI%KIU#+pG<h&mn+VF)LRqkmssoR-h*K`bO
zXpp3E4FmzLxg`k*DGFH;c}{*jDQpn7xX*w3SyIRJXlaEfQxY`xjczcxIigZrL2
zshAcgq4<~lV=H$@VYWpn1M&ZJFp6Z#k#P_0&W38QaJN%4dSQ?=aks>wMzjr*_w$eQ
zz<Yaxb>89#@q7lRL`!;Icw_f&dM~8;NgaRrTDr~!E2;Q48|igw3c%i4evf~2%hTd+
z<24l3jB`Tg@?h1d$@wz|Bv@Px?mwKA?I%~oyhug$hs_G+Q=aijTa%KK#>Vbj83q6}
z`_#rU;k?vU;`@*5|BR3S@9_4f+Tm5`d)`K1f*-==b@htxXI4brpy&xIB)tja4X;2%
z#STigBly+_=n~M;AQEFyuYrBlm=^$8wVT<u#j=y)T`w!`>E@izVgXfoF#ONEI?U|S
z=n$Ty@(EQl-7;l-5mn5cm?8}T;dDCefzr)HTz}L<0|1;!nj-A`fPpB9HAU$v3=6|>
zW(A*{UgCQ5MAp6ygN(^>?FhdR0FzMVFJm9w$pPJQ0g*6r!Xel}38i-}+b|=-EFqdn
zK?PqtcAc+o5r!nhW$OM-C_!0?v{WrdL@v|w)p$bcSc8-9%?lEJ85N)&S8xb8N!lw1
z{1AlR8rKo)`!y$t2&e-nmMq?njKhc9uaMi<y1+B6-EqY5YN=qKTRpDtHXI#K#rS+{
zxb>}fY;*nH*0R6_aUg7HDM$mLgpAwmKx)b4??7YeJo)aEwMt9j7MrtAw|12pi{G2e
zety=igG=EL(`sxiXCR#zuO-KyyHotr_n9(nKWz_x7IS64Ziv3`G=2sxTXLI|jS)UC
zMFnl*9L_U*ET@mQKN3w^OB9%Sq7R@A9Um%pWrK{Knq#@Y(V!j{9PSf*t9Ed>t&e$=
z5zI9I?E%@WaiL&1<9ms~nLX=qEVy+>M<B=Qwqc3p6R(<R%e$tfDLF+F9hIWCGdrCv
zzN1tc$B>Kt?LOQIFAAR@0FE;Ir#Js@d^n>wJ*M_3V5{;yVoY{3aef7=v!=7?pi&v_
zM;&eew70d^n_m2ze$qcWgKKqi__5cou8Nhz>?{r@l`-|$H-~KEaUXTo_7OVLK?v34
z4gEfsGv$xkzyqs?VZipWn5du}+C7w@KXZEa&C*{cqr{_hqr(EA{r-hWJCb!v)q6IL
z$7jqAtiZI%E|8`<Zg-$lv9Txhjp5mfcJDZu^}-35@36aiYX{!`sL_pXwU@!A<U9^v
z&!;jjXjnf=dsnJvkJgX#hhKpF*k^UnrxAtSmCcsHF%g+6C_ePbXJs_ArPii8uYW$M
zLrNEbOA##1rmTL-j|r&#ENl3!$*8?JB@QI5d;#T%R-bkF&Q@;61+k!%UHfGvFv?#N
zdN=N5Y=j08XM&d(yxeP$#ic+RJu*7+*n2rL&4J0zFl|u)%t@W3Loq$L$^QN<+<W-T
zcjkaRky$cNpiltcb!<g9mk{e2@jWypIAWYm2+z4QTvG0mleV0wE2%DUg<mZjE^gK@
z^=T&Ns@0)F-1~dohXlL=E5i&2+gg`f5AT#O5V?M5Yt;uk?PRexG~=);cw=4xC{T>k
zF}?Xj?})%|2@kAcxBWl32C%lM_fg44e|3xZ0ph_}OhCN;Skxo{2XzLYZ@KlDo?EJ;
z-?xorH8%!^YtrV%IE@~}h}@Unl6w!cNHP0b1~m7_@a<qH0PW@qA$qf`xu$ts=Q2>*
zvC%6F7Nn7rLMMaerR^Or4A<p3*%*hYVY$(PZ+5Q=zgxX?6x%MaB5t<BJ4ON#_EaOV
zC2)@rH`u(iJ}B}NRV0`eOif@~4=ERj_|ZSr)&<V|hefda2bsfqN7CfGi?jaZ^ox7H
zHH%)|2PO?46?-8Z_&4LiH**Knja{j5t^DtnB|bw^wj@}&NaF8HB^v-H$?3Gd)z&_2
z+HXU`M!zj1X*erIVK&M`xy-)6(b~|{=n#GFPVoh+@bjAG`Kh+e%`yzGF>{r|Bv03-
z&xrJqWW(3|m*iVp{=+CfM*D4T7Cy(ibd^OT0XEdHDR|2tcD3E6$OBq|=C)Y-alGkj
zkytjS7Hl8Y)(4bUQx%vmh=t}9b5j>z67s3Ci3lrE_+8{*ELmMrj}<6fEE0zIzVD+A
zcOptPUvngE{n3q-1Hf@OMUuP8CcC6mSwqqZ2-J?7n4OE?T#R)=w>1gq+2J{n!=&iD
z2)Bo-N1|O5p{6#2>2`l6Q|1QsB)v%7A5=IiTh?v_oVz$eo4aTAeT;+oohdeuh}ncM
zKbBTxN(r;;>X%;6lL>YI5Z1uMXEJvy^+)oirf`38;8ATHTqM8MK*+|}?z@*7AU@Fg
zUgvhSTB~kNDAUyez4f$<207^&Zz(WZl@dM*s}|!tq5<d|n|y!h8~wKceIrpuB}sBr
zPsQ<weLMxoU76j`9fEOi3~foNZ*17I=4R{qVeH|Zy2jKb3vfToAUfEh9`4hKF2H<E
zpgax$lq~_Gg3I(PwLiZjNruwPJ98L~pmp2L&$D%;J70EJXKRR8#%#?Qayn9Ce`G=I
z4O7&*T~{_5x8J%#c!i=d97>)ZUjvhJD6=xP@&3G~@YWqau{CGIoRdBfot#Y9yn*pB
zg~gBKZme`!>(g|5hYgu5{pNGCw(g$HD}eK{H0J|q3yDt6RcBLLelnD-?}JnAH%b1T
zUigO_^!9fqTc!m70r$UzEUpz~*=B`Hs{wt+ut79W>Fh|EWp|&F`lbIR9<zk>OtK_q
zbVoO_w>mHqM#_JTez+L#${ST3P6V1Y*dd}+bFZM?7em_{sC4#DK(&(WXH=P8a-0M-
zgKjw|qs-5{Zra)mCzuHelw6rDIX&-KH3_~sCvglr1y!^_7mX{>e+MQqOedg;g3)IY
z`t^$d!2JCmfbS_f5A2daxPZ>CZM{?hoq}SWdTzoOROx$lY<8e^(4JT)KPn*Z6RP~H
zDcjc}9z_BvP^|tHOQs8ae$>a=CSE>xVL~Z1Yo}BP4*=OonqN&No{!=%`8J%;I_X3b
zBp^1|y2W?dwcU22N#?$a*`4kE$@ziY&L^^W8U2^J=-kgo#*ws3qgc3%?4J&4U&Og)
z&j0p6{R>!lA!k%`v??n2q|S8&9XV+&EF&DR&dtXkE!<6hy?bx6kHlu5o?W=gdBc)r
zc4a<*L`POvZ(R{omFgsmM3+T-wtO&TSCTj;qO(2l%^EXdZkR82Jtw!G_3mz{{13a!
zMeMv=clN{q+5WPBa%<U-PRFRS%P`7ruAyvU?W+PL5x8Jn7Pf#d`betau%z8(fa6)7
z^pe|`z<DtK@(n~jBi}!`?+^S0#2JsQKq--J>i+fBj{Ctpoz;Do1edYm)fS<f7i6#i
zGDjWl7Tl%wdlz?wRiO`B0i5amQ8GzL-!mNHy2$u)Tf{&%)W$5%=J?XYo()b$RcKt+
zOMH%$znt5Bd05TX=7>(05zry#P5t`QjgxA|YWR6l<weB1<)ZOkGl5H6ozQMMdPW@D
zkvD6fmc7fLFC;qQ)AKtO-~>|HmL6#7Y-!A%yl%bP{Q!G#>JauS$e&NeOi&hSRsn#C
zf|Uk>=MV$_Dmwxgv6F7H@Vp;fs0Z@k6cg#l#!YeiaH%v^SU%z?HitrP#K~dQ)ouH3
zxm=;zGir!%rlU26Z|q>n`~zIj=dqi=QtRN7u@~{<lmmwzK(lTI>hC`g#pDEDNyhR$
z4$iYi$F&DyE4o=2fwD>!WT1amnk8DjE$+q9-0&Q_lum$!Q#LP8q9nuP?Wn-v3R1Lk
zW|>_s&8y=5MTGlUiSfg&nvs@bn&(KSXQC%9G$U-y>^8u=<m+ZpMW@5(D(z+dGD66c
z^Xs_{H!8$>OF`o30AizEq5J|Ph3r+?w|plGX8WCJp4EaVLMPVt3yuD4hn_-JoWc^*
z{>KYsCGznhmO9#aN7KbGR*CU!<20A&)Bd-IoA^mBIVMz&`w9I^diwMsLhwfYpXCh_
zW*OervHjMar^)($leW@R6W!l$cY&f({#j)+!zbR;%$Azokx{aE7V+U?`d6)L$+JQW
zWb%D(G0vbw#OL@2)#@srPa9MdejT@0+C$s+o-B!Ygy^O5Nuf|Q#AvkL#@?oKz<#~C
zB@m?7Qt_<UT<mZwQqgmJNd&|1()TdZ5sRR2oXCz?0{}4Z)3W9tS#J(2TDiQJ<|=FF
zx$RQUcvU7y6jhQEQgrp{(M-ft)^fv1Ec5KU6bXim0qonMjGMNn$BRv9R~R(wwzGFp
zCsA0xr9584u+=1!oxvqXJusO&oBRg@%|Pu{OzL?<mLy3}Et#Ji9Rc1vxjDELX$b41
zxpkd^``s0X{1hNSk_o?KWal&SgFB(?N9>$Hqyx<=FFEzz3_i&o$xw6fFpYT^FCx<U
z8G{%dH4ouCJ)e9GLBjrm(kwY-fE(<LHT-!rVGbP<LNMc~e#BHh?!~AquxhSIJTD$q
z-mZA6?>TZnbILR~N6A>7>a?fSdcHJIfnJKzfS|#DV!iqdPM~0<*s#-VW7n?T-LL3`
zkZ)`06I@*;O%F}Xx{P$FeD=FAa~R@L7D>{8!ugDA)!6?iu0+6czDU^CZ<$-$PQA#l
zx?+IVd-%&jgy83h^a@+3(3(Fr!Bd>;FR<rkH#Awo5^?1<d!uFWa}o~G<#KP0M}bQ@
z7`06JV%*oCe|^@uL9g1em<#@%v-fvZ3JpX^_?CM>9^b;g86wAk@e`1~-)z<|KkuVt
zE4w$ccJSf<>zkhIZ@YKhT+HwfN`8fYYxpb#APeXy*DN~)03;$h)pK5Hr-OwtzIuMY
z(xFL=SJN?jzOJ#iNh6u1#-p~SH!ct8&m6Uvu}0tFN6>-&PujzD)R#>5BBu2T_Jz&@
zun8D6YAzRj2v~IP3V@kuJ5kDd{#tazwe5l58V5i%7&B7o654J_W7rAfVskP|LJbtg
zQmk0`qN+r>Zb{M_8i%*)6D_2lxuB!I0RjCLtuw-N&hCR1&%&+-Bq-TGr2P33rJqCc
zhd%M4w=RE=kZCCya<Lr)nIwGD@w*Udi$-@1GG{Oi%mI#Bp@AaxU3<q*3?_Gz6&Aw2
zG<)Mj+U@x%RX4no3<C!+xFx1<fP7zExtzfdS3_2WwUN{IZTqoOYs&`9r~C80*}WwU
z5f}fl-?m@qP^_Z#0dFqK$iAh?+5zJ_cl-z@fJe4@v0E7D2>t^*GgTyc#ZX3jEcnc3
z>ipuvMrdyFvafp%u_osU$L4PVh-AwW^-I4Nk)@5wB#g@0%F1aU>VV+&*O%8;wZU><
z>)R?2n7Pp(yL~O+BAm^1-^e&BvB&@o$(COcooSlX)vC63)vAl4<IDn7c|mcyYFmFk
zIqGLj<R8TEZJRqKvuIT$?=><PcNrcJVqT_l+<G|lATmh`pTGfH7vJTCZ#RSa?*@y*
z->Cv9O9W~vP=E+TKxwwa6~B~+2&i?)bs%v<tRac^NOWZkAUn*C^+KUldRAX$ia`Er
z1#+l75}#&^bMUCmVqHg6S91$?LuinCoKJa4`@@V7%>?q>Cd5ROFJqs9f(v_&NGcZm
zgbrl*TbziE5d-0Q+1|vgW>6SN+WyVO&Wa=H&Ixfy@L$I%7et+uB3a!hKtLggfSTgE
zI38+aZ;jfiwB6tEqzdDuC|F)4bN-PmN__rv$<LZcVXwTmeVcNA7pmXPl*xB59C&Ol
z^;IecdP}y^cr;w_W9^p(^G+U{e^zG0z-D?MXCG)XdsASQITyPVcxA0|vW;DQRfix(
zCXJt}D<|O$5Uw${i4x_g+5YUGGp4i74ti{1C%MscYpuVpE7F1&<Kqp7g&Mo&ye^J}
z#PTl56`W7ZB5LMMN^*uTj^{3Gw}lcw>&JD5hKmeHCaMdm0C*d_eq2B$92`M2Zj+PF
zB&qc-Vt4OPAjM8_=di!~4IGDn{86F}+<pP~Hzyuo>rRAkF;w{R&v|~JVne-qh`dEv
zJRBX25a!+hM#fF|&Zy|mx)Ep?_{bZ&mB^u!V3Z)EnqX|Kev=f0q`}JN2p(6JPXtJT
z7@s-&d)XWM`9tGWmV`Mey+ZiZ!aElkx*l;zL(ID?N59cNwMniU313>)j6)Dk=Ob`=
zMF*o=BOc~UI7Xt2N1D&vP8Oz$MC}WDmxA%RY3y61H`e#>MxU+xqHz;yl|}SmI?x$p
z4=RT(+JIo0vT>Noi2zf*lRhX9LqGL8Tm4}HjvggBeyB0{;v?|H#RAeyixXBw-S)qH
zy&Bshiq2l&Trn`GHCRhA<nkM_&@d8pWlQ9(lk@1)EXKvlHulWTiU5E|WVvniE9BuL
zy;Kg@FRlxw()InqK(EdPV^+zeKRBN)jNrECVST-)vpUR^%q4{OL4x{<2xmX{BDs$r
z8tLUgMSA)xvgVn|Fy3hFJ(HG-hH!CNb%!+lmi)LjFSTF!@-99>LVpAm^We+!oe%c(
z+^3WQq&M#kde7nBD+=%1nHCmTjaaWL)2|UbTf2bDr<>a5I^m7KGa3YxNVz=<lpF`8
zq2NPwqjyA|t{rV&^rX*7<evmeIe0NlzDyHR2TVYy8OCq$K8373yYgNi?g?d#Zs08B
z%5`FO+WJjDiU>6=Bn#iOC3+N^<z25l8o9KZ#Kw&pr1a|MS@z~~E0uZkdX|@<H{ia6
z#Da!_mPW0`xbDL$v5#Gu!{^l)n#Kvd4L03UrzoeUh90k1>)5AQqfcX?-*rDiuX|EY
z8B>*U-SQ0S=OIX@^hNVr)zR<ikxggR>t5h8rXsYx_{g*uZl2L?f&FhF&dEJPO=N>D
zql!L#sC^u^L3c~pT)vMD2wHS)%n-eK4-5gg`!B)8|6q0?zkjG*)c})g&hfP=pm33I
zM98hG$FTVS{X5P-f{i~q5Zfn!fMntcw;>uz^S_y5e1fA}C&aI;hVpUI0EAHTileFy
zLHPe8G?CW>F?NeuG$tTC1v6*eX+PUFl>9GMkf!(_9nB2x%yXmvQYQV8;>?Ky5|xXZ
z;KZJPiGoxL{^*R<Mofd>{Y%|rf&mC&<|gqquxT)54dPcQ{0hTN)nxb7yhD~)HNAo+
z(IcIBGgt`zR`d+$J|&S~^ZPdPsGzcCYgUz(X!CT5Ry}S?u-CFQ-nlBz^tCX<vR(ZJ
zY9R_C?2mHhZ<V4-6H;B$SwtQpT`NT)qnA9t#zYM^er;l2HSILZs*kL`Z2(|n<oX?J
zfDs`bWQJ=8rm#IwH8PQ1GRXL&vnKjmq)9~c1^Jj(wp{k+ozmP`@>`jS2fEU9Gzj+7
z>Y=Fjby&O#fYvlrK8<dt&R}zgQ8p9NYSl>AhqjdxEqO|tt~Tk+#vZOBt56#AN~2sU
zzDDAMvFx(!yv_$V*<`sJ(GMcB7|4{b5}jl6gaX=^Sa-q%>r}M7f=)yJ^I5)TBCU;#
zq=eeA75k{{i_=CW27!7`+*9-3acl()Q^Dlsl_zN8SYvCf*!zw3!eDbFaZuDn?Nqyy
zQe|2nDhq*XmCt6<l^=-w*HEpAS_YJ5h#7Y1W%W^POG;>k7%Xb?p2dwO1zlNTp>~DD
zy^-;xBos`OF~Z!%dx4Kg7nLw2KB4QMVnW$zYSfZ983^;E+!2z!l(%Hs=@G<Fm?7^y
z|9y66n=#ovp7qZ?F-?Bo(n#dgyc1W4#vF^<o)%|`A{B_gv;O^mxB!8)(_tJ067Dmx
z&f@H561=R>U!eKWWl|NZvAb<(ey3f+A|zuR`VQ#8fJ2Xi@gPX7#bX8bwfP&g&|${j
zUD&a|rzEY9t9g(86gPqfjVCy<^L?j*zW@@r({QXqPBw?UG!Z9onH~vo7*iH5H~+fA
zmHhS2`Ct281<D#e($S=U+yP7&5EehJkBngY>?aCJPpHLd$PiOnI0ofp;F__9C5|_@
ztks5iF*^;#&{f)C4Q;*aY&jv3<$30gSwXuXo|<{zD`$!nPWy|~-lB+f{f(ZxMzm|v
znV~_jgo;t8HNJ88JJxdRzP=B#UAlEmd3#Skg={ky$feN~f2YkB!QuiRC6=?6&ycUS
zRQ&H<`7cGJq)%h$==fm-(V{=%(H+|b(c{b2z3WXwmpc4rl8)oly4{oXnmI|4PKfmG
zx99{KJ!5rsHK;i9y#tcHKqi|-_4@&i4jobu3wXuFl%0^>edRVfM*?Y$WUNiPJ?+e5
zMVo?Xa~fTaIbKA*>HS2@3ORFfY2*Rk-3Tm((iD$LA348P+Chk0sGS6MW?xhIx0}KH
zIodA*R=@$Dis*cp%bL=kaPa?is;ZZYQ>FNNM#sJ$6;Rk}o1xq?`aNe?e_5xs3WfM8
zh$Q#iyGY|Cm{xOX7DqiHsX{C5hDwRpQxb=w_`*>o0(Mu@JY80ZBU}nqDFunXCZUw5
z-?1d+^!Tir%7!T&O4e|8EE4T6(E9bjQVcNYO$R@TS{z`F3Rq<P(bxU6wF~Sp9v<<<
z%3ou5<vN%;D941a8OI{H2qdgIvyFvi3@8@0z+)V|ze5L;UavV&a;Neby<DPZw`~D%
zWiNq<#9>ri=_lU?Rz#`Om{?2f<UMsJ8atPdLm99M#+&Dnh0i7YO?$!9blld`6xm;7
zHi&b*uTiTc9O)`Kdnr$n^Y#M$L)y7g>{)Kvmzm2}_->kOrFmak{pS{4u^BCGrNA8Y
z6*hw?FtA`~99Q?8K9LYXkNJCca_XDvOmJgX%<dRhD#=As5M4YU9lks^mXSlT=_sLG
zl@p_Qr*-NE5yCSx?hobBKX}TDmr;aICHgk}USQMqP~E<MRQQV5A4Q7U0P-ex!LWU`
zF{Y;&@MP1g>c3+o5Ld#}(&;{|ft*CM!1d{FPLf}-Q$oLs&PWL+?D3YL424<Y405r|
z0>w1Clvmil%rqz;fiOSZ&RtJT@>jV0Pl5Y{7?2&b*yqLnEepMt29_Rb8>fH&D{*|q
z2NL6Cz&QE86V8ALuyk5J^0V)M>(Bx35C0lWAot=&USR&G4D{~^{_sa<>Sts7j{9Gk
zDDRIvwZ@<(^xq~OCo#a%ot0Bb<-Z@A1rS>nAb%J7ciQ{v2P|#rr)x+3TexZgNN71(
z2oI?Kz56>FVCivw(bD=~Syl$!A7D^USrEuB%inOZB%HvhwB>#<T14fwM-)92#BnE|
zNps9f3I}+ZXWIqdp2XNb%<zu;CMpJTCrLCC$<LUncE+*H+Q#|D*04tG#xY42Q(D-B
zFo`#xwp;H;5S3L4LH|C4s?7JSXmY|;v+_-99~z(B$;YM1c!~^${ANy(8BO{qFlc74
z?IVjJx%60@2ae-M@wlVe!ISMT{5nP!*CfX;H$n1R&@3-+l9mkHc<n@81u8`65({Jw
zzPPM6)a^dBx{N!LvO~13G!{vD+^-WVOBc0dCt#|<_p%c@va_-`A0|cs7mSZwS6-Yh
z0)S6PCs!!m@e?Fu52=FdxB?ksmYro2PK?9XW#r(C5|fCEcv@&lai>7?z7zFcr_o?Q
zj*so@E&m<&$5#nwWqBNNX_A}C3XO!_WwB@AynfoKmwln%6%%L|gt!|psID9Gy6F;?
z7u1YJibRj1vvJqW+UQW^Ei)vDMqk~y*U}!c46+RKZTGtbO7^1A2~qFV#|ie7{awuX
z1EdxCfU00x`cj3NRKiE)32<lorHrE_RC_rq-r4iHVn3~cBPF4Jd`)8BDcu2caV)Ly
zj~&beAw++O$?-YukXLD*WZb+wG2uE9svNe4l!Pmb?4_)hs)VnZtXP8E*D3uE46?10
zl-}{BMn(1Gm%<U4{2+6~$nC=H2|FPS(5P&cbJp?8xWm_LM9uOy)b4D3^7E{ix6EC?
zl(CWq3vzc=<9rjXA6k^pl<A9-X>Y$4U0kV5)W&5czO7C%D_FlTA!Tm(#myWdSs>E}
zPO@6a`Kj5#r#yic2QoM2_+AtoihH!V^s`XLQ*UC13iW#~5*@m?I-PEo1bPr1qqZzD
zR%EDq2W6KgDtVej>H+>gwHjcaK+Tw%x4s6+BgATCUy1l-n3#>XAlun<WGO3vq=7>J
z@j;9JLV?u5&iJGk<<$^2et2&j^f|o==(m>KjSP0lBZ#H!#YsArg4B0io7SAOfbgN$
z1e@KtzlB0_Hryo0aF9;S)RSddz|%W=aVg$LL_KN#cyx|pw~CY%jaf}p))gNqwcMeA
zMN8N?23A`_@*Wm>sXL9zm7lM%ScfHNz)ASZ+!XqrWvQbNF(@(`jhPMPpqF^&p9grD
zK;qn})OwEsCX}}a`y-ywU$95u8}-XJ32YC6N<xt0{YpbYAy}J*&knBaJ2-&3jiIpv
z@*d+D7`IqYrD~<43vp2S1oS><3@=gS^3>J<E1$yPWuD!GRwO&CZ2JjZy7U2qt+a6p
z?r$K*FX-Nm6^AOvTz005@2~fYd9|e<6l-7Gr=_%u!@hN&FLHxBM=luS@YL=p^iyev
zJe>^|^K(Ost$R~J#@+kw;1>N3S8(3K(+DAt59AL~S6**dG&6D{nF-*WHu!t@1%5iD
zVRG{aGaXukMnrIXR=PgZo4d9+cZwB-g$W)afz?PC^3k(cr$8&iuG*16H%k@vHX{~t
z(i$s4r521cv_6YIBMGucW7X(hF=k)pf+O-1Xm!Pm(u7+lqUPI`<pi*BNT{OfVQJ*Z
z86HI52l@iKyeZnIr^2ass&^E<jwMDO$wP~;y<M_^f-XZA=oWk#esXe&)>$9he(cNh
z5{U~=c1gcC$gVpf*~vpVYXz0><NXc#+Y`y3Qx34Hpkqx`*p>+sd=xG4ZRj$(7yoq>
zD@{vHy@y<qr|{#dLkXQ;7Zl$TW_$uci3wsN6T|ZVV!vU0$dCm>_kBfQuW+a(eU8G$
zc%2i^gD-ao)0{@MGRg~%L!-nI#lEDBlSPre(Z!~Y3JNo(mhIvXba`ISu&9oLq`@vh
z5%24VQC6Q2%E?ZYyBG_;Stp0-{=ocIO0DL1a>TrX58$dn^cs!PsE`jdDR-2N4z0#w
zJ+yCT8L;(KCg7Or%)XS*Dpfq0RP(-g2m~0LEYP%IsVGqTJmx9KSFrx_yEDNn2<y3}
ztNen^cO8MBkPzAykod$f$o%90^zng-?tt(Nt({#ZA-Ib6{zuWnm_+JjOk02M^ZT>l
z_oN>%>_iLOZ!WtcxbQg*`n-DFJE^B);x59Ca5XU`s@**D;Wx)mlu>=xGq*_;np$%r
zeIGHa*iAdpkNvH<&^pD4S{e*U=h&`(@`@5^vf}5jwJwLz;1{lR`mGbKNIqR-n!o3>
z<YiQQ+^64hm94?Kj}LzG&>!#{13`P9tg}Z?dUvc@BE5pp--Ueqp#Q@g3@&_!+@%>I
zLVQZBDpXMk=4FZ-bZRg%(mc65X@pqhP8iRkQKjVxRG5g_{Rp}q9Xb0HZrHNXPUl{$
zqL&p~!n1&b={tpT+|P}HFX|x;lCWml`I2LMmwE6Ej%>{+;PchjA35D2NT?1~q|c7|
z)P?pqlMIiU{+RVTC>ilun&D9~;!facaxY<9AaH0BthR~8ugw1GKK&n2Hvp8>RtLI<
zQ{scA2U5QR*<yoiQdU+AdQ1Xh1Z=B7@RW~Sa|Uo49IOfM&d{%Dl<|1NISO%DFBBX#
zk)=Ogn@Js_9mjv!cI0?1sbg>)hMRCoOVyE*vL+Xd{B;nG!u!PR>E6#wR0NHAO;fX{
zUvkcH)acU`s#+u?+qvd)l@xw}YWw5X<K2i%@<~g|mhJk3A_Q0Zk-`QoYH|c?!W<2;
zqMbeHL?jOL<9_>X01QvB9c0teCd1ASNAlMOAcOZQ>>QtSrL#>|zxWQ)Vn(FbyH2=O
zeC4XlBQxAzAV+_}I_M{93Xr+|qIGCjmYH>a!PYiodNGgts&~*83b5?YJm#^mA`HN;
zODq=cZ+<+gJ+2|qop;IgUOmIWSKbbEr=-NJ*x&#MGik_ONf!W%a2BJly<|6K^%d@-
zME4J?0RsZTMJr^VyOb9uTosD2h+C3T(XL|sfJt8fIn&3|OOj3n4;q$@DT;5w=*)g&
zi9MiDb>d1XF@*!N8aIc4xZ_NY3`xR1dHgq^NLofBI1x369<?(5E+LNctFnAQ%3aV#
zGl$W5S4?l@&oHi=H%Y44_fJ(pQ`G5?NtA1tKFk-D+n?w_gpw*%M?tl?t8xqR4fv}=
z=Pj@-EI0@Hn-WBhsQfHD#PuhFgx`ru#!z>APmg_9J+lpZ-Kh9Il#@V<t}jcEy^|xH
zCdU#VEYANj$Jt=^*TG8g&5w^^z>&85w$XlVe>~oY-Q5#0X!{2}Myh<lE_#8FT6XNk
zNE5heaA3IBCoSSxb(lRw+o%2FDm@#C^2Yv~5grK>J%UXNDGd@0GF`kg{c4+_w{xnD
zSF$jD5ULXV>E7u%j@F|c5RZ5$yd|oY75FiqC=h#}#8c6?g%MCFgrTwG*p1O7<(N1U
zm+JsymqCo#*E$ZxBuVQ!ih%hRq_TxWR+5(b!Q0Q&p#!}#q!Y?VU%bDRzkK5xqMkXJ
zNk9=hENJ}56GVZtYCc;dBRcr>#eJqEdJwXPaVc%XYA=N1QZz`(<e^P}^^LTB0SI7$
z{Ru8HOnP}aXJZ9^p6GP8-;wSq4RsCMIdKei_AA=dK0TFC)tINAnC}wmMS~0kGhG?A
zc7@}??~P+amveETFBZo@E4t_R&3f3PVD_j>-^pbF-Sv5aKC9@@^tr6X3stTs*6q7=
z#(3MEzy}p0O=(;Ybh-$B=zVf!3><|umo|tHO3R$vBR?Jx_pljwdL$@_l{;zJ=^kAx
z$T_YdiF~oD9Z)-tVt2ZZ_$YNYK8DpExg4t@ej|WRpwoXfB4%Q8GV|e~4~}ZIJG#b1
zo$i)h#UcD_nYx{X?m$FZ&XfPDuTn)$fNoDrd!psQ^PgJf$)7_<CA*L^d~Mi^hmPVR
z{)K8LrF3-$LhCP!;w7JAf65`P!uf55Un9yv*@!L~?U~oKJzCTY$U5@oLnlhIVC9=*
z|IPO64jgv|M+(zYp@av-==`8t8svPpk?s@lCW1WvQ%pDJhml7$irvt!2G|g!STIif
zr?T_d2YWT6e8SFQq;T11wqH@n?5Iw4hss@V(HiUTJbCmeB)I@~@bxjI*KQn8pV}Fn
zo0$5jKf7_Q6XXPBvSYEFRokL|r(%(VuLDB3yFNtcM<Kr`H9(asozpw2F7n@&USU5D
z<VaG$F3SWFwCvTuu&gjlX_@S(A`(TCkb=;CMo=D(G(2zc52q`UOl5D?RqrVX#CE~w
zX!V~SZ6%+}xgYz+RW$s1w69*zG%*@+f5HI0G<W-S2rvqzVo=5dIB)Kau_BOhp%{JS
z_zCuxeGu_<qgsg#!%eZ4qlZKZm8#2k(A%i1X#51w@bAe`(-)OrnZT8qzS}Im;{g-e
zFR|W6HUoD$4una0Y+Lyy@Lg8IkDMNC#arWJn(+&Fr(chtfmJ_fFCLQ!dJgk`kNlY%
zK7;q;MMN<Oblpb9xMLMrE{K87v@FIv?aqAU)3XGH(|pb_i0qml4PZhSPk335g5Lxp
zH5~^(I+QPmMp%_8Cv<YfF!R7kvhZUElat$Z@vRXD5+BU|i2^KtF2vg$jN&mru2oZb
z=rTp%_6buj2(jYs8nwVXEHwe=cL<nthLheMRY^F<TLxfJQJ4o&GKNtv?%0Q#o)h_R
z$7i2aJ-)PJjDIeCh^j(Q3i75#&d>gJVkrH#s%u0#8;s`l1(RT-dm{mZdqzPg$KXqj
zN%r$u&YckCB$YC0clP7Hv@jRUaWGSsvaemN+p2h8#cY_PBSo2_LZ303++y=uVnhxu
zXd4r8|5_B+i)u}G#POUC(1OT4Az+qL*ai(BCBbD7x}QEfKr+5z^hyS?=de`TSFebA
zO)_9o)u7B+-r$O%kM{>W>q7e`E*yI}+Y6|A!Z4Zqbbea!EK|7S`!H$<Q6nNG*68C+
zKVn5RfN#u4eb*>nGw#TP95pvO=XWm!Pu&*vd?aL{ryyznyf=*BmqjGFZ)Xf30uYHb
zp9D~;2SM1^VY^70eYd=^1ONuB1>yn@67mr9qJ6PM$BN}(fG9ItXw@A#AXT%MQaOHg
zOlF`=m(h)x0Cnm>8Si?dKtXHQ=Yx-1u|Jt@K>$^U?zs>8$%CGlt|kaas?bwVY$ghi
zWyzcvb$)3Jd2ezY4yT^cz<;~r)l$gb$<%?eX*g_{EwC1c%vzSOz`mg^(}Hk#3=UTA
zF>@AEk?P0EiCHJ=t`D#)RUY?h9dZ<@srn1SIEnt#1k+DRw6FUKQ>|P*!8xLw7zr7~
zqtMYXb3hP#LG9{&L_)s?8f%D&F%lF72P0PYNksMdG|YQ2#j;7aI5GMJDbQoZ24I)G
zX6jH7^a*$7igf*cG7~kx7(WH}q!uos*)$4R#hJnEdG9PCN>8ZpzC{wH1Seh_yW~c}
zRTgC5%9Je<3YfWt*$>h_lgB#<w4#pJNzwnhZl??U{Eiw3w<71d7qk8H7d-Xc9JFl^
z7gpP&U2<<1rc`!$8+h+aOCO~0;`<WueH-Y8QA|YR%^SZH(Y&i_y;Q=9khA)2PYL1X
zzlzEj%Ko@DmZJuOlQK@C0)47sMjYy-wz2kWvRK7%YYeF0<K;k=)C{63q$e3x?T%w5
z7*?l3{pYlq<$#9Z)mO~g8a#Yj_`}GukB^b$`{+-s5*$V%(AxuowmaTFBgD;)b1r^r
z7foL>;O@>7+P#{OUY3OhWEACW2!%uSFbC20XvTUVJC`_8O+VLm)(C{RV<KlGL(f$4
zUtnF(LS-Sujk?rLLM_4A=sfo%RN*VeSZ_{9<=XAL;4LMn=J<*pewe8bGQ@ob5-eSS
z*W1a4&!&(><Y_ef;VE-laSMEZQ<?r)PonwygjCSWoq+j&V0~JkgP&NJM!mkLP&lJz
z3HGkJVy}vM<2m+>xuOBH1Ca{fL*GF$BJrhvuhZnUYL6+H(=ag-THR$AhKMA)(KeO~
zf%H5jpcmdfzVN%De6cTZ+8@sD(W!X4j&#w43m6R+B_yH=m!&l58VXg_P$Aq<9hQM!
zdE=`1;o%7Bq@wi%PE+C(7u|Wp3}n}Bkja}os&W3w#mf5B`Vy|CJw0>H71C|-XQtHY
zYqsIdF4U&V4Ej217GvA4;-u`)t{H0wH#Dbi6|?=(7R|Z*BosKW6Q;e21d8+EFS3Z;
z@p|OW+9h-@h(xZXRtEjOij`~lj5(KC@Wr-c+-Hsy!Af^q1n<q|`%rq!S}I99A(zo`
z&9louNnvP&#M4|=?i&{98+Vz1+A+X5F6=$)b2+QEDO7gV&fwU=5RAL8|A(=+fU0V3
z+eQ~5At?>gA&Zt0q*)*x3l!-RNol0J8w8hhEkZ>)r9n`-k?!v9K9jAy=ilG^jq{%|
z*4Sf@A!E)b?zryjy65vu;A6}xrW>gSsKxm(8!E32X%f4?UO?uWF2Z99bzWW8(%Bz-
zoMMA6|Aajl4f!Nhn>h{S7beDEf*0;t<@1@_Q20oi=Y>doNi6lV|D5RemhkQrx?yt7
zmn2V;tL)~9lI3_BH3*x-{3^n(MK#{DE#vqqoa9M@@o<q(-%lZp+i3kpI0zLgV%77%
z6yBu>m-^4609NnA-Jxqtbmgf33&Z#Ej^VR(&a3!uhVM?Ddnsjx^FPi`1em)eM&N<x
z<2$e4{WsItdB+Pz^i%%#(ZcP@chqIsbiv4flYcNkty3U{3;)j}jDf+Y-?53Nwc7vV
z<j^~ol?fkUHMt<}K96J)n;Mx!e^7n^`zl1ajzoo4LN-Q?ODSG*qzeD|%dd)2J}Qp3
zr{t|v<9&yIb-9yttf^h4!!#ydYb~e7^?XHBFsNS>V%COhVW@e(QtcFoi77hE*bI50
zkJ?!g@M#m5H`6Hna7&!JBb_keL~((C*Ufk8j@fjc7_?AzLQlX&hi?O?Re+ucu>SW0
z0ePa9jSoHS)Y*x6NG_iCZgUV7mQHX3LMn`Cv0@f@Y08bl7|JK_>C+`*;7``y?al+L
zlm*GL45FhB*jx%v)A8dgS=B>;rQ6nKkLM8^WRGQ!<q^&l6P48huM^4Iz?{U(sJ-!N
zX?P%Lke?Hrd=s1jS{g+6t8X6+?EO*2GMcQQ%r`0tDD78DdD^@|JQ$xG83X!!yrl&_
zdW^1~s;+D_@<Qw?1#!sIAp-@ixp8UYQoH99Nr;&YK8!{Tn<gChe|@kqt=|El1u~?v
znwMmY#Ok53ChIcremE+~R*I#J56BSG8svz*R|9j3#yHcDlIf?cF6B2qWfw;<fU_<q
zyJBC%R4^<Bk^!n*z`WJ<_5a5_82k<hWbrd8UDf9fLIdqTWv|jlV*gwmkH!8J@M2Fv
zR+oM_4q#&Ae9!8HaUH9Iuwgu0XTqKgvwN~u3@SU34D+S(4Ge^Fp=h0g^r<ZCV+yZ7
zV+u_M$e=|6st9sB&D%#1!5}C`SM$9+?cDM04V3`Qv)TLgRhpKpzKeyZoe?SKwe%oT
zY~@2%-zr}{QjIMo*ilV(G9c@%XU`>u)Cv2045}ZDDK|e1!{Au&qYt=SC;{05r0=5&
zqm0SvR6xVc1pL#@)uQt;1(#Bv(0M|km7A<MMdf&9N|eBg%k)@v{5MJ`$r8uQl}+BH
zgC;XJxAo`Sug#TpCvRSF*Y6_ynu!O_2&9(enu($dJ{tH!<q7R)y`<83p{V`6FUN$r
zk{vLy660cw|Hu-)w~T=Ode+lCiXVMPZ}@3ab1*pe1Os_FDwZ-f;KS0c268Y8dy%`m
z1s(yu^Z3S(w4W{vZH{q5TroGXJTC`tS(4AH>1n~}6B5%!?UIIF{oF<mW7xN~*MQH$
zp^flGUel4Rn!@oMH`nv{LM{1t*1>Yw(|0#1j7Lj;e2z`xYKyN=caJxRv-Q|cFF&b&
zA6Oh>#PB_z!;Dxy{|LdQCAE^)+BlPNR$nw3{k9N=Ni*16yHhyy?#)tPIsrs*1xFz-
z_udzI?luM`<Of|CKFmheO|%%Iw4CB7+l43|n!83POUPhYP0orQ=<|Y`tMeVLAD1rg
z8xF>08slb2sPV~S)2EZP_k=YOB8-MK2SAhv{HC!xh3{yvEb-f+3Ct*<uk}=Y`T~>X
zeGdzV@Rt#Fz3dk(qh^Oa-}`H*y!*E_ht$5$5Kg=uQtSek#u5u6)5H!qU2T8YSF@Tu
zaNGQtUh#{_<>#vDLpa4De*W;py*0%GaqbB^d#e=yMHS@35q(1h*KuXl<tL;D{bOff
z@4sZ*i}zC?p4m>xn4`&^Ku@p{53?O`j6eB2el3fLw@rjhaY5tR1AUSXE5{|ZcXk%$
zJCzkaIAU>I7zpM$%_Sw{v9PB>8=@r7`a<Qcfl6f`pyz)4BGsvNyrxPVf4dyx?2s`K
zu$hS{s=Cw4UXk;ssZJ};wXT;<Yt{x^zfI1`S?T{Mx?Gah4(g9OM;Np)|0~o`E8fL?
z^tZN>DBQIgZ9&wSb^sflcdGoDew|C!fnl1_+)I*2jWr=fnAFn}jYuxQgqHnF?r&BK
z6}@rgu=bU)4;rgbyzH;0xL?7aN>`feWietxje^#H*wLj<zN_QOIJK%dO`8$Ok+Mp4
z*+@*{H>qN#^7tAQ9WF%1L;RFTHyAfSMrtx9(4Q!h9r{J^!lL$}pxM%g24?lby3fMZ
z(8o3kSNT^mfYE#TlxvawmFZ;gfw*dpsOi^+`)c)a4vc#>az;fY5HKnd8&dUylO`u|
z&<@+!JgzZ+Ikg;j-BTL1;QB_JxQbD)rdHZCl59e~OXNyL+p0ct#?v&gSFj*yGJ5@V
z;S;Yx2aNBZZP&RS)xS6%)qmwxyRCan+?`&q5}%r9s^`U6gvc7$ALUN0cy`|l(D9z5
z_WrC|v6;Izn(vA>gnhJ#vgk3h-nWWh)MxbzCY<blI4v;bCu9X34f*knl`?x?_NWWY
z7t1AGb~0~Q^xbF%(3HJE7Y}(`PJ#Zj-0NEZ+fC!GpGf0(i=1BJW83=DHx*7_t6rMS
z8xIUgn6<HfD<J%JZ-OUx`SjZAEkm~_B05IQI7fjk&xqG)>cz=gwoQXpmh@`fM*i!r
zc(#GsT~@#cQbZ1K*IQ0!RklB35BogAca+-3;IX%*s^8nP{JzVTu8Ibs&)SxeoCQuN
zT_ZxQoNVxOo3;DcpmiA7s^lt=o}>coh~9pA=YMwO6OfW#p-)^(Q9B~q^>JJ+p3Y8{
z&;Z?N`L0eB6%F%5+qK@t>VBMbC}x9I?z}bwwU1$DTkYb_!f`)I<62sU^6{1CLx!$o
zY0zRKDi=j>ud+@G*hfEdf2o#g(RCdglb&+TK8FEG*`wLq@xCqR{{=|CF!Y}Uv`nJB
z8wjGe<x2YA$2lis@ARELWEye+8v}2j4b@N`w@mDoyXVv&Y>W}l>H~}D;m(`iaGzFH
zX@Nk>;NevE2#_tI0Ww;0T0vBJcaQ-S?j6t{Mf1D4^d3@u5Ja}uh1v&Y=KMyZACv~p
zc!}|NnWuR&Wd)%Rs6-PF(GXmb`kzUs{aE(0@wU*N%~^|ODarCxgTV7AgVKYY)yU^T
z3iL+fiMSmk6Sv;Q=$*LK!5?0rZIeLQSTXLATD1iAqY`M`yX4pKmvQ3sFHi8$YRm1}
z#Tz<au{_>ISA^bNpUxKEBwLosKC^E(O9VpiNEE5nk6TL0|1JFqS|y$&!NkUZ0|+9$
z6lm;QZ?RRwH?38#Th)n$Jb!81tYs#(nGJOEek*Q?V(4|WZM43yI40hBZtyB}mu-^|
zNK&N@6w1%-sVdf8g4u!Fw&5#6jjb^j{bv@sw)@Kh7a3t%bsLYZzUv11y}QwC)I;4Z
z$>FbTl_q2_xWR=8#=Rb9)xV#daV%gzT;rDW1QW=>$z=?`2x-;zC&jS}%}RVT6i%MH
z1D~l>+cHk>Y0d(g#imV-ZoJ+|xtaKR@92c}CB-w~ON7QMol6QRgQkdib>F|G`|L>I
z;pOn|dep=W%mT%aM6?d+Y3se|S8M*xj)UgB!*F?y?NV)q>$62Ito2ddGRouy)<g>H
z?Hn@GR7T|G17rE}<B>_Tf#y}m_Ic^`9mB;6&%m%G?oE~Ya)ZK}W75}O7i=0zG+Izk
z-j6CJ^GF3#XzD6zm8;$A=^k|i?{0>zjOHF=rT%h+E&Ft;@j5PCf2&$ls#z?A3oNYN
zqN`8F`0ox%;7e^5HQxMwK&=1S>HAbJ+xIDEo_4?=@~x%C;w7PRERFRS?;AB|z3Qdv
zFQNwxFXKL<clOuh8Q<UDk<kt0Pfe2~<}1IX49rMZ9U6Os<!=HTWB3mrmacSh@Yr=f
zHH5ROdkHe)7S`^+RukOtR+^*GILCF|nbAMR*)0m>=zdwN?Y0(puKaFJ_xro^FO6$6
z?mL71H_qSGCbeH37t{UTtb9B?7nbL!Sj+x@W+-?Vht5-F#f@Rrg2RoC6oKR4*KyA-
zb|#C4mwOR*YSO+}jQa73j0Eo%*pChitwMBXvMW$#rxhRDl$TJZ?56;yaVgi=&SYcj
zx@~0m;|xU>JkGQBXK`%lnTGi5B>8bYo+zRBI~e}nI2a;cfBd^UnAdm&W5RWHI;AV`
z|FRC^)g7OcEsE+-S-|ZzRHGK_u`iUMcp9by@_n264jhB0vz}S-Yj?Bd1HPmF?gtwq
z>uzki2VobOq>MkiH;dR`4Yx%9UV~q!^e>SCUt|irJC0A!z1>4ycs+vAX-rix^jh1h
zpm)1)B$z7SzVB=`-TRgCX%FvuiRtlZy0?IM^^reS`SG(!iyv3x7L%F2GcMi*?1i2w
zN+R{IeSK1%t$SfrUn((5w#el$fy>|&^0eT`dgH3xRIX6&m4?stC*%5FI63H@#cp}a
zMt_rc#@5|tJ54Yj#R8j+<*Hg3I~m7PG>_zNezo@FVtF;aeAC+9>{R&F9%9q8(dQ`9
z)s^yL%3=5Vy;@hL#qT4n)q{&)rs-e%<~R%OPiKa-ZP>Ik4UH(-RiQI3ibnNySMtt^
zr@dAc>z|Va<iYFhNX%wGtJaltr1VZN7UGBJu09?EC~GIlOQLPFdxE&iUC+y~p|I|V
z^;BCUm*;`8io0QpMVpMc>&y!*awj4*nE`QEk7R}PdYRzYf`FfEr2am*KRz9ba@Y3h
zA4&8&MiCnHLZCWBNS>%Z`(3i$f%i6W1&^ZfW$P{;EmkJ%Iag37J@ak0WC*4)SmG_E
zM7sh+lC(>s6~ZN%xJpCs5F{BW*`e_HuyMzD`*7PK#j{t>eYfJ|edFk{?x(U5!K<{>
zvlAQ9T_4TxXSB0)(F<L&S)4W_JQCp3?eRcX&9X%xYyTMBUCmUEe|BbljNP3sEZoaf
zH1p8-%Lvw+vQUD~rH+qT4n^X8=sA}TJCJjePlI6X5@w508<RTj<3r_=_wzat$(2o<
z?Qmw8(yswsu=h+g)~#+0iTi}jJ|bzPXQi>8*D(L`CkKn>FA{s3H^K?$*C^Caq~!G|
zl5;I&kR`?Le$C-N0fVhB6ry%`r1o**zw8+}PxBsM_TVwkc56p5R|x*RS?lG=RmmL7
zS{p5U@K7|Vt(EeLo?}LUeaAs$0@t;mPJ_pTVdnVOJVJSfiQ+&f|JEl>SeGz(3bzrL
zi9)K6Yggsm@7#}O8f8MUUu1mO4@WaK>zUg>HsU6I-}XCP?aMTqx#V=;H(@>Zijsye
z846#XY)O6^-E1YLjIX<3fpurvqc69mEB-<v<<7FM;A&OXrQBJ3g{}sF&m6S_>0lTX
z;x}J$-YhoSG~vlt`;ox+++n%i&Z<3avC3uNuS7aNn5X`At#80!Zs<3sKhu_^?naXV
zTVam1b8`3RwmnS|c1CFP){VFvyxrB~J2f%w(GB`$Qy1Qd&B}G-((*Z#<notg_oX$V
zlT}L8mFw;I;NI4qnO7ALN)BCcmOr!A?k_nw_Gb>fbJ^(dGn@EK^(0~=^c(Os4?h9M
zwm@i0ZzU@R!!CR&?N`#LH0~4S(mIt!Qjy2cK%OX3kV^7u=$8$QD!QK3);>M2oSK-(
zFmEd~Zdl!<{n*%gD|*4pgy&a=>5lNLg~vLLPHa&wHKt%?a?zmg7O<9?X~BpZ43QIL
zDdn{A_v^dgKfX(;kx&7rMr74O<5-XJ$VLirA4l60=8;~SF~#hhLhVKNslU9~bXr`8
zXuikEF)CBT3iUw?0e)g<xF4VsBzeGz#YkA2uci9DvIf4q!{_;?&+ubJ2SVy#;*gR~
z%%Mvaz0~Df(o#D<2Asm7jVl-p8P9OFS=>t)kf<fWs&KbKtK1n2&N-Bl1>Z_ji^Gqe
ztZ{O3o?Z7NyzPyp5IS4lN*t1kiygH~5p-`I<4ELn$Ih1X4CWgizrZZQetvZ+p5={W
z((R{hZFf?XZh)S{2lXdxqw^+*bWXmcx%MQjTQ`vrS`;Ifc$2ab@XyeF&;l?JOhpHt
zEC0$`8=L-=eo85MlQ|sOy_5m}SnSmDEMJqSHx_!b!AEvTT==7v5Py5e+V2<@0#0Wd
z0v%hZ^ZZO&9<tNVYC}!`WnzSHF$Sp>pMN8fC4!NI<rzB0Oo998*MRRh@KOqrz$9FR
zM|VHpb{cAUIaA#9rB_(N<SD5;BcJ62g2`i?os{q`P3||7QJ(zHH<<(WKcp#JW0yrq
z<n<O!!NJjKQ7T5JWL<jxDy$)?(srwqgY*6~FOJpqp9YDE7eecM6pb9I(gA1k{T20$
zfz}Bcss06tHj`gU>c8Jk6E|9@E8RL}Y`4F_U^Kh5%%X04)^3OC(a1aT-3?#oVXg5b
zWtOoC+pWNjvLEDJY4yk1W%|nm{+jPt^$Im|7;jcpzz#&;-bPfLw^&H|b2n)E-S~^P
zMdvZPZ}|i|eWa9h5u$lyN3Db>he|EjAAn@{-hjXY@;8Fe1_Kc!NTX2WnojPMO+;W>
z@M7iLjtHX9y=Ro0d^=A|Bjee1TaObqThnLLxE`2z*DgGP*SYMAzU6>E+N~&x9#MQZ
z(chJwS5ECj8Cd%M*JqbZTM^8)=}xmffAPhV+a{`nr{97D4!n`+VY157jp?>xk_6I?
zS+MsHuzzx{E3KjbvlVx{p@E1e_xOoK3z4Vo#*t=s%8{YvcDD={ceUqrd@Gmw>4E^v
zv@cr!v3Ga=+qNBv{ne7jNQD_;=UM*Tp>;eoI60@AfWRqv)O51R!rnk5v|dU2Xze%H
zN!b`;RN7xSB(r6qO#6md^Sj%fHE?n*tTlmu-`k%gHUhe_kCE&A0>f0yJZX0Ve^Zsi
zfSOB~(a6>EK!%d>(bg1LDGz;GLbJNrVQmnOaIn-#G)L*Cp9+ML1bChO@;90ohm|58
zvK75f^F>jy(zoB;wn74(>ZugWL=!VYe!Jw*S6b>2PadLDc1JMtZe-eZIa`TVIG#jw
zxO_JjKDvFYlj6=a$?J|!%55r9dpd(-m5F}Dt^pzK<`2^SP`6f$WOtNF>?Zkl?<AXE
zh(U6zanVZT4|t~Oq@y2?1`B<l(T)Ad0OP||kqi@2!#gIcKy3WL?_xk@q0VkR#i&_7
zqAhEGwCX!P#5h0w`Z&^^HQwgp!D)8=BR&Z*xABWwcF*fv5%8`6H7#W$3EFFNf5eFu
zDT<J+9y;hTEj8lZ4_6b|%oxHxpQu*9F$N~sYXK9uF3WJxK^f(X?B5t1S$26!C7w3p
zd%TOYqBoXgdMvj*7Q^rS?C4O9mR9v2AB=u~r+rmy&2S*1ibhd$B7Nf*4Ig-1l;N>F
zs*<Z^wK)Bp`-{PF>275BR<QSPRP&)6>1{W~#XUGQ>MEORM=_q#^?*>a2!YNTuM_Fg
zTdFULt=Qp`Xvu{ir{;(<f>t@M=_LJ`;nmk4F8s<~3h6@F>w~kwl-I$FC2irPj@=g<
z7z$@wMmqf7GixsoD|I6Td9K-3`%-yHT$r&#WZHiX=zoTvH6Ynx{xZx{zMpj|Fl(B}
zNpsHKeq*xWYm9j5N3JzX7-nO2!iD;^LAruokVQTgxI>@sSFJ3{1LKBi+^;B#v*M{i
zY$~_MrB$0d?t4y$*cy#)|0Iv((;F+9#hd!tr}KHck$Mf9IXq#)*W_B|n?X44&RhXS
z;$=>J=*w&KNSsSUOkdAD3>DHFM{ptPL8ZpQJ1%ey%B5a)kfOG8>jq)?-me6>MyVmk
z$<B<gTTy)gv3{Xt$VR_-z3nF3_ct3UghB1k^3@QgFT5gBst@2VH&I-+BT=OEJ=h33
zRML-Hnr%!C5At)`_Y6KG1LrSFLfE4kxUiN_$wQ35IKYVme*@+RI%+s(^@jkZV0u{8
zujV{q*cKLdtd-!Yc#_}_PrzVBTa0EivvC?v+!vG1*l7(~?M#CwM_Z!gr^YV0yv!ou
zitfsDoy%RV=^Q$>7&yz_i*`QUP*pH-{zR0Q@PyIyMovP#*~b#hIC;HrB973Yiq6F^
z`Ztk14~e|^*)A~21Ko$FP0^M|kiLAlh~m+xu%yxA`(`LKnXP1;E$`h}Vi|9|QjRVq
z7(TL7i8;v-o9=yrUZ`6e#uz5-FIyJpmfwc(2)&wLD~Jq*gCDo;X+{Qn*x&5A24v45
zC00lwBc5CvWI;uel1&w}Vw0Cs0P@IzZN1Lv_!|vuwqVBa?L|W}AAZ4Xq20^i>Q~3d
ze6y_%(W_!@lt{MggD(^)`4DGWXJBh{Q-KFSrUbvFr2_Uej4gTOF(i>m{+R-~;1sTs
z`g~enDd9I-=NUw)Cl1%WS*_7$*nv*7C3)b(_g>@z#T-hL<oL!zlMC#zoFSa_&(*&D
z4pJau*5RQK<kGM^iZkaIW`tvYG0QFbW1j<JVh~a+l@q%NJUOZ#n%+H|_0EOgbRp;0
zNvZAowtD%RB_uY(;9y6=sGoAHzMi6VMl1PJt$SHGH4AI=vtrL#&#&ZN77JGNFc_$=
zPLeqW3M5`Gb?D>12p0~I&FLa@=dR9<3ufb_)1n`(C*iX~VAriPU^Uh~9)c$a#9oS|
zv)*`{%@pSCmvMi&`4x)q4Htvc>ZxoF7CtgjPLI&GMqt-(AiwPOPb|&Q>bVFV;k1O>
z+|<RZW1XKsapWAXAOlg2#3Q-<7tbR4z2qUJ6Z<N=e*j-*v`+xn#habu5C)iD*1@TG
zW8lH;NAE0+8`S=iOVDrET85EAoi}K+SDz+hSd_iCW=aS@>3cM<ttJ&TF5aMWk=-`9
z{IHY|lZRc+67pR+oMJNECO5a^oABKhqeN;uNp3p3XPPx!MxMAVY)z=i_t&T>TFy8$
z^Aogxgn9{Opb~Yb`j)(0A>DR*H;c?<qino2eDt|00PlNKyFyJTFr`eb9ydb{m~T`T
zpF+zvDY|jh3x#XKf7~YIJtAJGOms2T;z#HOLf=Fl07@?P+8;Q4&Qe^|Dq%Y;lsUzJ
zQ$58aKe8o7G@u4f?S5lll;a-C0lOM+5Uz#JS4(5TmnvvpK8cVl>Mg+JT;lqT6BNN&
z8nF4A?HCL*RZ541qYRfA_Sk1t`$Y@;8ZLf*++|kJmKI;fE)H*2cFS-5$L!s@cv8|F
zsH5fsBs~9&4c9jV{{s|77#Ti^fSzMiA`6xDFuz_1JUm-Gl>nlF)Rd>Qmj1WmBkTwJ
zW9iZ#7$zlJMWk#VmZj==xt||+x2MtZS<hdnQOQ-2cP_T{&%G!2TsJD7iGiYo*G=5N
zBLMyEM3&vd4CP25e2>)0z>OJ>i7Cxtft~<$8+iRw7;j@VNRoVy8b<~~x&->_`WF&}
zNCWweX+8#kC?y><a7>UzSk5NFRKPa}07JBaesbnA1Mx{!x4-YjKjjc$EK_Kbbx`IB
zFZ?}|sGn!kojrsuLxos9C^>SWAxLBYYu@*jx$Q4xDDvu7GX`J#m~N|uhVbf%2(!VD
z(6!s^mjIY*-M#VQ9;R{uFaa#h*zWleDnR6e99Zh&8zJK<O~$HGLBaHL#`^ursQyXD
z7Wac<<6GJ6>O=M}3L*P5KFeEk{7$uelk;g??;ro{MQ<7>>276y@J)Jz#9h4<RIFRZ
z7sKiBY>$4<mIc5@Qy;!I-lv{dfCXhN;B2=t0AaT+E9C0%)UJGe45MeliBrJA;I<EZ
zYAK!`Xg{85Gw^*<P~)OSqz#3b8S6PUEe)}>-tB>P-bL%o;BvCp4`K5m()p(WTrXcQ
zUc~=jUnHde8PP<6BAVgx3GKU+6h3r@vdC+9r=}lBmwT4ETrNeY(o(O>%%{~=uEtjf
ziZ$KIMV$x=$cDvw1R^=#8*p6hPZqi4LI^P*3r?3H1VJQ#^xD6`f0ZR7lgut`G5U@S
zl{e6SiSgRjrsk;qV-d9*7Kz<Q=z)A#8ilsO>IoYVx+veT3zNJ^V>NtR+7?X~5q!AR
zcGR)>f&JQZZNQtK(!DN0r7gdxp(&tGq&4tx^$tPs*i1g?u&(VnfePYEPqt;C+!v0q
zK;cNol!QPT9>CQkCak`yB6*ipY#Pt;h}l!x);b&((Ux=MiB*8SRX3jTp!0Qp7-tLW
z!qhl-f`I9RmF^TjY_^SsOI|_FqZU1Wr!BNr#nkAIMd`W20SQj02@fIF;$kBt%)lTI
z+!!a26cYcY@vT2m+uw}N?R4%P2;3@uCWd+b{vor7KFQE(5pO{MC%EM}Tu=`jhgy9|
zC^j^bTH^ML)`QdJ>;8jQphSf3R>bcubbZ27p%C~=Y+b+n+Q_Rw>qMAr<eQJRr>In~
z&Ut^XBU0Uo6jHg_Xpj7>2GJD>a!(rto4?7;2aR4<43bEnx#lSbS=ZRYCyYUi7LJMy
zEHnCfme>x&%&pkb#Gxk!S6^u~s+Xy)tCxw$g^Gzegs;&?tVJV^PL0J{w3bV}`>e??
z51$ehP6r0tulUB*HFwUs5+l$f?LG3rsl)C15S}FuuEzPUAAO?n-HpkFW8y8V|8neT
z<}Z1&!=<}SUT>eX*JA#IgKJ08Ih_Wkjg!JvI+GX2Cb=oa8)2IKJ2wKTf~9AF99j7|
z#4Oa9ZCg)zo72N8>0TrKbq0e_pHDdIam?5;!H%7IRK6en1l7>6c8q0aQhA)V2$F!l
zCq3d7_P-*9Vg8qn++0!nUEf(MX`E-Ii;Wir?tu8WQ8$(o|Hgp9SZF0mKUGr!$c4A}
z1N~2~rG+(ipv*{-O2j+w&(u}7950{2?Nrb#HCz~*Z~dWM*!=uuB}8qhSG%eGZQ-oz
z@3Tj<xhngbTY^r6#%aCL5~vG1Ez2T`ou8*)K+O3JhdKU%%+!b@faMkJhkg43xNxAL
z?4xS?Wv))qQ<BoRJyFA5#H5htPD|1n^&T`&yvw@R)`nTsi@p2U30SX?V;PlwO^&}A
zTqiB_d++4iQOkd39sdLmV1Uz)c!Q?i^#*jwh8P2&RTummg*|;|1kQA@#1WGDEA2(c
zpz*+3q<XRI?yO84E9sjzZ=nB6@;<0IP+sg<xD8&}t|1c@blA{O;om2INWlE&k|Q!;
z(CMeeeRySk^Z}~XOsI<^52grrj-_o*v(%_Xbo|y}yV!Dl^KtBA5yvG@>bXK<J^2GF
zcSIv;%w`yRR<X(Et_Ht1(F9MSF{|VDfs#?STq>E-m)^F{oLRU-a!@Px3}prv00;KG
znM;rhK?xYtH`6TQ;fo6o$|4!^806xj@IRkHWDWCms|lX{J7o_50D$gez0)vWx%Dgj
z<kOTUoXyv61f9{SM@@d!+)tf4g|eotx09)tfkUP1lbndrbEX5-8vn*~4FJ%Vw{{H<
zwJE1a2BjE7OLSzruPut(oEtCP9?e*vi}aG~60Rn8@=nr7pD2(_?NZ2ZqioEJJ!H`Q
z{caGSo6$SucdqqR?pm&f86Q*v%Sl)-6iu*$A>`0RF}(g+wtgRW!XA_Qo93~hR-B{r
zhay3%lzqNIJrnSAL0u!915*Ag3<B@Z1kCk&$UAkHl+q`)4<6^cUy6ja&q}@<S#%B>
z98MPlV0CLmVp*o??R@A%2Jb4HKK55GJ1P=jSF3gyI^95?1B(P^V$Y31JISj&O{)Kf
z1;So%PRptHMx+JPzuNdGrrG{umn!mN5Obq7>8M0sCE=j?t!fPZo6R><@gi1!1)k3x
z9+6dqDZuM&_wh^sRxR}9s?q)dmF>pf!zoX7GIV1dMD&L_-(Masi8Ah;hAy_BK$Wr;
z607pddf%xR`7!=frOwPfqo$pb81EBGXj>JA&2V9xkTZp}-3a-yj#EOuy~?fM;3`X%
z#<>mZiocaOA+L_~ho^u=eh!3M=VILh0WHK{4@5M|nGDL#!?QA*48!TpwQ(Dsi}zN3
zQpb6csZh%6L*xl3$_VdUVLJHHmHS*g6lHgmrB3TaToTc~^PP*z!|Ry~+_ruym>c-7
zM;a;sGk>|~HRFzZ9{!q->!W{=Qu74XWU<Kre>6n%u;8@6QN4>_luFbbyR-2~L6X&V
zx@41>fLTw1v@>8eIbiVfti-8>CvJ?-l*D>df95xIss$1?4<3!b!?sNVkjs;Wf6+_6
z3ptyMbG_B=X}olm+hf5FtBF^jhav}0iCA?o9vW8@%l};ONjlb!#5ASfg2SLs1P16~
z&#oyifHK=XhhK|frR5L+_!;Yje_v932aiyy{{ON_{8#Efci`{mKSAZdR%dEuuX2!R
zY2oDrk7g;IsG}Mj6>RR=L>L~xCgPqE=6Orz9z>Sdp4d*su_g{Q|0bO^x^ATytFRMb
z*R6@z<Q$C@Z4@?ssE}SE30WsXO+Bz^$$s?-;Cbj_*xlTy|3$OpV3k;oW#TBMii9VA
z6p1KEC6zUK^1bMGP-A67kM<m<KFa?IuJArnR?_lJI6@y7;QvaUVcN2~Dz;LPru~Qr
zMS>O_+_ICdKDyLAjF<Ko_%u-Fki}mh)glZNbfyyf@kOs~osPwBG}W12PM5S3q%5aq
z3Y2?Nt&+CXQ85T|S$`QpgkT^_Qy@}A^=RPM`snq&flu51OoFC=8n>s8#SRHsAR+xP
z`i#r|4e4Ht2Cg7_j`xw#mAkHuHnqX(tZP_SpxD{@*HpU(w#f7QUVx*rcW%0W`J2pe
zvFzJ)d$AXcHJ?$a;SrS~?!q&I2nrx5Rg-4X?4Uobt-sGow29x}SS5)IU{B-+-2D;&
zx=u3t`Fj*cOb|qcutv_AH7J%sn{mEKU|vD@*SfKR`3ZoY>m&nJ?Ykw*ZN4&jMo>2L
z9+l4}ET{aFbGBOHIllR$VVA!pRvJ)ZZG7=D1>wmlMj@j;tv$l_Apo%>k>s=Htb{&5
zOcEtBSXcRLbZDpmAhiuq{cr(>W8$6lTa5_UE^XwL!r+gKpqq&+%3ohdUO)Ov8i0V2
zpn%?Fzovml{BFMxVn5xA&BHTh@@yq_<gmDr)NSWA_~*I4X4Jqk1c5L4rO<Pt=V%Xk
zP2z75Y_XaifKHQcQ^}gFK!fHS#&1zX#Igpj&~atd?k5YE0*YUYN9gby8L$MH<j3wq
zWk*jcF-A~2Xf8GU>N1Ph=LS@h^rZo%5jfHwjMsmK{_E@4|1disU^XDJh~LZrtwlEO
z6Vb&0L&#P5BuRnLll~Q>8in-_?Ujp6s|TjzwZ)F=N`Bsq6YR@AQ~r4EnOgcOGClH4
z+2BV${Qj$f{Cjp8jRtp}YQpGEuVvH+=yOp(jY>78lwy-ZYX)o3jH&gc<6?%@(Er`8
z24ImcAZ&6#<s>DW;raM;Kog&}+a?M^4^<EH-&GI61;&Yno=wpLc&0e-n++JQqypHV
z3%4gd!$3XjzF2Sp{;G*PGJ%W%_6T225QUbe*x%ylckVg9aLI_wpHhi71=#Lpq^-U^
zde}{W_?a^U@*}D@+NfX8jac;5*#8cy7fAO4Z30_bvCtsh_LQySfF{|An9;Zu<)vS*
z=<xnjx2nMFz*u#BEpbpc2_}p57+guKH6$+Eo^&6BR}A-q@&Nv7u8sPwJ3t_f_Xj}Q
z=#k;LfqUEjE6QiBO2Wi{ZG|7;*phH5(PhBA&+%xeY5I@$f&m{8Jvu79ufPB=$m#jP
z@3lbBwZ!Tlwk-E%N8Mkz^4WUV1DB#v+pGOU#ANp+&tqS_PSRP{AKlO8?Y1X)f9Jm@
z;)r)8%T>+63}|-~B@reyL&WnF!;dVy;PK96nIS9q=LKTOKs`J`J{4nRlJPRykl%d=
zG3vI4<TOV)*&rPMOOJGLVftJVQn=+#%;_+oKdf@-_&*$;oJ%FDP+2OeTf4dW@>OM<
z;b?nC0kZB1hV09ti-V;YJT9`&@kL5hmH$?%)G7ekoy^$2*w^42<j9g`+BmQ_RDn)5
zT!g0v8Lu2N&}p!LKqs<c7UBMLZGBDHz%p~eQv9t|RN6P?0G|M{p%yB@BX9bLYVQ{q
z0<7FKl*QG0)R;yyM>_83W^45M)5Tcth$6$8;Zyai>Qj!UWN&-lH*)Uq&F)0qvg@}V
zaa_35bHK~(R8a~o(8cF8q>L1kL<dM5BQ^htQPi}+Mp$9KJ8ejRvw7m#I@5ub?3CfM
ze!*Ymz+P(F6JjNJGCy@5F17i9%9W(7QTI`dyq>O0Uy5+Fvj9$ORj5O;)4mG~{8|QL
zzBuKOVS(<8^ntDly(b+FyD;d<AFtBA8~CrwiHgnkgj!a1y}Gws$xEeuEz(g&bC(4J
z4a3QEs$*=ECnN>j?_B5R=0gZO(<`@Mj5D*vJYT#{;M(zA>PZz=+318$@Gv217X(lj
z3uP5N>q$2YuWDua^Ny0ccLX~9+MrVT=3&H|B250BLEfaC%dz!TqtOoaP8XDqXJ2&d
zb_A}7Q%*$u*m%KC8)!e;*O=+D?zT2kLNjB&j}psJ%PYHp*Gc~<uo(%@4;lhFI}tx*
z^f3C87XFt+B){9%CzAiChJo>aY8cHg^|Z?$+YV87r99_ajQX7m+yb~Ttc6>fkieac
zpP#Ri*DzMLi`$C1Np|TVx7_j^xXWScPRq)a!a~GDR7CbhGrKvHP~N>~EztlMoxqL2
zAm*5l!t66>{fY4Pb@h8o3_!7KPihnbF5QesOMN^|4|9BcT9wNuyUkY)zbsMHx|979
zzsL}7T`r&UPuFoPTBJO$k%O3<?rjxmLkvV0Wvqdz#b)J|{J{f~gh29n6*xIzfJ6fu
z(W=`o4|MX()!Ri5@(G}RN}g3g1T1)DT`Sw41-Anh{1NeB6IQ$5m}@umV(>A>_JwNw
z3_7N78t6=v@X=l%;MN?!kD~6~ni_aM8VbX!nz!Ox^e{Lc&#}b4&hbz6K~7H%e~;@_
z#mfN4{2r2fPmTOhB<5IwpioxV@{JeWSR+tA^)K=CC#WE_0ai>QO(}vU2X0|}jeP<m
z4i!P3qxH$cu6&&T*&4}~S0hqdr4|PygEaTSMOx)I@2fxgDC905g*=_$Mnlh0))@u4
z2>9~pqwJ+B^R{}fTY<6`hs(&s@Ikc*b*R)-K75{Lo7SK&pec;GS(n|z-7qDP08-V^
z)wm{z0_3qGp?%X}4{GvpVGRSFJ!<j<-JE<!IK1Wtr9CKS6FWXv#lk7BK3Ag2NGG+?
z--x)yZd(sy@pd9<A_46p&BddsgO(l?j|R6sLtMj}n?#;rL!EN~oz`Ak()*mO#1o0|
ztPZghA)O7jBwy7p@7i3&k~O)4szG%tZM&(;6eIHYfr1&>8aQ4k?pAQX;Fk<2m=8qq
zXm`62q_daXTE(%CIOGHh+YcfHts``!*dmLOf^<PksL;vaN*zcdc2kGT?kPf3B@>B0
z(iy54FX$se`GdU@obB_&hk*wzE?k><a-ALM-ptPKd;|fd@uZVl`>m4a(Hu7o*vX5Q
zCUW|6AT<zlD-B`Kp9yxA`6e0%3wprx>u{l7d*muM@QqN<6(r!$4bMjxq_+=e)7O`M
zKwWUowfemA!#puM>uutpu-7z!)xo&s4fffE`|7*&<FtDdLIzAoCV+-VkOqFl#7@w8
zUKJx8rVmcM64N7XA~ul<NKD=^F|p0ijth_ni6ZQ!BvNdm%`v{*&_Z(gSzs@6O7F9<
zA8<eklIS@BYh3r_<Lk4b84&nD0NFDUPf8XJJxJfexfD0d#yymH_F%BS_|^&pm0|>)
zb+X|LJ_6l-|C!GS-{;{IZGZVnQwvnp(k=4&DYrKI{=O7=GtqNZRqiBNwR<^_d=~Xd
z{IyAtW&XGq!bf)#ZlY-fA)+NrhQQd8Wnz?lc~1v)`zH>Tkq9ni&ToX6V2<4!jAJN^
zH;g{<AXW!_xq~-xN!!H@Oe-micxDet8Onux6iqOR6E%(Ep$&$;hF<gHp-a0GEiV3U
z2u*JsqT&w>y=-q-`4+p3$0J1Bn!NT;oA9aB`S{bT@tTeWSIirrbwI<em#%0-`~Ery
zS^YgMfkOab#T<k7!%GJ2`S4+2EY+JvJNIO&Zgr5x!#P)n#1(1~mO^4StNl4S!$+zN
z^(0LiIf^pdoSx;+S53V+js$e<qJT9%1HmZ0pw6CE&c<?Qv&sows^>KB-<Z(yKvyOv
zSyfXXx?V>oCn{HV<TG-o5>(@Xz$}{x(9#5SM~?1(+YT#~m-N9^vW(m>VPlD*5y6WF
z?(Dg9hdOSu151r%IQ!=xfTh#Sudq%vyai`>L+z(9K;?*gWlXg?&9Vre-*P8+RF)-^
zaz@428P&r++kwD4It}CxA)0hE_350S_qdc``+=XErqkN1@E2v_+3#N>KPOIv(<^(;
z+x94e6Duy;VVxAfYLhPz<6Uw2UcO5dUsZk0PS=CwBd^B<>*(1!3IDvOvh$q1VO8&J
z%D4=tTSF_A>Iz`e`B8oRL1(`na#u&}XNB(V)5FqMFec|>)uUn><2#`jN9v#T{y-@-
zX8<JGMrTA`$3riW;~^b5^Dr58L#OOqCRihq!Ryej<umbV;mtNnO3`dp*6N5Ts$N$a
zbZ_P8M0vbKw)wOdqh1PO<jd_9>Ae+8pSDw<J&4t$C3?8>oEziCc;;JsQufJtF42dm
zlefQhxU|OzSWTR&ZX-2i!h+rBHL4pwP|Yw!p`$Msg+_e&ifU+<h(;wx`zspK>|OIP
z@RI2zm?;4EHvVgaahN&f%MH?CQ3dp|H*$iv?&*`13Q_5z7!Tk}@hDr#dA3{K8Jsgq
z`_w+0pcOX~zW6-1BpZ)(yBv>w3?xcP$^FdVDmPws_;S4}oh%Px13ju6KYGr4ED}HI
z#xBJU+j&~13={J5RV8d<<=$*J9!@(UdYN!u`y;{*0!bXZGME)YXIQQL;yVB4v8Y4`
zCj*auRk<!?eMtYKo9rLp5__iuNU0Y4@etXppov~RsBsyx&X~lM^bBQlH4chJ^?v#?
zj?>qYYBJqAjP#bY5sJrzwI5%{72fJptq?=Cm&i09+nc%)Y`5e5c79M+4ljPG0m;XH
zSx?6utm`c`o;6nUlNpY4JGJ@@6?B@xsSVJE-PBy%%Vg@1MZ=(eT@n<eIo0Nd&^^Ny
zmRyFSr61if?(;Md=ftj%>Z>@e?iSO8P$Qy*H`j%a<KKQR5AuV$X&sPx_p##i$eK+F
zK^eWA#tDfJA{v)IL1dmU$G+m4hYP_}O&&A3MLfr-wv*-l6@A$AY29(PSAn<j$iQy<
z`|>W|Dp%vrIXtb?W)4M{uXrz_<gE*OalD={u){pk<#osTItzhS+mP^pFO?n!tN<kV
zjZOKD_MbIlMi1BwcCzs&#CHfv4_5P_$-vCv_>Ea|SRNb;>F_j~Dr(!JE(;{!x_K^7
z#2x<AlB!IMKvE7DgCVBtmA;}FI5E+v@=Nx)s7RNvL3i5Iq4Mc`|JY8PQy8L&Vs-h4
zMD|WLZs|fyrucAGekPi!WLeF(qCrRoCLw|Hgy*I@Np2lFY3((MF$;Jy@lS1;sJYeC
zT3eQ5x=Xp<bu9=Ogy1cP;&7YDqB9tW$Ol*d{1h0`?vR&wY%JJ)p&~RUy7SK1kX1*g
zDkXt*!4^-BT~WzwilF7EODnNl=^Rr{3I<Fd9%?_!#ywCd@#Ty`n&HsrNpEmlPd+>M
zp3qWo-@&Ob>UOcGSK@+}fa1uN^C<D-Zu{5BO&nkOZ^35~F~jHLu5Wc*FPK-uHZdMO
zWWa&R0e6TL=2c-ZG`^Z6moaMqV&nyjD#u^ML<Xdz>=!&fi!p#V4{5{SE?@2S<-8Y~
z3s99_U$~KBM_&qlvtzY%!^SSS6#LS|Hz9H<4Km7$0))!56+LN_H)m1rIEY1&xrZdo
z7aJt?Sek{CrY>7WO@(#zx`gduQYIq0!3VZf6!AW)BvlF@#udG)R_E#R<MG51jO}Du
z0zW0Pv2)~PsHQN%x*xid_1z|3FN(9mM7)Yuf3Op%DSNnB$*Dp*j8iO=Q@{cq!KiIe
zo$95SE)FJQp^%_p^cc?5xLAxLJyM&f_=sj%@N;iGGUpz87gg|cYf7#bl7nz;6YOda
z*uff*^%-z}&l5lF-r*ALIPZMv#k3#6y-L@3W}pua6k>==v|t}vX?%H*K<G%((Bj$L
zZRxO^x9LW-6KGruQ|;AfIrJaw<UmizStTa#X6KqAkn1&7_Q1oF+mSuy(H}AI$W!)~
z-eCBH%g_MUHlo-E#hxEeF5T=215Ah{`0`RihcTZ^zNOjU<4io6YG!dZybb4aiZQWY
z7^Y*<nG+xc{dG+#*U2dQvv!C0Z7ocFC&!^Ecz1tEo;~zEaNAw*NO42T3jB%g;&iZ#
zN>;XtlO{(zST#&Dp?=;o&cjrScCo{P{SyPy!-uE)DlSa0(0y42uqzG&cM*ZqNob(a
z5-%fxOBkN9|MeFF+jRyU;stvR69cAw)}>C7J<~$gBIS?GeeqzUt7dBWOTEQe!A>MI
zp-4f_G@z@fL-_j@p09__DYT}vC%@}gg&&iykXSIJ^So7qYpAVZPabinY)QzDt?x56
zmqbU4cfx>EBhp8A>uQB$;JYR5Zhnnasx7`oWJ5zISl4?d*cA6xXf%rg5Yya}IJ#Xn
zDdj{Nzw7;}v(dbPM6(wSx1s}zhoF)AHzxj+&*G5|B&yu!OEpYfz-|pQeC?PRfgge|
zs!mjECJcyTFP}`~;L2I_*D?FK_v@f8K*`PxQ#F;?)s}9g<*<rKL-mCsgS+TlzhWwD
zMmAFMaf-+u4?8|&zx;`0HMMEd$i%Y6m~Q0MJ#1>?))Dx|sO@Gh`(k^y5C8FZQ6b&y
z1ukf>Jmo|SYo#FFNS~$qm8i0%LWJn)pel1miu~8a<+mm&h?|>N2AG-}!S1pV(O<lt
zJk!65jGwuu4vL=u)WYW8_@pa(PQ8a(?O#-Q*BJ@47YYK-SS8)xdLs|E3dsBRAHn#J
z{h~&^Qw6C*oUp_p=ER!Ae~QLGl?63oj5q*+NbSmv3Gn2!i@&M<C1d`uJRu~AN@xs4
zbaPC;w-0=icHl=yH35S-E^cc7Dqb>ue5N5C__r=oLeX2R57t8oXWq3A5xTn8d}`jr
zc8Mn6ZF86oUv(4x3ONVb+G04JHM2MtrJba&`CKZrn;Dd2n8!{rY<Q|v4pw`7NAv1J
z;uQ6yC){0<eGeIMA+F1vYDZ^t+}(+dc$!?sV#9KMkIAiTBT-*G(P1s$6e6g&=e+NW
z0Qn|s_>$djos!Qe&0c!wr$|s(lJuE2`XdgN)oZu9;*`_G>iCR@%OL|p1DV3RTPSI_
zjhPw(Hn29k>VQ)5RJl7jDUS6Pp^;L+?=x1O;gW$K>Y7Ih&;|J1`>B}_(1sOiDo@b5
z&b?afx$4!O#D!b*nswPujA+7<GE9r$Z)Ns(E%#)>uBMMUTuh#PFvbQY_2Prf6j>17
zbM9hl8{B*(*5g1bMir9))twgN_;jwMy5t+7`Mh3vuu&ImN$I1GKmdA*+w^kf*Vr^@
zZ_q`K`BFS)sot&5woV-U(0G6FxWsS>d#OFG<3kIbd!s}vs_W`S&o2hp7wn>@Y3H0D
zz0;`Bz1-1Ah2_G?A%8AoHlV2!&iXR<dFA$$O@wD#p+0nJeB!HGJ?rNPcYO#c?CZCC
z9`uh9pphf6kjix$i5@PjJAfW05-U4{Y)7v2wO%5BM@+Ej{BjvP3I}N=#YY?-IOsj;
z6>zl^yp@GtAp-O*YR|16=G+U1_G-`)Z_Y>WpYp{H38EOem<$iNOyP1aC9gQ^px;OY
zoT$PcQ8`MMea~}efa&+@1SHwi2Y*R-BX~!|iLmK`9ssRzQ!Jt6dc*v=Pb+2VKp*H_
z#(BZ^AN}`_8~ns8riFQf+A)E06$9jy)ZCW8r~#?~5zKEWf&4+Q5StWo<iYX@w@T+8
zxJ}!j&k6P&m^r}kfxF(3NKi5w6C>Z6d`#?!_zWveUAEp0gh7E@TqGrEt93qg2`5^4
zd@~cz*B3x|>^&6-bkvfG%cNaBGg5}LNVuAkEg<)%(Y-O)IBWQ-xt)`tpo#G&era%F
zkTChpxSVy)gJ-Ol<iFAk5C!G|+C9To<(|;IcSJrk7ZzAyizD7b99B~nXa&&>$LFNQ
z_*=J#o+G{-yMyr)dtRSd^jAyluw8_;<R<QPM6i(wLp5^zaU+I-*jHB9>x0b~Up3RG
zadgg2HmF|}?&-`iAC*1<#*qPi()y+~8z4_|-}?!d^cTj{4H*4(mizZR;qJW#kDrio
z({%`cR7kgTWq#pMR$a7TRjnS2x@jBv<W!+1{h$+bSr+7t*pB2g8fsh3inB#Z#UU|{
z)?~U8T11!Fj`X3`Bc5>*s#TV-i;dF%>{UQ)hR!7w{=>111}o^w`IlnTS(?Q*p=}uI
z@1L_0=F-IlBj(~;LMCAOPd1h*s$KprH<!Cy){RBT34y{6??1KeXqb&|d^oE6SG)@m
z1EFg?>NE$rRG<b%ZSsnu6h9UdKTMNlcVC|8XPr8%qECY!OyKHm?tiY<k0n(QnMD$_
z0d4klKk;tp5wZ-9ofyzj>JrFHQV)kd<4ZtA*k5I(DO;+zBPxe&cmyg7!fPsA*axsW
zJPNEC379`%OpZJ<p2Ew8XjYpDE_Qx+g7t0T1je+#-=1qWoQ)!K{-I+l005jmjq~5+
z{uLZ8=?z|ht9N6h9S+{B5t$I>B{(Y_H0pzzklt5GALuzex43c#V<4T)=wNp_zfN$x
zddD8WoLk7+Ez@AlZn``#%37CRise&_(9O5}(ffwN#~w}(6N(59KG-_nf49`}i||L&
zaNek`&4_C2jLfO1^RE%W`J?lH>N?LD8nxFKP5i@A?M&Z3)+?Ooocj@bIiH^(dWbKS
z;*Y)(5PsLf8<F-uLKYf70t;l_BBk;pLX!=s@VuW+&cw-Kd~3jEw`_+11WqVyeyQUs
z+;09|)Tq(i!W(HWv@A_7Hu%?B|Ls@TUqIh%$9Z&%_dGy3zU_b^e4pt|))yki*p^ac
z<$?Yy#h|@FY(O&xM@c};C0?<H=*B|KbKHq0DpU*7{!Mc|LVF4%IUipsRY#(qN)S?6
z-%*XfeJ_9X`zCpSo#YUIv1o+XX%<n_`bXBdH}4sM_;%}K-bjN%rT!zy+{fu+NkA(_
zSAQ?e3Xdpsc~|}~*D&}5NRZ4WU^hJo>5CsD(0?V@CO|3z^vV}qEWk@c%UhZLY72j!
z&;Zbv7p-i}GqF&q7YV#F|C;$;BJf<(_c6Db0bB=0`H%h`%tQem3y?1MoB^4G(f~{O
zUp93`kGl_9XQZmfcHx$OV+mSTtbN9l$NaC6i6H}LZU0qhPT~U7!llJX<rM$ZkN#JY
z_z7SGUQr=H47ITT6GZ>>U*I25J!q+cMzQ3-75u&Q^A)I_jDH7AT!=BiMiSTWb?m>G
z;R%4g%)%;vmB@kF5Qi53wU)TXz_e6|Y*|`9LABsHrvDl-^#w4PERGeuXbs3ZFSGE!
zv$_h<;zRn$@YK2=Pi~igo#|g|8S?t>rIB^Si-1gnARGOkN!<O97{Of=;M23{xUZr9
z!BKNRe=x`-a=nP<O}{$D95i;PoE%Y<fFoAFNf-58A>Zk^B3HU;z%<{jW;y9A(yI?z
zIwfjJqiU2&(;1J=pPfZTq=9WYe0)arbG;Mzun2x7-{-QG7U*<Mk$I2S?(J&NAJ}3X
zmz^T(*=i~Rxj5IM;0(yP>BszW2AKA!(yy!s4zu*I+p9{<)6eyqz-LlI_^uA7%B5c3
zRK?UzoSdf57{{iHfFYOQ@p68(NghiVRiPIR6G~EttlRh;HT)|mA_MlsC5Eu}+la&q
zxS=guAqMeQw;{rOanWQM&!Y7)yl!=eXRZGjpZs!@DuQALE0<d+(iX2XhviwO`Yor%
z@ij@2tTwJ@(dKXee{C8yqAw!=K0z<pH@jh9$Xl^@!?bPc2?@V_|8XxnYq_gJ&QIYv
zYl61qCf#~|j^T`RKTroB%j>nsg4;Z9u-tG+{4F&pdB%9GG_ch4T`8GV>D7a{Kn}`k
z|K6*o0Go{@<_&zn^;NaDzQp<9;+TXkoZr=w>bF+`Irfbcf%L7AVR<a?AyRwna|z3E
zS#;x%C~BO`dAoZ<6S?i@P?M??XwXL}mxG|;fsn_M_+#?tZq2xF<(7SB3g}@73GUzS
zKM(m6KK$+s3v}T~=Gd0aK}-$w!AMz6@oDV*!qeD0hC;uhV4gRh9pbq0NFPcS(F|`h
z#R#2SovkjJXZPumrJX=)YAVf_C52S@y)_;#2ZqN-7-lOu*6#P6x<xht8qwaU?+ZYG
z?G{#}`B!ZC0WkhYA8*DTh;7wdV@aN^=`Qsheo&d*sYe|sy$<F->7rePrn0CPk$I=l
zy4njJymfrPIfiRfcf;j%evGr^wxzapX-;In+)Xx#Kgv{;tghHRXZZ3i!CLBYJ^~aH
z!|zMS$*x^0)ZV>g`<~Cdh@_%Y4O1*PbIova(m$#e{Z)1TXgfX<fM<I7rCkjp^1Pd*
z5{mhxDIM1pUqq59&Az#GpmeJGS=Aw1-kDG2G8-cZGu<!kT57SggUS|AA0x>4@QdoM
zyk!J@Q20S0#3Wfs@mJ2f)2$UW`G?Jc^6jy^#t~$Lq`+re*n)7&*<~YEy&I%w?6-))
zUbpxaGaG&upX9R~YA^8QL|4=PR&n>i#sF~M3sT?GqDsla6wL_C9C!2nI=+(~;G{M{
z`#W5nveB$HR3{g;8$3JwaXlkGxl(cWflYRgYNy5%?Mw|g|0(o@>gu@&2@j>acT!+P
zb!X79Ff2HlobM#wdZKia#Abl@oO8A7M=OFvS{N71OG&Q&ULyYE2iSK)G4e!x;^t_P
z(JV(Jox|tLn3bsCC88FS=1<9ny$R2|bIQcmv{Ie`=(-J{&<GC<GPSL%8B}843%`R;
z8_=m<{>!CshyR}eLt9a(PA<+k>y6Kz%Zs)t#w`qhRDY>9II6$K3kK5}_)o3f)zH;f
zdza_zEZvS+2g~RTbCZKB!%>Y_1r@VG8S!nv#+AyHeGJ#EQRSZj<rC!`tak&aM<=g)
zB?c?7K!M))0hHVSOWZPJ;#``W5xNnjlyLz~L%6hDoaYEURp8;ODAW=>2-MY7on+Pn
ztj?K3LQ!i&L0~N_>3ERmOMuFhMzvMHP-<_5O(3n>^B84&TZ|T1gw8~@RR(D5kyeue
z=1Bo;eM_?-Bg(KO;sL4-B|9C9D}3&qnJ6Ls>j{qf@WcO(+MCbhUa^TV0D-5gpUXO@
GgeCwLj9fqf

literal 33144
zcmdSAcTiMa^DZi)1j(Z0D3Xz!qml;+5|toP7?C*SJSszmA?G9+l#B!=$q+_KL(Un7
zoO9-EUf=KBU)8Ns_f(y8?ybuoQ^Ve~cdyl}SNH1Y=?;0Nu0-&F>cO2mcL+er@>+N9
zU~=ENgTao21zd4ucN4#J=i?oa{0kiq<IVK@b=1fo%_21x5^kHkv9W^P-65^r;ciLE
zR93ft>Qq5u*0F+i53FPIqnx}QE}9$db}o-f1y{nG#^d!?-o&^bdS|YjjHRRx_a;+r
z`OS8VRDqtrh-GmY|KI*FfA}sj#<;g1lliUI0Eg6%dVdn;9vf1DI-ZaXd?}Lw0_H7)
zfvAirOE*=o12r4Cnyoshb2KaM?@(O?k%>zWR_m`}7=-ZlQRXc$(Yxo;h8lXJd|3xh
zs*bkH42_ginQ>)uVgdimZuPw^GC=>c_FXBRfo4wRb#3lA=L)_KnTpp&HVG0v3UX~s
z*B;Q|K=y54!{p&%e^ui9#KlOD7dcm)dvZ#+OflCT{>&OT)}Hb5!*EOjhc&-*adUT4
zTvxdZ$p0IWE}y$rZ%}1gV3RmE{RAfHhw-KR+sC=1&PMsFolqndvzi55KadqIOl>$|
zG#sQw<!Ypu_+3lux~V-@v|H_;B?zw3L_x+`*QA;ZqolP#8raB&PYwD}NM}dTP5R;T
zg&@1ci|YfQc%S)dvf|v9Coj(H7<|9RY!kq<W|`5|%#3)6hcC3!i5iUul2kJ6_~DaI
zZ$5}|S1~uveyUpd<80(eRm!=x)3SH-va`x~q=iyv00eSP&mN7bbYA}BWK>#`n>EN`
zE$aO`;Se=Paqg=%z5G|>-y`}U<4@SB_l-Fd4;MT6WIGx(C5v+-IoEDfI<XE={ysK(
z`xt>`H-6#!xXf2Yxy|?1d?7}>GbTm3W^>KShY648gHL^AN^-BVa}8rEs}61eIkeq(
zj*yTue)dGXEzJQ+W?wzA<F1(evZWv{WI!@O=guI6q6t?TsyW~i-{5HMxJl03m>Re7
z5tYe`KE~LngW36dG84c{a_8&H8C#!$&b*Z`G-9^;9{gj^{aT^iy)wO-uguFBrcvei
z&(^%-1!*WF-QPE<d4WZ&a0WC@XcE=@1v@djX-qw014%It?wvIVU>dTZiAynKjpdrl
zvdlftl|UOuOWd*Ox^SX_H}w8h_ZMS~gJ6^-NzCkJB2NgAZ9yHL*n7~J8DU&na<|BX
zSyw|X+R_tq=BsfAUmcJ;3;O!SSkhmUvdha@bh6RP@WkEg#W2%nT;yB}ABg0#?(GD(
zalQE6-%`;e$}LLkW7PrPSD~`i0Vz2%TVm2!WT5=40$n9GSS-Puxn=M!D08DBjfQe(
z6~oMrFFsA(QRuok-d)O--qQZPDwU>E_fj9lZx^Z{&k?Ks+|f0Row~W_XS}>tPMbpZ
zYF^E;iz2LO{>7)b|0s$NWI9@BT1=O4xXsex6({E9l<4`*sW`#$ZUTYyPI|_8S4~TW
z7aR}khX(y?kT;id*Wj3nv6ZeP!zM(wQWiU}4E}P2Dp5anhPb+!1uet{jOLW~H;ayR
zkWuvEGI-w4qp7S`vSHD@+B1orJ?PGLwiDMQLbjcc*2foCe_c;un5&T_EB0D3*bJK!
z^)24T3Pb%5iK1B8!j*gZbD4Ljxi<-m2(B>j4+dW&<_LuiAFJu`#+uis4?(gW$`&2%
zs6da1lBMJ%I3CPK^RiesVR0XXKDdZSG1ju6EksGWztR|-g^y)Ddz$9?xeqd5PjZ9I
zs;pA`&XY<tS9r;{CPg&;c!~5MbI*RZB*qORZKS2CXAoU>&wnESK|V@-^`VVMllECS
zL^bK5=gwX-hok{WX$FEzzF(#X!o>fODP0=l=7ko(lA2`nc61thcZU^rMAgl}*XpP<
zAmRKxxz$x1R?>3!#RNwDcY`4(qrzrf(9Vgc5?2?*tOqv}QrMw0@X@i}L>JyqR<4c~
zK!og-cTrcplTq=SrCy{>lwn1;Ukw$>jPAH7ij69F!l~w+Gniv%dI*z99{wG82{7>I
zbqTY+VXru45X~yg1X4+HU_IpvHRi30cppb^W&&*g=^EacveH~nPh^0F^j1->&y`xt
zI&Dpd?!HrZ5VSEWZkDM^6(sX5cV34V^Aq1T%5wRmT`2CP_*3PRENxiZ*>_~@eoA^D
zd!+{wO$72}<>sLwp}!7Ds@Q)g#@!&%&Xcy%KpVS%_d?`zgDuOQ?w>yTpbJASw{k}^
zR-J)Ux-=0S@)hs*JOel;t>b2D;bm1OtZ4VGR}Qc<UwsfXhV-x3e$QG>Lykk<-<){J
zeJw4bOcd}Uynn;2gULm*;nJ<;w*)sux3n$C5EH1(Or|u39L{^zkc2j~LO81_((!~l
z-H7b*%sb5QRm5B9dm!@?-Z;`Xr3rLFFs{kcB)p}c*z(+4xm(b_EH}@BqYuZcGV{y2
zj))hu>P(WVnhhBH=~ba3{WmgBU<pvw>a>Xy<7LSwK5g_&8s4={ij(_=W8qbu`!aT&
zoWaW@OtKd<>^vD{W@s)O+W?{A=3hx?O$6LB8%lRy6AS7aeoP#0ZzY<P{=A_rnwJkx
zfTFvuAtP?q`?J7zrH4D^?VH$mH?u{T)TPxGMsYf*@31Fv5w4OAPX*rP<cdT}b=0&D
z9d&_zl-jFx3h3*<LXoqglgX~>(jGMvR0fSxl*)<89hc?K#vR&I{W_TsZ;FePn;m<S
zT#~%if&XvdeC+w3;3lie{~JQ`_p<8#fBCxmxufE-nx?D#|B+KcFevB`OX_CVfQ)(!
zRty#a)uFD|$NAR4EAc;~CSON>lW1Q~=7WZ*_}`VU7k&&Rp#JK`!Em*_Zu8spYB}`%
z*sI&mwnweH8m>A&d3;k<@76(Y)-gd;ud+=h;Tdu+rH@`7>6m=YG1roM9J^lBq@&cH
z>&3w`tnI6*)VQpD5&VD!l+Jx#y|pSPzoJs1qj8icetVm*C}aJltB&f|0^rGEa@Qdn
z>FUTr)0m;P|DU}WQx{C*{l(BC&+_Z{;}4v5S4g`JW6!00wwv5y`UaQgi%HZ>@IYge
z#|gW8ADu)lbTQDTWT8}S=iW(~!%XBpflhy!of3*syteTAaPjqYNYEB~#K=;t_NZf{
zP9&pFsMF$Tt6<jIrsQYC1v1g$b)Jk<SG%>Ll<Uglelh)@^VzXEO%2ml8$Z6GBxbIf
z%VyU8AYDiFeea*7Mw)~-<&Yc|WHcB<FUe|A+=fbP`q|QOsrNo{eoKD!`9~)!D3dU_
zrS<IFS_pZDmwX(2gt|V)do@37;HLTDkk)?GPB>&=Fgrw3$=%2^Ve;Md@Zj;31@hSx
zxJ-%$U6I|xBfV9~=38qg0!b~c4ZK(x-BCbR3c8^!rx$BY$nw1Hs{V*?FYZj!=ZSp1
z9vQSt%$)DAtF_MeJ3c1W^y4|lZys*G%$;&SwM+B8Y+2j!7l)pga^^S}pVHF)JW;p4
z*iF5e@gF^*S#(FbU!7y~$gdnJT$QjMj%e4`Z;6=5Jli?l5^<LCTkL0x9L75_*&v+q
zy<kgU!S>xx=u!`Z2b^!W)GKX$KAyBoJv^;PO|7|&TP?0OG`%+;(iBlXy0yXcY`9lX
z!7<5W+b<thN<^ISdPHjL&jtp)Tw2(J>W5#795Kpa)xV~16cYz)4HcA0#wDE8W;Q&c
zmCHr(FCN-o3B7hBnsR(`E}|~w6(@225u!QsiQ`|ZCnF((;5QEI8BIvJP3CPEUb#EB
z1`)XKNO`}G6M00m$=Q2t9|wOSWChpPR+^ye8o&+OM&~Dg_%3nw+t(X5uSTh-?*}?D
zk%3mR?o_X2uDVS*`L9Y)X_{>O!Bg^HrYvyo;;L|T>0XbN@oVpqTCpuUpv|`sB%~G}
z>Jhr&+U|6QhxK+1RKDbn5$6%6RpawXs=q9@@RnZuJq5iw(`?FKcc4x8UdXcVjAqiZ
zNSWOtqb^OZfjqNF6VINVlMh-OIw@pVs#_~apOEs}VD9(SPqD51Gv!L2_9)N0@${Fg
zeLx<$^UXz<Is8?|xuJ$WyzA~kZ~S$m3G~!9Zg%72-sF!exJ2EGKy#u3`6b+Yw*zlF
zc~**T%TQwq;^~qxIt$%PF!Z?g)MEX3EOr*`JGCSiShuoMpU;XqBF~5ue{ek2RM>nZ
zPe14P%M{}(<yx@SWw6e~S7x@toY(f+bJ;0V)q_{ut-nYq-%|W&^(x0AtCA+MKS%K=
zTBTVb6YO8D2RSizH+Hel0Vc~xGv*-eC9Q&b=u3~}RJGuJFy2tz*0>36@)A+G+2NUS
zYQEV-XP~B#lVWe4PhalmtYo`<%WOESyO<{O?%IlX=k_h{7w@_#<<d+*&H8Yulod3M
zXiGt$+~+6P^Gc-r*c63T#oj!a+Q*bF3_V09B{HkS<eExSBihs<4}@O@EPk%A9;9Ty
z=uHx1nP7WW)gBS+*5z#E@yqn-{+A=`?z-?KbC2a7Cx+1T3R<b4)u|`uE@X17w*Is=
zV+=xsw#H?VCVmV;GzhmBGQJxS<D_Kq=M<3&isL{Qux%#}C*2BXS{(E^RTb&c*}mYJ
zCP(vpZ5kmJF$iyuX3AIYGHip0U5#N5jNeU#V>TTw;Qi2sPoSRF^!fDfaS1KdNjjhC
z7GX?{CjV&~i5?9f)JPTQxfUSG^;}7j5{j}j=DDs}BpMe`k+I!L9~@6;berK9dbqX6
zHSBEE_b`krit1drk>sd0qnNi6a{56mT?V@H$7QzzHMD2Av_lYqupp!{se+onU&NVv
zb0KK;wr1*mNd`*tr6ZY)&wR$}rQ-0cm-u#+)YKEa-pPaHi#u?~X9YI{0`T63p!s|K
zGT>Ctj-T+bVYS4xlo8+eMpEuO!$tmamcc^V*CYLwV7gfG2K>Vwfl=Ev+t?n`Q1FH)
zD{aDcrr)o*Ih~w*)te^Q(Ki*HYrjbqrMG@Hn~9&$rp-8hj26Vwvzt*I9`!<uuh4xM
zl+z?yI@{1NDN{i%mp8Nek>m03I-E-)lF<&F3xZ8oLtFN%OR=we=(I(Au%W4mP?m*{
zi)@Sc#&*8u0S1v#UR#Gfh&;V2{=WGzI@YXuD}UqXDBU+Y`I8e-YOxko30JWVRe@aJ
zq!woA^y5VR21amMA&2X<TlQxH9kdwt`?EZ<lAmT4clc9!TO$aoPPh$quXrJJ6u!CF
z%c<VIE;^~yd`Jo*vqy2(UYYbeiGnf{?aB|bjIM)_?g3}4l0Ft6a`~!0_alSFL=@~z
zU9@_WookW>=tn#@D9p^Lh^AZou@w{#o<Ba)p6+oD>3x>XbUe-6h#O!Xes3^CxQtw+
zy!cg7Ea9vK?HT&!CHK#d5<Z!4vnBOB;dtU6DB6K0vV0Pj$_v6&CC{+0Wk+qwq&Y+3
z??}H|)4$WB@DrV`xp@-K8}8wUdC48t9VNU_=UBDE_=?l$Yr?sKQZP0@eB(8}q;|YI
z)KnO?tQ5|%FJcM<91bHPzdFtdNt$_@g?E!K2{J%<sp<V7wI+1^&x1uvG&(_<7RQ_G
z5n~`e*C{G7&F_1vZ$k3d{f5gv0_*V)l$mZio;lU8)iOEFsOELL`Ny28b)g#}$r4tq
zzN>7_3K^E3S;j-m9T&c%uTikw?l(U3XVVennF|R9wk?4)<QeW1$FI{3x48Oesgnu1
zwV~_5yfRkZ4OOxii?5a_y>kVkejTKvrV7@)DgNY|84|nMqww0LY~cD}l`dLwCW@Ow
zjRCuN_fBLHja%+Q*-b-(v3cIY$4XMjfE5D5FE^@_3sYteV;52+Jg_^1qpPn|9e9P~
zuX<m>2nM+?39=Ru;SYN8Zx-;s@XSJgu88__b1qdRoNH-RxA|gSM2(YOTOpWnS*y<)
za30s%^@MYy<U7g4x$!fcGEb^<@WtmEPQ`z&OgLZAT1PW?MjNQxv2ct%e@sNt@M`uC
z36nhqe8@lZXNQOBy#RGousHk<{660dO#!L-+?)sRXCy%syd6gp`6GJIcM9jKnsSS!
zhni(7_mqUxFjCHCg72b05%7|nH@lAN6#B~-xC9nYPoo;$ROw|YM14jtP}9?-Wp7`)
zybHIz@Td5l?f2EoQ@a0$g1kvNBO_1zImT=v_|#GR$Hscqw*d-B8^%$NNB9?2iKEr8
zugbWFos8<R&^StgYm?!p-@Swx8;k)1h-RVlINm`PA<K^kZzxbBrri&T-j-4Z4tbh=
zhr#i()vD6Eo~+x@qC>FBW|M2L;n<{h9|mzP5LsDQMbPk);v-o-1sd{gF;Q7h--cio
zE2;ibKI2PY0gWjcN{r)7KJl$zn1N3%Ir$BpR&>F6zW14idZYN0V45?dvnj6u>k&qj
zcYMijK9DdC{6#&ZCyDnXKm89ls)IT)0?_;juSAMa)o0CiNa{@Rxwg_Fh81(GV7Q8v
zo1JkRK3SF1wZO_lvK1L;sKr&f6l!BWl}BY~>#~E*eYI}acFwP3Gu_o_RcUHN#%ph8
zRU+(bzgIR9?`c4*Nf%_V<`u=)pM!eT`75tP=C3j&#~9|Rc+=ixK##o}c{?age__ly
z;Lwu$f$_H|oEyp`*izH;3iA{<iZUFp5idk(Tx}&%QDMA8eaX#Np#p>6k{cB4hNrcC
zuW=wpnUH&8qh*;#{vQ>vN~lQC$}AYS-Unhaw?oh_mqbra+qTkbX1c`QFkV@nge?Kb
zCVDP8&AS;&5|j<cTc}UsCh9^y;vk~%OQWEhVr#?*YFS>a-3SN`Sg5D4D`+^%XTy%+
z*fN9xL2SY!eR#l&VDL>E>SSEg8D`ebXY)wP8#TOEn2^a`pvKqs25Q;Nr1K2fE>Sa2
zMWSe_ys@EHTz2&6Yw+Ud#lS4PK?n;X+D{RRJ6Uu>J%`chkFbCD@pQ`&8Ne$PE^r~G
zAjr4=JIwF%Uf274spSC87li?8U@`_8>(;)dd9VnqoMWMG*D-fyv+)~=TDO-u%g1rY
zRJsS)a2x)Mr9leHT1A$V*ZYvN+F6p&mAlktbEms$zeI5(1?qcG4w*ZobpS)?2aj~~
zErjufd{U-Rndu<A%=b`-S$8X%>UjGX0j7GvfRd-N^~N`IA@Ta_w1Ror)|pkAiuucA
zG@<hX=`9S~&nUve;{@sz-lR#P!W<Q36B~Sq#ksG4JP$5*H6pI+@DwOh7<w-?U%2pV
zkLD}7j@a59Ozc?hSd%pV&Aqvxbey6!-+Rvx(Qmbz(9Gy!<blQirD6QsVi3$oa(@J5
zHl3Zm_3hRK3Qv1TwmW`#u3iQr6z9h22`BoMM9IOCl>t!ttB7$ntQlPGB@>CLp1B98
zx85b!!A?(O?`XDQeccSli|CC=Ne*{(2`|S~RFq{3isoDo)qJJ+!M3g~RbxhP4AO$R
zaudhgfED<FNPeI6Wmhm@jUreJF;~j^ngFOs<2NMfabDGwMXGQGOK-ped_Y@xnD6S@
ztJq>UGph)Tc^%<}4uqgbT~xg06Z_vZ$XECmzxAlg((^~1j6B9nVSzcOQ@eH2S3}D@
z*+4`sd;-VIae>(dT2KAP6>0gs=i25F$A2coDEiS%>g;U|74BoXa9a|j!vR@;TOm{1
zEMhjSDnUqVt39Q`(f2;$w1Z^X54{npsovSah2c5&gR7WRI@Y)i@*?gT&8oEy>9>r_
z=Pw$gSLqwCl~Z`|i!8{@h94(<?L-QE@`2$BU{Z}lN7B|J-Z1z{rA@zkl(JK_a%Wf@
zx`AtfwbC6Fx+iIR%;b>ZU;Q4>53Gs;hubRFx00A{)bVZ*k7&Ptv+oUV6LEMg5pJBJ
zc6&l9fICVCJMCr((@R3#&l`GD{#X(R+XS5GWS~XMD58S<Z@dJcY=8&obT+Ev8K*=j
z4k=4jwA`z5netv2<F<8dxf80UbOiV{Mp(;aO9Pjs7&kuM06nCXGH&jqdu2BxB-S)@
zuo`$u=*dJRC0E9gr@!=6W~-1ja6l$+wvS6!!__Xh8E>ILD6qQa68vZTTb4kwNclc1
z7ba-5vbFOu9wI|2z>3lhW@`#;0C<Qf;rFD70V0kh@#aZTn8I4_<>}dcKj;ds4LDhG
zaoS{k8bz<oy-(ch1z!_aNYf4FMawqtk>yYO_K2+^PK$|2?0bzk)AktR{IQv_-cumH
zuc#a5dg6AO#Ywt+aI6%oO<0r-m&)QZquLwy0XM!y?GKH5J<x4~v=Dz6pqq+2v#V%g
zE@lmPwgMMA$aW*VejZg04mwl28lBpXVm}@RR$sA^vfOkTNmpeK>xb)^`|#f}Z(VpJ
zhbC|x*@d~HM-Lyl8r6}bBeKZ>TVaK@v|aV&8+ryk6^wGU9gkw}4$65Uy<H!k7UbG?
zmENx^u{)QZ?a23RLtNRS9FdTBd%k_oDuMa@X4i(Vq*vx__v^U&L=ykIKN2S}?VzJa
zR1ueOn;r^&ir1W~g|{(bN~BJNx3k9~;}Z$i-AWpXbF7mv-iYQ%HQviBx**KfSd?wj
z{KnI%KZHl^5XNr&mEZZ~y8T|}3B=%G)CfRs@W!o)2-5-qk|N3S&eJLUPc^%)f!fr|
zC?}&=6vw{>;NEXUMUk$>J+EIXY!sE5U-)kC=+_5oMvm<50v0ruG7VkcK_(FFIDe<F
z8aTc<&*_x<vL%0?y0Fv9h*-o#JeK}P>wD7&qWC3W1c?1w<U**$(^+UNnXm>WxM5Ly
zm<v{mkrqChqoUkR{^Qf0Z|d6J1wM4f`Hg|rKBl$$WB4cR0CFhw4+2A{S*i`?uaf*S
zlw9mdD&Qace>%2eDgF*&to76!3CC9C53}4l<h}}u?rey<AGcE1r{VdFf53e3hu;ih
z&}-dBDgbcRyd@6gY6q#+0(ilnFgkTNuBd@nE2`cNn`lrNniV&1#HwQU!^+KA#dLFf
zkR#aj=ABFI{_uoBb?~Arbj2G{{_QrC*w41(qp|z>xPt&nEub$=Q1JPCA4wK;#0e%U
zJ%~Td=-xeXa00`8=q6=h0{k4&*KSQPc=~MYg$R$LPiv1t9eDhc&^b9{<&jmahwy&G
z*XsWJmQ`1^nn$&8;uCrL8F`p&o&7J)@WSWaiV!=R*O|+-dRPxwYZgD3M7cKee9b;;
zn<K6vfYX>0!<tiXz*~I^g35KI=7j+BB5q~Jd*UH6I8eBV*y$=AUC5x5Ci)<Tsn21c
zp%Cy4+&X$XSta~?&euZ!m>pOq3J)_ku5KmofD<t+#MoTBp9xf6EC%$k&=gY_IF<9k
zxGI-eLM?s~1QYWwy!Cv3AUF4EdGU{@<9s{6kh%wPdKCMw-Hr}hY#9Tq*Su73f;HY=
z1X1SXN#29Jr|4mtN$7XC|3XpPrcU~;m;0%azs*JEg+lHWP1W&w$Cr6g5gdtqgN?j+
zOwi@k@U&zi5Y@AZi1l7@@frr0I}ZApd3<?Xu{^Jr8$O1%Yw2mlD_}wGb$dnPQA;~6
zT3PjO(;@LV*R+qlLxFVRpRnE^^kU6@wv}P23aa^zjqdXhdq<$u@-Uzd2X3CjtPEh|
zd>7bS8~qW`Iq4Zv{kVJg>=gsqo_1hyv>)BBkIV_BcXybV6&BK12H)-rbP#pnOuwcf
z<q64L_UpPDr0aTk)hm85)l4~MhLwuJB-`X8`Zj?Vz+#u(JLaNI7EPh8-A;$DBiy&K
z-8GRStN72-kI!apcixVu$Km*=+5~pg#S=X4iX}=d5qc$S!ddBW{tWQkI2SBazo9H`
zM962Hhka$5hpAe-sX5KCovV3$wW~nP6#ezEMZ`hCiSzhjHn5nYtn<qX6xT5F{hQ))
z-Okj}KmZ2MGMuW~>b8^u{I=-wyy64V!69hfY36E@qA{1z_l(fc6;RzcUNg4UX{D@2
z$e1a%dMX@n+)p?W%@O1L!Zo<FmI0)6$9o}+ozbdyZ+9}nv6DKh6=&|nvI&bKD}#KP
zCLz8~+MRl<kMLH^&?rXP9)O)ql@YOm3vjd`xSD2BF>`5sMRn{^LDA=qkHQX;UFXj)
z`}9?*ntlbRQfKiCKe`hw5-vfgyOS^eO1`<~A3P#^!K@?Y=>!p`7>Z#R!EkJ&Q}?5f
z&XZ~MyA1Nciquiwc9Q>L=ljNVb){~y<)f=7>%fYby(1g1z{BoBiSy#M-0f-zK!=O+
zWm|zu8XOIH$85zHyyb#$7BhAO8s0Xn7s<HF`!w2V@H8)`Jw~_p?f3Fa>^h>#&l)Js
zEY)NaFg{aIRE#Rl+Z%N&kMDk09M1IGUyxQDFBI|SdO4N3^~4<Ehl`ip4525y>6PSD
z{*=@0kWtQHwX#)TLBVjqn0|HZ^{v(^qjDNVt<4@iPryn1TAh@9-bwYW2En`Np8Qxi
zjN(m8bg59Ew)|BQQyTHe81dRK92us;>KYVUpJtmlpuc;L>E>Zf+Z?#Wo!b-e+Gby3
zTR2jUb$m+-yt2&0S0}5@jYlAilA-#d`$6*o)$3E7U6A<+{OvvJww5O<ExielNQTU-
zCds4fjD2|=c7JS9+XjymjLqVcr;N?gd^Y9$0AKYg@>=PNkD=3ja@_67<9*q)bB<2U
ziu4H;9o`}c9|x5<x0biUzbj)ELZW8>U19c%L`54LRgS;!v?hLaWd;Sim05+FLk)#p
z*-=z!f!;_oIgxyQ{o5*7YbzUg<h>cw2PLV!Un?T7-tD<SVR8Wi<FmBwuO!w(s?Dwl
zW;vcHCfe9p8tPn?Fj=|rx#@F0KXu5+32<a9mg{u*guys)fR$XFRAR;=VDl1!rM0<W
z7w(D(tF-!jyU*KxeI%HC(7qGel2Ht%$}*k*Vf)z>?5t^AsLzJw^T%c~=mpC<{n-mJ
z4;G6f{6KcS3JCUJnXm1w{NB{79IB1{$|kS+a9vH4h0~ATsTG*sRP6lts5k7b`|n?5
zSr%8G+u#uKb)n1&rCWvkfh2$bkDXeDQmaO5{s>`L%%#x%z7|m!f7C{hc>0zab6R!p
z`8yZha3Xu1l42$vd~8Lpi~5Z0U|!*^Vg-{SxR)$!<QP0s5zPBJteb;O6VSvs(}>cs
zD{$$wgztIOQOvmFaHsf_ZftBs>9_YaV>201OhEFNqOT!2xNlWG$v#3$p4@{w-0j8j
zM3#tk<O?kcVZ>&kp3abF+1ZeE_GeH;STwHWhg9xrr9+#^e&^s`yL=pLbG&!;Sb6$S
z+Cq~q8{9=6>3-V?A^!QK;*j)Zu|R8iH86=`*QIu4NC|MdW8QWkwFpAE4U;zi2-!3I
zP0F<91Jx^3G8roMGVjrdCQmrlEpAz$wG?jT`QS=hp}w9*N*FSb%A6*tIKTB$omy95
zBV?Qmv5AQIF!9Wa<xOO~5UI|~p5lV+CuxaCTEHQd?@r6y^BkLy(BS{57_{#4FKh8S
zG|%fR&#)%I=Y)a)Ca9#83Be56wDqcLlPG@k+H#Ouw`wD#Pg%Cg)~jBg!gwU1^=N3*
zizB-!LF$peKrvqB8mZiWSBF+Xn4g`57h(SoEiQmUDuUm>^A?}iqP=C8k^uThf%Ygx
zV~dF+>8b0_)zO-q72<Txc7Q@kTrUJ}1LV_x9nJsqi$A#2R+O^aSm4iin-Ag&9601U
zR8PFs*sMvc4x=gyDL0(1BK+Q8%})AqtC)2p54Q7V&%BXvC!%s~i8-z>U&Qb~BB~7<
z7z&yVZEFmC$DtXuJC$X1zLVA&9y~ZaEr8htr{2M1)Ss(0TdNPX9}lj}+Fj_nMR3JC
zW`B9}>KV9&xTGpK^G23jzj-t=+YY;Wd^O>ANR#bs$b7R6FQz_EtLmX?|M5^xHN7fo
zYT2P?LSJK(u+=-xR-8!6_@2$6CoAdEyTQ~Fw-FK|DPiBbopTRPwHj3`PPNqQX@7T(
zSv$*r?vkGH`#E3VFUa!XY<R2jv8CO&D!JYkU6g**W<{i5pR@OQ?EZ2(rwW*JqBv1P
z#Bi&asjdFUs)%s9yKBSY+>x%e?FE3-Xll;r6#~z2+cvz3zEDeo_xHeq=e_gF={;m<
zcS<T|2Om^dWz=ba8&#9Ne@>s`UWA^s!g!wB#TVun9~Ak@-?z#0XfcA;p4(K9fg5=Y
z=?^ONZoy^WXx*8YbFyaTdox79Uh-IVV)9t;<s2X<*&7*oV%W5RKG5?x5Ayv(tw?;~
zf8fO7rz81$47Zhj1^Im<^rez_X{}b=<X_b>LIqaNM}iS`<`Smfggj+Wp-jCiZCdr~
zxGS()MsTB0%3I`S*91ktLY`5B=FF{Db(L1zT8~xAP+_~9e8wEVzkrgC=OW1S&$bL_
z4YPn<hRL_-*LJkE{k{P|s>FUv%$8N<+Kb2DHgG~$!&a{Mpr`4G#>{6jaf<H?|75p7
zJ`ef#GhkJQNzBXRy($6PoA;3U`p9~SJ<v)7K#jy9M73}{Ow|SoF;$BVzkRpZ#m|^X
zhfo_ZK<%$e`uq^`*XerpL)k#X$!LNh+#Ub>PI7bz=xyYb6QmW6S1;+fxPT$*Gu9_d
zTiOio+K8Nukyb@K+dKWV)VMYh7M*>+S%Ae=d6gbO6uZ(K{SZ>8lo3y%bv<k=3tZzn
z_VH}B|4_3vvfnWsHkQvvAoFV7*x|FK99#J>)zxOpo?qKTj2tug9-P)!(U8+PNoaQ&
zPXC!zgX1}XmkH)XZGk7IN%8q%n9R9*3Dz^;r+{1Pr8hnL$beb{3kH}q{#U-bUz5f4
zKx}+&8GV064Jro^L@D36$ffFQ#^0q2LPdbN4!LLiuzFeh3eZMBMHa^TMjS@7JwS-)
z?WN{KzCi|n=bj~|j~pc1RZ@<j-NyhSDB*0l<1TO(!upwfc9f|h?z8jVucMpK=FreK
zpfD$NYe+BqbG3@-o$w(T(}5^IIRFZ$sE?Bwo6dZdDxv%X9s*AkDHS-_q8Ym<qmrvz
zFwtqZ5IhG<6uWIxY_t*_B|eD1=01SI>sU9>U%5=b-B29qyUDlv2Q~S+@x@00-o?Yg
zp8&tkwI%99R8f4HNv<&7+`sV?PhEuoN73w*#KLr3Qt!&uh$1@w8X+itKG)*^p-J9m
z=WUG&D$n@rLJrVTm_TFK&E926|5iJJFF+*12Mn1zY2Egz<X40Zr@C?PTP(vjCdHSS
zp=KrvJ+%U=B5L9vlx|lAM|a$AQf&Y`!_^3<Z1IG?L&?N>26t_;&Qk(XE6R;eX8Ih$
z)Bf?+=(1r^XQ(L~+?>|nCqO9^IhIA;&H>LTpzM=H-fp-z{mI!$MkA+hsS9vorJk#u
zc6saGsd=FQ$1_9bC`A-|d5Z+HDvCK=Y_lK(PL6mbes$*oxwA>Yj&8O(-$2FVpMM}`
z=?~mUP$lWLGi*STvU!M-K7WMCekn~f3r;L`GP<m3hiw@k(Et~0BGeat6%x_yYXX0%
zIB$@3%O0h0EHSpMGJ5A7Ab8D{&nhn34J32<7I@X|8#fP!Jn~yfA0G86p$d3z5OXaR
z^8zc57RmN0MFW%{%Vd0ng$f}!!PISQzm%8qH^4YGGal)O2k|R3iQ)p0vwr&GFA-rg
zyr&S^Bv(@my3D;mzI(PlE^H&8PepxqB!>(OM@^6c=heNfN<1^EZj#4uYtKsRilRgS
zRBb~d#~zk%O5;Qmv>S^{q?g{sh41=EddYyjQ}-Xn#4djoDT;1`2_V{zfb~2_Fc6L6
z^Hn*{fP(UFSfL`K+?OmXDZUAZCu!(9Q?`I!8NTMrj<q0~4Kk$V<#Fnf<Pxy4(n!<D
zZ_WUIOT88UG9}60(TIW;{N>|?=*jz1dLZL%mbCbd1aF_=yQRCSp4~5;UZ^9M?QRU|
zE8{|<`we+O*t(ZsEFFsQR$lZQVDSOAUhiCT#Sf6VyCAn7c%xm=JwZY^WC!m`pe`9c
zs8rvmGnLTPxEAnl7v;Kao}?iM-W0IwNAvC)^$Wu{B2nN#|DV-iT!`43tngGe@}1d}
z%JdB8)mm2Kbu>ay7^V@}C(YaAPra(cI7l#*9vlnWZq!CW5cWyeL=hk8Y(+UP>5=W+
zp<ptE1V%()=)9(YT<a=rCObV;yHUjTWE9fEW|pp!6!=Lgb=0Y{0U&H!k7vdKe)^4p
z%#=?sfr)qZ;{i8ZrDjT!6(dEQ&665gqxzqzD-Y8rRUrZ=S0cbkTJihH*RB(v4i9tx
zdw$q}T6PUt;{Nd1_W<x=Imt}xjc<3E$i%JlVF=*8OP_5`OMZ?Ym`DH1Z0ixGxQtJA
z14Qw5v%n!Y%Qt{u0xAen2?<Y8_z!t)P&*N*>AAX7SB~qM7yyB7Qla3yDR9_JmE5TF
zA4CDdc*lO9KbF}A(~@tayPxht*B>noG?!PV(vaJ7Sv~N11R!PQE0^F5&+ZAnS3Q7F
zZ?qoM8^gWKBo-&zy|8dH;(svu0Fu)H(DB~}Je0>o1_wN5_I^FL8tS8OtvktjnT11Z
zAsBPxZ1H1vh{+31Qm=aYItkkJV`Y;a4~{4BV1BQwSCU2O;VCo+j#n|<EnO};8@7WG
zJ-|C13xG}AH#pus->BOyuOXzUNAxYrcwI#gS>u;klkZxIaVu>#8=ilBP|LYwr7m3`
z_Ml2-%VUnI^Dh%<mtfy;8t3BH7qkZCrrn)pqI*!Xe1WjJh+@F^1fLw`H~_e0i^nhP
z<(o@A$g$6FYrxjKX-GSllxCv<6i!M&wC&~!;j(RU?E&0@)jTw4dC!~)82h%h-<^nY
z%(Qw(rDnj=$4)J7Ad|;Fy+DngQgiE5MY3j)Ly}t<0Qh$ZRkd;9X)^ekh87a1BD<t*
zQcLtkNd>fy9Tfc%?qg1u*>xOOoDm;qUCmg18$No}3B=Umi?^duCG<S?`ifP$KkO$%
z9*l69H3SjfUtlz=^=@}Q$zXxMq>u@%`h;oPObtdxR_;9~q8|fr%Nk<BNCC^B{y#U=
z5F-kLe4ovq3Zlv6z4)$BRP{>!8LwXJV%n{*U{|bv%O#*A`sk8vtmsJtORs7YV2oL+
zZbLZ!(Ls8rjs=(9{M9y}q|w4xl98)ZRw2X^rHRZAqW!NS+-EXoF~XVTBHkBQJv!gk
zLmX)@Y6G7vlP?O;pqHf04?L&c&M5nZQu6eMN&cztX&&x!c-`fYQtfq_DFhWZDbx#e
zsj1AsVXT3EOKe6~c_h8KdksZop{K_y-(i1BDbWMYDeT7%1ntkCz-csbuW2#@0D)OU
z$5cI9-vD}~PxlEyLong|b?c4iGG9H?UVn*sN~F8%AVEm>au&t?U8SUy)ylKAIsG>g
zjm5}Qfj42=3)ptaW<&=M5h(F*j@nszMSr#3onL-D0VUkP;qyS6vI%$}*QXS9?lb_H
zqD_2K#tD%g4^3iU^1R}wz3V<T^1RHFMOn6oP`SE}&waQfxE@NmJaD{uv3bbCS~C0m
zO*l87|3f+ysX8H6!(04Ny*FMJDc-Yyr!aFTT^GR5X{8`+ee!`Fz{By{S_Q2LPpPkP
zM9t-se3oiwCm%qzz8Ys}xY$?8`v!{8os*0fT)$yexC+Qog-r&WiK+XMwOKV5eog+M
zE5%p&&Y@3o*F%K(X2mQhmcD5;0NW@jbCz#D{BpMJ@rM8dU8P{kdxjbmueznr$Y)8m
zmz0E(Ar)so3g+(|-00cAo$2(dR3xcqbn->GImQ%mj;Nj|V-DC#L5LcUFG+LM1piqE
zJBJ%9+0T&GQyJf&6;>OTE3qNNBzkZ_3+MF?xYi&`Z&)(}@rv)9m}$w=|IYLy4y(t3
zp%9Hq#?7op9D3K?0;MCrQf1q3-2`74qFXtsL(Y6Cj!A0;7L7-PlV42PGgs$o)c4b-
zIEnjX>kr0oEbzxv5XHrG;o@h;rk&Es+<A?uHe9K_UJ^W1k%;0SeE|e!#4FBvA<hNt
z59tg%_Qys@Zrc+Cq~*l}utI}}sOhHDTN#8!J0>n`zy>L@XO$T1ZNGjL7xI1uCl#bY
z)RJO$h^M|7=BdoOgs3(Artm{e2E`;^t6Q06UOZmt+Pa<aijR|;L-Ea<5^6UQSg0jC
z{$q<rgiP!{i7;K@fBo@Se2QFKSbTcZ#2i+Za_u;cR%-D*E5pyS%=KWP#fDv|73pOl
zp*|vn#ic*LR(7kDRo38&%0$@sPnAA#Rpz{UWr>c&Mo2_#`QRmZvticf(mW0oM%m!j
zqoQkDQ1kF{v98x*h~Xe*fTK=a(7mwr<OOgG*`BvwEjjE{cnggmk%SxX!0&Cn;4SP{
zAqtO+>*dd@G;R{+`=|P}G~}hN$~<P|Z8w1(5*?y5<?dAbG~}J_++6cr{<8CL%wQzP
zW|54&c%xrn2;LS*%O_LDG^x=ruU{BU^a7I7f+fKedn*RV1l!@iApR^?bebpg2p{>Y
z@2zb6|JF9ze`|YUy`Jc`EU@+N!i`Pw@VA2;JCI$BkmgML1qS$akj;ALqy)EM)m&Du
z_2@}#;6R8D4A1#R=SNS`A)m+<&w%`TxYx^^_HHwy%$r|Tca=t$@!IF8{B?yJ$3K9j
zDoAPTo%f+wsva6IPq}e9XP9+3y}syw0Mok0{F@}N7moUTpx10MU?+nFOu8=AmD}a@
zjH=An!3Gy%D{-o;2({}KF1hO|$CFd^LjPy8@;96c4NQ2yeZwng5*H9J<{CLl!ZmoF
z)Wj2-c#RVN4={uOt9tam=Z4k6Xp67OH&w4)>&aZaD@OA4PUGWvXkN*-`Ca^Z*<bNL
zKcfFlIKlr40KxLN?v^|FDFFomo#g69?wy=`nzM_ii#|mLgX&VlP}>xp5;Hm0*R+rO
z;+T{QGakGLGLvr^>+e^W&U+sB17*6m*HVfOQ@<5x(O!xDTS)wWq-6O2^Os_V>Z?#0
z4(KsIdp)qEtCWh2V~!|#|9&_+1htxbVxULUWnP&pf)dnKVv%l*nX!o|6n}z6Reipo
zV+~JIUk@8(-kS}uDxq;;9$HBX#d=Nq2fP*2!Q6O~D;BeVHh6*0Y*m^I*-3kIx>2)!
z%?UI_xI79=IP?Mat`afRM#n|uSydVic=s@OSA_ILgD%|z&s{;YYF}@x@I|4Qn?SEq
zamt}F9<~;~(W%||HTYry0`fDFWL0*+`u6l{BWx6qzjP$SEEV^?H$;qOt?^iO`3)J~
zQ#5}QS?(1ddnVK->R5$&tC68ZQ&{cmhI@1FWjYee`nEV+xwGTXY|L0Wj;pT5_A)QK
z7`HL&1?fXE?rgCkXQKx7J=Uj~NXXuKaW4BMu3D}p61|q2=3c;HXD-3rEhv*3hmx7f
zb=J7viLt^P0LuD<ORe64W}l+5S#!d$pQ59PWgpcjd(jWQKkF3h`SKRt<py%a>GERs
zH55B85Ok*m1X04^UB|$LE}c{ARSU$qU$UZ`Z8CCw9L_+FwmxFs+SY9l{f68d&b2aT
z0-}K`nndPUCyy=u7Y3Tk>Rw%s#JL?`GB=Lfaos8wUzFpz#WP6^i{hB(;wd||9wT+U
z3rNfFS2n3i$~O)r9+<3Aa@+x=*I+n^329j!Q>xy1cA{Kq(Fo>Q%gs#-YA|GJmz|JB
zTDm&pmx4Nx*Xf7GwHb?uleE;hE+<dWh8}(C=W4kX=0>!Yyt}faBnl$N#va*_<LQ5R
z^zx_ZnjQAF^tz(zIWQ}#Wr{wT4@d1>ISEax7v#Pq*ldwrzI?zFynnXC68a#?Ei^f^
z=N54Su$I_8(2kV<?3p_PAInc*=)Pwen|14YD|>_}w+IW`$369s3u)=2RT|XjFZkwb
zu0ZU}EV>9Z{%5b-WFQaZkeuW>@BKyqb4mcjY9!*40q7#Q6djT~%1dPl$LP=+xjUB{
zGlOl<CQGe{vv2nCT}k?jQS38WB>F`x(H?D<_GdFhwQUo5?0HzQjTrZ3Nv0mWkt%mZ
zjN7a*XjlMbR6wo=*KbUY12;2tLByn#@gf?iKOerjiEo3ZCLI={>r~T+cEYk2wZ8Gh
z*k3H%kGIe}RA|BjQmnzxf*X#veQI3KPmqocQd2T-CM~*-nn9ABEe=;jPkY0RmA{IJ
zvaQ#ZJ7$jS?QdDaf%Ct9mia~EVWx@~+rXRVyVPJBB~i&=9R$}B9|&N#9^-8*Dt6yL
z8XCqgyyp9R;g@ZCT*O^`yaD^2Kl;M9%1oOJ6S<P&lvJ-Lb@l_`U9&I6rP)!true6a
z@f@Cf;@4WB3o}qX3%dO<P08T>JxdaC?q>Ei@=w<$%vT#RWQVLz&)|5_ZyHc8cBlv*
z+;;-AQGc-CHdCE;Gvg3m$b=6nhj(SPtW3rjV-sQ`3!L9t6$$gKg{e$n$NQKMFuNF?
z50J|tnK#2CS<yR$*chkHtlxMl;XMWOrv5-VL(}i0^&Us7fY2<V*nO^2ndQ-FUdOrC
zMT#U-x8uSXzeh`GzeQZ^R~bNe;lb4ls%DS|P~YNo?jtN&6TFDn45rsHa}1nih6kaL
zzM(H3SS9?BIkf@W=fGCWs$P31U@PZ;wj)v|7-~v1^ifllt_>Cj3+yrkoz)wc`YdBy
zN2(Z5IPnRx%e3+N2!G2B0ARG>AbreC>86-K#>GLL+FeIFYf&NnEX#~1VF#41Ea4qM
zmmD#?UG=fGyY7Ib^w7O$dZ2?NIa_EQEoTS0BDy6Wbz4Z-+01bR6)ciQ1;sE3KcvtG
zW%96_2y}C@sL3LkH4fYRJ_4B+urD0-;MLT>-1o<i70&I({%T9~p2GR-==QT&jZP>j
z?5G07Oi;ISTJ6y-i@~VdBq_?tIREjZiMQ4ekYBg0R2gcKLyW9Sd=%ZEHrgDs@mS)D
zJV}yR>#+gHe$+RF&&S=iz`aHB0Jc&QO-W8f^j4OgR~<ppKgxmhRKED`%VF&a6!S5~
z(b14*s-NDUdVLn&@1(*IelOEKT4jby7I~lbosj~}k#|5<RSZc%*^c{>3&~7VU1Q^h
z<GzRFy$jjB);|l+(F*H>;971(^-+@%1$p{u4)lb`z7!AWdxgZMeBIv!^2Fla44VM~
zbrXmtWgOrj7&dqD#5a#DYw=;|{^Km5`OS(!RDWJV(`S{ViaWtk3dc8d+~CI3pMidC
zTp8FLH0f~%#QB!sxKL5<T?lhenA<zI0JjI&x<FOV$y2mNcgH{5A`-f~aPM6S0V9Fj
zbdWU$Q=l2n14eTP)<(#h4{JEVK5wZ0?MTVXd#Q7)qVFuV?*$p;#E66$*ch4G4tUd8
z-~wA#?z3P$DHUGGRRwA6{$}h=_vb}}H^}zLT#N2;5>xz_vIxYb@d?&hTP~<Xi2Hep
zjGsR9Gaxr-+_RqF!A8vXfj~!Ru%KatICu5AIWR|Aip8<q%SnpZcg^odQ1^VTh@PTn
zqi?vITAMfS1ytocuZx%II0tIh>WK4I=n#!u3bcuQAkQs=-pJ<s(IC)HnEeL8S0~UM
zRH*`COa8S#<05^UcajAlTiuv}D+DFVRsXaxtn&)fM;3jM9gkwezqz}B9*++vCdZ*o
z?8E&HK!kbdgUs0VEepBfiGYHP#fMZ<ei14Oi<NJqI*v;jH|`_d%5p1AGf*&}0N&Q`
zmLNQ4G|5aK#EgIl>8l)Wf^A_YNVgcjf4;$>IN6^2QVMAa&poL#$pN4mTk72YMA;<Y
z1Ei11r6Jn64(pz+2|g`(8ZP{z+SZR&_1TAKy{Lux+_c2~aYqe@wgW@cgF65*Qh)X=
zFuWv}Nk&3v2*N1ic`mTHkj%`E`l9J3Q|^D+Xsa5P0JK{kiI+`a-p*VEfjLFZS4^;H
zqi%YYd-5WpVy>3Sa-PT9L{A#CinqBW_acakZ@2kdE}!`-*u^91!sg-eeISZlZVV#I
zE45>t3jdse9J+nTs%a8EW7eapv^m!R7%Ci8MslJ!KwYmslgPOSYu!CT48{v8I{hFl
zKjPxM&V}lO@My`KJu#{TeOTI%L1miYwqMW;9qNGZSu*t4Dylz$j>pxD`Og=F9KieV
zeEWl-d^Yn?8&BWp@jC;nVMf@iVf4BK&BcJD3EK&Ktt0I$Mc<U51i(o1U!-p;UWjL9
z&M-ntV}$Jn>OWy*>G<6VNKPZ?Sc@ZBAqo}eo}b`LEdxBC)cv5S-SORPe#3!_#_sj!
zt4R|rv>xp2^!0;uw@tV|<F|lDRbp7xJbj<>%=^1;=X+~hW9`v6t?|5VQdzOPu3B`O
zNA*|`H5sP9;6;9pwYRDMnDRITVUVj?;_Zw$1;G0RQ__-6$R;e_UAey&7eX0X_W3)h
zS5>}ZI1z0ND3{w0Xf-ALx&$c1D*sLnYA1JD=?`S$@`O*o->l=v&j^%5Tg1t?ru7aY
z-MJAq1Ho|1*k9v%x8hEGtRPz=c}RQXon;~wqjRFrOvd73<YV$q?E#uG;zhWD;CkKM
zWzYuNQ8_8=knn+xFWJVQ^rv8*f&Q|X{n<;)D6*MaKpAldH!)64j;8kjXW&y?Mvvvp
z#<aD0pt<80dyhu-hI;uVbZ||C^v-rN&B4}fk7ICdL){S=wBtV=Mxsx5$|#xC2-$20
zr(s{C*j9$(8(;>eMrX+(yWYsx`_Gx+c)WM1sYH##JI-xZG}|DIgi?YGKvHTI@9}3!
z=JR4ON}s6^HsYDnf!7*PPEDTx?0}CZg5j47-s?RQf?2#CdZkMwvD^wYi3YxWE=E|)
zmQ0@J$L&B)4}UOV)3YwLUUx@;)UV^|qZ8K2g0oZ&HLqIP6GVTDCZ1d)Fuz~;d}r@7
zqk!7WSG_!(YgrWU$E!sQA7GN0(aBV`bERnDr3>9o>!~Hvog7aeE#p&_-_a)WA6h&X
zPU~_v4~ZkyXDS|(bdY@JtCZ8~)xd9b<>r0bp|>O_{jr)udOA|n+mUKS*~{JBoHgqS
zaR#5ToX==dOg>XA2IhNP1?Sha?5*8cd?#`KM_Br^^Nc*M&yCk%LPCsOUUY^eIg@nj
zQw2N}x!NYwtc^$IZV$_dh+|o*SRpd<;fIl+$_vdg@|7ZSt-vFCE?6doFW;qFkZ-gj
zS0>%D0$$cT3(gPUHTd6GIBRiF8i2MA&In*o=Y-ft*V{I}<TQzKv{<qla*j(KEW=QN
zRK_SW@x<q=v{l8|?tiZMZLs2eQlFwFFA2WW+a|_!JwMI(+j=f5&l`o{;;Yh~iGDBS
z&T5H7zw&<mZ*C>&Cl@ZY^@;3PxZu2l3F|OV@uS<rKVTLdI29IZFYCsYzljR7OQ^v*
z|F(=X%B#I$kYQmjyRl)0eT@Juoc#XxRlr04+rGUg^2Z`L|9Ti1o7?mO0GXhfk4(qP
zvzk@-dmR!2iRh0v{x+%NQ>0x?Uc?N6`-5x?I)To`<OQ#qJO7L$+c^YZ8uUT_ka4oe
z>9(4oF5}t9e}=ef^u%8yhKj`Zx7$~M)=p(F1>-UNGkWpF-*d6Kit_Z8=^Q|#lU3-(
z_}#zS=w;?VeOg5FzbqyN*_YL7@N>Fa$D}ooU^HP47%SGJ=oVH`M6mQb30<Jxpy$zO
z)qP^McHhGzC@41SJ?W$}@FRIvQc9ngq2N`y8f`T3E$bj<c&A_INvr;cvfH1OWb;pw
z_S^++X<O>);4q4({GQujNq5;aw(m}>`ne<JY3IWAGHAQmB1L9sA*^==F7tutWYnbb
zA$kYvKeI|O_FEUPof^QaU&~MLEU8?0xLuNHAF@C6@ShPe^QYw<hYD9j49RBiD{nov
zGp7atk|F>}WLF174nHXKGdrHu4<YO~dPYOxVCw=q5l1{8(a8Fv32kqeiM@-T)&-R5
zRO2_N#}TV(pSaJXwse4A+~u_cB6>Hbe2Q&ui5s?)cGk%HMflWo1f9@;@4;e0PJ`#|
zqGlJ#2gVE;_p{VQ8z}406*Xq7*)riW$`_DO`b=kI?}*B{iYD15`|LArzs{5)MOPpo
z`re!Sn&(6_4M+ld>7lnCBWC>-|3&J@##&dK(k5HoiHBi5JL=clGnzA!|Eg6E^jEFV
zo^nL~6$&HH9t_5BvhP>iC|Piugjk$Fp}wddsnO!K?vNgf_?5aZ>9v!6CFPshn&;2S
z|23`y`O|b)Ey5Z1i>aTe|EMu8_kW;9EWUbvkKHdWwD&VLr0nTME^GyULT}Ed*&GTD
z;9X~6nuUBSDa$zNK81fRRFVYYzLkf3otN&qk71v6e;K~N90C|^aQ;I|6`&3)bxiRC
zqo(Iyc~4=)>5A`mUHyBd(Z5{Gq^h3zSkbn>i&a~u`*DHzD<f}(b}bN}k|vMH<zK51
z(a_cq>YVmD`lD-2p5|GY3huW3FM48s$(rVzJbCqz324*Z%bERY`P!gFh1$L%U1Fn>
zdOFDD>Y2ER#?Iv?(RS3Lx_eg{|42zyKiT$(l5zOHOU|w7{@-=n=X21zW|aal5OJ%v
z^b)ikdsTZq!I_8Dr=i=Xsp%oJDOdx}C>BKYD^{tl9c;c|!v5Vmp4oI|KUc@Tejn5r
zLp}Ss3zSm)_Z4M-xp*SeT~af0zIJ1Bb(s1+{aE7C!~MK{G=IWwR&IK=DBq3Rw#m-L
zKEqKjK_8;)-t#u{2a~#F7eS^u%-aQ_q}gOo>S{w>6d;d|p5UBl`)|#CWmuG3)V76y
zh@dDbt)ev2tstEeB8?&-&Crbkf=EkBcS%SOHAu)PASewp<N!ktF)%dmGZ@En^qlwo
z{(b!5$INv-&)#dVz1Dr-YwbOJuK`_wGO~&P%jQBke-^);sciFT)6jzJm$bVpX5y}W
z2Rm^bTGSr9=^ece0C!ekD}4s@Ufj$;1HI@y$H8Z#Vx(OmJIpA)3TcyAr^K%(+MdXS
zjYkRcPi~q3)rFe9e66?Z@FTcxy+gCU`^LdMvf5`|7}J+1-kJ2G2Ki=CT_zB<Ztg+r
zURbx=C6ie{dShXLPmLg`Jt~{cdFV5>W9O~;Bl8Yzv($1K1=ez3-O4^KyQk$NVk(^r
z)>X@0n(mVasnR>ON)<uY*XaG{co_U_(=7M95^J;L9&!R_P{$#$HZ>mT^g1bMJ&@|1
zNf`Ui=YFK2n#&*>R*sPb?Df#0_0+-9es=xroR9ZWPNMPF8#r&=RkE4SB;O-PbeRTl
zu))$6{08=9F}df+&<!SOE_RhX5%n>u8)RwyF5hE0u+g4O&m*vIY5;e1&v0L`eK0hm
zOOdMbb)JZ}j&U+!uQuPI4~&r`+XS)@CIctLh%R?Q_HR@lU1S2^k_mMdUV-=kgCW(i
zx>emvgNyE*qTdbtJZPO^WBnsgoYwZ)+kA^XTnxXCq`62fxwiy~N%XGGSC<^7krZGr
z?-cP`*r%&nG8x6O)U5@vJo%9eL=Gl9;(FZ2EXpf;C`z0L^_tkqzK9SuFa`eDZraQX
z;3d#Rq|GN2EyW}8Nkoad<dz@D5#<cbt^0IuC6`0U8>WM?H)e?~Y>>|Nm7?y(&~@j;
zJl*3Rpb`tTkHDM<$M-aZsJzF^ZF5o$kCp~khF!d`dXc%Kg2ijnTLS8s4E6+s2A2Oe
zCGnlPNk^U|7sGmDYDw|KiFC9;@a8fX`b#o07b>am-Sl;ZEY%eGOPq)7F3GvH-WASW
z42#FvTkcK?|0~7=E7b=Y_Bv|mRi#g~sa!7h$bMu()POeF-T6%GvpAdNG3Hnl{26v^
z*2)NAZGhhRv>h^oPHMa9p!uV4f%zq#=+4=@{uH+*9ac1}YpL5%ix<QvI}13QZAgLr
z^F3+&GtnN$EUXa=YYuOSra!K1y<if|)}W_3!tO!c_hstvIrC4)22OjW`{`OWTJ9d{
zaI>2kxO-7nDU1tl52=qf2Xid9bqf2qhkYO8_Z;zu+$niW0jRW+PLZrQd<w-#BXzmT
z#9&t)G_xO7l(jX@F7w^5K2A0%tXG`$K330!A%LUTx#PR(b@NR~cd>HzSF25#%7Xmu
z-;D8}F9JFkz|^-<#!X*ZRX2s@M7P>8y&~GdD_r)Uy4PHM4xx@9F&O>$0owBZ*v}~Q
zN$oxq2U$@9kR{uS8rUPuK?eHx%l|ZC0p?VD|KQkrFHA^qK~gk~)<aM<!1!?^(d1BJ
z*{r)w!hDVk0c#n!Znd^9LuB+O5XioT3jkDe+e!u~rgXIqZxjvn(ZipgN4h|8$mCp7
z%iZdf`ig!8h~MB7HeAUaLjr7c{G=9(u%D`l%PqHsCOJc3?zyAA@0ujTg)@;?s}#17
z5A~h)TINj&kp^zT9{c^YV10~W>t}(D!uc*oKHKV`+RD)H$x2R*@kf;5{;+9v@}7Fo
z`Rm_u|FQ3lY@}tv@y!eq4pWmSn}_)g#A-%uJaTm(1tODNHmz+AK34@nIu@HPQH~01
z)vpu1ifek6C{tVyqHe12W3kNt?LFU8=_QGm%8b9`!2d!ffAYFNL5s|I;2ML&m}pzm
z*L_TcRD5W0xsyO1-bY<S?3?D>i+&Yi_!mebU2y#$xD-1cwb&uTSNfS&yPuHy^UkAt
z6E_<!t}!T#4L|Cw(*y#27U`uJnQstB0`IH&y|by;cp1LqF+O#@ZqhqEF&|aid*t7i
zEIW@Mt=>!?&5kuRv5FphS=_J(;%`wbzc)Y1{M~gJE4BVg9)I)=6YOC`GKI1$E2Aon
zvtrxpC7MuI8C+<J+hTbN4+^}xE^nX{)VyJ^Gaa|`0^>ue?~(QcYCbVvGl^BH=BS#7
zE!79Yknd2^^PzJU!NpEg)-_(hhz3TC)T@VjI;0LtQsxko!JLxT_5BxCq6vz_dgc?$
zyTr_PA3yN0di?Ug*E^f_3P^8kXP6e=Zdr&hO?6P*6$_s?OB!aYoxZ&?ntH?~3K_eh
zTO)`};rHr>YV;~5rYH!7b>w|juX#bL+s;<A^eUf(Bt<5kGN7zs>kbwmAMb{q&hoAl
zh*eJxf^&(}dmLB_YL5||NN)H`yRh68=UTKEXZ=tQO-VMJ%hfoDWLr+=6)oKkYNz&^
z`@7t>M>nm@2gUcC9Dxa-s{YGivsov>ih>r7LS<K9@z_2wa2?gRc`ikY0tpxZcn57g
zS1q#d>oHXsJ~}}!3D-^aNu=?Zc;NzPwk$@9ZGSLS17u`9FJ$IC*I2BWefzCZCyYp>
z_bl8a#Z8pC_s$ikN8jdyjY?**&eu^+7_m21)#B3P>C4^V=~~Cwu3Qi3%nZtBzN=w9
z7QK&wY^!&#?WOv_MAz2?zg9Nby#$GE39QOix7AWfK`jBNnAwtPjEj}TUQX1<^yj^$
z7Hg-+Ifu7#PD(bMn%ZYQRAn@#T6Fq?=*oG1tL@AApxxQ<MyFn7em~vEZ?3>ga}DE#
z7{wY>LHLcvK=1+~^*|09VK&4+mO~AAYd*hE{H!slO}Vv;vRM!3p<qB{_Yzgt;tgHY
zbWVahrAw=vSuWI0&osq43=JH47a_s|N1Nbfm(b!+yAaa^6F!%(n=WxluZYylD4@-=
z9qRX?CgUyP4#!#EMvz?s?|$m<Z1Q`OyTly8efy|6xQ4<krN0QscPsBRRg|eOT_p(<
zKNukDy>FLvskm-^$Y~TNzN0@o<!NdIK%Ja9_ffiqwr|n;`qfL#rAz1GtGA@l0_;GT
z^{$=$kh-yoX>YVORby|Mhwx;5p*7|+-9H4n|DYfWF8eHx!-mwXRQ!V}kzJxx!`(Ro
z{z+UzDH92S&ghQr-HVTbHZQ33&N=|q1Sj&M;PGxql(x%ts1zRk!k^jaw+t%X50c*Z
zeHsGAA{5INcUJunS|JAVO#e4nz-q4-@*U-?01Fu=uvX~!;QHlqmPdKy>CgE^x08e?
zlRDILpP_tKM!S0tIqlo;&!5`$3m4@jq(&nr!0!fb)sIa_5wTrn2X{&9W|a$gOWW0r
zZYdAy?5LJ=b2QK=-;(g>9N7SL;BjrENwh5(fQRlqGhl!<d;NjxA4BVW;1G9{{7v$@
zXIur44;Sicu+6j7`aV;eDr!stt0gEeba3A)W5`l^+}PVi@WghK{h=n{!LE(x8Mh8M
z8oKKvDO(=Vh?RG|5p_|!{L{B0RZBZ;+~^2T>RQyqz?D6&MhJ0u9+ji^1w_L($^fXE
z<z@P1qRA%IiO-SNd$YRp!%ksL!Js!<aD5W}PSBt1V8z!GCA-0?K4lgdKk0_dZB!Z5
z*g>Ur+kihfe|2HJ8n;8L!*j3~&x<Y>veyG>saRywppyGe{vtddx{UDN+~NLO$|?#v
zy1o*?q8!rmRY@y(iKY%S^$SmuLI-jf+RIRvSw<VE{EYT$kz}funp<{Hhe%u{N(vn%
z5?~k|i$#!)+lA{s2{P$HEK#;S_U<V+xbx{_H4&Ha<dVYZ7Gf)?c)4;@xh{8RH5K#<
zIL6%Bf@HDr;uQMTEI{<vpW5#Ny_DMY6c0E|R#CJw(|0!R<}%e2XnTwSuFjN6bvLMG
zZ_6nUEj<w90M2{x!*Y<^+<b0cp95c&PQDk}V4K?#+MeUl{}C%{<vQ3J-cgBKWC^du
zi)I>YkC<VK;O9EnDYG+Uv>w8jYhbg#Q1Qii+HuI_fW89`KXQp%2D<PEqKsuJO+hB4
zFW%i5`yAC(c%CwvJ%LFvc$2fU0F<<;+N&gPTsxbeg7Dpl*c7{BDjDBYI)9AWH!ySl
z;TguA=kA9gx@%jhU1mSbAHAWUCg&FL7r%50inIeA7KPWx2b7F%PO}`Z+X-OZ*mkFv
z_uvuqexp)mv@51>+}tsZKAytu#@C_ONxy3dmh+?d)pgfN6UUXfK5z=MC*S=W*1A+~
z#V?Nbj1KE<61sF4-qd~m>?6(n;Em?<xG+42{e4Rt6-i>ju0q^Lep%eT&O>Wv54<?I
z?Yu$Wz_I4-M{`g78TLae*#&NGk!N%UvsGGA_5_?p^n&zA^_~Q!RPqL<+L?C8j=ubi
zE6dmyVoBm`+UW6XDs>9JAa2K04e;2S#>0h}J^{#4l|!b;1AD#4KZPY}CBbHD=l9Ah
zJsV-sh#rbxrJNL-f!#zK{mQdB)W1qPbf#LS@!pg!qbAVYRF@Iw)71l%9a1}mc6y{b
zMk|iiA^jNy-7L^-zOOZv(}Tdy-NCol6Y~40`udofV-am{(bTA<*65*K^q2)!H~Jyr
ze35iXN$o1zpGm}@M<QwKm_zjs*KmoX+$1N#s%CDl5o%GuQPS6l9<E=<`47n{Ep*V3
zH<`W$(6-b|8HZLUC40YI3-j6AK=zKd)e9)sN__Y^|CiHX$Mcct*N$P*Z_sP}4jH{H
zYrC0DY3W<G-b|d?#119y%{fP)DlFz!zc6%n&nhm3<u88-mJL&Y5>v=)kAN@-Z^uHP
z^Nm?RYjykNLxcY$&QZ+?A^ZH`=w7Ae!=1qDf*zmsuPx@_q0d@zW?E&}{_tEK_ZOP*
zp-sLOw?%=+lk1lmrO``U*OadQQF}YBzu{x1?DBpLCO!W&R5Q-GgQHwFdgT}Y{W;!d
z&@g5dQP6{hfAf1HVIn(8=p!qsy#eHSE!B$0h|+eG){IEwc5oeH4PVpzWp!@(w%|Sl
zId&kTWUQ+V2}V`S%$@x{iWFA1in-UKwrMiu-MrPhE(o^KgYJeaGcmhNzbG>i-;^0t
zwL$H5p#Yo4Lo|AOuVf5h{(yVlH_TC)xd_|`060znfOY}a(!9roP5FHwx6NMLo+`wL
z=e>&j4On>#9!z^E;ziU}zris*0DHp1-_O%qOHkavDC)8t%IAXE60Shj*;%-LqO(<o
zewwV4kjYJK9k&_Divjr1CF}a`N8?Jcb8Frt24Zr%i>U`U+Br7$|2Nc(4>jqvc3$Dq
zD5+mgM{}n??hJE<&ZHSD%WX97ZHU)~R>8%bRQEgak)h;*Jq=qehBY0wUSB@*bXbW(
zUDs%BA0UqsK<10Kbj(2VRqbk67%1Q|?Kn$B@ftE0S^aT*r}YeGnkuyxNTXt2cdxm*
zGMz&vd+jK#d?50U&H37e4(0bo!%|Sqw;*eWW<3?i@jbcS9CzISn~Olo`%fC$45Hw5
zFG%^4CfL^5yTRue<^lcCcyN`grJ1Uu#fhsWmpl?e<wsnGMcDd2D0TC(4{_RK$(~~|
zU!Cgi_hlY)6eb0r0;xdj6<N$b{TWj&x~{em-^B;jv)K}af;UQPM3oXP7R-CzP>k`Y
z9Ie`B7(Ok{E*R7C={?<@<2H`P@>p_c&>fqudk+vp|AWD)#g<8YkN|E*tz;3i#%Aak
z-a&=XKGm^dtQay%3PUZ>_-x|NcM@izw>*9spui>AO08D7<SU2Lxz`<Fu8l6#*xr;%
zepIuIzDqZWD6;b)83&#YP5!#^Mw?86@Qwb?3I*9_CpaB|`$%W<M%!H<iS87)J(>Fy
zwNk*mzl(7H!lQsDWrC^}?PyHncDe9MXK-&Nho|@S3{lVh%?2XDHtVuY<*zPP@*xYb
z3g=hu6~|)z6TpvBc1V|oDHAJ|02Q<_*BYc3aQ=-*vNnQy(!i6S+is8A+p}PFOO!{+
z1Ast1#AT0Ka%GQH^61JvT*Nmm97k;~J3SeZ1@!)~e?zsv5p#QVJ~DgKNNaRWwszoA
z;3!6+rBg!YIWd>1?nA^xuiex5H~51himV{(^N+9IbZ$zp383h8qTn+WmJ%H~ma5C5
zuP~FIgPKf7t8aOFFU6p8eaN~(UG(brb;zm)#Q<2^LwI#o_vzO;;X2K}#Q!!{5SeO<
z^ViDzwPea?8ByLBtH>5fhqTWJnk*CYE3`B>BMQA9IvgnV*wLptqquFe#(B9Uq2xQ_
zQ?2Fjmyzl@ULjw-4r3Mwkj(>TKsCDjW}h3mkn}d$_-E-0o~Fwr7YGg}^VGXSA?3Q7
z=D2`%`t)DmCgGEduUq*b<YQ4PViiz<>Yb5%`8Oj{8Y!$my_0T$r}W{`YbulzmY?aG
ze$Z5B`ao;u!bOTM-_V;o0UQn}ciN&HW%w(u{a!A*emlBW3cvyNx^AIPy%^|_@$yel
zdMV6*{0Y5+O8r(T%rs8)ZO=Z#e#`?y;pF=;#CyifxQG<cCnu~tUm_oc=ovT!ieV%#
z{$-U%Z<s!^(h>e+W#NAjYkrC-VUJfQ?gH0`^K;8BL`jEJ?xbd&0PdkYaCr^#2AT+5
z-T$BEj(;l*{Yiegr2vR&ZneoEur=3|*G&?{OeAZx&rFn|^!!gP?EfHWm<O3WkcG|q
zJFvf%7sm>ZfM`H6V|#Cj^@EG}#(>$^QeekCIu8GTD;E55>B{^OWvf#Pxu{v!h0nBG
zS-|m}W63ov9M`}#mnk%ka18yvT$7v6DtE7SF*S?hde&8>z8CCTb&HnpJ^gQ0o-!VQ
zQA)w6GxfKGK4Qh#`|gZWCh56JKAO>EMZaLHKU)PLktxp4bxl5R!q?x1EQ9U<cQaIc
zt{NTw%-PFogD7%#1QM+arL^=e0qUqvNp#t9#lM_GX~V#?MbxJKex^FqB-Dvt9_9t8
zH7+h9V)V7nfRgv_TL3agYpCbW)T49u3VO^j$rusN_VJf<r`1EqFJ`;E%`?N+IR9OS
zQd<Bgd8c~-{BRTyTQ&vg(h>GuSNL<4gALzG;0>{sf1YOtZ-t?`yUua*byg_yey$L;
zHSjqoAJ1AYRxK^dBlxbd+v&yXx;gP5gm7zpeyt6iY2Y}VX0KT@z*d23VYCMZDncB2
z^~;fQ1?!`B<GsgOaH|NQwYjO|JM*YaOP9oN_1J$O>uVRi?W24$gI<!`z@0>iHsuZp
zM2!8G`7;j+&fOZZhk|U@KLzF-?eYk+_h+zMF&@j9&7hivt-w)<V73{)5&OLq^cSF+
zKiM{2s#lE5`=|w;<j}q7Jml4g9%AJ1nFvtfVsNt97O`oR?5!v9UX9W9S|)@8noiGf
zU0maliQ}>j$^Vv}&ry?AEa)uO&1f04P`jc(iH3FPP+2X+>N!hQk559i<^jocNf7mt
zq+1^5ijoXpmWx8Du`M)L+-CtQOV_WG4csUVGoSnNNc4j29C)d=n|{79L>!mtOdzI<
zseSSpG}TE?$k+kUNOI=|xU*O^3U&ShJ0&o5w|M=G_u4#S1t>4kP<QkgLf%Y34wv$8
zHt7B+Zo2O-mKeyv0)(o6;cD7tVZlTc#+OHh+3S`)@^p))zxX4QorXhKikS@Mj(m<b
zZ0jyE%3uKvJ+(C^UBY&MuG(Jj)P1t5$b|QbeCZNE&;_#m5iL*`v#F&pl{ZrW9DO+c
z$^-rK_Ml|**kO1iQ)=&qXlL<xN}ij^A2!VG^$M`@?mKv>Rf;pTb99#!cRZp7NT<-^
z^l5k}H*lZ`@MGcQUW3mhUDMmi{7!zjME1Q(!kQKlOOOuJ6`!MR=V<=F==N*ia<z1$
zBouWV)L_Tk!tU|M^&@Wf8g?gD&U;ws(xaNza6wGyvjPn%3Q<vi^dYtPOInMHfO=QQ
zARD*F(fytea!c>AYWp1ci}#X1^&}}wQ;^-k$LzTy0!R3?3*w*~?b%&RW&37~$(1%j
z1-pVHtrd|#)$%I9J*{;tf^gsu5{qVXQY@nPpw+hbpheSQji1F-p-r$CT#;LOQOpDN
z9K8Xx9bL>xwXtw%3)+)$+i3kR$7i!zi`%5~WI=Pvy<cqNA99zo)aTg`3?mg={2$`p
z6(YdM`CyPfisHsg#f`&!E7KdrisH2P@Oc~Hw@Do0#you-Ly!G6AKxytT7FkSZTb78
z(r^tu`3WhEr9S{SK3%Y>hK*?=mp!&YD~HH2`J-^-4$})uifxK;U{A#7kjUSFW^2V%
z0jm^t<Mwu~94*8-E)h`Gd+e@BZ`sx^Oy3NTxf*Bjv#d>3RQ*bLYZU}!?xltzUE}`j
zUpfuca$kM*Jfz8gbii^;m{VbOc)&8Y5aBzN_k5!PI5tMdZIe|lNlvf<(inQN^od{0
zi81%;413+oa~{+M1V+isdPsliD?V*@IVZLN;sKC}3?OzeTLa|=Y9*isA9Z2GOt^YM
zULofQ%4uLT|KgZ17yNL2;3z3&M-6qAHivxo&l2kQu7NY)Hva1e4OwqrpEc8qmz$Kz
zd_>6+1~(4}uF&O8Z5tTG7O{mAs{m0e8%7u9-7FZFeIc|PIxwue%riEMo^Y5q<MVoG
zmdWN$y5v&MY<>Ovn1T<H7+Y+M#<ETQdKP-4j>JaAb_vyB?-9)(4Os5J*>UTMu7=CF
z=Y!g@8EkJz*vvmwlv76>^H55%e5VX0MJk{q20iZ2@URtxg#z~o*E+<g=srnzgQwJ4
zk8SL{?M_i_`S>MH+>=2G>z&^#5TqBEc^v1aSK0}}gTng%**0;euw^Q!{ckC{M7Ls0
zhh>+_Eym9DaK9t2SA<bU@oSXWlaCWTN?{6>%$w@KosaH9d(o%WbF)8=FtYrpvHw-H
z`dgMBB%BguWgV%<cCMIe5C?C9RDtG{Jf2ijy@en9qp~$26ML;QlBVId(1UwFWVFBZ
zhrbVz%5fKe58ESiW21pC(Ym5m_v!PROh*oE9{5p6XBR`hbevs0&_cZoyK?16?fhTb
zMX2U>8yg|WSqg65+RS*-0IbG`K6|vTJ}3nx23W!eXs4iMR-@0Jd$dnlaP7CACf0+!
zZd9##4o$KUdzCl;sNLpGpA>9s)KeP2vqXnNw+3r?f}XBM&^IcM$UG5H__fUkcp<FB
zP)V}Yb1tqQpwi0jBDNJrN7XJgb`P$YQKjl9yvkVD#(zaDC7zKtv?m=(88aj$ZkfTb
z^-}seC6~q0i@CrCI>Xkncb=pgm+!t9G7k6EZw;WNivQ%+l{B2}HSV!8R&M8vs@dE3
zX+%OBxXo5NMUmQG<G1QK6l>^<Ij&d43=Y;#(3=-r)_|!s)1HjM^e?U?a%c?Z;valE
z8Ih2tn>8Hi-hNI^Gw=Ny3&Y7b9|M6-F)2wL&&x4Vs}Ve!=L)P-Bz){oWZifO1)F)i
zy2;f4!I7l!P*pR1K7d%_T-dqoo6N!A-yrhhaQcL-5@#p4u}e(-0rBnBriC%B$k>Q0
zK)cuo?v>U18l+8<JsP+2s@%>Uzu!nPi<ZHQ)*Lab>EEffzcBVx`z{`RmY3>DS}Bu@
zBG;5+d@T}lzocG0k&ka0pHswv;VWTy0`cu{KrwjHvjELz6-3MvnR%?ffm!)#s@5Bq
zccV_zy};FVnN7%P#d*svlBKdf^#I|yrJltmMNRJ?;e6-!xtUIsU$<_8=G>A?xwe3P
z=IP&`$$L7L?Gshu%E8lwOL4Y@S{PuMqxXLwCdIauobKI{ixRR|(oc!KYG-hjO<>09
z=Tk`+7d)ZHy*m85%GZF=Kf)p%0`U$r*EyG)*QsOo+(rOA!k{nYa)C$_S<J3Kbtr4?
zYo;ixd*|DeObc%P!^)rEn<<Q(a}On!D8T!JVV`%$R$M_-()(*%{PSGQS>|{p@bVcn
z-pkzAK0_p@o}D%_dkf8kd&lCe7;uK`i={uYnpEDi@QgA%n%6n<cPl@u&#7E^)_2~s
zDyt}M5Sad}KdjRKb^6Q_|FR4?4bph@n*P7L)!!%Vbj@qK?V##WROQmVkB~-xOwUFX
z^uu7_h{eX{@<m*t(r!^MW&QDs-)9!vOG0E|?E*9J*rp*8_v8@XD8lXMo$<%|>pvfy
z?AAV6)otG~B1LNO(R*_eQD9&(<)zhnaxJoD>s@XLIpty&;=;Sd^OQz)PKy?~qI?f3
z^PMKWWw9pAG4DC8HO#Ir1Ve<6b&uWNvZ~u8r5d`j$7a@;EU2xYc;oLTzTNopFnEO6
z{D~#+rKTsBJzX^P`yB2ITPG?l?F2hkt5=Ub>{n14w|w_ObdObDuX()&O8ZoXt?Ffi
zaphrwQW49RD!#3p$=V3vfz~_zqMqut10v{QSIS8K7^G~Yq_5Jvz)|l?!QI29Naqjj
zrzGr;oo`3#l7{xH4EuiBopPCxD=Y+fpYXA+G)orC!d^6x{I<jjy0uK*nBw-uX6^ie
z@ZM5dF+oh#)a}Iy`mK;+{iPNvLN#j({n9{rSA_yzgG8@wgEje~x$7PyR9Y*RK0=dp
zPD%;P)LJhrBE&ti^h?X0MI?Rq8MD(uzL3K?P@Ns-#S+-^u;*cb1G5Lw@B@9S_EeCz
z?Xo5n5wfIV_Sv>b+j~u|GQ)aIo`w2i^FajG39aRbMxBC29R`b**BjzWVN&>tv8Q)`
zcT7W>Fa4P+VW(mAbyg;raZ9_xe(%x@eWT~a&I4%tHQqp?(S~7LB9Fc7wd+L4aG^c&
zz4ysSmPpUTiiKQ8JdC6(Ghu&~n%Bl!*7govP^6RjaTGKqzxBgb*lp)@#=5?Va_wA@
zicbA%VpEsk^2hCj#w5?D<+T`th7vB2+s2M?pP9495AVQdVKIb=mugeIZiRTc=Uet-
zJ@B|?Ru**sq+!wGrkQQElatdLb5LQ+xz6srt*WCGQKkO7ih31x;>RJbm&Yi^d+n9h
z!zyU8?l9pXRbH_Y2{!5-x8#11!ttyuCJ8{<da%`HCGUOE&<y0h0%pRDgGvkh;=xZR
zz427C(?@dV+k;ig^y{2U<3e|h_hrOOM|$$r8yYYJjRn7riU4_yUBCg%9QdxMti?I;
z1FZE4wLcvtQh&^6Sxj^AKF+uB5!=ffmt=)o1bL<ZXve_0H}vc<E{d|K_!bk^=Z|Ok
zD)_gJ{d4yJjQH5LYP?(QzlX1%6a7byI{rKh{CmtgZezYo_FI@d{ybtT<F`0{+!3v&
z_gidKC9ajyp_dX5F8LUU6S`93Vr)&KAic|>JtQ9+-ltC*n+Awt_eQwXeC}(pvD);c
zcB{3GR@cuxfp07@x9j>E^rE@@mw7z{ue3iERHuROiG5mhM~pn#Bm|NU)o(MsiML3_
zzoq8R3%2ntul#y&*H&7G3A{2Zt92HId{M$c82RF41H-5_TwpAeN7U^g&HNms;6*uO
z!wmwGl~BjGtV!Xl)^IOv$p;m81WOt&>G3z__gPlot7UK%=jaR`j6iD;?2&toOL&bq
zn#-M)*}df!>Q8AsU{!F#KY}@@xY#D_>(OLp-mCr^b*3Znrn0`l!Oy(cN08aOyY6VM
zd!C#-1lp0@V)1D1WdfO9lq}IbcNR5$wXp|51g5c(d7>odF=<zgC5F+m!8kL^N|7XS
z7HgER{e2Q{lvHX?sqvJ&=zQgv#L9a5!8<+$18xw#|K$t3>Pml4NUFrl_%}+EO=e^*
zWrTWoKI0iLcC$xHlk=C42;RaNDSUNq>ql9@t1gD0zaiSF-0pD$WisVU=kQjR>E1{O
zm`(jsX9n#R^LWtRDw(rUehfu%^<pF7-Bov@cF0Sf+iu}TR=qh3<euS9QWU4YGBQxx
zz(vtU9g31!QhvA3l;0$%{%W+?;~g0mT5(_XLfH-W)1h@tfeAg_yRKD%QoDYj?i1bY
zXHN1#I*!N(#Y$SUFO^u^>THdix~rdU5EZ0gU_fOv``L16b|BV(;8qNH!oc_~Y%THv
z*UFAtJa8A#xzOMC#5sxUP1D+uInukgd!TJN0(dC~;b{9T%h1HT-poA=a)~sNDIsWO
zHSy2jBEdteyE>e1ceCPrF^{IoRI_9lyvWT%K&jz(&fP6Wsv%A_HcBNJ!Q;KsE}xCU
zdj|=skXoln-SqjisHSI<(DvfQ4m9lo-N!c3Rk{=VV_+llU9BvyD4}b1+bfUm)LN*`
zqHq^Dw`3SYL(y_sZ=-}Fu7mCphm^uE&pRX9d?&{IJht9|#*YHB4!hF4eGm|oD?*Iw
z;BlebHTG?T^g+9ByRa$3%!Ys%64iD~FyvB619?3W37IP)29h+$&K={iY}qSRLN5pW
zj$p=RcfwDz`&*Q4s^{K-jXGeChzGCiP_+CumP5*NqSdc1{?S^#L^m$7GT7nIn3+d{
zJGpEgkU0i9>ShJyn9ov`7voiR$&RI(>8;NBDRl0S(ayI`HxS*(B_A9#Zo?@U;-<Ap
z#V3i2V(>CbpiP5xf$vb3BNC~|rU~&DE<PKNtlw&yZi+OO8EG|qqSv^+&-0F9Lw~(C
zyA|iTpf7LdoW~bbb!03j8*UUh7q1yaB<dAHAPb71qI(Y|&xs@6T?6*QELZHilG7ms
z8j(+?94>0?^`CerljXy-DSOUt{GA$t!hDtubPLJr$uZA+O_xN;t%N=(MXTgR5JCMa
z<f}mocfeqTfAl(K+c}-}k)S3I{Q3YhJRNzxiA1-}k-D7vL5U|*zGtXE8G<DOms7TD
za1{0M=wC?xhOls&aHBFq31lT74N%a5-00m1iI?bJ%Dn3(#w-8W%``|CQE$+fYOF_$
z>&fW`F<&5he)H7rGuhsI%JE>j)mBzFysC)_hX=15i$QnIWib=HzH)`+bP>(zh=^{b
z9iw$S#3_q2MQqpEp;Y`N<HS!pWCd_56U_zVTMNYujs|#+Ld-%FXK?XWODW$O;!kSt
zw9fD#UN;2Xfl*?1VAlp+i67dS!bPP;#~A6_4%Q7$C@^gnDe1Ml5sr6w;JKDe4)dL9
zW|k2wncS7H4@iS~uqo<I@aoY{HB@DQQh~Huv^C<D<7Chh%dt&hV0jDld~nr`@Ok~<
z{+41$4X;5R52RM`HfW6wboUb255!^s9QXV;Q>DLN7M9g}C)MG1hsav&wtE$=uImX?
zG*>z6BK=a43Jl`sa0XAn%ZTg-`@^QYX{5ee+gClqxujE<@$Pa4vE2`s*{w78RzB54
zYTDNe{s=CzNz<6R1Z+UpQYRU2+o!g@_MG!1!4%Tfn5Ju|JPUbZ7~ss#HK20Tx&vKf
zGA_nF2V^Y${#Df+E&e%Q8#q6{@7IAF>@)QNsD8>4`n1?h)$0a(#_M8O6HufD8u)41
zc~PbhP85ByTpf#_XgFgac?7(2-Glq6{wxP#^L!@E@U<Q-0|}Yu*#`-n();zs*^7cL
ztuKhx7=KHdX>=Nw6d6En0<Pi9!s6sEMuJbcdMosisgZ}5`%d;vQbv#}p1Ot?AA|d7
z>b77{-gKoIH9`!Rh5g~J2o`p{*ZU#-zH*(c9G37`FIB=$TySDM(Y8fN1R1wz3#HNx
z?ow!vd`L&$5&}j(>AkGTKrB7Mv}VR$OTj<;*{?P<>NY%l^Yx%O(Zb0~_;3$Z^C{n$
z7ptAb@4*LN(DQ;-FDA!#v(9sG&Cwnlw1ZKvtc__>6{lmsPc}z%rENFf_F1mxV|3!d
zMjvLHrz@czge>gl29jkb07p7}`pFV`V@8%WYfw6&y&=HbX=9<1_zf6tqE}WYp&HQ7
z;Y!Sj%|n#Gp@qx-^1gC!z|ZPJ30C$=Y0n_K<8!p1c2YK4C-`!oF~GCugq@eiiHeV*
zNr*O9xPnJQcE&v3P3FXOQ!KsJm1d-^3B0pJoc1K~d*i|rP16z6yj^ZaIh5tyC(Kw)
z@3}Yg;CRVOzavD_wk5l)9_$)*;~If+<4v{z_i(><x{e8T@&Ui<EX;^ifOev3sj89o
z;q~`?oQ1{P&#o9tyz@|8hxD|5z|Ehez`JITcVmwV<Mz<${l@7z+d`1_7p7B&XWq&#
ziMU?Kvkh*=G1?M)JJ^;8{Q8rfS{R!qouZx&q7oLeNLDdm$syz3t+B!z-J*4f!~r(l
zj!_|bFoy71iOxEu2$zT()`j-1(u#89K!JOpE}Z``*C(|g1w({styz4|vkOhesaf72
zMAb(lf=d_V7{9bWxU`gyD;6J}U<@yKMU>IpigOn&+2(blt5=bieNKlZ9~Uo!cp^L)
z?Wh)hg4)07ni%akzA+l+${7{5EQ2Sz#^GynCTVEh?}WAAw9pb#MifHyx|E{LI&Q7$
z+WYf+dWfp7iT8XD2<^sN{Z3FHz|G)qF6)t;9elWXXLY+CG=8Otu1a;1=9xSRnOy+L
z_^x1EaA{IR-=iGrliVvUm2IL-A{=KRRM!WsY~g}Y_-&&;dVZx<no~W%27JE-$?LOx
zH08S&f?8|{=(Z&&nbbCV!HT976Mj3WOt`?ZM%hN?PKekvAczbicx?-pfeVXJi7=rN
z4WF=^?&%-Ur8~I`IlqV}+dP@MllD`1Cnl`zpmtDSaig*K6tN@y(|e-60%ziBtoSX6
zn-`vqk0B_G5{~ro-n{^^aL1dl^V(R=H(3eowk}oSrqXt$ahU5hPV%03-jBL8Sx)Rq
zV+yz3Z3dq-EpXwpxjHoEx9$`rMCAL>^K3E2&4u@-ZT*jgNvq6x$q%Y|SE=oe_IYVU
zNQP5!5s{xxt{|QVHEW)F7Xk|0J#aaRmvmrp;!cD|1n8|S7)h98{`Nuk^m3(jXT2LC
zrjv;#MZO*=EG_ZY+i<SX_*b-9ZY!Cs4+k1-baQ51Dp`I6KagY~ac4X8X~{mrN)oYh
zvi>>O-d?DgZ)t|@vm&#b-3wF_c;@^X@3j<m_eW)x;yaW}IaC_7`!zH05geuAaECX{
z9N8$bH<vKHmb+&(aYGyDPwh=b@fWOT5PXv>*mB8o;6}6!n03>H!|=6*TApJC#vu#G
z!N1e?So}DJeSYm~;bfIUT$ghq8^4BA%+oz<zEWYr!))OBZ)`=$aXZwR!Ze+}BmkVa
zdw1hvLH%5_ODvX(vgMw9i{3~b&5L9VC|K-?t~e$v^t`_3wk_mNYw3TYacCW@f<VR~
zn^XD)kn3C^bM5V}WA)2?*Ln5{tj2*!cqiRbLq8kKEq}6?k_SA?rKMHH#a$50f)}XA
z4vxZT4pwBj$?19TFWvH3nU@(}ED;m+?rzPtFA+VMa{^7Rm%LHQtCh#<Z$J5zVbR<d
zKj-p98=jlqq6<sLORp7JZ5mT~rS!Fwfvh?UP~FiT{zND&S<HRoIXERqVaA|tDuZPC
zTnDC%iQ46v_0SVG;qAf1kz*Fe@^=<zaaZ7G<Mw0GC4lo?Z9JM@4zwBsgn{v5AtilX
z<svz~6whs)y2UWvA>OlF;d`5#ih9hL5qjJe3#@1&b9Do@O-8fSI5^>2H%^V{QXkC0
zDw01L8AOM;%8Y*1Yd$}5YWyCKHDp-GIdfCXw%)Ye-!kE#t)RwGQ8TqSp%n-J+@aS(
zB-iJKH((%YiBy|0>5ms$b?aAGy)79q{4TisB%c;{br0?JX~E1phH~&dGX-9j)PpAD
z1qYodaA9MYg>!I#({dYC;Uk~jvRf37r*7LC)U8iNE%_2StIr%u0DmphT=T-Ysw<;j
z8!tY$gC2U%XS=t-BA!sqC5AH%rvzi(MbUr@O3%OL*;}beuC0ppobvUXA#Z6OB`YIK
z609#hwzshT5Y1L?#}T}6sUqq^sn;G_?VpCoSd6|XxvZwJ6+-l}qgMQCWyx}>tjpe}
zO!!+NytkaVE7ivJYU?B1)x%OAtXAFPh`F|1-L{mXs%LJW#+kPU9l|OJ*uc-c5H&Ca
z=6DgrJgh9Wma3Bu9NSvfJm4)RaoWu(?o*sj)(Kdp^(XIUDLWMP(AkIOaT;?hrfD5{
zTkf;G=4i0<FYvRe-wN3#7Jp!6XFH(l(Qhi(g;iq$2}kuS{QxXFZRY2M%V$GjJf&7M
zxT-*Cha%N>P2Z+43MRiW7bh?M57{H~#p$h~oEO%HWnGKOS}UdHIV!3%2C{eT9@6^Y
zN)L=WbmY1AOUFveDu2NcrHTzd-0BYuhnnDl#a$1Mmg7(=8q2TPzv--1SM@n`Pn<Z(
zEqCv(#&uC~H@$t<F>mww0!ulpNc5KyFyBmiBk57(47K*JGWXBb4VTxdiAf4${#CvG
zH&x*OUunZvO-qZv1$-hrLnpf{S-8QBISm3iq*l-HGLYY8Jf7_Le?4EfiNf*DhFeX+
Rbxr_(a#G6oiY1JE{|6ePQ%e8<


From c802f5430d4a404744ed64adfddda18dd57e525a Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Fri, 28 Mar 2025 23:39:18 -0400
Subject: [PATCH 090/593] [ROCm][AMD][Build] Update AMD supported arch list
 (#15632)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 CMakeLists.txt                                           | 2 +-
 docs/source/getting_started/installation/gpu/rocm.inc.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e0f1fdf78d142..9d15b77bc3798 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,7 +34,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
 set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
 
 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
 
 #
 # Supported/expected torch versions for CUDA/ROCm.
diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md
index 4381cef5e96a3..cdd487696c8aa 100644
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@@ -8,7 +8,7 @@ There are no pre-built wheels for this device, so you must either use the pre-bu
 
 ## Requirements
 
-- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
+- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100/1101), Radeon RX 9000 series (gfx1200/1201)
 - ROCm 6.3
 
 ## Set up using Python

From de1cb38769e9eb9812fa425c4fdfcf8faa3c420e Mon Sep 17 00:00:00 2001
From: pengyuange <pengyuange@gmail.com>
Date: Sat, 29 Mar 2025 11:39:21 +0800
Subject: [PATCH 091/593] [Model] Support Skywork-R1V (#15397)

Signed-off-by: jiacai.liu <932997367@qq.com>
Co-authored-by: jiacai.liu <932997367@qq.com>
---
 docs/source/models/supported_models.md        |    7 +
 examples/offline_inference/vision_language.py |   36 +
 .../vision_language/test_models.py            |   14 +
 .../vision_language/vlm_utils/model_utils.py  |   57 +
 .../multimodal/processing/test_common.py      |    9 +-
 tests/models/registry.py                      |    1 +
 vllm/entrypoints/chat_utils.py                |    2 +-
 vllm/model_executor/models/registry.py        |    1 +
 vllm/model_executor/models/skyworkr1v.py      | 1014 +++++++++++++++++
 vllm/transformers_utils/config.py             |    5 +-
 vllm/transformers_utils/configs/__init__.py   |    2 +
 vllm/transformers_utils/configs/skyworkr1v.py |   53 +
 12 files changed, 1194 insertions(+), 7 deletions(-)
 create mode 100644 vllm/model_executor/models/skyworkr1v.py
 create mode 100644 vllm/transformers_utils/configs/skyworkr1v.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 793831fd06ded..8477158a00403 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -921,6 +921,13 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
   * ✅︎
   * ✅︎
+- * `SkyworkR1VChatModel`
+  * Skywork-R1V-38B
+  * T + I
+  * `Skywork/Skywork-R1V-38B`
+  *
+  * ✅︎
+  * ✅︎
 - * `UltravoxModel`
   * Ultravox
   * T + A<sup>E+</sup>
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 0adbe574370d3..572eabe261930 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -804,6 +804,41 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# SkyworkR1V
+def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "Skywork/Skywork-R1V-38B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    messages = [[{
+        'role': 'user',
+        'content': f"<image>\n{question}"
+    }] for question in questions]
+    prompts = tokenizer.apply_chat_template(messages,
+                                            tokenize=False,
+                                            add_generation_prompt=True)
+
+    # Stop tokens for SkyworkR1V
+    # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py
+    stop_tokens = ["<｜end▁of▁sentence｜>", "<|endoftext|>"]
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
+
+
 model_example_map = {
     "aria": run_aria,
     "blip-2": run_blip2,
@@ -834,6 +869,7 @@ model_example_map = {
     "qwen_vl": run_qwen_vl,
     "qwen2_vl": run_qwen2_vl,
     "qwen2_5_vl": run_qwen2_5_vl,
+    "skywork_chat": run_skyworkr1v,
 }
 
 
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index d500ef5d8b805..0d1d237e5693c 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -474,6 +474,20 @@ VLM_TEST_SETTINGS = {
         vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,
         prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
     ),
+    "skywork_r1v": VLMTestInfo(
+        models=["Skywork/Skywork-R1V-38B"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<｜begin▁of▁sentence｜><｜User｜>\n{img_prompt}<｜Assistant｜><think>\n", # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<image>\nWhat's the content in the center of the image?",  # noqa: E501
+            "cherry_blossom": "<image>\nWhat is the season?",
+        }),
+        multi_image_prompt="<image>\n<image>\nDescribe the two images in short.",  # noqa: E501
+        max_model_len=4096,
+        use_tokenizer_eos=True,
+        patch_hf_runner=model_utils.skyworkr1v_patch_hf_runner,
+        marks=[large_gpu_mark(min_gb=80)],
+    ),
     ### Tensor parallel / multi-gpu broadcast tests
     "chameleon-broadcast": VLMTestInfo(
         models=["facebook/chameleon-7b"],
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index c84bf6dc15f42..2ddf28aca4f63 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -376,6 +376,63 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     return hf_model
 
 
+def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for SkyworkR1V."""
+
+    class SkyworkR1VProcessor:
+        """A simple processor for SkyworkR1V."""
+
+        def __init__(self, hf_runner: HfRunner):
+            self.num_image_token = hf_runner.model.num_image_token
+            self.tokenizer = hf_runner.tokenizer
+
+            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
+                                                     trust_remote_code=True)
+            self.vision_config = self.config.vision_config
+            self.use_thumbnail = self.config.use_thumbnail
+            self.min_num = self.config.min_dynamic_patch
+            self.max_num = self.config.max_dynamic_patch
+            self.image_size = self.vision_config.image_size
+
+        def __call__(self, text: str, images: Union[Image, list[Image]],
+                     **kwargs):
+            from vllm.model_executor.models.skyworkr1v import (
+                IMG_CONTEXT, IMG_END, IMG_START,
+                image_to_pixel_values_skyworkr1v)
+            images = [images] if isinstance(images, Image) else images
+            pixel_values = [
+                image_to_pixel_values_skyworkr1v(
+                    image,
+                    input_size=self.image_size,
+                    min_num=self.min_num,
+                    max_num=self.max_num,
+                    use_thumbnail=self.use_thumbnail,
+                ) for image in images
+            ]
+            num_patches_list = [
+                pixel_value.shape[0] for pixel_value in pixel_values
+            ]
+            pixel_values = torch.cat(pixel_values, dim=0)
+            for num_patches in num_patches_list:
+                context_tokens = IMG_CONTEXT * self.num_image_token \
+                    * num_patches
+                image_tokens = IMG_START + context_tokens + IMG_END
+                text = text.replace('<image>', image_tokens, 1)
+            prompt = self.tokenizer(text, return_tensors="pt")
+            prompt.update({"pixel_values": pixel_values})
+            return prompt
+
+    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
+        "<IMG_CONTEXT>")
+    hf_model.model.img_context_token_id = img_context_token_id
+    hf_model.processor = SkyworkR1VProcessor(hf_model)
+    hf_model.model.get_output_embeddings = lambda: \
+        hf_model.model.language_model.get_output_embeddings()
+    hf_model.model.generate = types.MethodType(_internvl_generate,
+                                               hf_model.model)
+    return hf_model
+
+
 def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for InternVL."""
 
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 078ed21537b8d..e4f1d297fc092 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -262,22 +262,23 @@ def _test_processing_correctness_mistral(
     "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
     "meta-llama/Llama-3.2-11B-Vision-Instruct",
     "TIGER-Lab/Mantis-8B-siglip-llama3",
-    "mistralai/Pixtral-12B-2409",
-    "mistral-community/pixtral-12b",
     "openbmb/MiniCPM-Llama3-V-2_5",
     "openbmb/MiniCPM-o-2_6",
     "openbmb/MiniCPM-V-2_6",
     "allenai/Molmo-7B-D-0924",
     "allenai/Molmo-7B-O-0924",
     "nvidia/NVLM-D-72B",
+    "google/paligemma-3b-mix-224",
+    "google/paligemma2-3b-ft-docci-448",
+    "mistralai/Pixtral-12B-2409",
+    "mistral-community/pixtral-12b",
     "Qwen/Qwen-VL-Chat",
     "Qwen/Qwen2-VL-2B-Instruct",
     "Qwen/Qwen2.5-VL-3B-Instruct",
     "Qwen/Qwen2-Audio-7B-Instruct",
+    "Skywork/Skywork-R1V-38B",
     "fixie-ai/ultravox-v0_5-llama-3_2-1b",
     "openai/whisper-large-v3",
-    "google/paligemma-3b-mix-224",
-    "google/paligemma2-3b-ft-docci-448",
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
diff --git a/tests/models/registry.py b/tests/models/registry.py
index d7946b75b7978..ff0c37a6afd76 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -294,6 +294,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"),  # noqa: E501
     "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct",  # noqa: E501
                                                           min_transformers_version="4.49"),  # noqa: E501
+    "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),
     "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501
                                      trust_remote_code=True),
     # [Encoder-decoder]
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 73a69d3037f7f..24382142768b5 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -496,7 +496,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
                 return self._cached_token_str(self._tokenizer,
                                               hf_config.image_token_index)
             if model_type in ("chameleon", "deepseek_vl_v2", "internvl_chat",
-                              "NVLM_D", "h2ovl_chat"):
+                              "skywork_chat", "NVLM_D", "h2ovl_chat"):
                 return "<image>"
             if model_type == "mllama":
                 return "<|image|>"
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 7797d9a2cc203..9288a4b81748e 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -190,6 +190,7 @@ _MULTIMODAL_MODELS = {
     # [Encoder-decoder]
     "Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"),  # noqa: E501
     "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"),  # noqa: E501
+    "SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),
     "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"),  # noqa: E501
 }
 
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
new file mode 100644
index 0000000000000..ac5de0e36b894
--- /dev/null
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -0,0 +1,1014 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py
+# --------------------------------------------------------
+# SkyworkR1V
+# Copyright (c) 2025 Skywork
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from abc import ABC, abstractmethod
+from collections.abc import Iterable, Mapping, Sequence
+from functools import cached_property
+from typing import Literal, Optional, Set, Tuple, TypedDict, TypeVar, Union
+
+import torch
+import torch.nn as nn
+import torchvision.transforms as T
+from PIL import Image
+from transformers import BatchEncoding, PretrainedConfig, TensorType
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.awq import AWQConfig
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.models.intern_vit import (InternVisionModel,
+                                                   InternVisionPatchModel)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   ImageSize, MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
+                    maybe_prefix, merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features
+
+IMG_START = '<img>'
+IMG_END = '</img>'
+IMG_CONTEXT = '<IMG_CONTEXT>'
+
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+
+class SkyworkR1VImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values_flat: torch.Tensor
+    """
+    Shape:
+    `(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
+    """
+
+    num_patches: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
+
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
+
+class SkyworkR1VImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: Union[torch.Tensor, list[torch.Tensor]]
+    """ 
+    A tensor of shape `(num_images, total_image_feature_size, hidden_size)`
+    or a list of tensors of shape `(total_image_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+SkyworkR1VImageInputs = Union[SkyworkR1VImagePixelInputs,
+                              SkyworkR1VImageEmbeddingInputs]
+
+
+# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
+def build_transform(input_size: int):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    return T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size),
+                 interpolation=T.InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=MEAN, std=STD)
+    ])
+
+
+# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
+def find_closest_aspect_ratio(
+    aspect_ratio: float,
+    target_ratios: list[tuple[int, int]],
+    *,
+    width: int,
+    height: int,
+    image_size: int,
+) -> tuple[int, int]:
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def resolve_skyworkr1v_min_max_num(
+    *,
+    min_dynamic_patch: int,
+    max_dynamic_patch: int,
+    dynamic_image_size: bool,
+    use_thumbnail: bool,
+) -> tuple[int, int]:
+    min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
+    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
+
+    if use_thumbnail and max_dynamic_patch != 1:
+        max_dynamic_patch += 1
+
+    return min_dynamic_patch, max_dynamic_patch
+
+
+def get_skyworkr1v_target_ratios(
+    min_num: int,
+    max_num: int,
+) -> list[tuple[int, int]]:
+    target_ratios = {(i, j)
+                     for n in range(min_num, max_num + 1)
+                     for i in range(1, n + 1)
+                     for j in range(1, n + 1) if min_num <= i * j <= max_num}
+    return sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+
+def calculate_skyworkr1v_targets(
+    *,
+    orig_width: int,
+    orig_height: int,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> tuple[int, int, int]:
+    aspect_ratio = orig_width / orig_height
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio,
+        target_ratios,
+        width=orig_width,
+        height=orig_height,
+        image_size=image_size,
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # add thumbnail image if num_blocks != 1
+    if use_thumbnail and blocks != 1:
+        blocks += 1
+
+    return blocks, target_width, target_height
+
+
+def dynamic_preprocess_skyworkr1v(
+    image: Image.Image,
+    *,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> list[Image.Image]:
+    orig_width, orig_height = image.size
+
+    # calculate the number of blocks without thumbnail
+    blocks, target_width, target_height = calculate_skyworkr1v_targets(
+        orig_width=orig_width,
+        orig_height=orig_height,
+        target_ratios=target_ratios,
+        image_size=image_size,
+        use_thumbnail=False,
+    )
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = ((i % (target_width // image_size)) * image_size,
+               (i // (target_width // image_size)) * image_size,
+               ((i % (target_width // image_size)) + 1) * image_size,
+               ((i // (target_width // image_size)) + 1) * image_size)
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+
+    assert len(processed_images) == blocks
+
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+
+    return processed_images
+
+
+# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B
+def image_to_pixel_values_skyworkr1v(
+    image: Image.Image,
+    *,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+) -> torch.Tensor:
+    target_ratios = get_skyworkr1v_target_ratios(min_num, max_num)
+
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess_skyworkr1v(
+        image,
+        target_ratios=target_ratios,
+        image_size=input_size,
+        use_thumbnail=use_thumbnail,
+    )
+
+    pixel_values = torch.stack([transform(image) for image in images])
+    return pixel_values
+
+
+class BaseSkyworkR1VProcessor(ABC):
+    """
+    This model doesn't define its own HF processor,
+    so we implement our own one here.
+
+    The code to insert image tokens is based on:
+    https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py#L252
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: AnyTokenizer,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.tokenizer = tokenizer
+
+        image_size: int = config.vision_config.image_size
+        patch_size: int = config.vision_config.patch_size
+
+        if min_dynamic_patch is None:
+            min_dynamic_patch = config.min_dynamic_patch
+        assert isinstance(min_dynamic_patch, int)
+
+        if max_dynamic_patch is None:
+            max_dynamic_patch = config.max_dynamic_patch
+        assert isinstance(max_dynamic_patch, int)
+
+        if dynamic_image_size is None:
+            dynamic_image_size = config.dynamic_image_size
+        assert isinstance(dynamic_image_size, bool)
+
+        self.num_image_token = int(
+            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+        self.image_size = image_size
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail: bool = config.use_thumbnail
+
+    @property
+    @abstractmethod
+    def image_token_id(self) -> int:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: Optional[int],
+    ) -> PromptUpdateDetails[str]:
+        raise NotImplementedError
+
+    def resolve_min_max_num(
+        self,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        use_thumbnail: Optional[bool] = None,
+    ) -> tuple[int, int]:
+        min_dynamic_patch = (self.min_dynamic_patch if min_dynamic_patch
+                             is None else min_dynamic_patch)
+        max_dynamic_patch = (self.max_dynamic_patch if max_dynamic_patch
+                             is None else max_dynamic_patch)
+        dynamic_image_size = (self.dynamic_image_size if dynamic_image_size
+                              is None else dynamic_image_size)
+        use_thumbnail = (self.use_thumbnail
+                         if use_thumbnail is None else use_thumbnail)
+
+        return resolve_skyworkr1v_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
+
+    def resolve_target_ratios(
+        self,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        use_thumbnail: Optional[bool] = None,
+    ) -> list[tuple[int, int]]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
+
+        return get_skyworkr1v_target_ratios(min_num, max_num)
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        target_ratios = self.resolve_target_ratios(
+            use_thumbnail=False,  # Applied in calculate_targets
+        )
+
+        num_patches, _, _ = calculate_skyworkr1v_targets(
+            orig_width=image_width,
+            orig_height=image_height,
+            image_size=self.image_size,
+            target_ratios=target_ratios,
+            use_thumbnail=self.use_thumbnail,
+        )
+
+        return num_patches * self.num_image_token
+
+    def _images_to_pixel_values_lst(
+        self,
+        images: list[Image.Image],
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+    ) -> list[torch.Tensor]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=False,  # Applied in image_to_pixel_values
+        )
+
+        return [
+            image_to_pixel_values_skyworkr1v(
+                image,
+                input_size=self.image_size,
+                min_num=min_num,
+                max_num=max_num,
+                use_thumbnail=self.use_thumbnail,
+            ) for image in images
+        ]
+
+    def __call__(
+        self,
+        text: Optional[Union[str, list[str]]] = None,
+        images: Optional[Union[Image.Image, list[Image.Image]]] = None,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ) -> Mapping[str, NestedTensors]:
+        if text is None:
+            text = []
+        if not isinstance(text, list):
+            text = [text]
+        if images is None:
+            images = []
+        if not isinstance(images, list):
+            images = [images]
+
+        if len(images) == 0:
+            image_inputs = {}
+        else:
+            pixel_values_lst = self._images_to_pixel_values_lst(
+                images,
+                min_dynamic_patch=min_dynamic_patch,
+                max_dynamic_patch=max_dynamic_patch,
+                dynamic_image_size=dynamic_image_size,
+            )
+            image_inputs: dict[str, NestedTensors] = {
+                "pixel_values_flat":
+                torch.cat(pixel_values_lst),
+                "image_num_patches":
+                torch.tensor([len(item) for item in pixel_values_lst]),
+            }
+
+            tokenizer = self.tokenizer
+            image_token_id = self.image_token_id
+
+            embed_is_patch = list[torch.Tensor]()
+
+            for pixel_values in pixel_values_lst:
+                num_patches = pixel_values.shape[0]
+                feature_size = num_patches * self.num_image_token
+
+                image_repl = self.get_image_repl(feature_size, num_patches)
+                feature_tokens = tokenizer.encode(image_repl.features,
+                                                  add_special_tokens=False)
+
+                text = [t.replace('<image>', image_repl.full, 1) for t in text]
+                embed_is_patch.append(
+                    torch.tensor(feature_tokens) == image_token_id)
+
+            image_inputs["embed_is_patch"] = embed_is_patch
+
+        text_inputs = self.tokenizer(text)
+
+        return {
+            **BatchEncoding(text_inputs, tensor_type=return_tensors),
+            **image_inputs,
+        }
+
+
+class SkyworkR1VProcessor(BaseSkyworkR1VProcessor):
+
+    @property
+    def image_token_id(self) -> int:
+        return self.tokenizer.get_vocab()[IMG_CONTEXT]
+
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: Optional[int],
+    ) -> PromptUpdateDetails[str]:
+        repl_features = IMG_CONTEXT * feature_size
+        repl_full = IMG_START + repl_features + IMG_END
+
+        return PromptUpdateDetails(full=repl_full, features=repl_features)
+
+
+class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo):
+
+    @abstractmethod
+    def get_hf_processor(
+        self,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        **kwargs: object,
+    ) -> BaseSkyworkR1VProcessor:
+        raise NotImplementedError
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        return {"image": self.get_max_image_tokens()}
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[BaseSkyworkR1VProcessor],
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        return processor.get_num_image_tokens(
+            image_width=image_width,
+            image_height=image_height,
+        )
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            processor=None,
+        )
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        processor = self.get_hf_processor()
+
+        base_size = processor.image_size
+        target_ratios = processor.resolve_target_ratios()
+
+        largest_feature_size, largest_feature_pinpoint = 0, None
+        for wr, hr in target_ratios:
+            width, height = base_size * wr, base_size * hr
+
+            feat_size = self.get_num_image_tokens(
+                image_width=width,
+                image_height=height,
+                processor=processor,
+            )
+            if feat_size > largest_feature_size:
+                largest_feature_size = feat_size
+                largest_feature_pinpoint = ImageSize(width=width,
+                                                     height=height)
+
+        if largest_feature_size == 0 or largest_feature_pinpoint is None:
+            raise ValueError("Cannot have a largest feature size of 0!")
+
+        return largest_feature_pinpoint
+
+
+_I = TypeVar("_I", bound=BaseSkyworkR1VProcessingInfo)
+
+
+class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text="<image>" * num_images,
+            mm_data=mm_data,
+        )
+
+
+class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[_I]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, NestedTensors]:
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+        hf_processor = self.info.get_hf_processor(**mm_kwargs)
+        image_token_id = hf_processor.image_token_id
+
+        # Since there may be extra tokens in the feature placeholders,
+        # we need to pass the image token ID to the model to select the
+        # tokens to merge from the vision encoder outputs
+        processed_outputs["image_token_id"] = torch.tensor(image_token_id)
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: Mapping[str, NestedTensors],
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0))
+        num_images = len(image_num_patches)
+
+        return dict(
+            pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
+                "image", image_num_patches),
+            image_num_patches=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+            image_token_id=MultiModalFieldConfig.shared("image", num_images),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        if "image_num_patches" in out_mm_kwargs:
+            image_num_patches = out_mm_kwargs["image_num_patches"]
+            assert isinstance(image_num_patches, torch.Tensor)
+            image_num_patches = image_num_patches.tolist()
+        elif "image_embeds" in out_mm_kwargs:
+            # TODO: Use image size information in dictionary embedding inputs
+            # to compute num_patches (similar to Qwen2-VL)
+            image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
+        else:
+            image_num_patches = []
+
+        def get_replacement_skyworkr1v(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                feature_size = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                feature_size = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    processor=hf_processor,
+                )
+
+            num_patches = image_num_patches[item_idx]
+            if num_patches is not None:
+                assert isinstance(num_patches, int)
+
+            return hf_processor.get_image_repl(feature_size, num_patches)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target="<image>",
+                replacement=get_replacement_skyworkr1v,
+            )
+        ]
+
+
+class SkyworkR1VProcessingInfo(BaseSkyworkR1VProcessingInfo):
+
+    def get_hf_processor(
+        self,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        **kwargs: object,
+    ) -> SkyworkR1VProcessor:
+        if min_dynamic_patch is not None:
+            kwargs["min_dynamic_patch"] = min_dynamic_patch
+        if max_dynamic_patch is not None:
+            kwargs["max_dynamic_patch"] = max_dynamic_patch
+        if dynamic_image_size is not None:
+            kwargs["dynamic_image_size"] = dynamic_image_size
+
+        return self.ctx.init_processor(
+            SkyworkR1VProcessor,
+            config=self.get_hf_config(),
+            tokenizer=self.get_tokenizer(),
+            **kwargs,
+        )
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    SkyworkR1VMultiModalProcessor,
+    info=SkyworkR1VProcessingInfo,
+    dummy_inputs=SkyworkR1VDummyInputsBuilder)
+class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+        self._patch_quant_config(config, quant_config)
+
+        image_size = config.force_image_size or config.vision_config.image_size
+        patch_size = config.vision_config.patch_size
+        self.patch_size = patch_size
+        self.num_image_token = int(
+            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+        self.downsample_ratio = config.downsample_ratio
+        self.ps_version = config.ps_version
+
+        self.llm_arch_name = config.text_config.architectures[0]
+        self.is_mono = self.llm_arch_name == 'SkyworkLM2VEForCausalLM'
+        self.vision_model = self._init_vision_model(
+            config,
+            quant_config=quant_config,
+            is_mono=self.is_mono,
+            prefix=maybe_prefix(prefix, "vision_model"),
+        )
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        self.mlp1 = self._init_mlp1(config)
+
+        self.img_context_token_id = None
+        self.visual_token_mask = None
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _patch_quant_config(self, config: PretrainedConfig,
+                            quant_config: QuantizationConfig):
+        # the awq models from OpenGVLab missing `modules_to_not_convert`
+        # patch the quant_config to add `modules_to_not_convert` back
+        if isinstance(quant_config, AWQConfig):
+            text_config = config.text_config
+            llm_quant_config = getattr(text_config, "quantization_config",
+                                       None)
+            if (not quant_config.modules_to_not_convert) and \
+                (llm_quant_config is not None):
+                quant_config.modules_to_not_convert.append("vision_model")
+
+    @cached_property
+    def sampler(self):
+        if hasattr(self.language_model, "sampler"):
+            return self.language_model.sampler
+
+        return get_sampler()
+
+    def _init_vision_model(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        *,
+        is_mono: bool,
+        prefix: str,
+    ):
+        if not is_mono:
+            vision_feature_layer = config.select_layer
+            if vision_feature_layer < 0:
+                num_hidden_layers = config.vision_config.num_hidden_layers \
+                    + vision_feature_layer + 1
+            else:
+                num_hidden_layers = vision_feature_layer + 1
+
+            return InternVisionModel(
+                config.vision_config,
+                quant_config=quant_config,
+                num_hidden_layers_override=num_hidden_layers,
+                prefix=prefix,
+            )
+        else:
+            return InternVisionPatchModel(config.vision_config)
+
+    def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential:
+        vit_hidden_size = config.vision_config.hidden_size
+        llm_hidden_size = config.text_config.hidden_size
+
+        return nn.Sequential(
+            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio)**2),
+            ReplicatedLinear(vit_hidden_size *
+                             int(1 / self.downsample_ratio)**2,
+                             llm_hidden_size,
+                             return_bias=False),
+            nn.GELU(),
+            ReplicatedLinear(llm_hidden_size,
+                             llm_hidden_size,
+                             return_bias=False),
+        )
+
+    def pixel_shuffle(self, x, scale_factor=0.5):
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        x = x.view(n, int(h * scale_factor), int(w * scale_factor),
+                   int(c / (scale_factor * scale_factor)))
+        if self.ps_version == 'v1':
+            pass
+        else:
+            x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+
+    def extract_feature(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        vit_embeds = self.vision_model(pixel_values=pixel_values)
+        vit_embeds = vit_embeds[:, 1:, :]
+
+        h = w = int(vit_embeds.shape[1]**0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = self.pixel_shuffle(vit_embeds,
+                                        scale_factor=self.downsample_ratio)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1,
+                                        vit_embeds.shape[-1])
+        vit_embeds = self.mlp1(vit_embeds)
+        return vit_embeds
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape)
+
+            if actual_dims != expected_dims:
+                expected_expr = str(expected_dims)
+                raise ValueError(
+                    "The expected shape of pixel values per image per batch "
+                    f" per patch is {expected_expr}. "
+                    f"You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[SkyworkR1VImageInputs]:
+        pixel_values_flat = kwargs.pop("pixel_values_flat", None)
+        image_num_patches = kwargs.pop("image_num_patches", None)
+        embed_is_patch = kwargs.pop("embed_is_patch", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values_flat is None and image_embeds is None:
+            return None
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+
+            return SkyworkR1VImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds),
+            )
+
+        image_token_id = kwargs["image_token_id"]
+        assert isinstance(image_token_id, torch.Tensor)
+        self.img_context_token_id = image_token_id.flatten().unique().item()
+
+        if pixel_values_flat is not None:
+            if not isinstance(pixel_values_flat, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values_flat)}")
+
+            if not isinstance(image_num_patches, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image_num_patches. "
+                                 f"Got type: {type(image_num_patches)}")
+
+            if not isinstance(embed_is_patch, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of embed_is_patch. "
+                                 f"Got type: {type(embed_is_patch)}")
+
+            pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
+            image_num_patches = flatten_bn(image_num_patches, concat=True)
+            embed_is_patch = flatten_bn(embed_is_patch)
+
+            return SkyworkR1VImagePixelInputs(
+                type="pixel_values",
+                pixel_values_flat=self._validate_pixel_values(
+                    pixel_values_flat),
+                num_patches=image_num_patches,
+                embed_is_patch=embed_is_patch,
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_image_input(
+        self,
+        image_input: SkyworkR1VImageInputs,
+    ) -> Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor, ...]]:
+        if image_input["type"] == "image_embeds":
+            return image_input["data"]
+
+        assert self.vision_model is not None
+
+        image_embeds = self.extract_feature(image_input["pixel_values_flat"])
+
+        num_patches = image_input["num_patches"]
+
+        # Only one image in the current batch
+        if len(num_patches) == 1:
+            return image_embeds.view(
+                -1, self.config.text_config.hidden_size).unsqueeze(0)
+
+        # NOTE: Image embeddings are split into separate tensors for each image
+        # by the size of each embedding.
+        feature_size = image_embeds.shape[1]
+        image_embeds = image_embeds.view(-1,
+                                         self.config.text_config.hidden_size)
+        image_feature_sizes = [
+            num_patches * feature_size for num_patches in num_patches
+        ]
+        return image_embeds.split(image_feature_sizes)
+
+    def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
+        if self.is_mono:
+            self.visual_token_mask = (
+                input_ids == self.img_context_token_id).reshape(-1, 1)
+        else:
+            self.visual_token_mask = None
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+
+        image_features = self._process_image_input(image_input)
+
+        if image_input["type"] != "pixel_values":
+            return image_features
+
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            assert self.img_context_token_id is not None
+            self._set_visual_token_mask(input_ids)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                select_patch_features(multimodal_embeddings),
+                self.img_context_token_id,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[SamplerOutput, IntermediateTensors]:
+
+        if intermediate_tensors is not None:
+            input_ids = None
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        forward_kwargs = {
+            "input_ids": input_ids,
+            "positions": positions,
+            "intermediate_tensors": intermediate_tensors,
+            "inputs_embeds": inputs_embeds,
+        }
+
+        # Only required if the model is mono-architecture
+        if self.visual_token_mask is not None:
+            forward_kwargs.update(
+                {"visual_token_mask": self.visual_token_mask})
+            self.visual_token_mask = None
+
+        hidden_states = self.language_model.model(**forward_kwargs)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        return self.language_model.sample(logits, sampling_metadata)
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        skip_prefixes = [
+            "action_embed", "temporal_embed", "track_embed",
+            "track_embed_decoder", "box_token", "cg_criterion", "cg_model",
+            "loc_encoder", "loc_decoder", "sam", "temporal_token",
+            "track_token"
+        ]
+        loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
+        return loader.load_weights(weights)
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 1937b13884711..71990468c315a 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -37,8 +37,8 @@ from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config,
                                              MLPSpeculatorConfig, MPTConfig,
                                              NemotronConfig, NVLM_D_Config,
                                              Olmo2Config, RWConfig,
-                                             SolarConfig, Telechat2Config,
-                                             UltravoxConfig)
+                                             SkyworkR1VChatConfig, SolarConfig,
+                                             Telechat2Config, UltravoxConfig)
 # yapf: enable
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.utils import resolve_obj_by_qualname
@@ -76,6 +76,7 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
     "NVLM_D": NVLM_D_Config,
     "olmo2": Olmo2Config,
     "solar": SolarConfig,
+    "skywork_chat": SkyworkR1VChatConfig,
     "telechat": Telechat2Config,
     "ultravox": UltravoxConfig,
     **_CONFIG_REGISTRY_OVERRIDE_HF
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 9060565596b21..53699341bfba8 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -20,6 +20,7 @@ from vllm.transformers_utils.configs.mpt import MPTConfig
 from vllm.transformers_utils.configs.nemotron import NemotronConfig
 from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
 from vllm.transformers_utils.configs.olmo2 import Olmo2Config
+from vllm.transformers_utils.configs.skyworkr1v import SkyworkR1VChatConfig
 from vllm.transformers_utils.configs.solar import SolarConfig
 from vllm.transformers_utils.configs.telechat2 import Telechat2Config
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
@@ -42,6 +43,7 @@ __all__ = [
     "NemotronConfig",
     "NVLM_D_Config",
     "Olmo2Config",
+    "SkyworkR1VChatConfig",
     "SolarConfig",
     "Telechat2Config",
     "UltravoxConfig",
diff --git a/vllm/transformers_utils/configs/skyworkr1v.py b/vllm/transformers_utils/configs/skyworkr1v.py
new file mode 100644
index 0000000000000..ef5f9ba85c237
--- /dev/null
+++ b/vllm/transformers_utils/configs/skyworkr1v.py
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Adapted from
+# https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/configuration_skywork_chat.py
+# --------------------------------------------------------
+# SkyworkR1V
+# Copyright (c) 2025 Skywork
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from transformers.configuration_utils import PretrainedConfig
+
+
+class SkyworkR1VChatConfig(PretrainedConfig):
+    model_type = 'internvl_chat'
+    is_composition = True
+
+    def __init__(self,
+                 vision_config=None,
+                 llm_config=None,
+                 use_backbone_lora=0,
+                 use_llm_lora=0,
+                 select_layer=-1,
+                 force_image_size=None,
+                 downsample_ratio=0.5,
+                 template=None,
+                 dynamic_image_size=False,
+                 use_thumbnail=False,
+                 ps_version='v1',
+                 min_dynamic_patch=1,
+                 max_dynamic_patch=6,
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        if vision_config is None:
+            vision_config = {}
+
+        if llm_config is None:
+            llm_config = {}
+
+        self.vision_config = PretrainedConfig(**vision_config)
+        self.text_config = PretrainedConfig(**llm_config)
+
+        self.use_backbone_lora = use_backbone_lora
+        self.use_llm_lora = use_llm_lora
+        self.select_layer = select_layer
+        self.force_image_size = force_image_size
+        self.downsample_ratio = downsample_ratio
+        self.template = template
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.ps_version = ps_version  # pixel shuffle version
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch

From 762b424a528e025ef3d9b02828eb926c6dbddb2c Mon Sep 17 00:00:00 2001
From: Ce Gao <cegao@tensorchord.ai>
Date: Sat, 29 Mar 2025 11:46:57 +0800
Subject: [PATCH 092/593] [Docs] Document v0 engine support in reasoning
 outputs (#15739)

Signed-off-by: Ce Gao <cegao@tensorchord.ai>
---
 docs/source/features/reasoning_outputs.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/docs/source/features/reasoning_outputs.md b/docs/source/features/reasoning_outputs.md
index 879b16d4f7b50..3a0be69f8e1c6 100644
--- a/docs/source/features/reasoning_outputs.md
+++ b/docs/source/features/reasoning_outputs.md
@@ -136,7 +136,14 @@ Remember to check whether the `reasoning_content` exists in the response before
 
 ## Structured output
 
-The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output.
+The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output. It is only supported in v0 engine now.
+
+```bash
+VLLM_USE_V1=0 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+    --enable-reasoning --reasoning-parser deepseek_r1
+```
+
+Please note that the `VLLM_USE_V1` environment variable must be set to `0` to use the v0 engine.
 
 ```python
 from openai import OpenAI

From 6d531ad7b810522fde902cb1cbf95f52bfc77860 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 28 Mar 2025 20:59:47 -0700
Subject: [PATCH 093/593] [Misc][V1] Misc code streamlining (#15723)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/distributed/utils.py          |  5 +--
 vllm/v1/core/sched/scheduler.py    | 53 ++++++++++++++----------------
 vllm/v1/engine/core_client.py      |  2 +-
 vllm/v1/engine/output_processor.py |  2 +-
 vllm/v1/request.py                 |  8 +++--
 5 files changed, 32 insertions(+), 38 deletions(-)

diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index 4206a24465e28..cae1a25519b3e 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -207,10 +207,7 @@ class StatelessProcessGroup:
     def barrier(self):
         """A barrier to synchronize all ranks."""
         for i in range(self.world_size):
-            if i == self.rank:
-                self.broadcast_obj(None, src=self.rank)
-            else:
-                self.broadcast_obj(None, src=i)
+            self.broadcast_obj(None, src=i)
 
     @staticmethod
     def create(
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 448119761259c..094602a8b732d 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -269,29 +269,26 @@ class Scheduler(SchedulerInterface):
 
                 request = self.waiting[0]
 
-                # Waiting request skipping logic
-                is_skipped = False
                 # Skip request if the structured output request is still waiting
-                # for FSM.
-                if (not is_skipped
-                        and request.status == RequestStatus.WAITING_FOR_FSM):
+                # for FSM compilation.
+                if request.status == RequestStatus.WAITING_FOR_FSM:
                     structured_output_req = request.structured_output_request
-                    is_skipped = (not structured_output_req
-                                  or not structured_output_req.grammar)
-                    if not is_skipped:
+                    if structured_output_req and structured_output_req.grammar:
                         request.status = RequestStatus.WAITING
+                    else:
+                        self.waiting.popleft()
+                        skipped_waiting_requests.appendleft(request)
+                        continue
 
-                # Skip request if max_loras can't be honored.
-                if (not is_skipped and self.lora_config
-                        and request.lora_request):
-                    req_lora_id = request.lora_request.lora_int_id
-                    is_skipped = (len(scheduled_loras)
-                                  == self.lora_config.max_loras
-                                  and (req_lora_id not in scheduled_loras))
-
-                if is_skipped:
-                    skipped_waiting_requests.appendleft(request)
+                # Check that adding the request still respects the max_loras
+                # constraint.
+                if self.lora_config and request.lora_request and (
+                        len(scheduled_loras) == self.lora_config.max_loras
+                        and request.lora_request.lora_int_id
+                        not in scheduled_loras):
+                    # Scheduling would exceed max_loras, skip.
                     self.waiting.popleft()
+                    skipped_waiting_requests.appendleft(request)
                     continue
 
                 # Get already-cached tokens.
@@ -602,8 +599,9 @@ class Scheduler(SchedulerInterface):
             # OPTIMIZATION: Avoid list(set) if the set is empty.
             if cached_encoder_input_ids:
                 for input_id in list(cached_encoder_input_ids):
-                    start_pos = request.mm_positions[input_id]["offset"]
-                    num_tokens = request.mm_positions[input_id]["length"]
+                    mm_positions = request.mm_positions[input_id]
+                    start_pos = mm_positions["offset"]
+                    num_tokens = mm_positions["length"]
                     if start_pos + num_tokens <= request.num_computed_tokens:
                         # The encoder output is already processed and stored
                         # in the decoder's KV cache.
@@ -616,25 +614,24 @@ class Scheduler(SchedulerInterface):
 
             stopped = False
             new_logprobs = None
-            new_token_ids: list[int] = []
+            new_token_ids = generated_token_ids
 
             # Append generated tokens and check for stop. Note that if
             # a request is still being prefilled, we expect the model runner
             # to return empty token ids for the request.
-            for output_token_id in generated_token_ids:
+            for num_new, output_token_id in enumerate(new_token_ids, 1):
                 request.append_output_token_ids(output_token_id)
-                new_token_ids.append(output_token_id)
 
                 # Check for stop and update request state.
                 # This must be called before we make the EngineCoreOutput.
                 stopped = check_stop(request, self.max_model_len)
                 if stopped:
                     self._free_request(request)
+                    del new_token_ids[num_new:]  # Trim new tokens if needed.
                     break
 
             # Extract sample logprobs if needed.
-            if (request.sampling_params.logprobs is not None
-                    and logprobs is not None):
+            if request.sampling_params.logprobs is not None and logprobs:
                 # NOTE: once we support N tokens per step (spec decode),
                 # the outer lists can be of length > 1.
                 new_logprobs = logprobs.slice(req_index, req_index + 1)
@@ -644,9 +641,7 @@ class Scheduler(SchedulerInterface):
                 # should not be None if use_structured_output, we have
                 # check above, so safe to ignore type warning
                 request.structured_output_request.grammar.accept_tokens(  # type: ignore[union-attr]
-                    request.request_id,
-                    new_token_ids,
-                )
+                    req_id, new_token_ids)
 
             # Get prompt logprobs for this request.
             prompt_logprobs_tensors = prompt_logprobs_dict.get(req_id)
@@ -665,7 +660,7 @@ class Scheduler(SchedulerInterface):
                 # Invariant: EngineCore returns no partial prefill outputs.
                 assert not prompt_logprobs_tensors
 
-            self.scheduled_req_ids.remove(request.request_id)
+            self.scheduled_req_ids.remove(req_id)
             if not stopped:
                 new_running.append(request)
 
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index c41ee6704be0f..8858a564d2c2b 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -416,9 +416,9 @@ class SyncMPClient(MPClient):
 
         def process_outputs_socket():
             shutdown_socket = ctx.socket(zmq.PAIR)
-            shutdown_socket.bind(shutdown_path)
             out_socket = make_zmq_socket(ctx, output_path, zmq.constants.PULL)
             try:
+                shutdown_socket.bind(shutdown_path)
                 poller = zmq.Poller()
                 poller.register(shutdown_socket)
                 poller.register(out_socket)
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 1e67bed261182..70f072d3c9399 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -328,7 +328,7 @@ class OutputProcessor:
             # 2) Detokenize the token ids into text and perform stop checks.
             stop_string = req_state.detokenizer.update(
                 new_token_ids, finish_reason == FinishReason.STOP)
-            if stop_string and finish_reason != FinishReason.STOP:
+            if stop_string:
                 finish_reason = FinishReason.STOP
                 stop_reason = stop_string
 
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index efb5a54d12077..48e5132678c13 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -93,9 +93,11 @@ class Request:
         token_ids: Union[int, list[int]],
     ) -> None:
         if isinstance(token_ids, int):
-            token_ids = [token_ids]
-        self._output_token_ids.extend(token_ids)
-        self._all_token_ids.extend(token_ids)
+            self._output_token_ids.append(token_ids)
+            self._all_token_ids.append(token_ids)
+        else:
+            self._output_token_ids.extend(token_ids)
+            self._all_token_ids.extend(token_ids)
 
     @property
     def num_tokens(self) -> int:

From 1286211f573586719d80e96ce1e618b620e61f56 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Fri, 28 Mar 2025 21:10:41 -0700
Subject: [PATCH 094/593] [Bugfix] LoRA V1: add and fix entrypoints tests
 (#15715)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 .../llm/test_generate_multiple_loras.py           | 14 +++++++++++++-
 tests/entrypoints/openai/test_lora_adapters.py    | 15 ++++++++++++++-
 vllm/entrypoints/openai/serving_models.py         |  2 +-
 3 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py
index 90e1d58141378..099af0f36088b 100644
--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
@@ -23,7 +23,19 @@ LORA_NAME = "typeof/zephyr-7b-beta-lora"
 
 
 @pytest.fixture(scope="module")
-def llm():
+def monkeypatch_module():
+    from _pytest.monkeypatch import MonkeyPatch
+    mpatch = MonkeyPatch()
+    yield mpatch
+    mpatch.undo()
+
+
+@pytest.fixture(scope="module", params=[False, True])
+def llm(request, monkeypatch_module):
+
+    use_v1 = request.param
+    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
     llm = LLM(model=MODEL_NAME,
diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py
index 1a62157acc478..2fc08b47513e6 100644
--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -53,7 +53,20 @@ def zephyr_lora_files():
 
 
 @pytest.fixture(scope="module")
-def server_with_lora_modules_json(zephyr_lora_files):
+def monkeypatch_module():
+    from _pytest.monkeypatch import MonkeyPatch
+    mpatch = MonkeyPatch()
+    yield mpatch
+    mpatch.undo()
+
+
+@pytest.fixture(scope="module", params=[False, True])
+def server_with_lora_modules_json(request, monkeypatch_module,
+                                  zephyr_lora_files):
+
+    use_v1 = request.param
+    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+
     # Define the json format LoRA module configurations
     lora_module_1 = {
         "name": "zephyr-lora",
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index 38a66583022a2..7a68452efc653 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -162,7 +162,7 @@ class OpenAIServingModels:
         except BaseException as e:
             error_type = "BadRequestError"
             status_code = HTTPStatus.BAD_REQUEST
-            if isinstance(e, ValueError) and "No adapter found" in str(e):
+            if "No adapter found" in str(e):
                 error_type = "NotFoundError"
                 status_code = HTTPStatus.NOT_FOUND
 

From 7a7992085b75fde8f6b9717f6be7859b390b9093 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Sat, 29 Mar 2025 00:10:45 -0400
Subject: [PATCH 095/593] [CI] Speed up V1 structured output tests (#15718)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .../llm/test_struct_output_generate.py        | 222 +++++++-----------
 1 file changed, 89 insertions(+), 133 deletions(-)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index c9fa03a1ae1fb..a32dd8263992e 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -23,20 +23,46 @@ MODELS_TO_TEST = [
 ]
 
 
+class CarType(str, Enum):
+    sedan = "sedan"
+    suv = "SUV"
+    truck = "Truck"
+    coupe = "Coupe"
+
+
+class CarDescription(BaseModel):
+    brand: str
+    model: str
+    car_type: CarType
+
+
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("guided_decoding_backend",
                          GUIDED_DECODING_BACKENDS_V1)
 @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_json_completion(
+def test_structured_output(
     monkeypatch: pytest.MonkeyPatch,
     sample_json_schema: dict[str, Any],
+    unsupported_json_schema: dict[str, Any],
+    sample_sql_ebnf: str,
+    sample_sql_lark: str,
+    sample_regex: str,
+    sample_guided_choice: str,
     guided_decoding_backend: str,
     model_name: str,
 ):
     monkeypatch.setenv("VLLM_USE_V1", "1")
+
+    # Use a single LLM instance for several scenarios to
+    # speed up the test suite.
     llm = LLM(model=model_name,
+              enforce_eager=True,
               max_model_len=1024,
               guided_decoding_backend=guided_decoding_backend)
+
+    #
+    # Test 1: Generate JSON output based on a provided schema
+    #
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=1000,
@@ -63,20 +89,9 @@ def test_guided_json_completion(
         output_json = json.loads(generated_text)
         jsonschema.validate(instance=output_json, schema=sample_json_schema)
 
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_json_object(
-    monkeypatch: pytest.MonkeyPatch,
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 2: Generate JSON object without a schema
+    #
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=100,
@@ -111,21 +126,9 @@ def test_guided_json_object(
                 allowed_types = (dict, list)
             assert isinstance(parsed_json, allowed_types)
 
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1 + ["auto"])
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_json_unsupported_schema(
-    monkeypatch: pytest.MonkeyPatch,
-    unsupported_json_schema: dict[str, Any],
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 3: test a jsonschema incompatible with xgrammar
+    #
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=1000,
@@ -141,8 +144,6 @@ def test_guided_json_unsupported_schema(
                          sampling_params=sampling_params,
                          use_tqdm=True)
     else:
-        # This should work for both "guidance" and "auto".
-
         outputs = llm.generate(
             prompts=("Give an example JSON object for a grade "
                      "that fits this schema: "
@@ -161,21 +162,9 @@ def test_guided_json_unsupported_schema(
             parsed_json = json.loads(generated_text)
             assert isinstance(parsed_json, dict)
 
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_grammar_ebnf(
-    monkeypatch: pytest.MonkeyPatch,
-    sample_sql_ebnf: str,
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 4: Generate SQL statement using EBNF grammar
+    #
     sampling_params = SamplingParams(
         temperature=0.8,
         top_p=0.95,
@@ -205,21 +194,9 @@ def test_guided_grammar_ebnf(
 
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_grammar_lark(
-    monkeypatch: pytest.MonkeyPatch,
-    sample_sql_lark: str,
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 5: Generate SQL statement using Lark grammar
+    #
     sampling_params = SamplingParams(
         temperature=0.8,
         top_p=0.95,
@@ -254,20 +231,9 @@ def test_guided_grammar_lark(
 
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_grammar_ebnf_invalid(
-    monkeypatch: pytest.MonkeyPatch,
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 6: Test invalid grammar input
+    #
     sampling_params = SamplingParams(
         temperature=0.8,
         top_p=0.95,
@@ -281,21 +247,9 @@ def test_guided_grammar_ebnf_invalid(
             use_tqdm=True,
         )
 
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_regex(
-    monkeypatch: pytest.MonkeyPatch,
-    sample_regex: str,
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 7: Generate text based on a regex pattern
+    #
     sampling_params = SamplingParams(
         temperature=0.8,
         top_p=0.95,
@@ -319,21 +273,9 @@ def test_guided_regex(
         assert re.fullmatch(sample_regex, generated_text) is not None
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_choice_completion(
-    monkeypatch: pytest.MonkeyPatch,
-    sample_guided_choice: str,
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 8: Generate text based on a choices
+    #
     sampling_params = SamplingParams(
         temperature=0.8,
         top_p=0.95,
@@ -353,33 +295,9 @@ def test_guided_choice_completion(
         assert generated_text in sample_guided_choice
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
-
-class CarType(str, Enum):
-    sedan = "sedan"
-    suv = "SUV"
-    truck = "Truck"
-    coupe = "Coupe"
-
-
-class CarDescription(BaseModel):
-    brand: str
-    model: str
-    car_type: CarType
-
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_json_completion_with_enum(
-    monkeypatch: pytest.MonkeyPatch,
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 9: Generate structured output using a Pydantic model with an enum
+    #
     json_schema = CarDescription.model_json_schema()
     sampling_params = SamplingParams(
         temperature=1.0,
@@ -403,3 +321,41 @@ def test_guided_json_completion_with_enum(
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
         output_json = json.loads(generated_text)
         jsonschema.validate(instance=output_json, schema=json_schema)
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+def test_structured_output_auto_mode(
+    monkeypatch: pytest.MonkeyPatch,
+    unsupported_json_schema: dict[str, Any],
+    model_name: str,
+):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend="auto")
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
+
+    # This would fail with the default of "xgrammar", but in "auto"
+    # we will handle fallback automatically.
+    outputs = llm.generate(prompts=("Give an example JSON object for a grade "
+                                    "that fits this schema: "
+                                    f"{unsupported_json_schema}"),
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(generated_text)
+
+        # Parse to verify it is valid JSON
+        parsed_json = json.loads(generated_text)
+        assert isinstance(parsed_json, dict)

From 8427f70493ed67bf26cb9e7fa98ac202b991c37d Mon Sep 17 00:00:00 2001
From: cyyever <cyyever@outlook.com>
Date: Sat, 29 Mar 2025 12:11:51 +0800
Subject: [PATCH 096/593] Use numba 0.61 for python 3.10+ to support numpy>=2
 (#15692)

Signed-off-by: cyy <cyyever@outlook.com>
---
 requirements/common.txt | 2 +-
 requirements/cuda.txt   | 3 ++-
 requirements/rocm.txt   | 3 ++-
 requirements/test.in    | 4 +++-
 requirements/test.txt   | 8 +++++---
 5 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 14084b79121bb..dfa20f5e3f08e 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -1,7 +1,7 @@
 cachetools
 psutil
 sentencepiece  # Required for LLaMA tokenizer.
-numpy < 2.0.0
+numpy
 requests >= 2.26.0
 tqdm
 blake3
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index ad7198081e0fa..9be7a868f56e0 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -1,7 +1,8 @@
 # Common dependencies
 -r common.txt
 
-numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.61; python_version > '3.9'
 
 # Dependencies for NVIDIA GPUs
 ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1.
diff --git a/requirements/rocm.txt b/requirements/rocm.txt
index 345c84b0f6cf2..5d5fea2d0e57e 100644
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -1,7 +1,8 @@
 # Common dependencies
 -r common.txt
 
-numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.61; python_version > '3.9'
 
 # Dependencies for AMD GPUs
 awscli
diff --git a/requirements/test.in b/requirements/test.in
index 3df5e32cd59e1..a7dd54151dee8 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -38,7 +38,9 @@ buildkite-test-collector==0.1.9
 genai_perf==0.0.8
 tritonclient==2.51.0
 
-numpy < 2.0.0
+numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.61; python_version > '3.9'
+numpy
 runai-model-streamer==0.11.0
 runai-model-streamer-s3==0.11.0
 fastsafetensors>=0.1.10
diff --git a/requirements/test.txt b/requirements/test.txt
index b0ae479604a1e..aed6a5653e2ad 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -219,7 +219,7 @@ libnacl==2.1.0
     # via tensorizer
 librosa==0.10.2.post1
     # via -r requirements/test.in
-llvmlite==0.43.0
+llvmlite==0.44.0
     # via numba
 lm-eval==0.4.4
     # via -r requirements/test.in
@@ -262,8 +262,10 @@ networkx==3.2.1
     # via torch
 nltk==3.9.1
     # via rouge-score
-numba==0.60.0
-    # via librosa
+numba==0.61.0
+    # via
+    #   -r requirements/test.in
+    #   librosa
 numexpr==2.10.1
     # via lm-eval
 numpy==1.26.4

From 5b800f0932c0d6661cb3aa85a62d89265197a034 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <linjinzhen@hotmail.com>
Date: Sat, 29 Mar 2025 12:12:26 +0800
Subject: [PATCH 097/593] [Bugfix] set VLLM_WORKER_MULTIPROC_METHOD=spawn for
 vllm.entrypoionts.openai.api_server (#15700)

Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
---
 vllm/entrypoints/cli/main.py          | 28 ++-------------------------
 vllm/entrypoints/openai/api_server.py |  4 +++-
 vllm/entrypoints/utils.py             | 26 +++++++++++++++++++++++++
 3 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/vllm/entrypoints/cli/main.py b/vllm/entrypoints/cli/main.py
index 13f2761b0db06..aa54bd66bed67 100644
--- a/vllm/entrypoints/cli/main.py
+++ b/vllm/entrypoints/cli/main.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # The CLI entrypoint to vLLM.
-import os
 import signal
 import sys
 
@@ -9,11 +8,9 @@ import vllm.entrypoints.cli.benchmark.main
 import vllm.entrypoints.cli.openai
 import vllm.entrypoints.cli.serve
 import vllm.version
-from vllm.logger import init_logger
+from vllm.entrypoints.utils import cli_env_setup
 from vllm.utils import FlexibleArgumentParser
 
-logger = init_logger(__name__)
-
 CMD_MODULES = [
     vllm.entrypoints.cli.openai,
     vllm.entrypoints.cli.serve,
@@ -30,29 +27,8 @@ def register_signal_handlers():
     signal.signal(signal.SIGTSTP, signal_handler)
 
 
-def env_setup():
-    # The safest multiprocessing method is `spawn`, as the default `fork` method
-    # is not compatible with some accelerators. The default method will be
-    # changing in future versions of Python, so we should use it explicitly when
-    # possible.
-    #
-    # We only set it here in the CLI entrypoint, because changing to `spawn`
-    # could break some existing code using vLLM as a library. `spawn` will cause
-    # unexpected behavior if the code is not protected by
-    # `if __name__ == "__main__":`.
-    #
-    # References:
-    # - https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
-    # - https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
-    # - https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors
-    # - https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders
-    if "VLLM_WORKER_MULTIPROC_METHOD" not in os.environ:
-        logger.debug("Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'")
-        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-
-
 def main():
-    env_setup()
+    cli_env_setup()
 
     parser = FlexibleArgumentParser(description="vLLM CLI")
     parser.add_argument('-v',
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 18d75a04ab0f3..2a61259896a37 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -82,7 +82,8 @@ from vllm.entrypoints.openai.serving_tokenization import (
 from vllm.entrypoints.openai.serving_transcription import (
     OpenAIServingTranscription)
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
-from vllm.entrypoints.utils import load_aware_call, with_cancellation
+from vllm.entrypoints.utils import (cli_env_setup, load_aware_call,
+                                    with_cancellation)
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParserManager
 from vllm.transformers_utils.config import (
@@ -1106,6 +1107,7 @@ if __name__ == "__main__":
     # NOTE(simon):
     # This section should be in sync with vllm/entrypoints/cli/main.py for CLI
     # entrypoints.
+    cli_env_setup()
     parser = FlexibleArgumentParser(
         description="vLLM OpenAI-Compatible RESTful API server.")
     parser = make_arg_parser(parser)
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index 773f52fa38f88..b88c2b3a080fd 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -2,11 +2,16 @@
 
 import asyncio
 import functools
+import os
 
 from fastapi import Request
 from fastapi.responses import JSONResponse, StreamingResponse
 from starlette.background import BackgroundTask, BackgroundTasks
 
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 
 async def listen_for_disconnect(request: Request) -> None:
     """Returns if a disconnect message is received"""
@@ -108,3 +113,24 @@ def load_aware_call(func):
         return response
 
     return wrapper
+
+
+def cli_env_setup():
+    # The safest multiprocessing method is `spawn`, as the default `fork` method
+    # is not compatible with some accelerators. The default method will be
+    # changing in future versions of Python, so we should use it explicitly when
+    # possible.
+    #
+    # We only set it here in the CLI entrypoint, because changing to `spawn`
+    # could break some existing code using vLLM as a library. `spawn` will cause
+    # unexpected behavior if the code is not protected by
+    # `if __name__ == "__main__":`.
+    #
+    # References:
+    # - https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
+    # - https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
+    # - https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors
+    # - https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders
+    if "VLLM_WORKER_MULTIPROC_METHOD" not in os.environ:
+        logger.debug("Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'")
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

From da461f3cbf8be4094a6f14a1eaf89b5931f3625f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Sat, 29 Mar 2025 05:13:06 +0100
Subject: [PATCH 098/593] [TPU][V1][Bugfix] Fix w8a8 recompiilation with GSM8K
 (#15714)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 .buildkite/run-tpu-v1-test.sh                      | 10 ++++------
 .../layers/quantization/kernels/scaled_mm/xla.py   |  3 ++-
 vllm/v1/worker/tpu_model_runner.py                 | 14 ++++++++------
 vllm/v1/worker/tpu_worker.py                       |  4 ++--
 4 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh
index 2c356b8fe5274..89252000f4003 100755
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@@ -28,16 +28,14 @@ docker run --privileged --net host --shm-size=16G -it \
     && echo TEST_3 \
     && pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
     && echo TEST_4 \
-    && python3 /workspace/vllm/examples/offline_inference/tpu.py \
+    && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
     && echo TEST_5 \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
+    && python3 /workspace/vllm/examples/offline_inference/tpu.py \
     && echo TEST_6 \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
+    && echo TEST_7 \
     && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" \
 
 
 # TODO: This test fails because it uses RANDOM_SEED sampling
 # && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
-
-# TODO: Re-enable this after fixing recompilation in quantization.
-# && echo TEST_4 \
-# && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
index 0bf090d7fab3c..089314071d39e 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
@@ -97,7 +97,8 @@ class XLAScaledMMLinearKernel(ScaledMMLinearKernel):
                                              block_size=-1,
                                              int4_weight=False,
                                              quantize_activation=True)
-
+        # `quantized_matmul` output is fp32, cast it down to bf16 for perf
+        out = out.to(x.dtype)
         # Explicitly capture control flow to make dynamo happy.
         # https://pytorch.org/docs/main/generated/exportdb/index.html#cond-branch-class-method # noqa: E501
         return cond(bias is None, self.no_add_bias, self.add_bias, [out, bias])
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 695e31f715b4d..773cd971103ae 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -80,6 +80,7 @@ class TPUModelRunner:
         self.enforce_eager = model_config.enforce_eager
         self.pin_memory = is_pin_memory_available()
         self.dtype = self.model_config.dtype
+        self._hidden_states_dtype = self.dtype
 
         self.is_multimodal_model = model_config.is_multimodal_model
         self.sliding_window = model_config.get_sliding_window()
@@ -771,10 +772,11 @@ class TPUModelRunner:
         torch._dynamo.mark_dynamic(attn_metadata.slot_mapping, 0)
 
         with set_forward_context(attn_metadata, self.vllm_config, 0):
-            self.model(input_ids=input_ids,
-                       positions=position_ids,
-                       kv_caches=kv_caches,
-                       inputs_embeds=inputs_embeds)
+            out = self.model(input_ids=input_ids,
+                             positions=position_ids,
+                             kv_caches=kv_caches,
+                             inputs_embeds=inputs_embeds)
+        self._hidden_states_dtype = out.dtype
 
     def capture_model(self) -> None:
         """Compile the model."""
@@ -800,7 +802,7 @@ class TPUModelRunner:
             num_reqs_to_sample = MIN_NUM_SEQS
             dummy_hidden = torch.randn((num_tokens, hsize),
                                        device=device,
-                                       dtype=torch.bfloat16)
+                                       dtype=self._hidden_states_dtype)
             # Compile for [8, 16, .., 128,.., `self.max_num_reqs`]
             while True:
                 indices = torch.zeros(
@@ -823,7 +825,7 @@ class TPUModelRunner:
                     num_reqs_to_sample + 1, self.max_num_reqs)
         xm.wait_device_ops()
         end = time.perf_counter()
-        logger.info("Compilation finished in in %.2f [secs].", end - start)
+        logger.info("Compilation finished in %.2f [secs].", end - start)
         # Record the number cached XLA graph after warming up, this will be
         # used for checking there is no additional graph compilation during
         # runtime execution.
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index c8691ee87fe6a..b51bd20f6f118 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -105,8 +105,8 @@ class TPUWorker:
 
         # Increase the cache size limit, which is the maximum number of
         # dynamo graphs that can be compiled.
-        # NOTE(woosuk): Usually, we compile 10-15 graphs for prefill and
-        # 30-40 graphs for decode. 128 is an arbitrary safe number.
+        # TODO (NickLucche) On gsm we compile 80+ graphs.
+        # Re-evaluate limit, with MM we may get close to this limit.
         torch._dynamo.config.cache_size_limit = 128
         # Use persistent cache to avoid XLA recompilation.
         # NOTE(woosuk): Set per-rank cache path since different ranks

From 7c1f7600248a0a0497a5c512ef0ee262577c5f7a Mon Sep 17 00:00:00 2001
From: yarongmu-google <150371854+yarongmu-google@users.noreply.github.com>
Date: Fri, 28 Mar 2025 21:13:15 -0700
Subject: [PATCH 099/593] [Kernel][TPU][ragged-paged-attn] vLLM code change for
 PR#8896 (#15659)

Signed-off-by: Yarong Mu <ymu@google.com>
---
 requirements/tpu.txt                 | 12 ++++----
 vllm/v1/attention/backends/pallas.py | 43 ++++++++++++++--------------
 vllm/v1/worker/tpu_model_runner.py   | 11 ++++---
 vllm/v1/worker/tpu_worker.py         |  8 +++---
 4 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index 35d5db6c46006..1930eacb61ad6 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -17,9 +17,9 @@ ray[data]
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250328-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250328-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250328-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250328-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250328-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250328-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index 14d3664db0d64..2f86920e2773a 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -41,7 +41,7 @@ class PallasAttentionBackend(AttentionBackend):
         num_kv_heads: int,
         head_size: int,
     ) -> tuple[int, ...]:
-        return (num_blocks, block_size, num_kv_heads * head_size)
+        return (num_blocks, block_size, num_kv_heads * 2, head_size)
 
     @staticmethod
     def swap_blocks(
@@ -132,7 +132,7 @@ class PallasAttentionBackendImpl(AttentionImpl):
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
-        kv_cache: tuple[torch.Tensor, torch.Tensor],
+        kv_cache: torch.Tensor,
         attn_metadata: PallasMetadata,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
@@ -142,14 +142,13 @@ class PallasAttentionBackendImpl(AttentionImpl):
             query: shape = [num_tokens, num_heads * head_size]
             key: shape = [num_tokens, num_kv_heads * head_size]
             value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = ([num_blocks, block_size, num_kv_heads * head_size], 
-                        [num_blocks, block_size, num_kv_heads * head_size])
+            kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
         # For determine_available_memory case.
-        if kv_cache[0].numel() == 0:
+        if kv_cache.numel() == 0:
             if output is None:
                 output = torch.ones_like(query)
             return output
@@ -158,15 +157,13 @@ class PallasAttentionBackendImpl(AttentionImpl):
         num_tokens, hidden_size = query.shape
         query = query.view(num_tokens, self.num_heads, self.head_size)
 
-        key_cache, value_cache = kv_cache
-        if kv_cache[0].numel() > 0:
+        if kv_cache.numel() > 0:
             slot_mapping = attn_metadata.slot_mapping
-            write_to_kv_cache(key, value, key_cache, value_cache, slot_mapping)
+            write_to_kv_cache(key, value, kv_cache, slot_mapping)
 
         output = torch.ops.xla.ragged_paged_attention(
             query,
-            key_cache,
-            value_cache,
+            kv_cache,
             attn_metadata.context_lens,
             attn_metadata.block_tables,
             attn_metadata.query_start_loc,
@@ -183,23 +180,27 @@ class PallasAttentionBackendImpl(AttentionImpl):
 def write_to_kv_cache(
     key: torch.Tensor,
     value: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
+    kv_cache: torch.Tensor,
     slot_mapping: torch.Tensor,
 ) -> None:
     """ Write the key and values to the KV cache.
 
     Args:
         key: shape = [num_tokens, num_kv_heads * head_size]
-        value: shape = [num_tokens, num_kv_heads * head_size]
-        k_cache = [num_blocks, block_size, num_kv_heads * head_size]
-        v_cache = [num_blocks, block_size, num_kv_heads * head_size]
+        value: shape = [num_tokens, num_kv_heads *  head_size]
+        kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size]
 
     """
-    torch.ops.xla.dynamo_set_buffer_donor_(key_cache, True)
-    torch.ops.xla.dynamo_set_buffer_donor_(value_cache, True)
+    _, _, num_combined_kv_heads, head_size = kv_cache.shape
+    num_kv_heads = num_combined_kv_heads // 2
 
-    key_cache = key_cache.flatten(0, 1)
-    value_cache = value_cache.flatten(0, 1)
-    key_cache.index_copy_(0, slot_mapping, key)
-    value_cache.index_copy_(0, slot_mapping, value)
+    key = key.view(-1, num_kv_heads, head_size)
+    value = value.view(-1, num_kv_heads, head_size)
+
+    kv = torch.cat([key, value], axis=-1).reshape(-1, num_combined_kv_heads,
+                                                  head_size)
+
+    torch.ops.xla.dynamo_set_buffer_donor_(kv_cache, True)
+
+    kv_cache = kv_cache.flatten(0, 1)
+    kv_cache.index_copy_(0, slot_mapping, kv)
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 773cd971103ae..ea5a17016eb6b 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -861,12 +861,11 @@ class TPUModelRunner:
                         kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
                     dtype = kv_cache_spec.dtype
 
-                    tpu_k_cache = torch.zeros(kv_cache_shape,
-                                              dtype=dtype,
-                                              device=self.device)
-                    tpu_v_cache = torch.zeros_like(tpu_k_cache)
+                    tpu_kv_cache = torch.zeros(kv_cache_shape,
+                                               dtype=dtype,
+                                               device=self.device)
 
-                    kv_caches[layer_name] = (tpu_k_cache, tpu_v_cache)
+                    kv_caches[layer_name] = tpu_kv_cache
                 else:
                     raise NotImplementedError
 
@@ -893,7 +892,7 @@ class ModelWrapperV1(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: list[tuple[torch.Tensor, torch.Tensor]],
+        kv_caches: list[torch.Tensor],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Executes the forward pass of the model.
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index b51bd20f6f118..9add8cee02e5b 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -136,10 +136,10 @@ class TPUWorker:
 
                 # Use an empty tensor instead of `None`` to force Dynamo to pass
                 # it by reference, rather by specializing on the value ``None``.
-                tpu_k_cache = torch.tensor([], dtype=dtype, device=self.device)
-                tpu_v_cache = torch.tensor([], dtype=dtype, device=self.device)
-
-                kv_caches[layer_name] = (tpu_k_cache, tpu_v_cache)
+                tpu_kv_cache = torch.tensor([],
+                                            dtype=dtype,
+                                            device=self.device)
+                kv_caches[layer_name] = tpu_kv_cache
             else:
                 raise NotImplementedError
 

From 73aa7041bfee43581314e6f34e9a657137ecc092 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sat, 29 Mar 2025 12:27:22 +0800
Subject: [PATCH 100/593] [doc] update doc (#15740)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 docs/README.md | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/docs/README.md b/docs/README.md
index 74e05ce02636b..dcd5e759dfa88 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -2,19 +2,42 @@
 
 ## Build the docs
 
-```bash
-# Install dependencies.
-pip install -r ../requirements/docs.txt
+- Make sure in `docs` directory
 
-# Build the docs.
+```bash
+cd docs
+```
+
+- Install the dependencies:
+
+```bash
+pip install -r ../requirements/docs.txt
+```
+
+- Clean the previous build (optional but recommended):
+
+```bash
 make clean
+```
+
+- Generate the HTML documentation:
+
+```bash
 make html
 ```
 
 ## Open the docs with your browser
 
+- Serve the documentation locally:
+
 ```bash
 python -m http.server -d build/html/
 ```
 
-Launch your browser and open localhost:8000.
+This will start a local server at http://localhost:8000. You can now open your browser and view the documentation.
+
+If port 8000 is already in use, you can specify a different port, for example:
+
+```bash
+python -m http.server 3000 -d build/html/
+```

From 4965ec42d28830f0c30756dea19e14b45cdbe5b1 Mon Sep 17 00:00:00 2001
From: TJian <tunjian.tan@embeddedllm.com>
Date: Sat, 29 Mar 2025 18:33:56 +0800
Subject: [PATCH 101/593] [FEAT] [ROCm] Add AITER int8 scaled gemm kernel
 (#15433)

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 tests/quantization/test_compressed_tensors.py |  76 ++++++++++-
 vllm/envs.py                                  |   8 ++
 .../kernels/scaled_mm/__init__.py             |   4 +-
 .../quantization/kernels/scaled_mm/aiter.py   | 119 ++++++++++++++++++
 4 files changed, 202 insertions(+), 5 deletions(-)
 create mode 100644 vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 133475a3e06aa..5c928f27c10dd 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -20,6 +20,23 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     sparse_cutlass_supported)
 from vllm.platforms import current_platform
 
+# AITER only supports per-channel-per-channel INT8 gemm
+# and per-tensor-per-tensor INT8 GEMM.
+# It does not support mix precision MM and mix quantization scheme.
+ROCM_AITER_SUPPORTED_INT8_MODEL = [
+    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
+    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2"
+]
+
+# TritonScaledMMLinearKernel only supports symmetric quantization.
+ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL = [
+    "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
+    "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
+    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
+    "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
+    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
+]
+
 
 @pytest.fixture(scope="function", autouse=True)
 def use_v0_only(monkeypatch):
@@ -57,6 +74,11 @@ def use_v0_only(monkeypatch):
 )
 def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
     model_path, strategy, quant_type, shape_0, is_symmetric = model_args
+
+    if current_platform.is_rocm(
+    ) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
+        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
+
     with vllm_runner(model_path, enforce_eager=True) as llm:
 
         def check_model(model):
@@ -123,6 +145,8 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
 )
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [10])
+@pytest.mark.parametrize(
+    "use_aiter", [True, False] if current_platform.is_rocm() else [False])
 def test_compressed_tensors_w8a8_logprobs(
     hf_runner,
     vllm_runner,
@@ -130,7 +154,21 @@ def test_compressed_tensors_w8a8_logprobs(
     model_path,
     max_tokens,
     num_logprobs,
+    use_aiter,
+    monkeypatch,
 ):
+
+    if current_platform.is_rocm(
+    ) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
+        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
+
+    if use_aiter:
+        if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
+            pytest.skip(
+                f"Skip model {model_path} as it is not support by aiter.")
+        # this will enable VLLM_ROCM_USE_AITER_LINEAR
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
     dtype = "bfloat16"
 
     # skip language translation prompt for the static per tensor asym model
@@ -154,6 +192,9 @@ def test_compressed_tensors_w8a8_logprobs(
         name_1="vllm",
     )
 
+    if current_platform.is_rocm():
+        torch.cuda.synchronize()
+
 
 def test_compressed_tensors_no_enforce_eager(vllm_runner):
     model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
@@ -177,8 +218,27 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner):
         ),
     ],
 )
-def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args):
+@pytest.mark.parametrize(
+    "use_aiter", [True, False] if current_platform.is_rocm() else [False])
+def test_compressed_tensors_w8a8_dynamic_per_token(
+    vllm_runner,
+    model_args,
+    use_aiter,
+    monkeypatch,
+):
     model_path, strategy = model_args
+
+    if current_platform.is_rocm(
+    ) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
+        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
+
+    if use_aiter:
+        if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
+            pytest.skip(
+                f"Skip model {model_path} as it is not support by aiter.")
+        # this will enable VLLM_ROCM_USE_AITER_LINEAR
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
     with vllm_runner(model_path, dtype=torch.float16) as llm:
 
         def check_model(model):
@@ -207,6 +267,8 @@ def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args):
         ("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4),
     ],
 )
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="The tests are skipped on non-CUDA platform.")
 def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
     model, strategy, group, pack_factor = wNa16_args
     with vllm_runner(model) as llm:
@@ -231,6 +293,8 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
         assert output
 
 
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="This test is skipped on non-CUDA platform.")
 def test_compressed_tensors_w4a16_marlin24(vllm_runner):
     model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
     with vllm_runner(model_path) as llm:
@@ -271,7 +335,7 @@ def test_compressed_tensors_fp8(vllm_runner):
 
             if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8):
                 assert len(qkv_proj.input_scale.shape) == 0
-                assert qkv_proj.weight.dtype is torch.float8_e4m3fn
+                assert qkv_proj.weight.dtype is current_platform.fp8_dtype()
                 assert qkv_proj.weight_scale.dtype is torch.float32
                 assert len(qkv_proj.weight_scale.shape) == 0
 
@@ -281,6 +345,8 @@ def test_compressed_tensors_fp8(vllm_runner):
         assert output
 
 
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="This test is skipped on non-CUDA platform.")
 def test_compressed_tensors_kv_cache(vllm_runner):
     model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
     with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
@@ -309,7 +375,8 @@ def _test_2of4_quant_models(qkv_proj,
 
 
 @pytest.mark.skipif(
-    not current_platform.has_device_capability(90),
+    not current_platform.is_cuda()
+    or not current_platform.has_device_capability(90),
     reason="Sparse FP8 is not yet supported on this GPU type.",
 )
 @pytest.mark.parametrize(
@@ -356,7 +423,8 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
 
 
 @pytest.mark.skipif(
-    not current_platform.has_device_capability(90),
+    not current_platform.is_cuda()
+    or not current_platform.has_device_capability(90),
     reason="Sparse FP8 is not yet supported on this GPU type.",
 )
 @pytest.mark.parametrize(
diff --git a/vllm/envs.py b/vllm/envs.py
index 5334667376b24..8a03ba329b028 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -75,6 +75,7 @@ if TYPE_CHECKING:
     VLLM_DISABLED_KERNELS: list[str] = []
     VLLM_USE_V1: bool = True
     VLLM_ROCM_USE_AITER: bool = False
+    VLLM_ROCM_USE_AITER_LINEAR: bool = True
     VLLM_ROCM_USE_AITER_MOE: bool = True
     VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE: bool = False
     VLLM_ROCM_USE_AITER_RMSNORM: bool = True
@@ -524,6 +525,13 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
              ("true", "1")),
 
+    # use aiter linear op if aiter ops are enabled
+    # The following list of related ops
+    # - scaled_mm (per-tensor / rowwise)
+    "VLLM_ROCM_USE_AITER_LINEAR":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER_LINEAR", "True").lower() in
+             ("true", "1")),
+
     # Whether to use aiter moe ops.
     # By default is enabled.
     "VLLM_ROCM_USE_AITER_MOE":
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
index a5967995ac88d..bedda4c2ab21b 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
@@ -3,6 +3,8 @@
 import os
 from typing import Dict, List, Optional, Type
 
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.aiter import (
+    AiterScaledMMLinearKernel)
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass import (
     CutlassScaledMMLinearKernel)
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import (  # noqa: E501
@@ -17,7 +19,7 @@ from vllm.platforms import PlatformEnum, current_platform
 _POSSIBLE_KERNELS: Dict[PlatformEnum, List[Type[ScaledMMLinearKernel]]] = {
     PlatformEnum.CPU: [CutlassScaledMMLinearKernel],
     PlatformEnum.CUDA: [CutlassScaledMMLinearKernel],
-    PlatformEnum.ROCM: [TritonScaledMMLinearKernel],
+    PlatformEnum.ROCM: [AiterScaledMMLinearKernel, TritonScaledMMLinearKernel],
     PlatformEnum.TPU: [XLAScaledMMLinearKernel],
 }
 
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
new file mode 100644
index 0000000000000..582b12f76562c
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+
+import torch
+
+import vllm.envs as envs
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+
+from .cutlass import CutlassScaledMMLinearKernel
+from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig
+
+
+class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel):
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 90
+
+    @classmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+        if not current_platform.is_rocm():
+            return (
+                False,
+                "AiterScaledMMLinearKernel requires `aiter` which is not " +
+                "currently supported on non-ROCm platform.")
+
+        try:
+            import aiter  # noqa: F401 # deliberately attempt to import aiter
+        except Exception:
+            return (
+                False,
+                "AiterScaledMMLinearKernel requires `aiter` which is not " +
+                "installed on ROCm.")
+        # Check if rocm_aiter_gemm_w8a8_scaled_mm is enabled
+        if not (
+            envs.VLLM_ROCM_USE_AITER_LINEAR \
+            and envs.VLLM_ROCM_USE_AITER
+        ):
+            return (False, "AiterScaledMMLinearKernel is disabled. " +
+                    "Enable by setting `VLLM_ROCM_USE_AITER=1` " +
+                    "and `VLLM_ROCM_USE_AITER_LINEAR=1`. " +
+                    "`VLLM_ROCM_USE_AITER_LINEAR` default is True.")
+
+        if not c.input_symmetric:
+            return (False,
+                    "AiterScaledMMLinearKernel only supports symmetric " +
+                    "quantization.")
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        super().process_weights_after_loading(layer)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        `AiterScaledMMLinearKernel` implements a fused version of
+            `output = torch.mm((scale_a * a), (scale_b * b)).to(out_dtype)`
+        where scale_a * a and scale_b * b are implemented using numpy-style
+        broadcasting.
+        Currently only support per-tensor-per-tensor GEMM
+        and per-token-per-channel GEMM through AITER
+        w8a8 scaled gemm. `AiterScaledMMLinearKernel` also does not support
+        ATIER block scaled GEMM and mix-precision GEMM.
+        """
+        w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer)
+
+        # ops.scaled_int8_quant supports both dynamic and static quant:
+        # * dynamic, i_s is None and x_s computed from x.
+        # * static, i_s is scalar and x_s is i_s.
+        symmetric = azp_adj is None
+        assert symmetric, ("AiterScaledMMLinearKernel only supports"
+                           " symmetric quantization.")
+        x_q, x_s, x_zp = ops.scaled_int8_quant(x,
+                                               i_s,
+                                               i_zp,
+                                               symmetric=symmetric)
+
+        assert x_zp is None, ("AiterScaledMMLinearKernel only supports"
+                              " symmetric quantization.")
+        out_dtype = x.dtype
+
+        assert (w_q.shape[0] % 16 == 0 and w_q.shape[1] % 16 == 0)
+        assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
+        assert bias is None or bias.shape[0] == w_q.shape[
+            1] and bias.dtype == out_dtype
+
+        m = x_q.shape[0]  # a
+        n = w_q.shape[1]  # b
+
+        per_tensor_scale_a = (x_s.numel() == 1)
+        per_tensor_scale_b = (w_s.numel() == 1)
+        per_token_scale_a = (x_s.numel() == m)
+        per_channel_scale_b = (w_s.numel() == n)
+
+        # @TODO:
+        # Maybe broadcast the per-tensor-scale into per-channel-scale
+        # if one of the scale is a per-channel-scale.
+        # For now, it only supports:
+        # - per-tensor-per-tensor a8w8 scaled GEMM, and
+        # - per-token-per-channel a8w8 scaled GEMM
+        assert ((per_tensor_scale_a and per_tensor_scale_b)
+                or (per_token_scale_a and per_channel_scale_b)), (
+                    "Currently only support per-tensor-per-tensor GEMM " +
+                    " and per-token-per-channel GEMM through AITER"
+                    " w8a8 scaled gemm. `AiterScaledMMLinearKernel` " +
+                    "does not support AITER block scaled GEMM.")
+
+        from aiter import gemm_a8w8_CK
+
+        # gemm_a8w8_CK(a, b, scale_a, scale_b, bias) expects
+        # a to be [M, K]
+        # b to be [N, K]
+        # CutlassScaledMMLinearKernel prepare weight `w_q` in [K, N] format
+        return gemm_a8w8_CK(x_q, w_q.t(), x_s, w_s, bias).to(out_dtype)

From 94744ba41a2807cb195e4a41a85d4d49f6867967 Mon Sep 17 00:00:00 2001
From: wwl2755 <wangwenlong2755@gmail.com>
Date: Sat, 29 Mar 2025 05:39:14 -0500
Subject: [PATCH 102/593] [V1] [Feature] Collective RPC (#15444)

Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
---
 .buildkite/test-pipeline.yaml |  6 ++---
 vllm/engine/llm_engine.py     | 13 +++++++++--
 vllm/entrypoints/llm.py       |  4 ++--
 vllm/v1/engine/core.py        | 12 +++++++++-
 vllm/v1/engine/core_client.py | 43 ++++++++++++++++++++++++++++++++++-
 vllm/v1/engine/llm_engine.py  | 10 +++++++-
 vllm/v1/serial_utils.py       |  8 +++++++
 7 files changed, 86 insertions(+), 10 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 428b4c593c38e..62872bf8e3e18 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -150,8 +150,8 @@ steps:
   # TODO: create a dedicated test section for multi-GPU example tests
   # when we have multiple distributed example tests
   - pushd ../examples/offline_inference
-  - VLLM_ENABLE_V1_MULTIPROCESSING=0 python3 rlhf.py
-  - VLLM_ENABLE_V1_MULTIPROCESSING=0 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+  - python3 rlhf.py
+  - RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
   - popd
 
 - label: Metrics, Tracing Test # 10min
@@ -520,7 +520,7 @@ steps:
   - vllm/v1/engine/
   commands:
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
-  - VLLM_ENABLE_V1_MULTIPROCESSING=0 pytest -v -s entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s entrypoints/llm/test_collective_rpc.py
   - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 5682b3dabe2e8..10677878ecc8f 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -7,8 +7,8 @@ from collections import deque
 from contextlib import contextmanager
 from dataclasses import dataclass
 from functools import partial
-from typing import (TYPE_CHECKING, Callable, ClassVar, Deque, Dict, Iterable,
-                    List, Mapping, NamedTuple, Optional)
+from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
+                    Iterable, List, Mapping, NamedTuple, Optional)
 from typing import Sequence as GenericSequence
 from typing import Set, Type, Union, cast, overload
 
@@ -67,6 +67,7 @@ _LOCAL_LOGGING_INTERVAL_SEC = 5
 
 _G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup)
 _O = TypeVar("_O", RequestOutput, PoolingRequestOutput)
+_R = TypeVar("_R", default=Any)
 
 
 @dataclass
@@ -2123,6 +2124,14 @@ class LLMEngine:
 
         return sampling_params
 
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        return self.model_executor.collective_rpc(method, timeout, args,
+                                                  kwargs)
+
 
 if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
     from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 1887caf25a30f..7c354be2d45c5 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -492,8 +492,8 @@ class LLM:
             It is recommended to use this API to only pass control messages,
             and set up data-plane communication to pass data.
         """
-        executor = self.llm_engine.model_executor
-        return executor.collective_rpc(method, timeout, args, kwargs)
+
+        return self.llm_engine.collective_rpc(method, timeout, args, kwargs)
 
     def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
         """
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 20904cd495f91..6083eea45cd98 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -8,7 +8,7 @@ import time
 from concurrent.futures import Future
 from inspect import isclass, signature
 from logging import DEBUG
-from typing import Any, Optional
+from typing import Any, Callable, Optional, TypeVar, Union
 
 import msgspec
 import psutil
@@ -43,6 +43,8 @@ logger = init_logger(__name__)
 
 POLLING_TIMEOUT_S = 2.5
 
+_R = TypeVar('_R')  # Return type for collective_rpc
+
 
 class EngineCore:
     """Inner loop of vLLM's Engine."""
@@ -280,6 +282,14 @@ class EngineCore:
     def pin_lora(self, lora_id: int) -> bool:
         return self.model_executor.pin_lora(lora_id)
 
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        return self.model_executor.collective_rpc(method, timeout, args,
+                                                  kwargs)
+
 
 class EngineCoreProc(EngineCore):
     """ZMQ-wrapper for running EngineCore in background process."""
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 8858a564d2c2b..3dc33a1284a12 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -12,7 +12,7 @@ from collections.abc import Awaitable, Sequence
 from concurrent.futures import Future
 from dataclasses import dataclass, field
 from threading import Thread
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable, Optional, TypeVar, Union
 
 import zmq
 import zmq.asyncio
@@ -33,6 +33,8 @@ logger = init_logger(__name__)
 
 AnyFuture = Union[asyncio.Future[Any], Future[Any]]
 
+_R = TypeVar('_R')  # Return type for collective_rpc
+
 
 class EngineCoreClient(ABC):
     """
@@ -117,6 +119,13 @@ class EngineCoreClient(ABC):
     def pin_lora(self, lora_id: int) -> bool:
         raise NotImplementedError
 
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        raise NotImplementedError
+
     async def get_output_async(self) -> EngineCoreOutputs:
         raise NotImplementedError
 
@@ -153,6 +162,14 @@ class EngineCoreClient(ABC):
     async def pin_lora_async(self, lora_id: int) -> bool:
         raise NotImplementedError
 
+    async def collective_rpc_async(
+            self,
+            method: Union[str, Callable[..., _R]],
+            timeout: Optional[float] = None,
+            args: tuple = (),
+            kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        raise NotImplementedError
+
 
 class InprocClient(EngineCoreClient):
     """
@@ -210,6 +227,13 @@ class InprocClient(EngineCoreClient):
     def pin_lora(self, lora_id: int) -> bool:
         return self.engine_core.pin_lora(lora_id)
 
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        return self.engine_core.collective_rpc(method, timeout, args, kwargs)
+
 
 class CoreEngine:
     """One per data parallel rank."""
@@ -505,6 +529,14 @@ class SyncMPClient(MPClient):
     def execute_dummy_batch(self) -> None:
         self.call_utility("execute_dummy_batch")
 
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        return self.call_utility("collective_rpc", method, timeout, args,
+                                 kwargs)
+
 
 class AsyncMPClient(MPClient):
     """Asyncio-compatible client for multi-proc EngineCore."""
@@ -636,6 +668,15 @@ class AsyncMPClient(MPClient):
     async def pin_lora_async(self, lora_id: int) -> bool:
         return await self.call_utility_async("pin_lora", lora_id)
 
+    async def collective_rpc_async(
+            self,
+            method: Union[str, Callable[..., _R]],
+            timeout: Optional[float] = None,
+            args: tuple = (),
+            kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        return await self.call_utility_async("collective_rpc", method, timeout,
+                                             args, kwargs)
+
 
 class DPAsyncMPClient(AsyncMPClient):
     """Asyncio-compatible client for multi-proc, multi-engine (data parallel)
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 000de21fbe7bf..764c643b5c974 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -2,7 +2,7 @@
 
 from collections.abc import Mapping
 from copy import copy
-from typing import Optional, Union
+from typing import Any, Callable, Optional, Union
 
 from typing_extensions import TypeVar
 
@@ -32,6 +32,7 @@ from vllm.v1.executor.abstract import Executor
 logger = init_logger(__name__)
 
 _G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup)
+_R = TypeVar("_R", default=Any)
 
 
 class LLMEngine:
@@ -282,6 +283,13 @@ class LLMEngine:
         """Prevent an adapter from being evicted."""
         return self.engine_core.pin_lora(lora_id)
 
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        return self.engine_core.collective_rpc(method, timeout, args, kwargs)
+
     def __del__(self):
         if dp_group := getattr(self, "dp_group", None):
             stateless_destroy_torch_distributed_process_group(dp_group)
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 3f000abcde0d1..146d7d747f1a4 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -1,13 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pickle
+from types import FunctionType
 from typing import Any, Optional
 
+import cloudpickle
 import torch
 from msgspec import msgpack
 
 CUSTOM_TYPE_TENSOR = 1
 CUSTOM_TYPE_PICKLE = 2
+CUSTOM_TYPE_CLOUDPICKLE = 3
 
 
 class MsgpackEncoder:
@@ -41,6 +44,9 @@ def custom_enc_hook(obj: Any) -> Any:
         # https://gist.github.com/tlrmchlsmth/8067f1b24a82b6e2f90450e7764fa103 # noqa: E501
         return msgpack.Ext(CUSTOM_TYPE_TENSOR, pickle.dumps(obj.numpy()))
 
+    if isinstance(obj, FunctionType):
+        return msgpack.Ext(CUSTOM_TYPE_CLOUDPICKLE, cloudpickle.dumps(obj))
+
     return msgpack.Ext(CUSTOM_TYPE_PICKLE, pickle.dumps(obj))
 
 
@@ -49,5 +55,7 @@ def custom_ext_hook(code: int, data: memoryview) -> Any:
         return torch.from_numpy(pickle.loads(data))
     if code == CUSTOM_TYPE_PICKLE:
         return pickle.loads(data)
+    if code == CUSTOM_TYPE_CLOUDPICKLE:
+        return cloudpickle.loads(data)
 
     raise NotImplementedError(f"Extension type code {code} is not supported")

From 6fa7cd3dbcf3e78e36431ca31abd973e5617dd27 Mon Sep 17 00:00:00 2001
From: shangmingc <caishangming@linux.alibaba.com>
Date: Sat, 29 Mar 2025 19:01:46 +0800
Subject: [PATCH 103/593] [Feature][Disaggregated] Support XpYd disaggregated
 prefill with MooncakeStore (#12957)

Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
---
 .../disagg_examples/disagg_proxy_demo.py      | 450 ++++++++++++++++++
 .../kv_transfer/kv_connector/factory.py       |   5 +
 .../kv_connector/mooncake_store_connector.py  | 216 +++++++++
 .../kv_transfer/kv_lookup_buffer/base.py      |  87 +++-
 .../kv_lookup_buffer/mooncake_store.py        | 160 +++++++
 5 files changed, 907 insertions(+), 11 deletions(-)
 create mode 100644 examples/online_serving/disagg_examples/disagg_proxy_demo.py
 create mode 100644 vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
 create mode 100644 vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py

diff --git a/examples/online_serving/disagg_examples/disagg_proxy_demo.py b/examples/online_serving/disagg_examples/disagg_proxy_demo.py
new file mode 100644
index 0000000000000..a701636f357a8
--- /dev/null
+++ b/examples/online_serving/disagg_examples/disagg_proxy_demo.py
@@ -0,0 +1,450 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This file provides a disaggregated prefilling proxy demo to demonstrate an
+example usage of XpYd disaggregated prefilling.
+We can launch multiple vllm instances (2 for prefill and 2 for decode), and
+launch this proxy demo through:
+  python3 examples/online_serving/disagg_examples/disagg_proxy_demo.py  \
+       --model $model_name  \
+       --prefill localhost:8100 localhost:8101   \
+       --decode localhost:8200 localhost:8201   \
+       --port 8000
+
+Note: This demo will be removed once the PDController implemented in PR 15343
+(https://github.com/vllm-project/vllm/pull/15343) supports XpYd.
+"""
+import argparse
+import ipaddress
+import itertools
+import json
+import logging
+import os
+import sys
+from abc import ABC, abstractmethod
+from typing import Callable, Optional
+
+import aiohttp
+import requests
+import uvicorn
+from fastapi import (APIRouter, Depends, FastAPI, Header, HTTPException,
+                     Request, status)
+from fastapi.responses import JSONResponse, StreamingResponse
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+logger = logging.getLogger()
+logging.basicConfig(level=logging.INFO)
+
+
+class SchedulingPolicy(ABC):
+
+    @abstractmethod
+    def schedule(self, cycler: itertools.cycle):
+        raise NotImplementedError("Scheduling Proxy is not set.")
+
+
+class Proxy:
+
+    def __init__(
+        self,
+        prefill_instances: list[str],
+        decode_instances: list[str],
+        model: str,
+        scheduling_policy: SchedulingPolicy,
+        custom_create_completion: Optional[Callable[[Request],
+                                                    StreamingResponse]] = None,
+        custom_create_chat_completion: Optional[Callable[
+            [Request], StreamingResponse]] = None,
+    ):
+        self.prefill_instances = prefill_instances
+        self.decode_instances = decode_instances
+        self.prefill_cycler = itertools.cycle(prefill_instances)
+        self.decode_cycler = itertools.cycle(decode_instances)
+        self.model = model
+        self.scheduling_policy = scheduling_policy
+        self.custom_create_completion = custom_create_completion
+        self.custom_create_chat_completion = custom_create_chat_completion
+        self.router = APIRouter()
+        self.setup_routes()
+
+    def setup_routes(self):
+        self.router.post(
+            "/v1/completions",
+            dependencies=[
+                Depends(self.validate_json_request)
+            ])(self.custom_create_completion if self.
+               custom_create_completion else self.create_completion)
+        self.router.post(
+            "/v1/chat/completions",
+            dependencies=[
+                Depends(self.validate_json_request)
+            ])(self.custom_create_chat_completion if self.
+               custom_create_chat_completion else self.create_chat_completion)
+        self.router.get("/status",
+                        response_class=JSONResponse)(self.get_status)
+        self.router.post("/instances/add",
+                         dependencies=[Depends(self.api_key_authenticate)
+                                       ])(self.add_instance_endpoint)
+
+    async def validate_json_request(self, raw_request: Request):
+        content_type = raw_request.headers.get("content-type", "").lower()
+        if content_type != "application/json":
+            raise HTTPException(
+                status_code=415,
+                detail=
+                "Unsupported Media Type: Only 'application/json' is allowed",
+            )
+
+    def api_key_authenticate(self, x_api_key: str = Header(...)):
+        expected_api_key = os.environ.get("ADMIN_API_KEY")
+        if not expected_api_key:
+            logger.error("ADMIN_API_KEY is not set in the environment.")
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail="Server configuration error.",
+            )
+        if x_api_key != expected_api_key:
+            logger.warning("Unauthorized access attempt with API Key: %s",
+                           x_api_key)
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail="Forbidden: Invalid API Key.",
+            )
+
+    async def validate_instance(self, instance: str) -> bool:
+        url = f"http://{instance}/v1/models"
+        try:
+            async with aiohttp.ClientSession(
+                    timeout=AIOHTTP_TIMEOUT) as client:
+                logger.info("Verifying %s ...", instance)
+                async with client.get(url) as response:
+                    if response.status == 200:
+                        data = await response.json()
+                        if "data" in data and len(data["data"]) > 0:
+                            model_cur = data["data"][0].get("id", "")
+                            if model_cur == self.model:
+                                logger.info("Instance: %s could be added.",
+                                            instance)
+                                return True
+                            else:
+                                logger.warning("Mismatch model %s : %s != %s",
+                                               instance, model_cur, self.model)
+                                return False
+                        else:
+                            return False
+                    else:
+                        return False
+        except aiohttp.ClientError as e:
+            logger.error(str(e))
+            return False
+        except Exception as e:
+            logger.error(str(e))
+            return False
+
+    async def add_instance_endpoint(self, request: Request):
+        try:
+            data = await request.json()
+            logger.warning(str(data))
+            instance_type = data.get("type")
+            instance = data.get("instance")
+            if instance_type not in ["prefill", "decode"]:
+                raise HTTPException(status_code=400,
+                                    detail="Invalid instance type.")
+            if not instance or ":" not in instance:
+                raise HTTPException(status_code=400,
+                                    detail="Invalid instance format.")
+            host, port_str = instance.split(":")
+            try:
+                if host != "localhost":
+                    ipaddress.ip_address(host)
+                port = int(port_str)
+                if not (0 < port < 65536):
+                    raise HTTPException(status_code=400,
+                                        detail="Invalid port number.")
+            except Exception as e:
+                raise HTTPException(status_code=400,
+                                    detail="Invalid instance address.") from e
+
+            is_valid = await self.validate_instance(instance)
+            if not is_valid:
+                raise HTTPException(status_code=400,
+                                    detail="Instance validation failed.")
+
+            if instance_type == "prefill":
+                if instance not in self.prefill_instances:
+                    self.prefill_instances.append(instance)
+                    self.prefill_cycler = itertools.cycle(
+                        self.prefill_instances)
+                else:
+                    raise HTTPException(status_code=400,
+                                        detail="Instance already exists.")
+            else:
+                if instance not in self.decode_instances:
+                    self.decode_instances.append(instance)
+                    self.decode_cycler = itertools.cycle(self.decode_instances)
+                else:
+                    raise HTTPException(status_code=400,
+                                        detail="Instance already exists.")
+
+            return JSONResponse(content={
+                "message":
+                f"Added {instance} to {instance_type}_instances."
+            })
+        except HTTPException as http_exc:
+            raise http_exc
+        except Exception as e:
+            logger.error("Error in add_instance_endpoint: %s", str(e))
+            raise HTTPException(status_code=500, detail=str(e)) from e
+
+    async def forward_request(self, url, data, use_chunked=True):
+        async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+            headers = {
+                "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+            }
+            try:
+                async with session.post(url=url, json=data,
+                                        headers=headers) as response:
+                    if 200 <= response.status < 300 or 400 <= response.status < 500:  # noqa: E501
+                        if use_chunked:
+                            async for chunk_bytes in response.content.iter_chunked(  # noqa: E501
+                                    1024):
+                                yield chunk_bytes
+                        else:
+                            content = await response.read()
+                            yield content
+                    else:
+                        error_content = await response.text()
+                        try:
+                            error_content = json.loads(error_content)
+                        except json.JSONDecodeError:
+                            error_content = error_content
+                        logger.error("Request failed with status %s: %s",
+                                     response.status, error_content)
+                        raise HTTPException(
+                            status_code=response.status,
+                            detail=
+                            f"Request failed with status {response.status}: "
+                            f"{error_content}",
+                        )
+            except aiohttp.ClientError as e:
+                logger.error("ClientError occurred: %s", str(e))
+                raise HTTPException(
+                    status_code=502,
+                    detail=
+                    "Bad Gateway: Error communicating with upstream server.",
+                ) from e
+            except Exception as e:
+                logger.error("Unexpected error: %s", str(e))
+                raise HTTPException(status_code=500, detail=str(e)) from e
+
+    def schedule(self, cycler: itertools.cycle) -> str:
+        return self.scheduling_policy.schedule(cycler)
+
+    async def get_status(self):
+        status = {
+            "prefill_node_count": len(self.prefill_instances),
+            "decode_node_count": len(self.decode_instances),
+            "prefill_nodes": self.prefill_instances,
+            "decode_nodes": self.decode_instances,
+        }
+        return status
+
+    async def create_completion(self, raw_request: Request):
+        try:
+            request = await raw_request.json()
+
+            kv_prepare_request = request.copy()
+            kv_prepare_request["max_tokens"] = 1
+
+            prefill_instance = self.schedule(self.prefill_cycler)
+            try:
+                async for _ in self.forward_request(
+                        f"http://{prefill_instance}/v1/completions",
+                        kv_prepare_request):
+                    continue
+            except HTTPException as http_exc:
+                self.remove_instance_endpoint("prefill", prefill_instance)
+                raise http_exc
+
+            # Perform kv recv and decoding stage
+            decode_instance = self.schedule(self.decode_cycler)
+
+            try:
+                generator = self.forward_request(
+                    f"http://{decode_instance}/v1/completions", request)
+            except HTTPException as http_exc:
+                self.remove_instance_endpoint("decode", decode_instance)
+                raise http_exc
+            response = StreamingResponse(generator)
+            return response
+        except Exception:
+            import sys
+
+            exc_info = sys.exc_info()
+            print("Error occurred in disagg proxy server")
+            print(exc_info)
+
+    async def create_chat_completion(self, raw_request: Request):
+        try:
+            request = await raw_request.json()
+
+            # add params to request
+            kv_prepare_request = request.copy()
+            kv_prepare_request["max_tokens"] = 1
+
+            # prefill stage
+            prefill_instance = self.schedule(self.prefill_cycler)
+            try:
+                async for _ in self.forward_request(
+                        f"http://{prefill_instance}/v1/chat/completions",
+                        kv_prepare_request):
+                    continue
+            except HTTPException as http_exc:
+                self.remove_instance_endpoint("prefill", prefill_instance)
+                raise http_exc
+            # Perform kv recv and decoding stage
+            decode_instance = self.schedule(self.decode_cycler)
+
+            try:
+                generator = self.forward_request(
+                    "http://" + decode_instance + "/v1/chat/completions",
+                    request)
+            except HTTPException as http_exc:
+                self.remove_instance_endpoint("decode", decode_instance)
+                raise http_exc
+            response = StreamingResponse(content=generator)
+            return response
+        except Exception:
+            exc_info = sys.exc_info()
+            error_messages = [str(e) for e in exc_info if e]
+            print("Error occurred in disagg proxy server")
+            print(error_messages)
+            return StreamingResponse(content=iter(error_messages),
+                                     media_type="text/event-stream")
+
+    def remove_instance_endpoint(self, instance_type, instance):
+        if (instance_type == "decode" and instance in self.decode_instances):
+            self.decode_instances.remove(instance)
+            self.decode_cycler = itertools.cycle(self.decode_instances)
+        if (instance_type == "prefill" and instance in self.decode_instances):
+            self.prefill_instances.remove(instance)
+            self.prefill_cycler = itertools.cycle(self.decode_instances)
+
+
+class RoundRobinSchedulingPolicy(SchedulingPolicy):
+
+    def __init__(self):
+        super().__init__()
+
+    def schedule(self, cycler: itertools.cycle) -> str:
+        return next(cycler)
+
+
+class ProxyServer:
+
+    def __init__(
+        self,
+        args: argparse.Namespace,
+        scheduling_policy: Optional[SchedulingPolicy] = None,
+        create_completion: Optional[Callable[[Request],
+                                             StreamingResponse]] = None,
+        create_chat_completion: Optional[Callable[[Request],
+                                                  StreamingResponse]] = None,
+    ):
+        self.validate_parsed_serve_args(args)
+        self.port = args.port
+        self.proxy_instance = Proxy(
+            prefill_instances=[] if args.prefill is None else args.prefill,
+            decode_instances=[] if args.decode is None else args.decode,
+            model=args.model,
+            scheduling_policy=(scheduling_policy if scheduling_policy
+                               is not None else RoundRobinSchedulingPolicy()),
+            custom_create_completion=create_completion,
+            custom_create_chat_completion=create_chat_completion,
+        )
+
+    def validate_parsed_serve_args(self, args: argparse.Namespace):
+        if not args.prefill:
+            raise ValueError("Please specify at least one prefill node.")
+        if not args.decode:
+            raise ValueError("Please specify at least one decode node.")
+        self.validate_instances(args.prefill)
+        self.validate_instances(args.decode)
+        self.verify_model_config(args.prefill, args.model)
+        self.verify_model_config(args.decode, args.model)
+
+    def validate_instances(self, instances: list):
+        for instance in instances:
+            if len(instance.split(":")) != 2:
+                raise ValueError(f"Invalid instance format: {instance}")
+            host, port = instance.split(":")
+            try:
+                if host != "localhost":
+                    ipaddress.ip_address(host)
+                port = int(port)
+                if not (0 < port < 65536):
+                    raise ValueError(
+                        f"Invalid port number in instance: {instance}")
+            except Exception as e:
+                raise ValueError(
+                    f"Invalid instance {instance}: {str(e)}") from e
+
+    def verify_model_config(self, instances: list, model: str) -> None:
+        model_suffix = model.split("/")[-1]
+        for instance in instances:
+            try:
+                response = requests.get(f"http://{instance}/v1/models")
+                if response.status_code == 200:
+                    model_cur = response.json()["data"][0]["id"]
+                    model_cur_suffix = model_cur.split("/")[-1]
+                    if model_cur_suffix != model_suffix:
+                        raise ValueError(
+                            f"{instance} serves a different model: "
+                            f"{model_cur} != {model}")
+                else:
+                    raise ValueError(f"Cannot get model id from {instance}!")
+            except requests.RequestException as e:
+                raise ValueError(
+                    f"Error communicating with {instance}: {str(e)}") from e
+
+    def run_server(self):
+        app = FastAPI()
+        app.include_router(self.proxy_instance.router)
+        config = uvicorn.Config(app, port=self.port, loop="uvloop")
+        server = uvicorn.Server(config)
+        server.run()
+
+
+if __name__ == "__main__":
+    # Todo: allow more config
+    parser = argparse.ArgumentParser("vLLM disaggregated proxy server.")
+    parser.add_argument("--model",
+                        "-m",
+                        type=str,
+                        required=True,
+                        help="Model name")
+
+    parser.add_argument(
+        "--prefill",
+        "-p",
+        type=str,
+        nargs="+",
+        help="List of prefill node URLs (host:port)",
+    )
+
+    parser.add_argument(
+        "--decode",
+        "-d",
+        type=str,
+        nargs="+",
+        help="List of decode node URLs (host:port)",
+    )
+
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8000,
+        help="Server port number",
+    )
+    args = parser.parse_args()
+    proxy_server = ProxyServer(args=args)
+    proxy_server.run_server()
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index 7336c54ec8a30..e37ce6dc75b03 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -53,3 +53,8 @@ KVConnectorFactory.register_connector(
     "LMCacheConnector",
     "vllm.distributed.kv_transfer.kv_connector.lmcache_connector",
     "LMCacheConnector")
+
+KVConnectorFactory.register_connector(
+    "MooncakeStoreConnector",
+    "vllm.distributed.kv_transfer.kv_connector.mooncake_store_connector",
+    "MooncakeStoreConnector")
\ No newline at end of file
diff --git a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
new file mode 100644
index 0000000000000..c5135dab23eba
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
@@ -0,0 +1,216 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+MooncakeStore Connector for Distributed Machine Learning Inference
+
+The MooncakeStoreConnector transfers KV caches between prefill vLLM workers
+(KV cache producer) and decode vLLM workers (KV cache consumer) using a
+database-style KVStore.
+"""
+import hashlib
+from typing import TYPE_CHECKING, List, Tuple, Union
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
+from vllm.logger import init_logger
+from vllm.sequence import IntermediateTensors
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+
+logger = init_logger(__name__)
+
+
+class MooncakeStoreConnector(KVConnectorBase):
+
+    def __init__(
+        self,
+        rank: int,
+        local_rank: int,
+        config: VllmConfig,
+    ):
+        self.config = config.kv_transfer_config
+        self.tp_size = config.parallel_config.tensor_parallel_size
+
+        self.local_tp_rank = local_rank
+
+        # Init kv_store
+        if self.config.kv_connector == "MooncakeStoreConnector":
+            # Check if MOONCAKE_CONFIG_PATH is set
+            import os
+            use_mooncake_store = os.getenv('MOONCAKE_CONFIG_PATH') is not None
+
+            if not use_mooncake_store:
+                raise ValueError(
+                    "To use MooncakeStoreConnector, you need to pass the ENV: "
+                    "'MOONCAKE_CONFIG_PATH=/path/to/mooncake_config.json'.")
+            else:
+                from vllm.distributed.kv_transfer.kv_lookup_buffer.mooncake_store import (  # noqa: E501
+                    MooncakeStore)
+                logger.info(
+                    "Initializing KVStoreConnector under kv_transfer_config %s",
+                    self.config)
+                self.kv_store = MooncakeStore(config)
+        else:
+            logger.error("Can not find %s", self.config.kv_connector)
+
+        assert self.kv_store is not None
+
+    def close(self) -> None:
+        """Close the buffer and release resources.
+        This method is responsible for cleaning up resources related to the 
+        connector when it is no longer needed.
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        self.kv_store.close()
+
+    def send_kv_caches_and_hidden_states(
+        self,
+        model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: List[torch.Tensor],
+        hidden_or_intermediate_states: Union[torch.Tensor,
+                                             IntermediateTensors],
+    ) -> None:
+        input_tokens_tensor = model_input.input_tokens
+        seq_lens = model_input.attn_metadata.seq_lens
+        slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten()
+        start_layer = model_executable.model.start_layer
+        end_layer = model_executable.model.end_layer
+
+        model_config = model_executable.model.config
+        num_heads = int(model_config.num_key_value_heads / self.tp_size)
+        hidden_size = model_config.hidden_size
+        num_attention_heads = model_config.num_attention_heads
+        head_size = int(hidden_size / num_attention_heads)
+
+        for idx, slen in enumerate(seq_lens):
+            start_pos = sum(seq_lens[:idx])
+            end_pos = start_pos + slen
+
+            current_tokens = input_tokens_tensor[start_pos:end_pos]
+            store_key_prefix = self.tensor_hash(current_tokens)
+            keys, values = [], []
+
+            for layer_id in range(start_layer, end_layer):
+                kv_cache = kv_caches[layer_id - start_layer]
+
+                key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
+                value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
+
+                current_slot_mapping = slot_mapping_flat[start_pos:end_pos]
+
+                keys.append(key_cache[current_slot_mapping].unsqueeze(0))
+                values.append(value_cache[current_slot_mapping].unsqueeze(0))
+
+            keys = torch.cat(keys, dim=0)
+            values = torch.cat(values, dim=0)
+            kvcache_to_sent = torch.stack((keys, values), dim=0)
+            store_kvcache_key = f"{store_key_prefix}_{self.local_tp_rank}"
+            self.kv_store.put(store_kvcache_key, kvcache_to_sent)
+
+            hidden_key = f"{store_key_prefix}_hidden_{self.local_tp_rank}"
+            self.kv_store.put(hidden_key,
+                              hidden_or_intermediate_states[start_pos:end_pos])
+
+        logger.debug("[rank%d]: KV send DONE.", torch.distributed.get_rank())
+
+    def recv_kv_caches_and_hidden_states(
+        self, model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: List[torch.Tensor]
+    ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool,
+               "ModelInputForGPUWithSamplingMetadata"]:
+        bypass_model_exec = True
+        input_tokens_tensor = model_input.input_tokens
+        seq_lens = model_input.attn_metadata.seq_lens
+        num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens
+        slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
+        start_layer = model_executable.model.start_layer
+        end_layer = model_executable.model.end_layer
+        hidden_or_intermediate_states_for_one_req = []
+
+        for idx, slen in enumerate(seq_lens):
+            start_pos = sum(seq_lens[:idx])
+            end_pos = start_pos + slen
+
+            if start_pos >= num_prefill_tokens:
+                # This can happen during inflight batching. See:
+                # vllm/worker/model_runner.py::_prepare_model_input_tensors:
+                # - input_tokens[:num_prefill_tokens] contains prefill tokens.
+                # - input_tokens[num_prefill_tokens:] contains decode tokens.
+                logger.warning("You should set --enable_chunked_prefill=False "
+                               "and --max_num_batched_tokens "
+                               "should be equal to max_seq_len_to_capture")
+                bypass_model_exec = False
+                assert start_pos == num_prefill_tokens
+                break
+
+            current_tokens = input_tokens_tensor[start_pos:end_pos]
+
+            # get roi for current seq
+            load_key_prefix = self.tensor_hash(current_tokens)
+            load_kvcache_key = f"{load_key_prefix}_{self.local_tp_rank}"
+            remote_kv = self.kv_store.get(load_kvcache_key)
+            hidden_key = f"{load_key_prefix}_hidden_{self.local_tp_rank}"
+            hidden = self.kv_store.get(hidden_key)
+
+            if remote_kv is None or hidden is None:
+                # didn't find any match.
+                bypass_model_exec = False
+                continue
+
+            num_computed_tokens = current_tokens.shape[0]
+
+            # update the end position based on how many tokens are cached.
+            end_pos = start_pos + num_computed_tokens
+
+            # call self.kv_store to get kv layer by layer
+            for layer_id in range(start_layer, end_layer):
+                layer = model_executable.model.layers[layer_id]
+                # get kvcache object
+                kv_cache = kv_caches[layer_id - start_layer]
+                key_cache, value_cache = kv_cache[0], kv_cache[1]
+                # get remote kvcache
+
+                remote_k, remote_v = remote_kv[0][layer_id], remote_kv[1][
+                    layer_id]
+                # use ops.reshape_and_cache_flash to put kv into kvcache
+                ops.reshape_and_cache_flash(
+                    remote_k.to(key_cache.device),
+                    remote_v.to(value_cache.device),
+                    key_cache,
+                    value_cache,
+                    slot_mapping[start_pos:end_pos],
+                    layer.self_attn.attn.kv_cache_dtype,
+                    layer.self_attn.attn._k_scale,
+                    layer.self_attn.attn._v_scale,
+                )
+
+            hidden_or_intermediate_states_for_one_req.append(hidden)
+
+        if not bypass_model_exec:
+            logger.warning(
+                "[rank%d]: Failed to receive all KVs and hidden "
+                "states, redo model forwarding.", torch.distributed.get_rank())
+            hidden_or_intermediate_states = None
+
+        else:
+            logger.debug(
+                "[rank%d]: Successfully received all KVs and hidden "
+                "states, skip model forwarding.", torch.distributed.get_rank())
+            hidden_or_intermediate_states = torch.cat(
+                hidden_or_intermediate_states_for_one_req, dim=0)
+
+        return hidden_or_intermediate_states, bypass_model_exec, model_input
+
+    @staticmethod
+    def tensor_hash(tensor: torch.Tensor) -> int:
+        """Calculate the hash value of the tensor."""
+        tensor_bytes = tensor.clone().detach().cpu().numpy().tobytes()
+        hash_object = hashlib.blake2b(tensor_bytes)
+        hash_hex = hash_object.hexdigest()
+        return int(hash_hex[:16], 16)
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
index 845da7c501e88..bea42846e9e41 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
@@ -1,11 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 """
-This file contains a new class `KVLookupBufferBase` that allows developers to 
-think of KV cache operations as inserting new KV cache entries (`insert`) 
-into the lookup buffer and querying existing KV caches (`drop_select`) 
+This file contains a new class `KVLookupBufferBase` that allows developers to
+think of KV cache operations as inserting new KV cache entries (`insert`)
+into the lookup buffer and querying existing KV caches (`drop_select`)
 from the lookup buffer.
 
-All distributed communications are abstracted behind this class.
+This file also contains a new class `KVStoreBufferBase` that allows developers
+to manage the KVCache buffer as a simple key-value storage buffer with basic
+put/get operations.
+
+These classes above are abstracted behind class `KVCacheBufferBase`.
 """
 
 from abc import ABC, abstractmethod
@@ -14,9 +18,27 @@ from typing import List, Optional
 import torch
 
 
-class KVLookupBufferBase(ABC):
+class KVCacheBufferBase(ABC):
     """
-    Abstract base class for a lookup buffer.
+    Abstract base class for a KVCache buffer.
+    """
+
+    @abstractmethod
+    def close(self) -> None:
+        """Close the buffer and release resources.
+
+        This method is responsible for cleaning up resources related to the
+        KVCache buffer when it is no longer needed.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
+
+
+class KVLookupBufferBase(KVCacheBufferBase):
+    """
+    Abstract base class for a KVCache lookup buffer.
 
     This class provides an abstraction for a key-value (KV) cache lookup buffer.
     
@@ -96,12 +118,55 @@ class KVLookupBufferBase(ABC):
         """
         raise NotImplementedError
 
-    @abstractmethod
-    def close(self) -> None:
-        """Close the buffer and release resources.
 
-        This method is responsible for cleaning up resources related to the 
-        lookup buffer when it is no longer needed.
+class KVStoreBufferBase(KVCacheBufferBase):
+    """
+    Abstract base class for a KVCache storage buffer with key-value semantics.
+    This class provides a simple key-value storage buffer abstract with basic
+    put/get operations, which enables flexible KVCache transfer granular
+    control.
+
+    The functionality is similar to a distributed key-value store, where:
+    - Key: A unique string identifier for the cached entry
+    - Value:
+        - Tensor to be stored and retrieved
+        - None (indicating deletion or empty value)
+    """
+
+    @abstractmethod
+    def put(
+        self,
+        key: str,
+        value: Optional[torch.Tensor],
+    ) -> None:
+        """Store a key-value pair in the buffer.
+
+        Args:
+            key (str): Unique identifier for a tensor, this tensor could be the
+                key cache tensor, value cache tensor, or hidden state tensor
+                generated during model forwarding.
+
+            value (Optional[torch.Tensor]): Tensor to be stored.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get(
+        self,
+        key: str,
+    ) -> Optional[torch.Tensor]:
+        """Retrieve a value from the buffer by key.
+
+        Args:
+            key (str): Unique identifier for a tensor, this tensor could be the
+                key cache tensor, value cache tensor, or hidden state tensor
+                generated during model forwarding.
+
+        Returns:
+            Optional[torch.Tensor]: Stored tensor if exists, None otherwise.
 
         Raises:
             NotImplementedError: This method must be implemented in subclasses.
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
new file mode 100644
index 0000000000000..7fd5967293f26
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This file contains a new class `MooncakeStore` that allows developers to
+think of KV cache transfer operations as putting new KV cache entries
+into a remote KVStore-based lookup buffer and getting existing KV caches
+from this remote lookup buffer.
+"""
+import json
+import os
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+from safetensors.torch import load as safetensors_load
+from safetensors.torch import save as safetensors_save
+
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_lookup_buffer.base import (
+    KVStoreBufferBase)
+from vllm.logger import init_logger
+
+DEFAULT_GLOBAL_SEGMENT_SIZE = 3355443200  # 3.125 GiB
+DEFAULT_LOCAL_BUFFER_SIZE = 1073741824  # 1.0 GiB
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class MooncakeStoreConfig:
+    local_hostname: str
+    metadata_server: str
+    global_segment_size: int
+    local_buffer_size: int
+    protocol: str
+    device_name: str
+    master_server_address: str
+
+    @staticmethod
+    def from_file(file_path: str) -> 'MooncakeStoreConfig':
+        """Load the config from a JSON file."""
+        with open(file_path) as fin:
+            config = json.load(fin)
+        return MooncakeStoreConfig(
+            local_hostname=config.get("local_hostname"),
+            metadata_server=config.get("metadata_server"),
+            global_segment_size=config.get("global_segment_size",
+                                           DEFAULT_GLOBAL_SEGMENT_SIZE),
+            local_buffer_size=config.get("local_buffer_size",
+                                         DEFAULT_LOCAL_BUFFER_SIZE),
+            protocol=config.get("protocol", "tcp"),
+            device_name=config.get("device_name", ""),
+            master_server_address=config.get("master_server_address"),
+        )
+
+    @staticmethod
+    def load_from_env() -> 'MooncakeStoreConfig':
+        """Load config from a file specified in the environment variable."""
+        config_file_path = os.getenv('MOONCAKE_CONFIG_PATH')
+        if config_file_path is None:
+            raise ValueError(
+                "The environment variable 'MOONCAKE_CONFIG_PATH' is not set.")
+        return MooncakeStoreConfig.from_file(config_file_path)
+
+
+class MooncakeStore(KVStoreBufferBase):
+
+    def __init__(
+        self,
+        config: VllmConfig,
+    ):
+
+        try:
+            from mooncake_vllm_adaptor import MooncakeDistributedStore
+        except ImportError as e:
+            raise ImportError(
+                "Please install mooncake by following the instructions at "
+                "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md "  # noqa: E501
+                "to run vLLM with MooncakeConnector.") from e
+
+        try:
+            self.store = MooncakeDistributedStore()
+            self.config = MooncakeStoreConfig.load_from_env()
+            logger.info("Mooncake Configuration loaded successfully.")
+
+            self.store.setup(self.config.local_hostname,
+                             self.config.metadata_server,
+                             self.config.global_segment_size,
+                             self.config.local_buffer_size,
+                             self.config.protocol, self.config.device_name,
+                             self.config.master_server_address)
+
+        except ValueError as e:
+            logger.error("Configuration loading failed: %s", e)
+            raise
+        except Exception as exc:
+            logger.error(
+                "An error occurred while loading the configuration: %s", exc)
+            raise
+
+    def close(self):
+        # MooncakeDistributedStore will automatically call the destructor, so
+        # it is unnecessary to close it manually.
+        pass
+
+    def put(
+        self,
+        key: str,
+        value: Optional[torch.Tensor],
+    ) -> None:
+        # A message queue needs to be introduced before making it asynchronous.
+        if value is not None:
+            self._put_impl(key, value)
+
+    def get(
+        self,
+        key: str,
+    ) -> Optional[torch.Tensor]:
+        # A message queue needs to be introduced before making it asynchronous.
+        value = self._get_impl(key)
+        return value
+
+    def _put_impl(
+        self,
+        key: str,
+        value: torch.Tensor,
+    ) -> None:
+        """Put KVCache to Mooncake Store"""
+        device_id = value.device.index if value.device.type == 'cuda' else -1
+        device_tensor = torch.tensor(device_id, dtype=torch.int32)
+        value_bytes = safetensors_save({
+            "tensor": value,
+            "device_id": device_tensor
+        })
+        try:
+            self.store.put(key, value_bytes)
+        except TypeError as err:
+            logger.error("Failed to put value into Mooncake Store: %s", err)
+            raise TypeError("Mooncake Store Put Type Error.") from err
+
+    def _get_impl(
+        self,
+        key: str,
+    ) -> Optional[torch.Tensor]:
+        """Get KVCache from Mooncake Store"""
+        try:
+            data = self.store.get(key)
+        except TypeError as err:
+            logger.error("Failed to get value from Mooncake Store: %s", err)
+            raise TypeError("Mooncake Store Get Type Error.") from err
+
+        if data:
+            loaded_tensors = safetensors_load(data)
+            tensor = loaded_tensors["tensor"]
+            device_id_tensor = loaded_tensors["device_id"]
+            device_id = int(device_id_tensor.item())
+            device = torch.device(
+                'cuda', device_id) if device_id >= 0 else torch.device('cpu')
+            return tensor.to(device)
+
+        return None

From c67abd614fe670b1cc771097658dd7efe4a33747 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sat, 29 Mar 2025 06:30:09 -0700
Subject: [PATCH 104/593] [V1] Support interleaved modality items (#15605)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 .buildkite/test-pipeline.yaml                 |  1 +
 tests/conftest.py                             | 39 +++++----
 .../vision_language/test_interleaved.py       | 77 ++++++++++++++++++
 tests/multimodal/test_utils.py                | 80 +++++++++++++++----
 vllm/multimodal/utils.py                      | 72 ++++++-----------
 vllm/v1/engine/processor.py                   | 51 +++++-------
 6 files changed, 205 insertions(+), 115 deletions(-)
 create mode 100644 tests/models/decoder_only/vision_language/test_interleaved.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 62872bf8e3e18..99358d5579919 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -431,6 +431,7 @@ steps:
     - pytest -v -s models/encoder_decoder/audio_language -m core_model
     - pytest -v -s models/encoder_decoder/language -m core_model
     - pytest -v -s models/encoder_decoder/vision_language -m core_model
+    - pytest -v -s models/decoder_only/vision_language/test_interleaved.py
 
 - label: Multi-Modal Models Test (Extended) 1 # 48m
   optional: true
diff --git a/tests/conftest.py b/tests/conftest.py
index cc48fceb8eff0..6627ab638bf55 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -747,30 +747,27 @@ class VllmRunner:
         videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
     ) -> list[TextPrompt]:
-        if images is not None:
-            assert len(prompts) == len(images)
 
-        if videos is not None:
-            assert len(prompts) == len(videos)
+        if any(x is not None and len(x) != len(prompts)
+               for x in [images, videos, audios]):
+            raise ValueError(
+                "All non-None multimodal inputs must have the same length as "
+                "prompts")
 
-        if audios is not None:
-            assert len(prompts) == len(audios)
+        inputs = []
+        for i, prompt in enumerate(prompts):
+            multi_modal_data = {}
+            if images is not None and (image := images[i]) is not None:
+                multi_modal_data["image"] = image
+            if videos is not None and (video := videos[i]) is not None:
+                multi_modal_data["video"] = video
+            if audios is not None and (audio := audios[i]) is not None:
+                multi_modal_data["audio"] = audio
 
-        inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
-        if images is not None:
-            for i, image in enumerate(images):
-                if image is not None:
-                    inputs[i]["multi_modal_data"] = {"image": image}
-
-        if videos is not None:
-            for i, video in enumerate(videos):
-                if video is not None:
-                    inputs[i]["multi_modal_data"] = {"video": video}
-
-        if audios is not None:
-            for i, audio in enumerate(audios):
-                if audio is not None:
-                    inputs[i]["multi_modal_data"] = {"audio": audio}
+            inputs.append(
+                TextPrompt(prompt=prompt,
+                           multi_modal_data=multi_modal_data
+                           if multi_modal_data else None))
 
         return inputs
 
diff --git a/tests/models/decoder_only/vision_language/test_interleaved.py b/tests/models/decoder_only/vision_language/test_interleaved.py
new file mode 100644
index 0000000000000..8804497ae616f
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/test_interleaved.py
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+
+models = ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]
+
+
+def base_prompt(modalities_str: str) -> str:
+    return f"<|im_start|>user {modalities_str}\nDescribe what you see from these items.<|im_end|><|im_start|>assistant\n"  # noqa: E501
+
+
+INTERLEAVED_PROMPT = base_prompt("<image><video><image>\n")
+NONINTERLEAVED_PROMPT = base_prompt("<image><image><video>\n")
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["float16"])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
+    """
+    This is a simple test to check if interleaved and non-interleaved prompts
+    give the same result.
+    """
+
+    image_cherry = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+    image_stop = ImageAsset("stop_sign").pil_image.convert("RGB")
+    images = [image_cherry, image_stop]
+    video = VideoAsset(name="sample_demo_1.mp4", num_frames=16).np_ndarrays
+
+    inputs = [
+        (
+            [INTERLEAVED_PROMPT],
+            [images],
+            [video],
+        ),
+        (
+            [NONINTERLEAVED_PROMPT],
+            [images],
+            [video],
+        ),
+    ]
+
+    with vllm_runner(model,
+                     task="generate",
+                     dtype=dtype,
+                     limit_mm_per_prompt={"image": 2},
+                     max_model_len=32768,
+                     max_num_seqs=2,
+                     tensor_parallel_size=1,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs_per_case = [
+            vllm_model.generate_greedy(prompts,
+                                       max_tokens,
+                                       images=images,
+                                       videos=videos)
+            for prompts, images, videos in inputs
+        ]
+
+    all_results = [output[0][1] for output in vllm_outputs_per_case]
+    outputs = [(total_str, total_str.find("assistant\n") + len("assistant\n"))
+               for total_str in all_results]
+    prompt_lengths = [prompt_len for _, prompt_len in outputs]
+    generated_strs = [
+        total_str[prompt_len:] for total_str, prompt_len in outputs
+    ]
+    interleaved_prompt_len, noninterleaved_prompt_len = prompt_lengths
+    interleaved_output_str, noninterleaved_output_str = generated_strs
+
+    # The two prompts are identical except for the order of modality tokens.
+    assert interleaved_prompt_len == noninterleaved_prompt_len
+
+    # The two generated strings should be different because of the
+    # interleaved modality tokens.
+    assert interleaved_output_str != noninterleaved_output_str
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index a3f136c5667d5..ce1429fda9439 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -155,7 +155,7 @@ def test_merge_and_sort_multimodal_metadata():
                 ]
             },
             mm_hashes={"image": ["hash1", "hash2"]},
-            expected_modalities=["image"],
+            expected_modalities=["image", "image"],
             expected_ranges=[
                 PlaceholderRange(offset=0, length=2),
                 PlaceholderRange(offset=3, length=2),
@@ -172,7 +172,7 @@ def test_merge_and_sort_multimodal_metadata():
                 ]
             },
             mm_hashes=None,
-            expected_modalities=["image"],
+            expected_modalities=["image", "image"],
             expected_ranges=[
                 PlaceholderRange(offset=0, length=2),
                 PlaceholderRange(offset=2, length=2),
@@ -197,7 +197,7 @@ def test_merge_and_sort_multimodal_metadata():
                 "image": ["image_hash1", "image_hash2"],
                 "audio": ["audio_hash1", "audio_hash2"],
             },
-            expected_modalities=["audio", "image"],
+            expected_modalities=["audio", "audio", "image", "image"],
             expected_ranges=[
                 PlaceholderRange(offset=0, length=2),
                 PlaceholderRange(offset=2, length=3),
@@ -223,7 +223,7 @@ def test_merge_and_sort_multimodal_metadata():
                 ]
             },
             mm_hashes=None,
-            expected_modalities=["audio", "image"],
+            expected_modalities=["audio", "audio", "image", "image"],
             expected_ranges=[
                 PlaceholderRange(offset=0, length=2),
                 PlaceholderRange(offset=2, length=3),
@@ -254,7 +254,9 @@ def test_merge_and_sort_multimodal_metadata():
                 "audio": ["audio_hash1"],
                 "video": ["video_hash1", "video_hash2", "video_hash3"]
             },
-            expected_modalities=["audio", "video", "image"],
+            expected_modalities=[
+                "audio", "video", "video", "video", "image", "image"
+            ],
             expected_ranges=[
                 PlaceholderRange(offset=0, length=2),
                 PlaceholderRange(offset=3, length=4),
@@ -300,12 +302,19 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
                 "image": ["image_hash1", "image_hash2"],
                 "audio": ["audio_hash1", "audio_hash2"],
             },
-            expected_modalities=[],
-            expected_ranges=[],
-            expected_hashes=None,
+            expected_modalities=["image", "audio", "image", "audio"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=4),
+                PlaceholderRange(offset=5, length=2),
+                PlaceholderRange(offset=8, length=2),
+                PlaceholderRange(offset=11, length=4),
+            ],
+            expected_hashes=[
+                "image_hash1", "audio_hash1", "image_hash2", "audio_hash2"
+            ],
         ),
 
-        # <image> <image> <video> <audio> <image>
+        # <image> <image> <audio> <video> <image>
         TestCase(
             mm_positions={
                 "image": [
@@ -321,15 +330,54 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
                 ]
             },
             mm_hashes=None,
-            expected_modalities=[],
-            expected_ranges=[],
+            expected_modalities=["image", "image", "audio", "video", "image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=2, length=3),
+                PlaceholderRange(offset=5, length=2),
+                PlaceholderRange(offset=8, length=5),
+                PlaceholderRange(offset=20, length=4),
+            ],
             expected_hashes=None,
         ),
+
+        # <image> <audio> <video> <image> with hashes
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=18, length=4),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=6, length=2),
+                ],
+                "video": [
+                    PlaceholderRange(offset=10, length=5),
+                ]
+            },
+            mm_hashes={
+                "image": ["image_hash1", "image_hash2"],
+                "audio": ["audio_hash1"],
+                "video": ["video_hash1"],
+            },
+            expected_modalities=["image", "audio", "video", "image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=6, length=2),
+                PlaceholderRange(offset=10, length=5),
+                PlaceholderRange(offset=18, length=4),
+            ],
+            expected_hashes=[
+                "image_hash1", "audio_hash1", "video_hash1", "image_hash2"
+            ],
+        ),
     ]
 
-    for case in test_cases:
-        with pytest.raises(ValueError) as ex_info:
-            merge_and_sort_multimodal_metadata(case.mm_positions,
-                                               case.mm_hashes)
+    for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
+         expected_hashes) in test_cases:
+        modalities, ranges, hashes = merge_and_sort_multimodal_metadata(
+            mm_positions, mm_hashes)
 
-        assert "Interleaved mixed-modality" in str(ex_info.value)
+        assert modalities == expected_modalities
+        assert ranges == expected_ranges
+        assert hashes == expected_hashes
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 8e4fb7eac49c0..fc0fb8929b1e7 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -303,14 +303,10 @@ def merge_and_sort_multimodal_metadata(
 
     Optionally if a MultiModalHashDict is given, same operation will be 
     applied to the object and the sorted list of hashes will be returned.
-
-    Raises:
-        ValueError: If the input prompt has interleaved placeholders from
-            different modalities (e.g, "<image><audio><image> Describe the 
-            content.")
     
     Returns:
-        list[str]: Sorted list of involved modalities.
+        list[str]: List of item modalities in order of their positions in
+            the input sequence.
         list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from 
             mm_positions.
         Optional[list[str]]: Sorted list of all hashes from mm_hashes if 
@@ -324,50 +320,33 @@ def merge_and_sort_multimodal_metadata(
     # For single modality, placeholder ranges and hashes are already sorted
     # so we can return the list directly.
     if len(modalities) == 1:
-        if mm_hashes is None:
-            return modalities, list(mm_positions[modalities[0]]), None
-        else:
-            return modalities, list(mm_positions[modalities[0]]), list(
-                mm_hashes[modalities[0]])
+        modality = modalities[0]
+        placeholder_list = list(mm_positions[modality])
 
-    placeholder_lists_with_modality = [(modality, mm_positions[modality])
-                                       for modality in modalities]
+        return [modality] * len(
+            placeholder_list
+        ), placeholder_list, None if not mm_hashes else mm_hashes[modality]
 
-    if mm_hashes is None:
-        sorted_placeholder_lists = sorted(placeholder_lists_with_modality,
-                                          key=lambda x: x[1][0]['offset'])
-        sorted_hash_lists = None
-    else:
-        hashes_lists = [
-            mm_hashes[modality] for modality in modalities
-            if modality in mm_hashes
-        ]
-        sorted_pairs = sorted(zip(placeholder_lists_with_modality,
-                                  hashes_lists),
-                              key=lambda x: x[0][1][0]['offset'])
-        sorted_placeholder_tuple, sorted_hash_tuple = zip(*sorted_pairs)
-        sorted_placeholder_lists = list(sorted_placeholder_tuple)
-        sorted_hash_lists = list(sorted_hash_tuple)
+    # Create a list of (modality, placeholder, hash) tuples for all placeholders
+    all_items = []
+    for modality in modalities:
+        placeholder_list = list(mm_positions[modality])
+        hash_list: list[Optional[str]] = list(
+            mm_hashes[modality]) if mm_hashes and modality in mm_hashes else [
+                None
+            ] * len(placeholder_list)
 
-    sorted_modalities = [modality for modality, _ in sorted_placeholder_lists]
+        for placeholder, hash_value in zip(placeholder_list, hash_list):
+            all_items.append((modality, placeholder, hash_value))
 
-    # Flatten sorted list of lists to a single list and verify there is no
-    # interleaving of placeholders from different modalities.
-    merged_placeholders: list[PlaceholderRange] = []
-    for modality, placeholder_list in sorted_placeholder_lists:
-        if merged_placeholders and placeholder_list[0][
-                'offset'] < merged_placeholders[-1]['offset']:
-            raise ValueError(
-                "Interleaved mixed-modality inference is currently not "
-                "supported.")
-        merged_placeholders.extend(placeholder_list)
+    # Sort all items by offset
+    all_items.sort(key=lambda x: x[1]['offset'])
 
-    if sorted_hash_lists is not None:
-        merged_hashes = []
-        for hash_list in sorted_hash_lists:
-            merged_hashes.extend(hash_list)
-    else:
-        merged_hashes = None
+    # Split into separate lists
+    sorted_modalities = [item[0] for item in all_items]
+    merged_placeholders = [item[1] for item in all_items]
+    merged_hashes = [str(item[2])
+                     for item in all_items] if mm_hashes is not None else None
 
     return sorted_modalities, merged_placeholders, merged_hashes
 
@@ -383,8 +362,7 @@ def group_mm_inputs_by_modality(
 
     Returns:
         list[list[MultiModalKwargs]]: List of list of MultiModalKwargs, each 
-        inner list contains consecutive MultiModalKwargs with same modality, or
-        one with multimodal modalities.
+        inner list contains consecutive MultiModalKwargs with same modality.
     """
     if not mm_inputs:
         return []
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index dbaf0abaea18a..0d2892837eb28 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -234,22 +234,11 @@ class Processor:
         if decoder_inputs["type"] == "multimodal":
             decoder_mm_inputs = decoder_inputs["mm_kwargs"]
 
-            # The output of merged multi-modal processor (`decoder_mm_inputs`)
-            # contains the kwargs for all items from all modalities.
-            # This code separates them so that there is one set of kwargs
-            # per item per modality.
-            individual_mm_inputs = [
-                MultiModalKwargs.from_items([item])
-                for modality in decoder_mm_inputs.modalities
-                for item in decoder_mm_inputs.get_items(modality)
-            ]
-
             # Merge and flatten multimodal placeholders, hashes and inputs
             # from dictionaries to lists, and sort them by each item's position
             # in the input sequence.
-            # NOTE: interleaved modalities are not supported.
             (
-                sorted_modalities,
+                sorted_item_modalities,
                 sorted_mm_positions,
                 sorted_mm_hashes,
             ) = merge_and_sort_multimodal_metadata(
@@ -257,26 +246,26 @@ class Processor:
                 decoder_inputs["mm_hashes"] if self.use_hash else None,
             )
 
-            # NOTE: Sort multimodal inputs/kwargs ONLY IF there are multiple
-            # modalities involved.
-            if len(sorted_modalities) > 1:
-                modality_order_dict = {
-                    modality: order
-                    for order, modality in enumerate(sorted_modalities)
-                }
-
-                # Sanity check to make sure each multimodal input has only one
-                # modality key.
-                for mm_input in individual_mm_inputs:
-                    assert len(mm_input.modalities) == 1
-
-                # Sort MultiModalKwargs to match sorted_mm_positions
-                sorted_mm_inputs = sorted(
-                    individual_mm_inputs,
-                    key=lambda mm_input: modality_order_dict[list(
-                        mm_input.modalities)[0]])
+            # The output of merged multi-modal processor (`decoder_mm_inputs`)
+            # is a single MultiModalKwargs for all items from all modalities.
+            # This code flattens kwargs for individual items in a list and
+            # sorts them by each item's position in the input sequence if there
+            # are multiple modalities.
+            unique_modalities = set(sorted_item_modalities)
+            if len(unique_modalities) > 1:
+                sorted_mm_inputs = []
+                used_indices = {modality: 0 for modality in unique_modalities}
+                for modality in sorted_item_modalities:
+                    items = decoder_mm_inputs.get_items(modality)
+                    item = items[used_indices[modality]]
+                    sorted_mm_inputs.append(MultiModalKwargs.from_items([item
+                                                                         ]))
+                    used_indices[modality] += 1
             else:
-                sorted_mm_inputs = individual_mm_inputs
+                sorted_mm_inputs = [
+                    MultiModalKwargs.from_items([item]) for item in
+                    decoder_mm_inputs.get_items(sorted_item_modalities[0])
+                ]
 
         return EngineCoreRequest(
             request_id=request_id,

From 2bc4be4e32a42a439f7aad3752b96a20e7c34938 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sat, 29 Mar 2025 09:25:17 -0700
Subject: [PATCH 105/593] [V1][Minor] Simplify rejection sampler's parse_output
 (#15741)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/sample/rejection_sampler.py | 7 -------
 vllm/v1/worker/gpu_model_runner.py  | 7 +++----
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index e5b8872a2a3ff..3cf7fde5cd0ec 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -107,7 +107,6 @@ class RejectionSampler(nn.Module):
     @staticmethod
     def parse_output(
         output_token_ids: torch.Tensor,
-        ignored_req_idxs: list[int],
         vocab_size: int,
     ) -> list[list[int]]:
         """Parse the output of the rejection sampler.
@@ -117,9 +116,6 @@ class RejectionSampler(nn.Module):
                 [batch_size, max_spec_len + 1]. The rejected tokens are
                 replaced with `PLACEHOLDER_TOKEN_ID` by the rejection sampler
                 and will be filtered out in this function.
-            ignored_req_idxs: The indices of the requests that should not be
-                sampled. This is usually because the request is still in the
-                prefill phase.
             vocab_size: The size of the vocabulary.
 
         Returns:
@@ -129,11 +125,8 @@ class RejectionSampler(nn.Module):
         # Create mask for valid tokens.
         valid_mask = ((output_token_ids_np != PLACEHOLDER_TOKEN_ID) &
                       (output_token_ids_np < vocab_size))
-
-        ignored_req_idx_set = set(ignored_req_idxs)
         outputs = [
             row[valid_mask[i]].tolist()
-            if i not in ignored_req_idx_set else []
             for i, row in enumerate(output_token_ids_np)
         ]
         return outputs
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 1b581c69a728b..4511a9aa85fd3 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1121,16 +1121,15 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         if max_gen_len == 1:
             # No spec decode tokens.
             valid_sampled_token_ids = sampled_token_ids.tolist()
-            # Mask out the sampled tokens that should not be sampled.
-            for i in discard_sampled_tokens_req_indices:
-                valid_sampled_token_ids[i].clear()
         else:
             # Includes spec decode tokens.
             valid_sampled_token_ids = self.rejection_sampler.parse_output(
                 sampled_token_ids,
-                discard_sampled_tokens_req_indices,
                 self.input_batch.vocab_size,
             )
+        # Mask out the sampled tokens that should not be sampled.
+        for i in discard_sampled_tokens_req_indices:
+            valid_sampled_token_ids[i].clear()
 
         if not self.use_spec_decode:
             spec_token_ids = None

From 3c0ff914ac8ea2f17c25b35df4a2cfe7a6c36ac0 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 30 Mar 2025 02:11:15 +0800
Subject: [PATCH 106/593] [Bugfix] Fix Mllama interleaved images input support
 (#15564)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
---
 .../vision_language_multi_image.py            |  4 +-
 vllm/model_executor/models/mllama.py          | 65 ++++++++++++++-----
 2 files changed, 52 insertions(+), 17 deletions(-)

diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 98a739169d702..0493222da1341 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -229,8 +229,8 @@ def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
-    placeholders = "<|image|>" * len(image_urls)
-    prompt = f"{placeholders}<|begin_of_text|>{question}"
+    img_prompt = "Given the first image <|image|> and the second image<|image|>"
+    prompt = f"<|begin_of_text|>{img_prompt}, {question}?"
     return ModelRequestData(
         engine_args=engine_args,
         prompt=prompt,
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index ac4bdbc41e441..68d5298dfc9bc 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -180,10 +180,10 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
         mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
                                   return_mm_hashes)
 
+        image_token_id = self.info.get_hf_config().image_token_index
         # Check that the number of image tokens in the decoder prompt matches
         # the number of images provided in mm_data
-        num_image_tokens = mm_inputs['prompt_token_ids'].count(
-            self.info.get_hf_config().image_token_index)
+        num_image_tokens = mm_inputs['prompt_token_ids'].count(image_token_id)
         image_data = mm_data.get("image", [])
         num_images = 1 if isinstance(image_data, Image) else len(image_data)
         if num_image_tokens != num_images:
@@ -191,8 +191,55 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
                 f"The number of image tokens ({num_image_tokens}) must be"
                 f" the same as the number of images ({num_images})")
 
+        # Given prompt: <IMG0> P0 P1 <IMG1> <IMG2> P3 P4 D5 D6...., (P-prefill, D-decode)  # noqa: E501
+        # P0 & P1 do cross attention with placeholder of <IMG0>
+        # P3 P4 D5 D6 do cross attention with placeholder of <IMG1> and <IMG2>
+        # Example input to encoder and decoder:
+        # {
+        #     'encoder': {
+        #         'type': 'token',
+        #         'prompt_token_ids': [128256, 128256, ..., 128256],
+        #         'prompt': '<|image|><|image|>...<|image|>',
+        #         'multi_modal_data': {'image': <PIL.Image.Image image mode=RGB size=1770x1180 at 0x7FDE2C624880>},  # noqa: E501
+        #     },
+        #     'decoder': {
+        #         'type': 'token',
+        #         'prompt_token_ids': [128000, 128256, 128000, 3923, 374, 279, 2262, 315, 420, 2217, 30],  # noqa: E501
+        #         'prompt': '<|image|><|begin_of_text|>What is the content of this image?',  # noqa: E501
+        #         'multi_modal_data': {'image': <PIL.Image.Image image mode=RGB size=1770x1180 at 0x7FDE2C624880>},  # noqa: E501
+        #     },
+        # }
+
+        if mm_data:
+            # Since only the last group of consecutive images
+            # are attended by the decoded tokens, we only need to
+            # get the number of tokens for those images.
+            token_per_chunk = self.info.get_token_per_chunk_from_config()
+            num_decode_images = self._get_num_image_in_last_group(
+                mm_inputs["prompt_token_ids"])
+            num_encode_images = num_images - num_decode_images
+
+            # Set encoder prompt length based on the number of tiles.
+            # This tells the block manager to allocate correct number
+            # of slots for encoder tokens.
+            num_tiles = mm_inputs["mm_kwargs"]["num_tiles"]
+            decode_tiles = num_tiles[num_encode_images:num_images].sum().item()
+            num_tokens = decode_tiles * token_per_chunk
+            mm_inputs["encoder_prompt_token_ids"] = [image_token_id
+                                                     ] * num_tokens
+            mm_inputs["encoder_prompt"] = "<|image|>" * num_tokens
+
         return mm_inputs
 
+    def _get_num_image_in_last_group(self, prompt_token_ids: List[int]) -> int:
+        num_images = 0
+        for token_id in prompt_token_ids[::-1]:
+            if token_id == self.info.get_hf_config().image_token_index:
+                num_images += 1
+            elif num_images > 0:
+                break
+        return num_images
+
     def _call_hf_processor(
         self,
         prompt: str,
@@ -210,19 +257,7 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
             processed_outputs["num_tiles"] = torch.tensor(num_tiles)
             for k in ('pixel_values', 'aspect_ratio_ids', "aspect_ratio_mask"):
                 processed_outputs[k] = processed_outputs[k].squeeze(0)
-            # Example input to encoder and decoder:
-            # {
-            #     'encoder': {
-            #         'type': 'token',
-            #         'prompt_token_ids': [128256, 128000, 3923, 374, 279, 2262, 315, 420, 2217, 30],  # noqa: E501
-            #         'prompt': '<|image|><|begin_of_text|>What is the content of this image?',  # noqa: E501
-            #         'multi_modal_data': {'image': <PIL.Image.Image image mode=RGB size=1770x1180 at 0x7FDE2C624880>},  # noqa: E501
-            #     },
-            #     'decoder': {
-            #         'type': 'token',
-            #         'prompt_token_ids': [128000],
-            #     },
-            # }
+
             processed_token_ids = processed_outputs.pop("input_ids")
             start_idx, end_idx = 0, processed_token_ids.size(1)
             processed_prompt_text = tokenizer.decode(processed_token_ids[0])

From 045533716b4108e341e7c54dcb78758e2f9b5c7d Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Sun, 30 Mar 2025 11:20:02 +0800
Subject: [PATCH 107/593] [CI] xgrammar structured output supports Enum.
 (#15757)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 tests/v1/structured_output/test_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/v1/structured_output/test_utils.py b/tests/v1/structured_output/test_utils.py
index 3aa86cbec533c..554f38926269b 100644
--- a/tests/v1/structured_output/test_utils.py
+++ b/tests/v1/structured_output/test_utils.py
@@ -13,10 +13,6 @@ def unsupported_string_schemas():
             "type": "string",
             "pattern": "^[a-zA-Z]+$"
         },
-        {
-            "type": "string",
-            "enum": ["active", "inactive", "pending"]
-        },
         {
             "type": "string",
             "minLength": 1
@@ -164,6 +160,10 @@ def supported_schema():
                     "type": "number"
                 }
             },
+            "car_type": {
+                "type": "string",
+                "enum": ["sedan", "suv", "truck"]
+            },
             "address": {
                 "type": "object",
                 "properties": {

From 6909a762012ce665931ff6d482dce17cf927108a Mon Sep 17 00:00:00 2001
From: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Date: Sun, 30 Mar 2025 05:20:19 +0200
Subject: [PATCH 108/593] [Bugfix] Fix Mistral guided generation using xgrammar
 (#15704)

Signed-off-by: Julien Denize <julien.denize@mistral.ai>
---
 .../llm/test_struct_output_generate.py        | 33 +++++++++++++------
 vllm/v1/structured_output/backend_xgrammar.py | 18 ++++++----
 2 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index a32dd8263992e..fa58c6460f840 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -15,11 +15,20 @@ from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
-GUIDED_DECODING_BACKENDS_V1 = [
-    "xgrammar:disable-any-whitespace", "guidance:disable-any-whitespace"
+PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
+    ("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace",
+     "auto"),
+    ("mistralai/Ministral-8B-Instruct-2410", "guidance:disable-any-whitespace",
+     "auto"),
+    ("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace",
+     "mistral"),
+    ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar:disable-any-whitespace", "auto"),
+    ("Qwen/Qwen2.5-1.5B-Instruct", "guidance:disable-any-whitespace", "auto"),
 ]
-MODELS_TO_TEST = [
-    "Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410"
+
+PARAMS_MODELS_TOKENIZER_MODE = [
+    ("mistralai/Ministral-8B-Instruct-2410", "auto"),
+    ("Qwen/Qwen2.5-1.5B-Instruct", "auto"),
 ]
 
 
@@ -37,9 +46,8 @@ class CarDescription(BaseModel):
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+@pytest.mark.parametrize("model_name, guided_decoding_backend, tokenizer_mode",
+                         PARAMS_MODELS_BACKENDS_TOKENIZER_MODE)
 def test_structured_output(
     monkeypatch: pytest.MonkeyPatch,
     sample_json_schema: dict[str, Any],
@@ -49,6 +57,7 @@ def test_structured_output(
     sample_regex: str,
     sample_guided_choice: str,
     guided_decoding_backend: str,
+    tokenizer_mode: str,
     model_name: str,
 ):
     monkeypatch.setenv("VLLM_USE_V1", "1")
@@ -58,7 +67,8 @@ def test_structured_output(
     llm = LLM(model=model_name,
               enforce_eager=True,
               max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+              guided_decoding_backend=guided_decoding_backend,
+              tokenizer_mode=tokenizer_mode)
 
     #
     # Test 1: Generate JSON output based on a provided schema
@@ -324,17 +334,20 @@ def test_structured_output(
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+@pytest.mark.parametrize("model_name, tokenizer_mode",
+                         PARAMS_MODELS_TOKENIZER_MODE)
 def test_structured_output_auto_mode(
     monkeypatch: pytest.MonkeyPatch,
     unsupported_json_schema: dict[str, Any],
     model_name: str,
+    tokenizer_mode: str,
 ):
     monkeypatch.setenv("VLLM_USE_V1", "1")
 
     llm = LLM(model=model_name,
               max_model_len=1024,
-              guided_decoding_backend="auto")
+              guided_decoding_backend="auto",
+              tokenizer_mode=tokenizer_mode)
 
     sampling_params = SamplingParams(
         temperature=1.0,
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index 9bfb644c58094..7fe62f26af597 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -42,12 +42,15 @@ class XgrammarBackend(StructuredOutputBackend):
             # NOTE: ideally, xgrammar should handle this accordingly.
             # refer to https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98
             try:
-                encoded_vocab = [
-                    token for token, _ in sorted(
-                        tokenizer.get_vocab().items(),
-                        key=lambda x: x[1],
-                    )
-                ]
+                if tokenizer.is_tekken:
+                    encoded_vocab = tokenizer._vocab
+                else:
+                    encoded_vocab = [
+                        token for token, _ in sorted(
+                            tokenizer.get_vocab().items(),
+                            key=lambda x: x[1],
+                        )
+                    ]
                 stop_token_ids = None
                 if hasattr(
                         tokenizer,
@@ -62,7 +65,8 @@ class XgrammarBackend(StructuredOutputBackend):
             tokenizer_info = xgr.TokenizerInfo(  # type: ignore
                 encoded_vocab=encoded_vocab,
                 # NOTE: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
-                vocab_type=xgr.VocabType.BYTE_FALLBACK,
+                vocab_type=xgr.VocabType.RAW
+                if tokenizer.is_tekken else xgr.VocabType.BYTE_FALLBACK,
                 vocab_size=self.vocab_size,
                 stop_token_ids=stop_token_ids,
                 add_prefix_space=True,

From 44c3a5abc32588c930d8cce6dc4172c6098a67fd Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sun, 30 Mar 2025 16:12:13 +0800
Subject: [PATCH 109/593] [doc] update conda to usage link in installation
 (#15761)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 .../source/getting_started/installation/python_env_setup.inc.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/getting_started/installation/python_env_setup.inc.md b/docs/source/getting_started/installation/python_env_setup.inc.md
index 6ea44c36db324..a03d35030fe8a 100644
--- a/docs/source/getting_started/installation/python_env_setup.inc.md
+++ b/docs/source/getting_started/installation/python_env_setup.inc.md
@@ -1,4 +1,4 @@
-You can create a new Python environment using `conda`:
+You can create a new Python environment using [conda](https://docs.conda.io/projects/conda/en/stable/user-guide/getting-started.html):
 
 ```console
 # (Recommended) Create a new conda environment.

From 7fd8c0f85ce3cb7062dcaecfe05d087462362148 Mon Sep 17 00:00:00 2001
From: pansicheng <sicheng.pan.chn@gmail.com>
Date: Sun, 30 Mar 2025 17:01:34 +0800
Subject: [PATCH 110/593] fix test_phi3v (#15321)

Signed-off-by: pansicheng <sicheng.pan.chn@gmail.com>
---
 tests/entrypoints/openai/test_vision.py       | 46 ++++++++++++++++---
 .../openai/test_vision_embedding.py           | 24 ++++++++--
 .../embedding/vision_language/test_phi3v.py   | 13 ++++++
 vllm/model_executor/models/phi3v.py           | 41 +++++++++++++++--
 4 files changed, 110 insertions(+), 14 deletions(-)

diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index bb100e573b878..4b9029ded41b4 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -3,6 +3,9 @@
 import openai
 import pytest
 import pytest_asyncio
+import requests
+from PIL import Image
+from transformers import AutoProcessor
 
 from vllm.multimodal.utils import encode_image_base64, fetch_image
 
@@ -53,11 +56,31 @@ def base64_encoded_image() -> dict[str, str]:
     }
 
 
+def get_hf_prompt_tokens(model_name, content, image_url):
+    processor = AutoProcessor.from_pretrained(model_name,
+                                              trust_remote_code=True,
+                                              num_crops=4)
+
+    placeholder = "<|image_1|>\n"
+    messages = [{
+        "role": "user",
+        "content": f"{placeholder}{content}",
+    }]
+    images = [Image.open(requests.get(image_url, stream=True).raw)]
+
+    prompt = processor.tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True)
+    inputs = processor(prompt, images, return_tensors="pt")
+
+    return inputs.input_ids.shape[1]
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 async def test_single_chat_session_image(client: openai.AsyncOpenAI,
                                          model_name: str, image_url: str):
+    content_text = "What's in this image?"
     messages = [{
         "role":
         "user",
@@ -70,16 +93,17 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
             },
             {
                 "type": "text",
-                "text": "What's in this image?"
+                "text": content_text
             },
         ],
     }]
 
+    max_completion_tokens = 10
     # test single completion
     chat_completion = await client.chat.completions.create(
         model=model_name,
         messages=messages,
-        max_completion_tokens=10,
+        max_completion_tokens=max_completion_tokens,
         logprobs=True,
         temperature=0.0,
         top_logprobs=5)
@@ -87,8 +111,12 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
 
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
+    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text,
+                                            image_url)
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=774, total_tokens=784)
+        completion_tokens=max_completion_tokens,
+        prompt_tokens=hf_prompt_tokens,
+        total_tokens=hf_prompt_tokens + max_completion_tokens)
 
     message = choice.message
     message = chat_completion.choices[0].message
@@ -150,6 +178,7 @@ async def test_single_chat_session_image_base64encoded(
         client: openai.AsyncOpenAI, model_name: str, image_url: str,
         base64_encoded_image: dict[str, str]):
 
+    content_text = "What's in this image?"
     messages = [{
         "role":
         "user",
@@ -163,16 +192,17 @@ async def test_single_chat_session_image_base64encoded(
             },
             {
                 "type": "text",
-                "text": "What's in this image?"
+                "text": content_text
             },
         ],
     }]
 
+    max_completion_tokens = 10
     # test single completion
     chat_completion = await client.chat.completions.create(
         model=model_name,
         messages=messages,
-        max_completion_tokens=10,
+        max_completion_tokens=max_completion_tokens,
         logprobs=True,
         temperature=0.0,
         top_logprobs=5)
@@ -180,8 +210,12 @@ async def test_single_chat_session_image_base64encoded(
 
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
+    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text,
+                                            image_url)
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=774, total_tokens=784)
+        completion_tokens=max_completion_tokens,
+        prompt_tokens=hf_prompt_tokens,
+        total_tokens=hf_prompt_tokens + max_completion_tokens)
 
     message = choice.message
     message = chat_completion.choices[0].message
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index 74e5c4cc7ea4a..3e6f13e10ac27 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -2,6 +2,8 @@
 
 import pytest
 import requests
+from PIL import Image
+from transformers import AutoProcessor
 
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 from vllm.multimodal.utils import encode_image_base64, fetch_image
@@ -52,11 +54,24 @@ def base64_encoded_image() -> dict[str, str]:
     }
 
 
+def get_hf_prompt_tokens(model_name, content, image_url):
+    processor = AutoProcessor.from_pretrained(model_name,
+                                              trust_remote_code=True,
+                                              num_crops=4)
+
+    placeholder = "<|image_1|> "
+    prompt = f"{placeholder}{content}"
+    images = [Image.open(requests.get(image_url, stream=True).raw)]
+    inputs = processor(prompt, images, return_tensors="pt")
+    return inputs.input_ids.shape[1]
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
                                image_url: str):
+    content_text = "Represent the given image."
     messages = [{
         "role":
         "user",
@@ -69,7 +84,7 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
             },
             {
                 "type": "text",
-                "text": "Represent the given image."
+                "text": content_text
             },
         ],
     }]
@@ -85,9 +100,12 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
     response.raise_for_status()
     embeddings = EmbeddingResponse.model_validate(response.json())
 
+    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text,
+                                            image_url)
+
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
     assert len(embeddings.data[0].embedding) == 3072
     assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 763
-    assert embeddings.usage.total_tokens == 763
+    assert embeddings.usage.prompt_tokens == hf_prompt_tokens
+    assert embeddings.usage.total_tokens == hf_prompt_tokens
diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/embedding/vision_language/test_phi3v.py
index 9cc767c23b26c..f9985bd8a2e89 100644
--- a/tests/models/embedding/vision_language/test_phi3v.py
+++ b/tests/models/embedding/vision_language/test_phi3v.py
@@ -2,6 +2,10 @@
 
 import pytest
 import torch.nn.functional as F
+from PIL import Image
+
+from vllm.assets.base import get_vllm_public_assets
+from vllm.assets.image import VLM_IMAGES_DIR
 
 from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
 from ....utils import large_gpu_test
@@ -112,6 +116,15 @@ def test_models_image(
         (text, asset.pil_image)
         for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
     ]
+    # add cases for special_tokens
+    input_texts_images.append((
+        "\n<s><|user|>\n <|image_1|>\n\t <s>"
+        "Represent the given image for classification<|end|>"
+        "\n<|assistant|>\n",
+        Image.open(
+            get_vllm_public_assets(filename="cherry_blossom.jpg",
+                                   s3_prefix=VLM_IMAGES_DIR)),
+    ))
     input_texts = [text for text, _ in input_texts_images]
     input_images = [image for _, image in input_texts_images]
 
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 5305f1e03e1a1..d5c64989e64d3 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -14,6 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import re
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
 from typing import Any, List, Literal, Optional, Set, Tuple, TypedDict, Union
@@ -428,10 +429,6 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
 
-        tokenizer = self.info.get_tokenizer()
-        bos_token_id = tokenizer.bos_token_id
-        assert isinstance(bos_token_id, int)
-
         def get_replacement_phi3v(item_idx: int):
             images = mm_items.get_items(
                 "image", (ImageEmbeddingItems, ImageProcessorItems))
@@ -449,7 +446,7 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
             image_tokens = [_IMAGE_TOKEN_ID] * num_image_tokens
 
             return PromptUpdateDetails(
-                full=image_tokens + [bos_token_id],
+                full=image_tokens,
                 features=image_tokens,
             )
 
@@ -469,6 +466,40 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
         mm_prompt_updates: Mapping[str, Sequence[BoundPromptUpdate]],
         mm_item_counts: Mapping[str, int],
     ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
+        # align to hf behavior when there are images
+        if len(mm_item_counts):
+            tokenizer = self.info.get_tokenizer()
+            # to decode token_ids to the original text, we need to
+            # 1. remove the first bos token
+            # 2. remove space after each special token
+            #    introduced by the tokenizer
+            if len(token_ids) and token_ids[0] == tokenizer.bos_token_id:
+                token_ids = token_ids[1:]
+            text = tokenizer.decode(token_ids)
+            for special_tokens in tokenizer.special_tokens_map.values():
+                if isinstance(special_tokens, str):
+                    text = text.replace(f"{special_tokens} ", special_tokens)
+                elif isinstance(special_tokens, list):
+                    for special_token in special_tokens:
+                        text = text.replace(f"{special_token} ", special_token)
+            # perform hf behavior
+            # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/64f88b6/processing_phi3_v.py#L407
+            pattern = r"<\|image_\d+\|>"
+            prompt_chunks = [
+                tokenizer(chunk).input_ids
+                for chunk in re.split(pattern, text)
+            ]
+            image_tags = [
+                tokenizer(chunk, add_special_tokens=False).input_ids
+                for chunk in re.findall(pattern, text)
+            ]
+            if len(prompt_chunks) > len(image_tags):
+                image_tags.append([])
+            token_ids = [
+                e for sublist in zip(prompt_chunks, image_tags)
+                for ele in sublist for e in ele
+            ]
+
         token_ids, text, placeholders = super()._apply_prompt_updates(
             token_ids=token_ids,
             mm_prompt_updates=mm_prompt_updates,

From 803d5c35f3e8a6547ff7c6e6c322e54cbfec8444 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 30 Mar 2025 18:20:42 +0800
Subject: [PATCH 111/593] [V1] Override `mm_counts` for dummy data creation
 (#15703)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../vision_language/test_models.py            | 28 ++-------------
 .../model_executor/models/llava_next_video.py | 14 +++++---
 vllm/model_executor/models/llava_onevision.py | 25 ++++++++-----
 vllm/model_executor/models/minicpmo.py        | 26 ++++++++------
 vllm/model_executor/models/minicpmv.py        | 36 ++++++++++++-------
 vllm/model_executor/models/qwen2_vl.py        | 26 +++++++++-----
 vllm/multimodal/profiling.py                  | 30 +++++++++++-----
 vllm/multimodal/registry.py                   |  6 ++--
 vllm/v1/worker/gpu_model_runner.py            | 16 +++------
 9 files changed, 114 insertions(+), 93 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 0d1d237e5693c..ecb637c62e439 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -385,7 +385,7 @@ VLM_TEST_SETTINGS = {
     ),
     "minicpmo_26": VLMTestInfo(
         models=["openbmb/MiniCPM-o-2_6"],
-        test_type=(VLMTestType.IMAGE),
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
         img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
         max_model_len=4096,
@@ -394,21 +394,9 @@ VLM_TEST_SETTINGS = {
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
         patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
     ),
-    "minicpmo_26_multi_image": VLMTestInfo(
-        models=["openbmb/MiniCPM-o-2_6"],
-        test_type=(VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
-        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
-        max_model_len=4096,
-        max_num_seqs=2,
-        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
-        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
-        patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
-        marks=[large_gpu_mark(min_gb=32)],
-    ),
     "minicpmv_26": VLMTestInfo(
         models=["openbmb/MiniCPM-V-2_6"],
-        test_type=(VLMTestType.IMAGE),
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
         img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
         max_model_len=4096,
@@ -417,18 +405,6 @@ VLM_TEST_SETTINGS = {
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
         patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
     ),
-    "minicpmv_26_multi_image": VLMTestInfo(
-        models=["openbmb/MiniCPM-V-2_6"],
-        test_type=(VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
-        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
-        max_model_len=4096,
-        max_num_seqs=2,
-        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
-        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
-        patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
-        marks=[large_gpu_mark(min_gb=32)],
-    ),
     "molmo": VLMTestInfo(
         models=["allenai/Molmo-7B-D-0924"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 8b1a8c9da6804..8a5edefb4a0b2 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -71,7 +71,8 @@ class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
         max_video_tokens = self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
-            num_frames=self.get_num_frames_with_most_features(seq_len),
+            num_frames=self.get_num_frames_with_most_features(
+                seq_len, mm_counts),
         )
 
         return {"video": max_video_tokens}
@@ -130,9 +131,12 @@ class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
 
         return num_frames
 
-    def get_num_frames_with_most_features(self, seq_len: int) -> int:
-        mm_config = self.ctx.get_mm_config()
-        max_videos = mm_config.get_limit_per_prompt("video")
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_videos = mm_counts.get("video", 0)
 
         max_total_frames = self._get_max_video_frames(seq_len)
 
@@ -155,7 +159,7 @@ class LlavaNextVideoDummyInputsBuilder(
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
         target_num_frames = \
-            self.info.get_num_frames_with_most_features(seq_len)
+            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
 
         mm_data = {
             "video":
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index fbc298b812498..c7e13bb352f42 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -108,7 +108,7 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
     ) -> Mapping[str, int]:
         return {
             "image": self.get_max_image_tokens(),
-            "video": self.get_max_video_tokens(seq_len),
+            "video": self.get_max_video_tokens(seq_len, mm_counts),
         }
 
     # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
@@ -202,10 +202,13 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
 
         return num_frames
 
-    def get_num_frames_with_most_features(self, seq_len: int) -> int:
-        mm_config = self.ctx.get_mm_config()
-        max_images = mm_config.get_limit_per_prompt("image")
-        max_videos = mm_config.get_limit_per_prompt("video")
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
 
         max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self._get_max_video_frames(seq_len -
@@ -215,13 +218,18 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
 
         return max(max_frames_per_video, 1)
 
-    def get_max_video_tokens(self, seq_len: int) -> int:
+    def get_max_video_tokens(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
         target_width, target_height = self.get_image_size_with_most_features()
 
         return self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
-            num_frames=self.get_num_frames_with_most_features(seq_len),
+            num_frames=self.get_num_frames_with_most_features(
+                seq_len, mm_counts),
         )
 
 
@@ -243,7 +251,8 @@ class LlavaOnevisionDummyInputsBuilder(
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
         target_num_frames = \
-            self.info.get_num_frames_with_most_features(seq_len)
+            self.info.get_num_frames_with_most_features(seq_len,
+                                                        mm_counts)
 
         mm_data = {
             "image":
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index ea37de0b806ab..c74e086d3748e 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -43,7 +43,8 @@ from vllm.multimodal.parse import (AudioItem, AudioProcessorItems,
 from vllm.multimodal.processing import PromptReplacement, PromptUpdate
 from vllm.multimodal.profiling import ProcessorInputs
 
-from .minicpmv import (MiniCPMV2_6, MiniCPMVDummyInputsBuilder,
+from .minicpmv import (_MAX_FRAMES_PER_VIDEO, MiniCPMV2_6,
+                       MiniCPMVDummyInputsBuilder,
                        MiniCPMVMultiModalDataParser,
                        MiniCPMVMultiModalProcessor, MiniCPMVProcessingInfo,
                        _minicpmv_field_config)
@@ -203,8 +204,8 @@ class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
         return 30
 
     def get_max_audio_tokens(self) -> int:
-        return self.get_max_audio_tokens_per_chunk(
-        ) * self.get_max_audio_chunks_with_most_features()
+        num_chunks = self.get_max_audio_chunks_with_most_features()
+        return self.get_max_audio_tokens_per_chunk() * num_chunks
 
     def get_audio_len_by_num_chunks(self, num_chunks: int) -> int:
         sampling_rate = self.get_default_audio_sampling_rate()
@@ -212,21 +213,24 @@ class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
         num_tokens_per_chunk = self.get_max_audio_tokens_per_chunk() - 2
         return int(num_chunks * sampling_rate / num_tokens_per_chunk) + 1
 
-    def get_num_frames_with_most_features(self, seq_len: int) -> int:
-        mm_config = self.ctx.get_mm_config()
-        max_images = mm_config.get_limit_per_prompt("image")
-        max_videos = mm_config.get_limit_per_prompt("video")
-        max_audios = mm_config.get_limit_per_prompt("audio")
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
+        max_audios = mm_counts.get("audio", 0)
 
         max_image_tokens = self.get_max_image_tokens() * max_images
         max_audio_tokens = self.get_max_audio_tokens() * max_audios
         max_total_frames = self.get_max_video_frames(seq_len -
                                                      max_image_tokens -
                                                      max_audio_tokens)
+        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
+                                   _MAX_FRAMES_PER_VIDEO)
 
-        num_frames = max(max_total_frames // max(max_videos, 1), 1)
-
-        return num_frames
+        return max(max_frames_per_video, 1)
 
 
 class MiniCPMODummyInputsBuilder(
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 76c7a59d656d5..2c0d37e883b90 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -69,6 +69,9 @@ from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
 from .vision import scatter_patch_features, select_patch_features
 
+# For profile run
+_MAX_FRAMES_PER_VIDEO = 16
+
 
 class MiniCPMVImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
@@ -369,7 +372,8 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
     ) -> Mapping[str, int]:
         mm_max_tokens = {"image": self.get_max_image_tokens()}
         if self.get_model_version() == (2, 6):
-            mm_max_tokens["video"] = self.get_max_video_tokens(seq_len)
+            mm_max_tokens["video"] = self.get_max_video_tokens(
+                seq_len, mm_counts)
 
         return mm_max_tokens
 
@@ -432,9 +436,14 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
             use_image_id=False,
         )
 
-    def get_max_video_tokens(self, seq_len: int) -> int:
-        return self.get_max_video_frame_tokens(
-        ) * self.get_num_frames_with_most_features(seq_len)
+    def get_max_video_tokens(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        num_frames = self.get_num_frames_with_most_features(seq_len, mm_counts)
+        num_video_tokens_total = self.get_max_video_frame_tokens() * num_frames
+        return num_video_tokens_total
 
     def get_video_max_slice_num(self) -> int:
         return 1
@@ -449,18 +458,21 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
         num_frames = max_tokens // num_frame_tokens
         return num_frames
 
-    def get_num_frames_with_most_features(self, seq_len: int) -> int:
-        mm_config = self.ctx.get_mm_config()
-        max_images = mm_config.get_limit_per_prompt("image")
-        max_videos = mm_config.get_limit_per_prompt("video")
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
 
         max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self.get_max_video_frames(seq_len -
                                                      max_image_tokens)
+        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
+                                   _MAX_FRAMES_PER_VIDEO)
 
-        num_frames = max(max_total_frames // max(max_videos, 1), 1)
-
-        return num_frames
+        return max(max_frames_per_video, 1)
 
 
 _I = TypeVar("_I",
@@ -483,7 +495,7 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
         video_width, video_height = \
             self.info.get_video_frame_size_with_most_features()
         num_video_frames = \
-            self.info.get_num_frames_with_most_features(seq_len)
+            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
 
         mm_data = {
             "image":
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 7537671e1bb82..a7800d4153667 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -806,7 +806,7 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
         max_pixels: Optional[int] = None,
         size: Optional[dict[str, int]] = None,
         **kwargs: object,
-    ):
+    ) -> Qwen2VLImageProcessor:
         return cached_image_processor_from_config(
             self.ctx.model_config,
             **self._get_image_processor_kwargs(min_pixels=min_pixels,
@@ -825,7 +825,7 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
     ) -> Mapping[str, int]:
         return {
             "image": self.get_max_image_tokens(),
-            "video": self.get_max_video_tokens(seq_len),
+            "video": self.get_max_video_tokens(seq_len, mm_counts),
         }
 
     def _get_vision_info(
@@ -941,10 +941,13 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
 
         return num_frames
 
-    def get_num_frames_with_most_features(self, seq_len: int) -> int:
-        mm_config = self.ctx.get_mm_config()
-        max_images = mm_config.get_limit_per_prompt("image")
-        max_videos = mm_config.get_limit_per_prompt("video")
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
 
         max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self._get_max_video_frames(seq_len -
@@ -954,13 +957,18 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
 
         return max(max_frames_per_video, 1)
 
-    def get_max_video_tokens(self, seq_len: int) -> int:
+    def get_max_video_tokens(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
         target_width, target_height = self.get_image_size_with_most_features()
 
         return self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
-            num_frames=self.get_num_frames_with_most_features(seq_len),
+            num_frames=self.get_num_frames_with_most_features(
+                seq_len, mm_counts),
             image_processor=None,
         )
 
@@ -982,7 +990,7 @@ class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
         target_num_frames = \
-            self.info.get_num_frames_with_most_features(seq_len)
+            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
 
         mm_data = {
             "image":
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index e36f8e4434ec6..1df9a1f5eba1c 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -3,7 +3,7 @@
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from dataclasses import dataclass, field
-from typing import Generic, NamedTuple, TypeVar, cast
+from typing import Generic, NamedTuple, Optional, TypeVar, cast
 
 import numpy as np
 import numpy.typing as npt
@@ -160,17 +160,19 @@ class MultiModalProfiler(Generic[_I]):
     def get_and_validate_mm_inputs(
         self,
         seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
     ) -> tuple[MultiModalInputs, Mapping[str, int]]:
-        mm_counts = self.get_mm_limits()
+        if mm_counts is None:
+            mm_counts = self.get_mm_limits()
 
         info = self.processing_info
         mm_max_tokens_per_item = info.get_mm_max_tokens_per_item(
             seq_len, mm_counts)
 
-        if mm_counts.keys() != mm_max_tokens_per_item.keys():
+        if mm_counts.keys() - mm_max_tokens_per_item.keys():
             raise AssertionError(
                 "The keys returned by `get_supported_mm_limits` "
-                f"({set(mm_counts.keys())}) should be the same as those "
+                f"({set(mm_counts.keys())}) should be a subset of those "
                 "returned by `get_mm_max_tokens_per_item` "
                 f"({set(mm_max_tokens_per_item.keys())})")
 
@@ -193,8 +195,12 @@ class MultiModalProfiler(Generic[_I]):
                 "tokens.")
         return mm_inputs, total_placeholders_by_modality
 
-    def get_encoder_dummy_data(self, seq_len: int) -> DummyEncoderData:
-        mm_inputs, _ = self.get_and_validate_mm_inputs(seq_len)
+    def get_encoder_dummy_data(
+        self,
+        seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
+    ) -> DummyEncoderData:
+        mm_inputs, _ = self.get_and_validate_mm_inputs(seq_len, mm_counts)
         mm_inputs = cast(MultiModalEncDecInputs, mm_inputs)
 
         # For encoder-decoder models, use encoder prompt token ids instead of
@@ -207,9 +213,15 @@ class MultiModalProfiler(Generic[_I]):
 
         return DummyEncoderData(encoder_prompt_token_ids)
 
-    def get_decoder_dummy_data(self, seq_len: int) -> DummyDecoderData:
-        (mm_inputs, total_placeholders_by_modality
-         ) = self.get_and_validate_mm_inputs(seq_len)
+    def get_decoder_dummy_data(
+        self,
+        seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
+    ) -> DummyDecoderData:
+        (
+            mm_inputs,
+            total_placeholders_by_modality,
+        ) = self.get_and_validate_mm_inputs(seq_len, mm_counts)
 
         prompt_token_ids = mm_inputs["prompt_token_ids"]
         total_len = len(prompt_token_ids)
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 8c16c3ba80750..4f41fa083f63b 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -458,6 +458,7 @@ class MultiModalRegistry:
         self,
         model_config: "ModelConfig",
         seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
     ) -> DummyDecoderData:
         """
         Create dummy data for profiling the memory usage of a model.
@@ -466,7 +467,7 @@ class MultiModalRegistry:
         """
         processor = self.create_processor(model_config, disable_cache=True)
         profiler = MultiModalProfiler(processor)
-        dummy_data = profiler.get_decoder_dummy_data(seq_len)
+        dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts)
 
         # Having more tokens is over-conservative but otherwise fine
         token_ids = dummy_data.prompt_token_ids
@@ -481,6 +482,7 @@ class MultiModalRegistry:
         self,
         model_config: "ModelConfig",
         seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
     ) -> DummyEncoderData:
         """
         Create dummy data for profiling the memory usage of a model.
@@ -489,7 +491,7 @@ class MultiModalRegistry:
         """
         processor = self.create_processor(model_config, disable_cache=True)
         profiler = MultiModalProfiler(processor)
-        dummy_data = profiler.get_encoder_dummy_data(seq_len)
+        dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts)
 
         # Having more tokens is over-conservative but otherwise fine
         token_ids = dummy_data.prompt_token_ids
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4511a9aa85fd3..8071c98b269fd 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1470,19 +1470,13 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 encoder_budget, max_num_mm_items, dummy_data_modality)
 
             # Create dummy batch of multimodal inputs.
-            dummy_request_data = self.mm_registry.get_decoder_dummy_data(
+            dummy_mm_kwargs = self.mm_registry.get_decoder_dummy_data(
                 model_config=self.model_config,
                 seq_len=self.max_num_tokens,
-            )
-            dummy_mm_data = dummy_request_data.multi_modal_data
-
-            # Dummy data definition may contain multiple multimodal items
-            # (e.g, multiple images) for a single request, therefore here we
-            # always replicate first item by max_num_mm_items times since in V1
-            # they are scheduled to be processed separately.
-            dummy_mm_item = dummy_mm_data.get_item(
-                modality=dummy_data_modality, item_index=0)
-            dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item])
+                mm_counts={
+                    dummy_data_modality: 1
+                },
+            ).multi_modal_data
 
             batched_dummy_mm_inputs = MultiModalKwargs.batch(
                 [dummy_mm_kwargs] * max_num_mm_items)

From 248e76c4dfbe89a6604d58b15111f278a698183a Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Sun, 30 Mar 2025 18:36:02 +0800
Subject: [PATCH 112/593] fix: lint fix a ruff checkout syntax error (#15767)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
---
 tests/models/registry.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index ff0c37a6afd76..54e392ab73d6e 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -281,7 +281,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                          extras={"v2": "google/paligemma2-3b-ft-docci-448"}),  # noqa: E501
     "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
                                         trust_remote_code=True,
-                                        extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"}),  # noqa: E501),
+                                        extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"}),  # noqa: E501
     "Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
                                         trust_remote_code=True),
     "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409",  # noqa: E501

From bb103b29bf3d26fa831458a8ca5af1bb820c4712 Mon Sep 17 00:00:00 2001
From: kYLe <kylhuang@nvidia.com>
Date: Sun, 30 Mar 2025 05:45:08 -0500
Subject: [PATCH 113/593] [Bugfix] Added `embed_is_patch` mask for fuyu model
 (#15731)

Signed-off-by: Kyle Huang <kylhuang@nvidia.com>
---
 vllm/model_executor/models/fuyu.py | 41 ++++++++++++++++++++++++++----
 1 file changed, 36 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index bd7ef29e1f63f..a1004cd0ac608 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -18,7 +18,7 @@
 """ PyTorch Fuyu model."""
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, Set, Tuple, TypedDict
+from typing import Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
 import torch.nn as nn
@@ -39,10 +39,12 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
+from vllm.utils import flatten_2d_lists
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features
 
 # Cannot find the following 2 numbers from hf config.
 _IMAGE_TOKEN_ID = 71011
@@ -64,6 +66,11 @@ class FuyuImagePatchInputs(TypedDict):
     This is used to split the embeddings which has the first two dimensions
     flattened just like `flat_data`.
     """
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+    """
 
 
 class FuyuProcessingInfo(BaseProcessingInfo):
@@ -183,6 +190,19 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
 
             processed_outputs["image_patches"] = image_patches[0]
 
+            # get patch grid size for each image
+            embed_is_patch = []
+            for image in images:
+                ncols, nrows = self.info.get_image_feature_grid_size(
+                    image_width=image.width,
+                    image_height=image.height,
+                )
+
+                mask = torch.tensor(([True] * ncols + [False]) * nrows)
+                embed_is_patch.append(mask)
+
+            processed_outputs["embed_is_patch"] = embed_is_patch
+
         return processed_outputs
 
     def _apply_hf_processor_tokens_only(
@@ -202,7 +222,8 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(image_patches=MultiModalFieldConfig.batched("image"))
+        return dict(image_patches=MultiModalFieldConfig.batched("image"),
+                    embed_is_patch=MultiModalFieldConfig.batched("image"))
 
     def _get_prompt_updates(
         self,
@@ -301,11 +322,15 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[FuyuImagePatchInputs]:
         image_patches = kwargs.pop("image_patches", None)
+        embed_is_patch = kwargs.pop("embed_is_patch", None)
         if image_patches is not None:
             if not isinstance(image_patches, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of image patches. "
                                  f"Got type: {type(image_patches)}")
 
+            if not isinstance(embed_is_patch, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of embed_is_patch. "
+                                 f"Got type: {type(embed_is_patch)}")
             image_patches_flat = flatten_bn(image_patches)
 
             return FuyuImagePatchInputs(
@@ -313,6 +338,7 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
                 flat_data=self._validate_pixel_values(
                     flatten_bn(image_patches_flat, concat=True)),
                 patches_per_image=[x.size(0) for x in image_patches_flat],
+                embed_is_patch=embed_is_patch,
             )
 
         return None
@@ -333,7 +359,12 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         if image_input is None:
             return None
         vision_embeddings = self._process_image_input(image_input)
-        return vision_embeddings
+        #return vision_embeddings
+        return flatten_2d_lists(
+            scatter_patch_features(*args) for args in zip(
+                vision_embeddings,
+                image_input["embed_is_patch"],
+            ))
 
     def get_input_embeddings(
         self,
@@ -343,8 +374,8 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
             inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                _IMAGE_TOKEN_ID)
+                input_ids, inputs_embeds,
+                select_patch_features(multimodal_embeddings), _IMAGE_TOKEN_ID)
         return inputs_embeds
 
     def forward(

From 70fedd0f7954079ebee36a7ca834cdf2f3e5d568 Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Mon, 31 Mar 2025 01:47:57 +0800
Subject: [PATCH 114/593] fix: Comments to English for better dev experience
 (#15768)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
---
 vllm/model_executor/models/deepseek_vl2.py         | 2 +-
 vllm/transformers_utils/processors/deepseek_vl2.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 0faf895964bb6..4554a997755f6 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -509,7 +509,7 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         _, hw, n_dim = images_embeds.shape
         h = w = int(hw**0.5)
 
-        # 根据self.tile_tag & self.global_view_pos填充image token sequence
+        # fill image token based on self.tile_tag & self.global_view_pos
         tile_index = 0
         vision_embeddings = []
         for jdx in range(images_spatial_crop.size(0)):
diff --git a/vllm/transformers_utils/processors/deepseek_vl2.py b/vllm/transformers_utils/processors/deepseek_vl2.py
index d37381ea9925f..316281f2af4e5 100644
--- a/vllm/transformers_utils/processors/deepseek_vl2.py
+++ b/vllm/transformers_utils/processors/deepseek_vl2.py
@@ -226,7 +226,7 @@ class DeepseekVLV2Processor(ProcessorMixin):
         input_ids[input_ids < 0] = self.pad_id
 
         if inference_mode:
-            # 去掉结尾的eos token
+            # Remove the ending eos token
             assert input_ids[-1] == self.eos_id
             input_ids = input_ids[:-1]
             target_ids = target_ids[:-1]

From 9b459eca88b4953586391c14d183574c4d21fca3 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 30 Mar 2025 14:10:42 -0700
Subject: [PATCH 115/593] [V1][Scheduler] Avoid calling
 `_try_schedule_encoder_inputs` for every request (#15778)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/core/sched/scheduler.py | 51 ++++++++++++++++++---------------
 vllm/v1/request.py              |  9 ++----
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 094602a8b732d..aafa2f0a9f30d 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -171,19 +171,23 @@ class Scheduler(SchedulerInterface):
             assert num_new_tokens > 0
 
             # Schedule encoder inputs.
-            encoder_inputs_to_schedule, num_new_tokens, new_encoder_budget = (
-                self._try_schedule_encoder_inputs(request,
-                                                  request.num_computed_tokens,
-                                                  num_new_tokens,
-                                                  encoder_budget))
-            if num_new_tokens == 0:
-                # The request cannot be scheduled because the encoder budget
-                # or the encoder cache is exhausted.
-                # NOTE(woosuk): Here, by doing `continue` instead of `break`,
-                # we do not strictly follow the FCFS scheduling policy and
-                # allow the lower-priority requests to be scheduled.
-                req_index += 1
-                continue
+            if request.has_encoder_inputs:
+                (encoder_inputs_to_schedule, num_new_tokens,
+                 new_encoder_budget) = self._try_schedule_encoder_inputs(
+                     request, request.num_computed_tokens, num_new_tokens,
+                     encoder_budget)
+                if num_new_tokens == 0:
+                    # The request cannot be scheduled because the encoder budget
+                    # or the encoder cache is exhausted.
+                    # NOTE(woosuk): By using `continue` instead of `break` here,
+                    # we intentionally relax the strict FCFS scheduling policy
+                    # to allow lower-priority requests to be scheduled when a
+                    # higher-priority request is blocked by encoder constraints.
+                    req_index += 1
+                    continue
+            else:
+                encoder_inputs_to_schedule = None
+                new_encoder_budget = encoder_budget
 
             while True:
                 new_blocks = self.kv_cache_manager.allocate_slots(
@@ -318,13 +322,17 @@ class Scheduler(SchedulerInterface):
                 assert num_new_tokens > 0
 
                 # Schedule encoder inputs.
-                (encoder_inputs_to_schedule, num_new_tokens,
-                 new_encoder_budget) = self._try_schedule_encoder_inputs(
-                     request, num_computed_tokens, num_new_tokens,
-                     encoder_budget)
-                if num_new_tokens == 0:
-                    # The request cannot be scheduled.
-                    break
+                if request.has_encoder_inputs:
+                    (encoder_inputs_to_schedule, num_new_tokens,
+                     new_encoder_budget) = self._try_schedule_encoder_inputs(
+                         request, num_computed_tokens, num_new_tokens,
+                         encoder_budget)
+                    if num_new_tokens == 0:
+                        # The request cannot be scheduled.
+                        break
+                else:
+                    encoder_inputs_to_schedule = None
+                    new_encoder_budget = encoder_budget
 
                 new_blocks = self.kv_cache_manager.allocate_slots(
                     request, num_new_tokens, computed_blocks)
@@ -506,9 +514,6 @@ class Scheduler(SchedulerInterface):
         limitations, the method adjusts `num_new_tokens` to schedule only the
         decoder tokens up to just before the unschedulable encoder input.
         """
-        if not request.has_encoder_inputs():
-            return [], num_new_tokens, encoder_budget
-
         encoder_inputs_to_schedule: list[int] = []
         mm_positions = request.mm_positions
         assert mm_positions is not None
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 48e5132678c13..490fe4e83d3ad 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -59,6 +59,8 @@ class Request:
         self.mm_positions = multi_modal_placeholders or []
         self.mm_inputs = multi_modal_inputs or []
         self.mm_hashes: list[str] = multi_modal_hashes or []
+        self.num_encoder_inputs = len(self.mm_inputs)
+        self.has_encoder_inputs = self.num_encoder_inputs > 0
 
         # Sanity check
         assert len(self.mm_inputs) == len(self.mm_positions)
@@ -117,13 +119,6 @@ class Request:
     def get_finished_reason(self) -> Union[FinishReason, None]:
         return RequestStatus.get_finished_reason(self.status)
 
-    def has_encoder_inputs(self) -> bool:
-        return len(self.mm_inputs) > 0
-
-    @property
-    def num_encoder_inputs(self) -> int:
-        return len(self.mm_positions)
-
     def get_num_encoder_tokens(self, input_id: int) -> int:
         assert input_id < len(self.mm_positions)
         num_tokens = self.mm_positions[input_id]["length"]

From 18ed3132d2bfe1df9a74729457b69243955221e8 Mon Sep 17 00:00:00 2001
From: Chengyang LIU <464004340@qq.com>
Date: Sun, 30 Mar 2025 19:39:56 -0700
Subject: [PATCH 116/593] [Misc] update the comments (#15780)

Signed-off-by: chengyang liu <lcy4869@gmail.com>
Co-authored-by: chengyang liu <lcy4869@gmail.com>
---
 vllm/v1/worker/gpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 8071c98b269fd..e3df2a62e67f4 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -673,7 +673,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         # use two kernels for cascade attention. Let's imagine:
         # Request 3's input query: [D]
         # Request 3's kv cache: [A, B, C, D]
-        # Request 3's num_computed_tokens: 4 (i.e., [A, B, C, D])
+        # Request 3's num_computed_tokens: 3 (i.e., [A, B, C])
         # If we use [A, B, C, D] as the common prefix for Request 1-3,
         # then Request 3 will be processed only by the first kernel,
         # and the second kernel will get an empty input. While this is not

From effc5d24fae10b29996256eb7a88668ff7941aed Mon Sep 17 00:00:00 2001
From: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Date: Mon, 31 Mar 2025 00:38:58 -0700
Subject: [PATCH 117/593] [Benchmark] Update Vision Arena Dataset and
 HuggingFaceDataset Setup (#15748)

Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com>
---
 benchmarks/README.md               | 240 ++++++++++++++++-------------
 benchmarks/benchmark_dataset.py    | 155 +++++++++----------
 benchmarks/benchmark_serving.py    |  17 +-
 benchmarks/benchmark_throughput.py |  30 ++--
 4 files changed, 227 insertions(+), 215 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index d41de1caa04c0..4777d8329f2db 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -41,29 +41,33 @@ become available.
       <td><code>synthetic</code></td>
     </tr>
     <tr>
-      <td><strong>HuggingFace</strong></td>
-      <td style="text-align: center;">🟡</td>
-      <td style="text-align: center;">🟡</td>
-      <td>Specify your dataset path on HuggingFace</td>
+      <td><strong>HuggingFace-VisionArena</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>lmarena-ai/VisionArena-Chat</code></td>
     </tr>
     <tr>
-      <td><strong>VisionArena</strong></td>
+      <td><strong>HuggingFace-InstructCoder</strong></td>
       <td style="text-align: center;">✅</td>
       <td style="text-align: center;">✅</td>
-      <td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td>
+      <td><code>likaixin/InstructCoder</code></td>
+    </tr>
+    <tr>
+      <td><strong>HuggingFace-Other</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
     </tr>
   </tbody>
 </table>
 
 ✅: supported
 
+🟡: Partial support
+
 🚧: to be supported
 
-🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
-similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`.
-If you need support for other dataset formats, please consider contributing.
-
-**Note**: VisionArena’s `dataset-name` should be set to `hf`
+**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
 
 ---
 ## Example - Online Benchmark
@@ -71,8 +75,7 @@ If you need support for other dataset formats, please consider contributing.
 First start serving your model
 
 ```bash
-MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
-vllm serve ${MODEL_NAME} --disable-log-requests
+vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
 ```
 
 Then run the benchmarking script
@@ -80,12 +83,13 @@ Then run the benchmarking script
 ```bash
 # download dataset
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
-NUM_PROMPTS=10
-BACKEND="vllm"
-DATASET_NAME="sharegpt"
-DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
-python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
+python3 vllm/benchmarks/benchmark_serving.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --endpoint /v1/completions \
+  --dataset-name sharegpt \
+  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --num-prompts 10
 ```
 
 If successful, you will see the following output
@@ -122,88 +126,76 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
 ```
 
 ```bash
-MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
-NUM_PROMPTS=10
-BACKEND="openai-chat"
-DATASET_NAME="hf"
-DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
-DATASET_SPLIT='train'
-
 python3 vllm/benchmarks/benchmark_serving.py \
-  --backend "${BACKEND}" \
-  --model "${MODEL_NAME}" \
-  --endpoint "/v1/chat/completions" \
-  --dataset-name "${DATASET_NAME}" \
-  --dataset-path "${DATASET_PATH}" \
-  --hf-split "${DATASET_SPLIT}" \
-  --num-prompts "${NUM_PROMPTS}"
+  --backend openai-chat \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --endpoint /v1/chat/completions \
+  --dataset-name hf \
+  --dataset-path lmarena-ai/VisionArena-Chat \
+  --hf-split train \
+  --num-prompts 1000
 ```
 
-### HuggingFaceDataset Examples
+### InstructCoder Benchmark with Speculative Decoding
 
-Currently, HuggingFaceDataset only supports dataset formats
-similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`. If you need support for other dataset
-formats, please consider contributing.
+``` bash
+VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
+    --speculative-model "[ngram]" \
+    --ngram_prompt_lookup_min 2 \
+    --ngram-prompt-lookup-max 5 \
+    --num_speculative_tokens 5
+```
+
+``` bash
+python3 benchmarks/benchmark_serving.py \
+    --model meta-llama/Meta-Llama-3-8B-Instruct \
+    --dataset-name hf \
+    --dataset-path likaixin/InstructCoder \
+    --num-prompts 2048
+```
+
+### Other HuggingFaceDataset Examples
 
 ```bash
-# need a model with vision capability here
 vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
 ```
 
 **`lmms-lab/LLaVA-OneVision-Data`**
 
 ```bash
-MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
-NUM_PROMPTS=10
-BACKEND="openai-chat"
-DATASET_NAME="hf"
-DATASET_PATH="lmms-lab/LLaVA-OneVision-Data"
-DATASET_SPLIT='train'
-DATASET_SUBSET='chart2text(cauldron)'
 python3 vllm/benchmarks/benchmark_serving.py \
-  --backend "${BACKEND}" \
-  --model "${MODEL_NAME}" \
-  --endpoint "/v1/chat/completions" \
-  --dataset-name "${DATASET_NAME}" \
-  --dataset-path "${DATASET_PATH}" \
-  --hf-split "${DATASET_SPLIT}" \
-  --num-prompts "${NUM_PROMPTS}" \
-  --hf-subset "${DATASET_SUBSET}"
+  --backend openai-chat \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --endpoint /v1/chat/completions \
+  --dataset-name hf \
+  --dataset-path lmms-lab/LLaVA-OneVision-Data \
+  --hf-split train \
+  --hf-subset "chart2text(cauldron)" \
+  --num-prompts 10
 ```
 
 **`Aeala/ShareGPT_Vicuna_unfiltered`**
 
 ```bash
-MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
-NUM_PROMPTS=10
-BACKEND="openai-chat"
-DATASET_NAME="hf"
-DATASET_PATH="Aeala/ShareGPT_Vicuna_unfiltered"
-DATASET_SPLIT='train'
 python3 vllm/benchmarks/benchmark_serving.py \
-  --backend "${BACKEND}" \
-  --model "${MODEL_NAME}" \
-  --endpoint "/v1/chat/completions" \
-  --dataset-name "${DATASET_NAME}" \
-  --dataset-path "${DATASET_PATH}" \
-  --hf-split "${DATASET_SPLIT}" \
-  --num-prompts "${NUM_PROMPTS}" \
+  --backend openai-chat \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --endpoint /v1/chat/completions \
+  --dataset-name hf \
+  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
+  --hf-split train \
+  --num-prompts 10
 ```
 
 ---
 ## Example - Offline Throughput Benchmark
 
 ```bash
-MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
-NUM_PROMPTS=10
-DATASET_NAME="sonnet"
-DATASET_PATH="vllm/benchmarks/sonnet.txt"
-
 python3 vllm/benchmarks/benchmark_throughput.py \
-  --model "${MODEL_NAME}" \
-  --dataset-name "${DATASET_NAME}" \
-  --dataset-path "${DATASET_PATH}" \
-  --num-prompts "${NUM_PROMPTS}"
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset-name sonnet \
+  --dataset-path vllm/benchmarks/sonnet.txt \
+  --num-prompts 10
 ```
 
 If successful, you will see the following output
@@ -217,19 +209,13 @@ Total num output tokens:  1500
 ### VisionArena Benchmark for Vision Language Models
 
 ``` bash
-MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
-NUM_PROMPTS=10
-DATASET_NAME="hf"
-DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
-DATASET_SPLIT="train"
-
 python3 vllm/benchmarks/benchmark_throughput.py \
-  --model "${MODEL_NAME}" \
-  --backend "vllm-chat" \
-  --dataset-name "${DATASET_NAME}" \
-  --dataset-path "${DATASET_PATH}" \
-  --num-prompts "${NUM_PROMPTS}" \
-  --hf-split "${DATASET_SPLIT}"
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --backend vllm-chat \
+  --dataset-name hf \
+  --dataset-path lmarena-ai/VisionArena-Chat \
+  --num-prompts 1000 \
+  --hf-split train
 ```
 
 The `num prompt tokens` now includes image token counts
@@ -240,29 +226,71 @@ Total num prompt tokens:  14527
 Total num output tokens:  1280
 ```
 
+### InstructCoder Benchmark with Speculative Decoding
+
+``` bash
+VLLM_WORKER_MULTIPROC_METHOD=spawn \
+VLLM_USE_V1=1 \
+python3 vllm/benchmarks/benchmark_throughput.py \
+    --dataset-name=hf \
+    --dataset-path=likaixin/InstructCoder \
+    --model=meta-llama/Meta-Llama-3-8B-Instruct \
+    --input-len=1000 \
+    --output-len=100 \
+    --num-prompts=2048 \
+    --async-engine \
+    --speculative-model="[ngram]" \
+    --ngram_prompt_lookup_min=2 \
+    --ngram-prompt-lookup-max=5 \
+    --num_speculative_tokens=5
+```
+
+```
+Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
+Total num prompt tokens:  261136
+Total num output tokens:  204800
+```
+
+### Other HuggingFaceDataset Examples
+
+**`lmms-lab/LLaVA-OneVision-Data`**
+
+```bash
+python3 vllm/benchmarks/benchmark_throughput.py \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --backend vllm-chat \
+  --dataset-name hf \
+  --dataset-path lmms-lab/LLaVA-OneVision-Data \
+  --hf-split train \
+  --hf-subset "chart2text(cauldron)" \
+  --num-prompts 10
+```
+
+**`Aeala/ShareGPT_Vicuna_unfiltered`**
+
+```bash
+python3 vllm/benchmarks/benchmark_throughput.py \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --backend vllm-chat \
+  --dataset-name hf \
+  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
+  --hf-split train \
+  --num-prompts 10
+```
+
 ### Benchmark with LoRA Adapters
 
 ``` bash
 # download dataset
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-MODEL_NAME="meta-llama/Llama-2-7b-hf"
-BACKEND="vllm"
-DATASET_NAME="sharegpt"
-DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
-NUM_PROMPTS=10
-MAX_LORAS=2
-MAX_LORA_RANK=8
-ENABLE_LORA="--enable-lora"
-LORA_PATH="yard1/llama-2-7b-sql-lora-test"
-
 python3 vllm/benchmarks/benchmark_throughput.py \
-  --model "${MODEL_NAME}" \
-  --backend "${BACKEND}" \
-  --dataset_path "${DATASET_PATH}" \
-  --dataset_name "${DATASET_NAME}" \
-  --num-prompts "${NUM_PROMPTS}" \
-  --max-loras "${MAX_LORAS}" \
-  --max-lora-rank "${MAX_LORA_RANK}" \
-  ${ENABLE_LORA} \
-  --lora-path "${LORA_PATH}"
+  --model meta-llama/Llama-2-7b-hf \
+  --backend vllm \
+  --dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --dataset_name sharegpt \
+  --num-prompts 10 \
+  --max-loras 2 \
+  --max-lora-rank 8 \
+  --enable-lora \
+  --lora-path yard1/llama-2-7b-sql-lora-test
   ```
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index 38ef739c69f9e..f332566d64f80 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -23,7 +23,8 @@ from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from dataclasses import dataclass
 from functools import cache
-from typing import Any, Optional, Union
+from io import BytesIO
+from typing import Any, Callable, Optional, Union
 
 import numpy as np
 import pandas as pd
@@ -239,21 +240,24 @@ def process_image(image: Any) -> Mapping[str, Any]:
     """
     Process a single image input and return a multimedia content dictionary.
 
-    For a PIL.Image.Image input:
-      - Converts the image to RGB.
-      - Saves the image as a JPEG in-memory.
-      - Encodes the JPEG data as a base64 string.
-      - Returns a dictionary with the image as a base64 data URL.
+    Supports three input types:
 
-    For a string input:
-      - Treats the string as a URL or file path.
-      - Prepends "file://" if the string doesn't start with "http://" or
-        "file://".
-      - Returns a dictionary with the image URL.
+    1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
+       containing raw image data.  - Loads the bytes as a PIL.Image.Image.
+
+    2. PIL.Image.Image input: - Converts the image to RGB.  - Saves the image as
+       a JPEG in memory.  - Encodes the JPEG data as a base64 string.  - Returns
+       a dictionary with the image as a base64 data URL.
+
+    3. String input: - Treats the string as a URL or local file path.  -
+       Prepends "file://" if the string doesn't start with "http://" or
+       "file://".  - Returns a dictionary with the image URL.
 
     Raises:
-      ValueError: If the input is neither a PIL.Image.Image nor a string.
+        ValueError: If the input is not a supported type.
     """
+    if isinstance(image, dict) and 'bytes' in image:
+        image = Image.open(BytesIO(image['bytes']))
     if isinstance(image, Image.Image):
         image = image.convert("RGB")
         with io.BytesIO() as image_data:
@@ -272,8 +276,8 @@ def process_image(image: Any) -> Mapping[str, Any]:
             ("http://", "file://")) else f"file://{image}")
         return {"type": "image_url", "image_url": {"url": image_url}}
 
-    raise ValueError(
-        f"Invalid image input {image}. Must be a PIL.Image.Image or str.")
+    raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
+                     " or str or dictionary with raw image bytes.")
 
 
 # -----------------------------------------------------------------------------
@@ -562,48 +566,56 @@ class BurstGPTDataset(BenchmarkDataset):
 
 
 # -----------------------------------------------------------------------------
-# HuggingFace Dataset Implementation
+# HuggingFace Dataset Base Implementation
 # -----------------------------------------------------------------------------
-
-
 class HuggingFaceDataset(BenchmarkDataset):
-    """
-    Dataset class for processing a HuggingFace dataset with conversation data
-    and optional images.
-    """
+    """Base class for datasets hosted on HuggingFace."""
+
+    SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set()
 
     def __init__(
         self,
+        dataset_path: str,
         dataset_split: str,
         dataset_subset: Optional[str] = None,
         **kwargs,
     ) -> None:
-        super().__init__(**kwargs)
+        super().__init__(dataset_path=dataset_path, **kwargs)
+
+        # Validate dataset path
+        if self.SUPPORTED_DATASET_PATHS and \
+            self.dataset_path not in self.SUPPORTED_DATASET_PATHS:
+            raise ValueError(
+                f"{self.__class__.__name__} "
+                f"only supports: {', '.join(self.SUPPORTED_DATASET_PATHS)}. "
+                "Please consider contributing if you would "
+                "like to add support for additional dataset formats.")
+
         self.dataset_split = dataset_split
         self.dataset_subset = dataset_subset
-
         self.load_data()
 
     def load_data(self) -> None:
-        if not self.dataset_path:
-            raise ValueError("dataset_path must be provided for loading data.")
-
+        """Load data from HuggingFace datasets."""
         self.data = load_dataset(
             self.dataset_path,
             name=self.dataset_subset,
             split=self.dataset_split,
             streaming=True,
         )
-        if self.data.features is None or "conversations" \
-            not in self.data.features:
-            raise ValueError(
-                "HuggingFaceDataset currently only supports datasets with "
-                "a 'conversations' column like lmms-lab/LLaVA-OneVision-Data. "
-                "Please consider contributing if you would like to add "
-                "support for additional dataset formats.")
-        # Shuffle and filter examples with at least 2 conversations.
-        self.data = self.data.shuffle(seed=self.random_seed).filter(
-            lambda x: len(x["conversations"]) >= 2)
+        self.data = self.data.shuffle(seed=self.random_seed)
+
+
+# -----------------------------------------------------------------------------
+# Conversation Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class ConversationDataset(HuggingFaceDataset):
+    """Dataset for conversation data with multimodal support."""
+    SUPPORTED_DATASET_PATHS = {
+        'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered'
+    }
 
     def sample(self,
                tokenizer: PreTrainedTokenizerBase,
@@ -611,10 +623,13 @@ class HuggingFaceDataset(BenchmarkDataset):
                output_len: Optional[int] = None,
                enable_multimodal_chat: bool = False,
                **kwargs) -> list:
+        # Filter examples with at least 2 conversations
+        filtered_data = self.data.filter(
+            lambda x: len(x["conversations"]) >= 2)
         sampled_requests = []
         dynamic_output = output_len is None
 
-        for item in self.data:
+        for item in filtered_data:
             if len(sampled_requests) >= num_requests:
                 break
             conv = item["conversations"]
@@ -659,29 +674,12 @@ class VisionArenaDataset(HuggingFaceDataset):
     """
 
     DEFAULT_OUTPUT_LEN = 128
-    VISION_ARENA_DATASET_PATH = "lmarena-ai/vision-arena-bench-v0.1"
-
-    def __init__(
-        self,
-        **kwargs,
-    ) -> None:
-        super().__init__(**kwargs)
-        if self.dataset_path != self.VISION_ARENA_DATASET_PATH:
-            raise ValueError(f"Only support Vision Arena dataset.\
-                    This data path {self.dataset_path} is not valid.")
-        if self.dataset_subset is None and self.dataset_split != "train":
-            raise ValueError("Dataset split must be 'train'.")
-
-        self.load_data()
-
-    def load_data(self) -> None:
-        dataset = load_dataset(
-            self.dataset_path,
-            name=self.dataset_subset,
-            split=self.dataset_split,
-            streaming=True,
-        )
-        self.data = dataset.shuffle(seed=self.random_seed)
+    SUPPORTED_DATASET_PATHS = {
+        "lmarena-ai/VisionArena-Chat":
+        lambda x: x["conversation"][0][0]["content"],
+        "lmarena-ai/vision-arena-bench-v0.1":
+        lambda x: x["turns"][0][0]["content"]
+    }
 
     def sample(
         self,
@@ -697,7 +695,11 @@ class VisionArenaDataset(HuggingFaceDataset):
         for item in self.data:
             if len(sampled_requests) >= num_requests:
                 break
-            prompt = item["turns"][0][0]["content"]
+            parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
+            if parser_fn is None:
+                raise ValueError(
+                    f"Unsupported dataset path: {self.dataset_path}")
+            prompt = parser_fn(item)
             mm_content = process_image(item["images"][0])
             prompt_len = len(tokenizer(prompt).input_ids)
             if enable_multimodal_chat:
@@ -727,34 +729,15 @@ class InstructCoderDataset(HuggingFaceDataset):
     InstructCoder Dataset.
     https://huggingface.co/datasets/likaixin/InstructCoder
 
-    InstructCoder is the dataset designed for general code editing.
-    It consists of 114,239 instruction-input-output triplets,
-    and covers multiple distinct code editing scenario.
+    InstructCoder is the dataset designed for general code editing.  It consists
+    of 114,239 instruction-input-output triplets, and covers multiple distinct
+    code editing scenario.
     """
 
     DEFAULT_OUTPUT_LEN = 200  # this is the average default output length
-    DEFAULT_NUM_REQUESTS = 1000
-    INSTRUCT_CODER_DATASET_PATH = "likaixin/InstructCoder"
-
-    def __init__(
-        self,
-        **kwargs,
-    ) -> None:
-        super().__init__(**kwargs)
-        if self.dataset_path != self.INSTRUCT_CODER_DATASET_PATH:
-            raise ValueError(f"Only support likaixin/InstructCoder dataset.\
-                    This data path {self.dataset_path} is not valid.")
-        if self.dataset_subset is None and self.dataset_split != "train":
-            raise ValueError("Dataset split must be 'train'.")
-
-    def load_data(self) -> None:
-        dataset = load_dataset(
-            self.dataset_path,
-            name=self.dataset_subset,
-            split=self.dataset_split,
-            streaming=True,
-        )
-        self.data = dataset.shuffle(seed=self.random_seed)
+    SUPPORTED_DATASET_PATHS = {
+        "likaixin/InstructCoder",
+    }
 
     def sample(self,
                tokenizer: PreTrainedTokenizerBase,
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index e2f712dfc6f49..dabf2214c84a0 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -49,7 +49,7 @@ try:
 except ImportError:
     from argparse import ArgumentParser as FlexibleArgumentParser
 
-from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
+from benchmark_dataset import (BurstGPTDataset, ConversationDataset,
                                InstructCoderDataset, RandomDataset,
                                SampleRequest, ShareGPTDataset, SonnetDataset,
                                VisionArenaDataset)
@@ -584,16 +584,17 @@ def main(args: argparse.Namespace):
                                             return_prompt_formatted=True)
 
     elif args.dataset_name == "hf":
-        # Choose between VisionArenaDataset
-        # and HuggingFaceDataset based on provided parameters.
-        dataset_class = HuggingFaceDataset
-        if args.dataset_path == VisionArenaDataset.VISION_ARENA_DATASET_PATH:
-            assert args.hf_subset is None, "VisionArenaDataset needs hf_subset to be None."  #noqa: E501
+        # all following datasets are implemented from the
+        # HuggingFaceDataset base class
+        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
             dataset_class = VisionArenaDataset
-        elif args.dataset_path == "likaixin/InstructCoder":
+            args.hf_split = "train"
+            args.hf_subset = None
+        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
             dataset_class = InstructCoderDataset
             args.hf_split = "train"
-
+        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = ConversationDataset
         input_requests = dataset_class(
             dataset_path=args.dataset_path,
             dataset_subset=args.hf_subset,
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index f2f68b0d1e5e2..1ff63f0a44795 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -11,7 +11,7 @@ from typing import Any, Optional, Union
 
 import torch
 import uvloop
-from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
+from benchmark_dataset import (BurstGPTDataset, ConversationDataset,
                                InstructCoderDataset, RandomDataset,
                                SampleRequest, ShareGPTDataset, SonnetDataset,
                                VisionArenaDataset)
@@ -319,21 +319,19 @@ def get_requests(args, tokenizer):
     elif args.dataset_name == "burstgpt":
         dataset_cls = BurstGPTDataset
     elif args.dataset_name == "hf":
-        if args.dataset_path == VisionArenaDataset.VISION_ARENA_DATASET_PATH:
-            if args.args.backend == "vllm-chat":
-                raise ValueError(
-                    "hf datasets only are supported by vllm-chat backend")
-            # Choose between VisionArenaDataset and HuggingFaceDataset based on
-            # provided parameters.
-            dataset_cls = (VisionArenaDataset if args.dataset_path
-                           == VisionArenaDataset.VISION_ARENA_DATASET_PATH
-                           and args.hf_subset is None else HuggingFaceDataset)
+        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = VisionArenaDataset
+            common_kwargs['dataset_subset'] = None
+            common_kwargs['dataset_split'] = "train"
+            sample_kwargs["enable_multimodal_chat"] = True
+        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = InstructCoderDataset
+            common_kwargs['dataset_split'] = "train"
+        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = ConversationDataset
             common_kwargs['dataset_subset'] = args.hf_subset
             common_kwargs['dataset_split'] = args.hf_split
             sample_kwargs["enable_multimodal_chat"] = True
-        elif args.dataset_path == "likaixin/InstructCoder":
-            dataset_cls = InstructCoderDataset
-            common_kwargs['dataset_split'] = "train"
 
     else:
         raise ValueError(f"Unknown dataset name: {args.dataset_name}")
@@ -469,10 +467,12 @@ def validate_args(args):
                 since --dataset-name is not 'hf'.",
                       stacklevel=2)
     elif args.dataset_name == "hf":
-        if args.dataset_path == VisionArenaDataset.VISION_ARENA_DATASET_PATH:
+        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
             assert args.backend == "vllm-chat", "VisionArenaDataset needs to use vllm-chat as the backend."  #noqa: E501
-        elif args.dataset_path == "likaixin/InstructCoder":
+        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
             assert args.backend == "vllm", "InstructCoder dataset needs to use vllm as the backend."  #noqa: E501
+        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
+            assert args.backend == "vllm-chat", "ConversationDataset needs to use vllm-chat as the backend."  #noqa: E501
         else:
             raise ValueError(
                 f"{args.dataset_path} is not supported by hf dataset.")

From e85829450d8016309d71de9f347e2147ee03400a Mon Sep 17 00:00:00 2001
From: Charlie Fu <charlifu@amd.com>
Date: Mon, 31 Mar 2025 06:42:18 -0500
Subject: [PATCH 118/593] [Feature][ROCm]Enable fusion pass for torch.compile
 on ROCm (#15050)

Signed-off-by: charlifu <charlifu@amd.com>
---
 csrc/quantization/fp8/common.cu               |  7 +--
 csrc/quantization/fp8/common.cuh              | 41 ++-----------
 ...fused_layernorm_dynamic_per_token_quant.cu | 28 ++++-----
 .../fused_kernels/layernorm_utils.cuh         | 13 ++--
 .../fused_kernels/quant_conversions.cuh       |  4 +-
 csrc/quantization/utils.cuh                   | 59 +++++++++++++++++++
 tests/compile/test_fusion.py                  |  8 ++-
 vllm/compilation/fusion.py                    |  4 +-
 8 files changed, 92 insertions(+), 72 deletions(-)
 create mode 100644 csrc/quantization/utils.cuh

diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu
index 8f9aa21aae4ee..eceb3a8ea05da 100644
--- a/csrc/quantization/fp8/common.cu
+++ b/csrc/quantization/fp8/common.cu
@@ -30,9 +30,6 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
     fp8_type* __restrict__ out, float* __restrict__ scale,
     scalar_t const* __restrict__ input, float const* __restrict__ scale_ub,
     const int hidden_size) {
-  float const min_scaling_factor =
-      1.0f / (fp8_e4m3_adjusted_max_v<fp8_type> * 512.f);
-
   int const tid = threadIdx.x;
   int const token_idx = blockIdx.x;
 
@@ -67,8 +64,8 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
       token_scale = block_absmax_val_maybe;
     }
     // token scale computation
-    token_scale = max(token_scale / fp8_e4m3_adjusted_max_v<fp8_type>,
-                      min_scaling_factor);
+    token_scale = max(token_scale / quant_type_max_v<fp8_type>,
+                      min_scaling_factor<fp8_type>::val());
     scale[token_idx] = token_scale;
   }
   __syncthreads();
diff --git a/csrc/quantization/fp8/common.cuh b/csrc/quantization/fp8/common.cuh
index d331c63ae827f..def8b31b27546 100644
--- a/csrc/quantization/fp8/common.cuh
+++ b/csrc/quantization/fp8/common.cuh
@@ -1,20 +1,12 @@
 #pragma once
 
 #include "quantization/vectorization.cuh"
+#include "quantization/utils.cuh"
 
 #include <cmath>
-#include <c10/core/ScalarType.h>
 
-#ifndef USE_ROCM
-  #include <c10/util/Float8_e4m3fn.h>
-  #define MAYBE_HOST_DEVICE C10_HOST_DEVICE
-#else
-  #include <ATen/hip/HIPContext.h>
-  #include <c10/util/Float8_e4m3fn.h>
-  #include <c10/util/Float8_e4m3fnuz.h>
+#ifdef USE_ROCM
   #include "amd/quant_utils.cuh"
-  // ROCm doesn't seem to need C10_HOST_DEVICE for static constexpr
-  #define MAYBE_HOST_DEVICE
 #endif
 
 // Determines the preferred FP8 type for the current platform.
@@ -31,29 +23,6 @@ static bool is_fp8_ocp() {
 #endif
 }
 
-template <typename T>
-struct fp8_e4m3_adjusted_max;
-
-template <>
-struct fp8_e4m3_adjusted_max<c10::Float8_e4m3fn> {
-  static constexpr c10::Float8_e4m3fn val() {
-    return std::numeric_limits<c10::Float8_e4m3fn>::max();
-  }
-};
-
-// Using the default max value from pytorch (240.0 0x7F) will cause accuracy
-// issues when running dynamic quantization. Here use 224.0 0x7E for rocm.
-template <>
-struct fp8_e4m3_adjusted_max<c10::Float8_e4m3fnuz> {
-  static constexpr c10::Float8_e4m3fnuz val() {
-    return c10::Float8_e4m3fnuz(0x7E, c10::Float8_e4m3fnuz::from_bits());
-  }
-};
-
-template <typename T>
-MAYBE_HOST_DEVICE static constexpr T fp8_e4m3_adjusted_max_v =
-    fp8_e4m3_adjusted_max<T>::val();
-
 namespace vllm {
 
 __device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
@@ -76,8 +45,8 @@ __device__ __forceinline__ fp8_type scaled_fp8_conversion(float const val,
     x = val / scale;
   }
 
-  float r = fmax(-fp8_e4m3_adjusted_max_v<fp8_type>,
-                 fmin(x, fp8_e4m3_adjusted_max_v<fp8_type>));
+  float r =
+      fmax(-quant_type_max_v<fp8_type>, fmin(x, quant_type_max_v<fp8_type>));
 #ifndef USE_ROCM
   return static_cast<fp8_type>(r);
 #else
@@ -123,7 +92,7 @@ __global__ void segmented_max_reduction(float* __restrict__ scale,
   // Finally, since cache[0] contains the maximum for this thread block,
   // atomically write the max to the target location
   if (threadIdx.x == 0) {
-    atomicMaxFloat(scale, cache[0] / fp8_e4m3_adjusted_max_v<fp8_type>);
+    atomicMaxFloat(scale, cache[0] / quant_type_max_v<fp8_type>);
   }
 }
 
diff --git a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
index 1be89c504bfeb..2b6ab7fcec902 100644
--- a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
+++ b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
@@ -14,8 +14,7 @@ __device__ void rms_norm_dynamic_per_token_quant_vec(
     float* __restrict__ scales,           // [num_tokens]
     scalar_t const* __restrict__ input,   // [..., hidden_size]
     scalar_t const* __restrict__ weight,  // [hidden_size]
-    float const* scale_ub, float const var_epsilon,
-    float const min_scaling_factor, int32_t const hidden_size,
+    float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
     scalar_t* __restrict__ residual = nullptr) {
   float rms = 0.0f;
   float token_scale = 0.0f;
@@ -27,8 +26,8 @@ __device__ void rms_norm_dynamic_per_token_quant_vec(
   // Compute scale
   vllm::vectorized::compute_dynamic_per_token_scales<scalar_t, scalar_out_t,
                                                      has_residual>(
-      &token_scale, scales, input, weight, rms, scale_ub, min_scaling_factor,
-      hidden_size, residual);
+      &token_scale, scales, input, weight, rms, scale_ub, hidden_size,
+      residual);
 
   // RMS Norm + Quant
   if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
@@ -50,8 +49,7 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel(
     float* __restrict__ scales,           // [num_tokens]
     scalar_t const* __restrict__ input,   // [..., hidden_size]
     scalar_t const* __restrict__ weight,  // [hidden_size]
-    float const* scale_ub, float const var_epsilon,
-    float const min_scaling_factor, int32_t const hidden_size,
+    float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
     scalar_t* __restrict__ residual = nullptr) {
   // For vectorization, token_input and token_output pointers need to be
   // aligned at 8-byte and 4-byte addresses respectively.
@@ -60,8 +58,8 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel(
   if (can_vectorize) {
     return rms_norm_dynamic_per_token_quant_vec<scalar_t, scalar_out_t,
                                                 has_residual>(
-        out, scales, input, weight, scale_ub, var_epsilon, min_scaling_factor,
-        hidden_size, residual);
+        out, scales, input, weight, scale_ub, var_epsilon, hidden_size,
+        residual);
   }
 
   float rms = 0.0f;
@@ -72,8 +70,8 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel(
                                             var_epsilon, residual);
   // Compute Scale
   vllm::compute_dynamic_per_token_scales<scalar_t, scalar_out_t, has_residual>(
-      &token_scale, scales, input, weight, rms, scale_ub, min_scaling_factor,
-      hidden_size, residual);
+      &token_scale, scales, input, weight, rms, scale_ub, hidden_size,
+      residual);
 
   // RMS Norm + Quant
   if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
@@ -105,11 +103,6 @@ void rms_norm_dynamic_per_token_quant_dispatch(
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  const float min_scaling_factor =
-      out.dtype() == torch::kInt8
-          ? std::numeric_limits<float>::epsilon()
-          : 1.0f / (std::numeric_limits<c10::Float8_e4m3fn>::max() * 512.f);
-
   if (residual.has_value()) {
     VLLM_DISPATCH_QUANT_TYPES(
         out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
@@ -119,8 +112,7 @@ void rms_norm_dynamic_per_token_quant_dispatch(
                   out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
                   input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
                   scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
-                  var_epsilon, min_scaling_factor, hidden_size,
-                  residual->data_ptr<scalar_in_t>());
+                  var_epsilon, hidden_size, residual->data_ptr<scalar_in_t>());
         });
 
   } else {
@@ -132,7 +124,7 @@ void rms_norm_dynamic_per_token_quant_dispatch(
                   out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
                   input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
                   scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
-                  var_epsilon, min_scaling_factor, hidden_size, nullptr);
+                  var_epsilon, hidden_size, nullptr);
         });
   }
 }
diff --git a/csrc/quantization/fused_kernels/layernorm_utils.cuh b/csrc/quantization/fused_kernels/layernorm_utils.cuh
index b5cea98f7706e..e6d23cd24e178 100644
--- a/csrc/quantization/fused_kernels/layernorm_utils.cuh
+++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh
@@ -5,6 +5,7 @@
  */
 
 #include "quantization/vectorization.cuh"
+#include "quantization/utils.cuh"
 #include "quant_conversions.cuh"
 
 #ifndef USE_ROCM
@@ -51,11 +52,11 @@ __device__ void compute_dynamic_per_token_scales(
     float* __restrict__ token_scale, float* __restrict__ all_token_scales,
     scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
     float const rms, float const* __restrict__ scale_ub,
-    float const min_scaling_factor, int32_t const hidden_size,
+    int32_t const hidden_size,
     scalar_t const* __restrict__ residual = nullptr) {
   int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
   ;
-  constexpr scalar_out_t qmax{std::numeric_limits<scalar_out_t>::max()};
+  constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
 
   float block_absmax_val_maybe = 0.0f;
   for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
@@ -83,7 +84,7 @@ __device__ void compute_dynamic_per_token_scales(
       scale = block_absmax_val_maybe;
     }
     // token scale computation
-    scale = max(scale / qmax, min_scaling_factor);
+    scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
     s_token_scale = scale;                 // Shared memory store
     all_token_scales[blockIdx.x] = scale;  // Global output store
   }
@@ -184,7 +185,7 @@ __device__ void compute_dynamic_per_token_scales(
     float* __restrict__ token_scale, float* __restrict__ all_token_scales,
     scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
     float const rms, float const* __restrict__ scale_ub,
-    float const min_scaling_factor, int32_t const hidden_size,
+    int32_t const hidden_size,
     scalar_t const* __restrict__ residual = nullptr) {
   int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
   ;
@@ -200,7 +201,7 @@ __device__ void compute_dynamic_per_token_scales(
         reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
   }
 
-  constexpr scalar_out_t qmax{std::numeric_limits<scalar_out_t>::max()};
+  constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
 
   int32_t const num_vec_elems = hidden_size >> 2;
   float block_absmax_val_maybe = 0.0f;
@@ -248,7 +249,7 @@ __device__ void compute_dynamic_per_token_scales(
       scale = block_absmax_val_maybe;
     }
     // token scale computation
-    scale = max(scale / qmax, min_scaling_factor);
+    scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
     s_token_scale = scale;                 // shared memory store
     all_token_scales[blockIdx.x] = scale;  // global output store
   }
diff --git a/csrc/quantization/fused_kernels/quant_conversions.cuh b/csrc/quantization/fused_kernels/quant_conversions.cuh
index 9ac7b188f5181..7c10aaa81cf7b 100644
--- a/csrc/quantization/fused_kernels/quant_conversions.cuh
+++ b/csrc/quantization/fused_kernels/quant_conversions.cuh
@@ -33,8 +33,8 @@ static __device__ __forceinline__ int8_t float_to_int8_rn(float const x) {
 
 template <typename fp8_type>
 static __device__ __forceinline__ fp8_type float_to_fp8(float const x) {
-  float const r = fmax(-fp8_e4m3_adjusted_max_v<fp8_type>,
-                       fmin(x, fp8_e4m3_adjusted_max_v<fp8_type>));
+  float const r =
+      fmax(-quant_type_max_v<fp8_type>, fmin(x, quant_type_max_v<fp8_type>));
   return static_cast<fp8_type>(r);
 }
 
diff --git a/csrc/quantization/utils.cuh b/csrc/quantization/utils.cuh
new file mode 100644
index 0000000000000..73055a1528744
--- /dev/null
+++ b/csrc/quantization/utils.cuh
@@ -0,0 +1,59 @@
+#pragma once
+
+/**
+ * Quantization utilities including:
+ *   Adjusted maximum values for qtypes.
+ *   Minimum scaling factors for qtypes.
+ */
+
+#include <cmath>
+#include <torch/types.h>
+
+#ifndef USE_ROCM
+  #include <c10/util/Float8_e4m3fn.h>
+  #define MAYBE_HOST_DEVICE C10_HOST_DEVICE
+#else
+  #include <ATen/hip/HIPContext.h>
+  #include <c10/util/Float8_e4m3fn.h>
+  #include <c10/util/Float8_e4m3fnuz.h>
+  // ROCm doesn't seem to need C10_HOST_DEVICE for static constexpr
+  #define MAYBE_HOST_DEVICE
+#endif
+
+template <typename T,
+          typename = std::enable_if_t<std::is_same_v<T, c10::Float8_e4m3fn> ||
+                                      std::is_same_v<T, c10::Float8_e4m3fnuz> ||
+                                      std::is_same_v<T, int8_t>>>
+struct quant_type_max {
+  static constexpr T val() { return std::numeric_limits<T>::max(); }
+};
+
+// Using the default max value from pytorch (240.0 0x7F) will cause accuracy
+// issues when running dynamic quantization. Here use 224.0 0x7E for rocm.
+template <>
+struct quant_type_max<c10::Float8_e4m3fnuz> {
+  static constexpr c10::Float8_e4m3fnuz val() {
+    return c10::Float8_e4m3fnuz(0x7E, c10::Float8_e4m3fnuz::from_bits());
+  }
+};
+
+template <typename T>
+MAYBE_HOST_DEVICE static constexpr T quant_type_max_v =
+    quant_type_max<T>::val();
+
+template <typename T,
+          typename = std::enable_if_t<std::is_same_v<T, c10::Float8_e4m3fn> ||
+                                      std::is_same_v<T, c10::Float8_e4m3fnuz> ||
+                                      std::is_same_v<T, int8_t>>>
+struct min_scaling_factor {
+  C10_DEVICE C10_ALWAYS_INLINE static float val() {
+    return 1.0f / (quant_type_max_v<T> * 512.0f);
+  }
+};
+
+template <>
+struct min_scaling_factor<int8_t> {
+  C10_DEVICE C10_ALWAYS_INLINE static float val() {
+    return std::numeric_limits<float>::epsilon();
+  }
+};
\ No newline at end of file
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index aaf0277810907..a1adf7083ef54 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -2,7 +2,6 @@
 
 import pytest
 import torch
-from compressed_tensors.quantization import FP8_DTYPE
 
 import vllm.envs as envs
 import vllm.plugins
@@ -14,9 +13,12 @@ from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     CUTLASS_FP8_SUPPORTED, Fp8LinearOp, maybe_create_device_identity)
+from vllm.platforms import current_platform
 
 from .backend import TestBackend
 
+FP8_DTYPE = current_platform.fp8_dtype()
+
 
 class TestModel(torch.nn.Module):
 
@@ -59,8 +61,8 @@ class TestModel(torch.nn.Module):
 @pytest.mark.parametrize("static", [True, False])
 @pytest.mark.parametrize("cutlass_fp8_enabled",
                          [True, False] if CUTLASS_FP8_SUPPORTED else [False])
-@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda",
-                    reason="Only test on CUDA")
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
+                    reason="Only test on CUDA and ROCm")
 def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
                               cutlass_fp8_enabled):
     torch.set_default_device("cuda")
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
index 0c3d8697b2375..b46f5f52244fa 100644
--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -4,8 +4,6 @@ from typing import Callable, Dict, List, NamedTuple, Optional, Tuple
 
 import torch
 import torch._inductor.pattern_matcher as pm
-# TODO(luka) use vllm.utils once #10836 landed
-from compressed_tensors.quantization import FP8_DTYPE
 from torch import fx
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._inductor.pattern_matcher import PatternMatcherPass
@@ -13,12 +11,14 @@ from torch._ops import OpOverload
 
 from vllm.config import CompilationConfig
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 
 from .fx_utils import find_getitem_maybe
 from .multi_output_match import MultiOutputMatch
 from .vllm_inductor_pass import VllmInductorPass
 
 logger = init_logger(__name__)
+FP8_DTYPE = current_platform.fp8_dtype()
 
 
 def empty_bf16(*args, **kwargs):

From b932c048acd4a57eece0fc744d9205acc2e54a58 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 31 Mar 2025 12:54:49 +0100
Subject: [PATCH 119/593] Recommend developing with Python 3.12 in developer
 guide (#15811)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/source/contributing/overview.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md
index 10cbc0eb1264b..1e6f73dd524e7 100644
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@@ -44,6 +44,12 @@ pre-commit run --all-files
 pytest tests/
 ```
 
+:::{tip}
+Since the <gh-file:Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
+
+Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
+:::
+
 :::{note}
 Currently, the repository is not fully checked by `mypy`.
 :::

From e7ae3bf3d62f9e68802711247e574f0f0942bfcb Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Mon, 31 Mar 2025 20:13:32 +0800
Subject: [PATCH 120/593] fix: better install requirement for install in
 setup.py (#15796)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
---
 setup.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 37f3e78926c6e..3a92d5a23a1f2 100755
--- a/setup.py
+++ b/setup.py
@@ -592,9 +592,8 @@ def get_requirements() -> list[str]:
         for line in requirements:
             if line.startswith("-r "):
                 resolved_requirements += _read_requirements(line.split()[1])
-            elif line.startswith("--"):
-                continue
-            else:
+            elif not line.startswith("--") and not line.startswith(
+                    "#") and line.strip() != "":
                 resolved_requirements.append(line)
         return resolved_requirements
 

From 555aa21905a1d725b44a29ea8cbebf218ff14558 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 31 Mar 2025 20:22:34 +0800
Subject: [PATCH 121/593] [V1] Fully Transparent Implementation of CPU
 Offloading (#15354)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 CMakeLists.txt                              |  1 +
 csrc/cuda_view.cu                           | 39 +++++++++++++
 csrc/ops.h                                  |  2 +
 csrc/torch_bindings.cpp                     |  4 ++
 tests/basic_correctness/test_cpu_offload.py |  7 ---
 tests/kernels/test_uva.py                   | 61 +++++++++++++++++++++
 tests/quantization/test_cpu_offload.py      |  7 ---
 vllm/config.py                              |  5 +-
 vllm/engine/arg_utils.py                    |  6 --
 vllm/model_executor/models/utils.py         | 21 ++++++-
 vllm/utils.py                               | 16 ++++++
 vllm/v1/worker/gpu_model_runner.py          |  4 ++
 12 files changed, 148 insertions(+), 25 deletions(-)
 create mode 100644 csrc/cuda_view.cu
 create mode 100644 tests/kernels/test_uva.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9d15b77bc3798..ab6185e9a63b8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -234,6 +234,7 @@ set(VLLM_EXT_SRC
   "csrc/activation_kernels.cu"
   "csrc/layernorm_kernels.cu"
   "csrc/layernorm_quant_kernels.cu"
+  "csrc/cuda_view.cu"
   "csrc/quantization/gptq/q_gemm.cu"
   "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
   "csrc/quantization/fp8/common.cu"
diff --git a/csrc/cuda_view.cu b/csrc/cuda_view.cu
new file mode 100644
index 0000000000000..938bd4ab7fc62
--- /dev/null
+++ b/csrc/cuda_view.cu
@@ -0,0 +1,39 @@
+#include <torch/all.h>
+#include <torch/cuda.h>
+#include <cuda_runtime.h>
+
+// This function assumes that `cpu_tensor` is a CPU tensor allocated with pinned
+// memory, and that UVA (Unified Virtual Addressing) is enabled.
+torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor) {
+  TORCH_CHECK(cpu_tensor.device().is_cpu(), "Input tensor must be on CPU");
+
+  // Get raw host pointer from CPU tensor
+  void* host_ptr = cpu_tensor.data_ptr();
+
+  // Get a device pointer corresponding to the pinned host memory
+  void* device_ptr = nullptr;
+  cudaError_t err = cudaHostGetDevicePointer(&device_ptr, host_ptr, 0);
+  TORCH_CHECK(err == cudaSuccess,
+              "cudaHostGetDevicePointer failed: ", cudaGetErrorString(err));
+
+  // We'll use the same sizes, strides, and dtype as the CPU tensor.
+  // TODO: check if layout is respected.
+  auto sizes = cpu_tensor.sizes();
+  auto strides = cpu_tensor.strides();
+  auto options = cpu_tensor.options().device(torch::kCUDA);
+
+  // from_blob signature: from_blob(void *data, IntArrayRef sizes, ..., Deleter,
+  // const TensorOptions &) Provide a no-op deleter. The CPU tensor holds the
+  // memory, so we don't free it here.
+  auto deleter = [](void*) {
+    // no-op, since the memory is owned by the original CPU tensor
+  };
+
+  torch::Tensor cuda_tensor =
+      torch::from_blob(device_ptr, sizes, strides, deleter, options);
+
+  TORCH_CHECK(cuda_tensor.device().is_cuda(),
+              "Resulting tensor is not on CUDA device");
+
+  return cuda_tensor;
+}
diff --git a/csrc/ops.h b/csrc/ops.h
index 1ea9f465cf21d..77d1ab768d95d 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -119,6 +119,8 @@ void advance_step_flashinfer(
     torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
     torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);
 
+torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor);
+
 #ifndef USE_ROCM
 torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
                         const torch::Tensor& codebooks,
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 60ad6430336a5..b0a23a3693711 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -31,6 +31,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("weak_ref_tensor(Tensor input) -> Tensor");
   ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);
 
+  ops.def("get_cuda_view_from_cpu_tensor(Tensor cpu_tensor) -> Tensor");
+  ops.impl("get_cuda_view_from_cpu_tensor", torch::kCPU,
+           &get_cuda_view_from_cpu_tensor);
+
   // Attention ops
   // Compute the attention between an input query and the cached
   // keys/values using PagedAttention.
diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py
index 436e43638a3dd..be3ad12396b4b 100644
--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
@@ -1,15 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import pytest
-
 from ..utils import compare_two_settings
 
 
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
 def test_cpu_offload():
     compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
                          ["--cpu-offload-gb", "1"])
diff --git a/tests/kernels/test_uva.py b/tests/kernels/test_uva.py
new file mode 100644
index 0000000000000..f641ae7b67c2d
--- /dev/null
+++ b/tests/kernels/test_uva.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+
+from vllm.utils import get_cuda_view_from_cpu_tensor, is_uva_available
+
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+
+@pytest.mark.skipif(not is_uva_available(), reason="UVA is not available.")
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_cpu_write(device):
+    torch.set_default_device(device)
+    cpu_tensor = torch.zeros(10,
+                             10,
+                             device="cpu",
+                             pin_memory=True,
+                             dtype=torch.int32)
+    cuda_view = get_cuda_view_from_cpu_tensor(cpu_tensor)
+    assert cuda_view.device.type == "cuda"
+
+    assert cuda_view[0, 0] == 0
+    assert cuda_view[2, 3] == 0
+    assert cuda_view[4, 5] == 0
+
+    cpu_tensor[0, 0] = 1
+    cpu_tensor[2, 3] = 2
+    cpu_tensor[4, 5] = -1
+
+    cuda_view.mul_(2)
+    assert cuda_view[0, 0] == 2
+    assert cuda_view[2, 3] == 4
+    assert cuda_view[4, 5] == -2
+
+
+@pytest.mark.skipif(not is_uva_available(), reason="UVA is not available.")
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_gpu_write(device):
+    torch.set_default_device(device)
+    cpu_tensor = torch.zeros(10,
+                             10,
+                             device="cpu",
+                             pin_memory=True,
+                             dtype=torch.int32)
+    cuda_view = get_cuda_view_from_cpu_tensor(cpu_tensor)
+    assert cuda_view.device.type == "cuda"
+
+    assert cuda_view[0, 0] == 0
+    assert cuda_view[2, 3] == 0
+    assert cuda_view[4, 5] == 0
+
+    cuda_view[0, 0] = 1
+    cuda_view[2, 3] = 2
+    cuda_view[4, 5] = -1
+    cuda_view.mul_(2)
+
+    assert cpu_tensor[0, 0] == 2
+    assert cpu_tensor[2, 3] == 4
+    assert cpu_tensor[4, 5] == -2
\ No newline at end of file
diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
index a7d6518514c72..a05eb494c11a7 100644
--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -10,13 +10,6 @@ from tests.quantization.utils import is_quant_method_supported
 from ..utils import compare_two_settings
 
 
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    # Fall back to V0 if cpu offloading is enabled.
-    # Fixture is required to that baseline uses V0.
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="fp8 is not supported on this GPU type.")
 def test_cpu_offload_fp8():
diff --git a/vllm/config.py b/vllm/config.py
index 6a15109c6744d..a02e4f7179352 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3562,9 +3562,10 @@ class VllmConfig:
 
         if self.cache_config is not None and \
             self.cache_config.cpu_offload_gb > 0 and \
-            self.compilation_config.level != CompilationLevel.NO_COMPILATION:
+            self.compilation_config.level != CompilationLevel.NO_COMPILATION \
+                and not envs.VLLM_USE_V1:
             logger.warning(
-                "CPU offload is not supported with `torch.compile` yet."
+                "CPU offload is not supported with `torch.compile` in v0 yet."
                 " Disabling `torch.compile`.")
             self.compilation_config.level = CompilationLevel.NO_COMPILATION
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ca511c7434f83..1da021d7f70dc 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1595,12 +1595,6 @@ class EngineArgs:
                                recommend_to_remove=False)
             return False
 
-        # No CPU offloading yet.
-        if self.cpu_offload_gb != EngineArgs.cpu_offload_gb:
-            _raise_or_fallback(feature_name="--cpu-offload-gb",
-                               recommend_to_remove=False)
-            return False
-
         # Only Fp16 and Bf16 dtypes since we only support FA.
         V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16]
         if model_config.dtype not in V1_SUPPORTED_DTYPES:
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 1e3d78c7f6fd7..d8c8b5b39ef14 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -10,12 +10,14 @@ import torch.nn as nn
 from torch.func import functional_call
 from transformers import PretrainedConfig
 
+import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors
 from vllm.sequence import IntermediateTensors
-from vllm.utils import is_pin_memory_available
+from vllm.utils import (get_cuda_view_from_cpu_tensor, is_pin_memory_available,
+                        is_uva_available)
 
 logger = init_logger(__name__)
 
@@ -505,6 +507,14 @@ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
         return module
 
     pin_memory = is_pin_memory_available()
+    uva_available = is_uva_available()
+
+    if envs.VLLM_USE_V1:
+        assert uva_available, ("V1 CPU offloading requires"
+                               " uva (pin memory) support")
+        uva_offloading = True
+    else:
+        uva_offloading = False
 
     # offload parameters to CPU
     # use pin_memory if possible, which helps cudagraph capture speed
@@ -523,11 +533,16 @@ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
                                        device='cpu',
                                        pin_memory=pin_memory)
         cpu_data.copy_(p.data)
-        p.data = cpu_data
+        if not uva_offloading:
+            p.data = cpu_data
+        else:
+            # keep the cpu data alive
+            p._vllm_offloaded_cpu_data = cpu_data
+            p.data = get_cuda_view_from_cpu_tensor(cpu_data)
         _CPU_OFFLOAD_BYTES += p.data.numel() * p.data.element_size()
         offloaded_parameters = True
 
-    if offloaded_parameters:
+    if offloaded_parameters and not uva_offloading:
         original_forward = module.forward
 
         def forward(*args, **kwargs):
diff --git a/vllm/utils.py b/vllm/utils.py
index bf83b38ace80d..f13f4d78723b3 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -795,6 +795,14 @@ def is_pin_memory_available() -> bool:
     return current_platform.is_pin_memory_available()
 
 
+@cache
+def is_uva_available() -> bool:
+    """Check if Unified Virtual Addressing (UVA) is available."""
+    # UVA requires pinned memory.
+    # TODO: Add more requirements for UVA if needed.
+    return is_pin_memory_available()
+
+
 class DeviceMemoryProfiler:
 
     def __init__(self, device: Optional[torch.types.Device] = None):
@@ -1645,6 +1653,14 @@ def weak_ref_tensors(
     raise ValueError("Invalid type for tensors")
 
 
+def get_cuda_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tensor:
+    """
+    Get a CUDA view of a CPU tensor using Unified Virtual Addressing (UVA).
+    """
+    assert cpu_tensor.is_pinned(), "CPU tensor must be pinned"
+    return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor)
+
+
 def is_in_doc_build() -> bool:
     try:
         from sphinx.ext.autodoc.mock import _MockModule
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index e3df2a62e67f4..74f3124e3c779 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -69,6 +69,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         self.prompt_adapter_config = vllm_config.prompt_adapter_config
         self.observability_config = vllm_config.observability_config
 
+        from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
+        set_cpu_offload_max_bytes(
+            int(self.cache_config.cpu_offload_gb * 1024**3))
+
         model_config = self.model_config
         cache_config = self.cache_config
         scheduler_config = self.scheduler_config

From 3aa2b6a637140554f60669027279f5301e5d6e05 Mon Sep 17 00:00:00 2001
From: Naveassaf <55059536+Naveassaf@users.noreply.github.com>
Date: Mon, 31 Mar 2025 15:35:14 +0300
Subject: [PATCH 122/593] [Model] Update support for NemotronNAS models
 (#15008)

Signed-off-by: Nave Assaf <nassaf@nvidia.com>
---
 docs/source/models/supported_models.md     |   2 +-
 tests/models/registry.py                   |   2 +-
 vllm/config.py                             |  21 +-
 vllm/model_executor/models/decilm.py       | 124 ------
 vllm/model_executor/models/interfaces.py   |  29 ++
 vllm/model_executor/models/nemotron_nas.py | 454 +++++++++++++++++++++
 vllm/model_executor/models/registry.py     |  20 +-
 vllm/model_executor/models/utils.py        |   5 +-
 8 files changed, 524 insertions(+), 133 deletions(-)
 delete mode 100644 vllm/model_executor/models/decilm.py
 create mode 100644 vllm/model_executor/models/nemotron_nas.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 8477158a00403..0fa10e0734ad8 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -224,7 +224,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
 - * `DeciLMForCausalLM`
   * DeciLM
-  * `Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.
+  * `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc.
   *
   * ✅︎
 - * `DeepseekForCausalLM`
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 54e392ab73d6e..a733fedcdd6c0 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -112,7 +112,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "Cohere2ForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r7b-12-2024", # noqa: E501
                                          trust_remote_code=True),
     "DbrxForCausalLM": _HfExamplesInfo("databricks/dbrx-instruct"),
-    "DeciLMForCausalLM": _HfExamplesInfo("Deci/DeciLM-7B-instruct",
+    "DeciLMForCausalLM": _HfExamplesInfo("nvidia/Llama-3_3-Nemotron-Super-49B-v1", # noqa: E501
                                          trust_remote_code=True),
     "DeepseekForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-llm-7b-chat"),
     "DeepseekV2ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V2-Lite-Chat",  # noqa: E501
diff --git a/vllm/config.py b/vllm/config.py
index a02e4f7179352..bd192af2044c5 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -411,6 +411,7 @@ class ModelConfig:
 
         self.is_attention_free = self._init_attention_free()
         self.is_hybrid = self._init_is_hybrid()
+        self.has_noops = self._init_has_noops()
         self.has_inner_state = self._init_has_inner_state()
 
         if current_platform.is_neuron():
@@ -510,6 +511,10 @@ class ModelConfig:
     def _init_is_hybrid(self) -> bool:
         return self.registry.is_hybrid_model(self.architectures)
 
+    def _init_has_noops(self) -> bool:
+        architectures = getattr(self.hf_config, "architectures", [])
+        return self.registry.is_noops_model(architectures)
+
     def _init_has_inner_state(self) -> bool:
         return self.registry.model_has_inner_state(self.architectures)
 
@@ -872,6 +877,14 @@ class ModelConfig:
             return getattr(self.hf_config.attn_config, "kv_n_heads",
                            self.hf_config.num_attention_heads)
 
+        if self.hf_config.model_type == "nemotron-nas":
+            for block in self.hf_config.block_configs:
+                if not block.attention.no_op:
+                    return self.hf_config.num_attention_heads \
+                        // block.attention.n_heads_in_group
+
+            raise RuntimeError("Couldn't determine number of kv heads")
+
         if self.is_attention_free:
             return 0
 
@@ -940,7 +953,9 @@ class ModelConfig:
         # This function relies on 'layers_block_type' in hf_config,
         # for w/o this attribute, we will need to have workarounds like so
         attn_block_type = block_type == LayerBlockType.attention
-        is_transformer = not self.is_hybrid and not self.is_attention_free
+        is_transformer = not self.is_hybrid and \
+                            not self.has_noops and \
+                            not self.is_attention_free
         start, end = self.get_layers_start_end_indices(parallel_config)
 
         if is_transformer:
@@ -951,6 +966,10 @@ class ModelConfig:
             # Note that this code assumes there
             # is only one type of attention-free block type.
             return 0 if attn_block_type else end - start
+        elif self.has_noops:
+            block_configs = self.hf_config.block_configs
+            return sum(not bc.attention.no_op
+                       for bc in block_configs[start:end])
         else:
             # Hybrid model
             layers_block_type_value = getattr(self.hf_config,
diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py
deleted file mode 100644
index b239b642f752b..0000000000000
--- a/vllm/model_executor/models/decilm.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
-# Copyright 2023 DeciAI Research Team. All rights reserved.
-# Copyright 2023 The vLLM team.
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on MistralAI GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only DeciLM model compatible with HuggingFace weights."""
-
-from typing import Iterable, Set, Tuple
-
-import torch
-
-from vllm.config import VllmConfig
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.llama import LlamaForCausalLM
-
-from .utils import is_pp_missing_parameter
-
-
-class DeciLMForCausalLM(LlamaForCausalLM):
-    """
-    Implementation for https://huggingface.co/Deci/DeciLM-7b-instruct.
-    Based on the llama executor.
-
-    The main difference is that DeciLM uses Variable Grouped Query Attention.
-    The constant number of GQA heads in the decoder is overridden with a value
-    per layer.
-
-    Usually, in the HuggingFace implementation, instead of
-    "config.num_key_value_heads", we use
-    "config.num_key_value_heads_per_layer[i]" which varies.
-
-    Currently, PagedAttention does not work well with variable GQA, so we
-    normalize the weights upon loading, and use uniform GQA with the max value
-    instead.
-    """
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        config = vllm_config.model_config.hf_config
-        config.num_key_value_heads = max(config.num_key_value_heads_per_layer)
-        delattr(config, "num_key_value_heads_per_layer")
-        super().__init__(vllm_config=vllm_config)
-
-    def load_weights(self, weights: Iterable[Tuple[str,
-                                                   torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            if "k_proj" in name or "v_proj" in name:
-                loaded_weight = self._degroup_weight(loaded_weight)
-
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
-    def _degroup_weight(self, loaded_weight: torch.Tensor) -> torch.Tensor:
-        hidden_size = self.config.hidden_size
-        head_size = self.config.hidden_size // self.config.num_attention_heads
-        target_num_kv_heads = self.config.num_key_value_heads
-        num_kv_heads = loaded_weight.shape[0] // head_size
-        n_repeats = target_num_kv_heads / num_kv_heads
-        assert n_repeats == int(n_repeats)
-
-        n_repeats = int(n_repeats)
-        loaded_weight = loaded_weight.view(num_kv_heads, head_size,
-                                           hidden_size)
-        loaded_weight = torch.repeat_interleave(loaded_weight,
-                                                repeats=n_repeats,
-                                                dim=0)
-        loaded_weight = loaded_weight.reshape(target_num_kv_heads * head_size,
-                                              hidden_size)
-
-        return loaded_weight
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index c77324bab59c6..c61254ac99990 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -411,6 +411,35 @@ def is_hybrid(
     return isinstance(model, IsHybrid)
 
 
+@runtime_checkable
+class HasNoOps(Protocol):
+    has_noops: ClassVar[Literal[True]] = True
+
+
+@runtime_checkable
+class _HasNoOpsType(Protocol):
+    has_noops: ClassVar[Literal[True]]
+
+
+@overload
+def has_noops(model: object) -> TypeIs[HasNoOps]:
+    ...
+
+
+@overload
+def has_noops(model: Type[object]) -> TypeIs[Type[HasNoOps]]:
+    ...
+
+
+def has_noops(
+    model: Union[Type[object], object]
+) -> Union[TypeIs[Type[HasNoOps]], TypeIs[HasNoOps]]:
+    if isinstance(model, type):
+        return isinstance(model, _HasNoOpsType)
+
+    return isinstance(model, HasNoOps)
+
+
 @runtime_checkable
 class SupportsCrossEncoding(Protocol):
     """The interface required for all models that support cross encoding."""
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
new file mode 100644
index 0000000000000..5c9b04cab180a
--- /dev/null
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -0,0 +1,454 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only deci model compatible with HuggingFace weights."""
+from typing import Iterable, Optional, Set, Tuple, Type, Union
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.models.llama import LlamaAttention, LlamaMLP
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import HasNoOps, SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int:
+    # DeciLM-specific code
+    intermediate_size = int(2 * ffn_mult * n_embd / 3)
+    return _find_multiple(intermediate_size, 256)
+
+
+def _find_multiple(n: int, k: int) -> int:
+    # DeciLM-specific code
+    if n % k == 0:
+        return n
+    return n + k - (n % k)
+
+
+class DeciLMDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        block_config = config.block_configs[layer_idx]
+        self._is_no_op_attention = block_config.attention.no_op
+        self._is_no_op_ffn = block_config.ffn.no_op
+
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+                config, "original_max_position_embeddings", None):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        # Support abacusai/Smaug-72B-v0.1 with attention_bias
+        # Support internlm/internlm-7b with bias
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False)
+        bias_o_proj = attention_bias
+        # support internlm/internlm3-8b with qkv_bias
+        if hasattr(config, "qkv_bias"):
+            attention_bias = config.qkv_bias
+
+        if not self._is_no_op_attention:
+            num_kv_heads = (config.num_attention_heads //
+                            block_config.attention.n_heads_in_group)
+            self.self_attn = LlamaAttention(
+                config=config,
+                hidden_size=self.hidden_size,
+                num_heads=config.num_attention_heads,
+                num_kv_heads=num_kv_heads,
+                rope_theta=rope_theta,
+                rope_scaling=rope_scaling,
+                max_position_embeddings=max_position_embeddings,
+                quant_config=quant_config,
+                bias=attention_bias,
+                bias_o_proj=bias_o_proj,
+                cache_config=cache_config,
+                prefix=f"{prefix}.self_attn",
+            )
+            self.input_layernorm = RMSNorm(config.hidden_size,
+                                           eps=config.rms_norm_eps)
+
+        if not self._is_no_op_ffn:
+            ffn_mult = block_config.ffn.ffn_mult
+            intermediate_size = _ffn_mult_to_intermediate_size(
+                ffn_mult, config.hidden_size)
+
+            self.mlp = LlamaMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                bias=getattr(config, "mlp_bias", False),
+                prefix=f"{prefix}.mlp",
+            )
+            self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                    eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+
+        if self._is_no_op_attention:
+            pass
+        else:
+            if (residual is None):
+                residual = hidden_states
+                hidden_states = self.input_layernorm(hidden_states)
+            else:
+                hidden_states, residual = self.input_layernorm(
+                    hidden_states, residual)
+            hidden_states = self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+            )
+
+        # Fully Connected
+        if not self._is_no_op_ffn:
+            hidden_states, residual = self.post_attention_layernorm(
+                hidden_states, residual)
+            hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class DeciModel(nn.Module):
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        layer_type: Type[DeciLMDecoderLayer] = DeciLMDecoderLayer,
+    ):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.quant_config = quant_config
+        self.padding_idx = config.pad_token_id
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        if get_pp_group().is_first_rank or (config.tie_word_embeddings
+                                            and get_pp_group().is_last_rank):
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                quant_config=quant_config,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        def get_layer(prefix: str):
+            layer_idx = int(prefix.rsplit(".", 1)[1])
+            return layer_type(
+                config,
+                layer_idx,
+                cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            get_layer,
+            prefix=f"{prefix}.layers",
+        )
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        kv_cache_index = 0
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            if not layer._is_no_op_attention:
+                hidden_states, residual = layer(positions, hidden_states,
+                                                residual)
+                kv_cache_index += 1
+            else:
+                hidden_states, residual = layer(positions, hidden_states,
+                                                residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if self.quant_config is not None and (
+                    scale_name := self.quant_config.get_cache_scale(name)):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            if "scale" in name:
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class DeciLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, HasNoOps):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+        "embed_tokens",
+        "lm_head",
+    ]
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    # Mistral/Llama models can also be loaded with --load-format mistral
+    # from consolidated.safetensors checkpoints
+    mistral_mapping = {
+        "layers": "model.layers",
+        "attention": "self_attn",
+        "wq": "q_proj",
+        "wk": "k_proj",
+        "wv": "v_proj",
+        "wo": "o_proj",
+        "attention_norm": "input_layernorm",
+        "feed_forward": "mlp",
+        "w1": "gate_proj",
+        "w2": "down_proj",
+        "w3": "up_proj",
+        "ffn_norm": "post_attention_layernorm",
+        "tok_embeddings": "model.embed_tokens",
+        "output": "lm_head",
+        "norm": "model.norm",
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        self.config = config
+        self.lora_config = lora_config
+
+        self.model = self._init_model(vllm_config=vllm_config,
+                                      prefix=maybe_prefix(prefix, "model"))
+
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            if lora_config:
+                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=(
+                    DEFAULT_VOCAB_PADDING_SIZE
+                    # We need bigger padding if using lora for kernel
+                    # compatibility
+                    if not lora_config else
+                    lora_config.lora_vocab_padding_size),
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+            if config.tie_word_embeddings:
+                self.lm_head = self.lm_head.tie_weights(
+                    self.model.embed_tokens)
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    config.vocab_size,
+                                                    logit_scale)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.sampler = get_sampler()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def _init_model(self, vllm_config: VllmConfig, prefix: str = ""):
+        return DeciModel(vllm_config=vllm_config, prefix=prefix)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        model_output = self.model(input_ids, positions, intermediate_tensors,
+                                  inputs_embeds)
+        return model_output
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(self, logits: torch.Tensor,
+               sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 9288a4b81748e..34be221285cee 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -21,9 +21,10 @@ import torch.nn as nn
 from vllm.logger import init_logger
 from vllm.utils import is_in_doc_build
 
-from .interfaces import (has_inner_state, is_attention_free, is_hybrid,
-                         supports_cross_encoding, supports_multimodal,
-                         supports_pp, supports_transcription, supports_v0_only)
+from .interfaces import (has_inner_state, has_noops, is_attention_free,
+                         is_hybrid, supports_cross_encoding,
+                         supports_multimodal, supports_pp,
+                         supports_transcription, supports_v0_only)
 from .interfaces_base import is_text_generation_model
 
 logger = init_logger(__name__)
@@ -44,7 +45,7 @@ _TEXT_GENERATION_MODELS = {
     "CohereForCausalLM": ("commandr", "CohereForCausalLM"),
     "Cohere2ForCausalLM": ("commandr", "CohereForCausalLM"),
     "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
-    "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
+    "DeciLMForCausalLM": ("nemotron_nas", "DeciLMForCausalLM"),
     "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
     "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"),
     "DeepseekV3ForCausalLM": ("deepseek_v2", "DeepseekV3ForCausalLM"),
@@ -118,7 +119,7 @@ _EMBEDDING_MODELS = {
     "RobertaModel": ("roberta", "RobertaEmbeddingModel"),
     "RobertaForMaskedLM": ("roberta", "RobertaEmbeddingModel"),
     "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"),
-    "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
+    "DeciLMForCausalLM": ("nemotron_nas", "DeciLMForCausalLM"),
     "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"),
     "GlmForCausalLM": ("glm", "GlmForCausalLM"),
     "GritLM": ("gritlm", "GritLM"),
@@ -235,6 +236,7 @@ class _ModelInfo:
     has_inner_state: bool
     is_attention_free: bool
     is_hybrid: bool
+    has_noops: bool
     supports_transcription: bool
     supports_v0_only: bool
 
@@ -252,6 +254,7 @@ class _ModelInfo:
             is_hybrid=is_hybrid(model),
             supports_transcription=supports_transcription(model),
             supports_v0_only=supports_v0_only(model),
+            has_noops=has_noops(model),
         )
 
 
@@ -511,6 +514,13 @@ class _ModelRegistry:
         model_cls, _ = self.inspect_model_cls(architectures)
         return model_cls.is_hybrid
 
+    def is_noops_model(
+        self,
+        architectures: Union[str, List[str]],
+    ) -> bool:
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.has_noops
+
     def is_transcription_model(
         self,
         architectures: Union[str, List[str]],
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index d8c8b5b39ef14..eb89193140a88 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -497,7 +497,10 @@ def set_cpu_offload_max_bytes(max_bytes: int) -> None:
 
 
 def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
-    device = next(module.parameters()).device
+    if (params := next(module.parameters(), None)) is None:
+        return module
+
+    device = params.device
 
     if device == torch.device("cpu"):
         return module

From c2e7507ad4a022befe418acefccc3bf883a0aad5 Mon Sep 17 00:00:00 2001
From: Alex Brooks <alex.brooks@ibm.com>
Date: Mon, 31 Mar 2025 07:23:53 -0600
Subject: [PATCH 123/593] [Bugfix] Fix Crashing When Loading Modules With
 Batchnorm Stats (#15813)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
---
 tests/models/test_utils.py          | 79 +++++++++++++++++++++++++++++
 vllm/model_executor/models/utils.py | 24 +++++++++
 2 files changed, 103 insertions(+)
 create mode 100644 tests/models/test_utils.py

diff --git a/tests/models/test_utils.py b/tests/models/test_utils.py
new file mode 100644
index 0000000000000..d61c7d2d50000
--- /dev/null
+++ b/tests/models/test_utils.py
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from vllm.model_executor.models.utils import AutoWeightsLoader
+
+
+class ModuleWithBatchNorm(torch.nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.bn = torch.nn.BatchNorm1d(2)
+
+    def forward(self, x):
+        return self.bn(x)
+
+
+class ModuleWithNestedBatchNorm(torch.nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.nested_mod = ModuleWithBatchNorm()
+
+    def forward(self, x):
+        return self.nested_mod(x)
+
+
+def test_module_with_batchnorm_can_load():
+    """Ensure the auto weight loader can load batchnorm stats."""
+    mod = ModuleWithBatchNorm()
+    # Run some data through the module with batchnorm
+    mod(torch.Tensor([[1, 2], [3, 4]]))
+
+    # Try to load the weights to a new instance
+    def weight_generator():
+        yield from mod.state_dict().items()
+
+    new_mod = ModuleWithBatchNorm()
+
+    assert not torch.all(new_mod.bn.running_mean == mod.bn.running_mean)
+    assert not torch.all(new_mod.bn.running_var == mod.bn.running_var)
+    assert new_mod.bn.num_batches_tracked.item() == 0
+
+    loader = AutoWeightsLoader(new_mod)
+    loader.load_weights(weight_generator())
+
+    # Ensure the stats are updated
+    assert torch.all(new_mod.bn.running_mean == mod.bn.running_mean)
+    assert torch.all(new_mod.bn.running_var == mod.bn.running_var)
+    assert new_mod.bn.num_batches_tracked.item() == 1
+
+
+def test_module_with_child_containing_batchnorm_can_autoload():
+    """Ensure the auto weight loader can load nested modules batchnorm stats."""
+    mod = ModuleWithNestedBatchNorm()
+    # Run some data through the module with batchnorm
+    mod(torch.Tensor([[1, 2], [3, 4]]))
+
+    # Try to load the weights to a new instance
+    def weight_generator():
+        yield from mod.state_dict().items()
+
+    new_mod = ModuleWithNestedBatchNorm()
+
+    assert not torch.all(
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+    assert not torch.all(
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0
+
+    loader = AutoWeightsLoader(new_mod)
+    loader.load_weights(weight_generator())
+
+    # Ensure the stats are updated
+    assert torch.all(
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+    assert torch.all(
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index eb89193140a88..f197434f31432 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -158,6 +158,26 @@ class AutoWeightsLoader:
 
             yield weight_qualname
 
+    def _add_loadable_non_param_tensors(self, module: nn.Module,
+                                        child_params: Dict[str, torch.Tensor]):
+        """
+        Add tensor names that are not in the model params that may be in the
+        safetensors, e.g., batch normalization stats.
+        """
+        if isinstance(module, (
+                nn.BatchNorm1d,
+                nn.BatchNorm2d,
+                nn.BatchNorm3d,
+                nn.LazyBatchNorm1d,
+                nn.LazyBatchNorm2d,
+                nn.LazyBatchNorm3d,
+                nn.SyncBatchNorm,
+        )):
+            module_state_dict = module.state_dict()
+            for stat_name in ("running_mean", "running_var",
+                              "num_batches_tracked"):
+                child_params[stat_name] = module_state_dict[stat_name]
+
     def _load_module(
         self,
         base_prefix: str,
@@ -186,6 +206,10 @@ class AutoWeightsLoader:
         child_modules = dict(module.named_children())
         child_params = dict(module.named_parameters(recurse=False))
 
+        # Add missing tensors the weight loader needs to be able to load
+        # that aren't registered as params, e.g., batchnorm statistics.
+        self._add_loadable_non_param_tensors(module, child_params)
+
         for child_prefix, child_weights in self._groupby_prefix(weights):
             prefix = self._get_qualname(base_prefix, child_prefix)
 

From 037bcd942cac24a30dfe61ea3423caf62c92a153 Mon Sep 17 00:00:00 2001
From: Mrm <86636997+noc-turne@users.noreply.github.com>
Date: Mon, 31 Mar 2025 21:56:42 +0800
Subject: [PATCH 124/593] [Bugfix] Fix missing return value in load_weights
 method of adapters.py (#15542)

Signed-off-by: noc-turne <2270929247@qq.com>
---
 vllm/model_executor/models/adapters.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 23d72d8e60f60..6ab03c40ab4a2 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -99,16 +99,17 @@ def _create_pooling_model_cls(
                     mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
                     weights = mapper.apply(weights)
 
-                    self.model.load_weights(weights)
-                    return
+                    loaded_params = self.model.load_weights(weights)
+                    loaded_params = {f"model.{name}" for name in loaded_params}
+                    return loaded_params
 
             # For most other models
             if hasattr(orig_cls, "load_weights"):
-                orig_cls.load_weights(self, weights)  # type: ignore
+                return orig_cls.load_weights(self, weights)  # type: ignore
             # Fallback
             else:
                 loader = AutoWeightsLoader(self)
-                loader.load_weights(weights)
+                return loader.load_weights(weights)
 
     return ModelForPooling  # type: ignore
 

From e5ef4fa99af2050693a00b212842f07412796f2e Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 31 Mar 2025 16:59:37 +0100
Subject: [PATCH 125/593] Upgrade `transformers` to `v4.50.3` (#13905)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/source/models/supported_models.md        |  2 +-
 requirements/common.txt                       |  2 +-
 requirements/test.in                          |  2 +-
 requirements/test.txt                         |  2 +-
 tests/distributed/test_pipeline_parallel.py   |  2 +-
 .../vision_language/test_models.py            | 52 +++++++------------
 .../vision_language/vlm_utils/model_utils.py  |  7 +++
 tests/models/registry.py                      | 48 +++++++++++++----
 8 files changed, 68 insertions(+), 49 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 0fa10e0734ad8..1705757d8308f 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -73,7 +73,7 @@ The Transformers fallback explicitly supports the following features:
 
 - <project:#quantization-index> (except GGUF)
 - <project:#lora-adapter>
-- <project:#distributed-serving> (requires `transformers>=4.49.0`)
+- <project:#distributed-serving>
 
 #### Remote code
 
diff --git a/requirements/common.txt b/requirements/common.txt
index dfa20f5e3f08e..c7bbdb71b742c 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -6,7 +6,7 @@ requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
-transformers >= 4.48.2  # Required for Bamba model and Transformers backend.
+transformers >= 4.50.3
 tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
diff --git a/requirements/test.in b/requirements/test.in
index a7dd54151dee8..cf89794b93fc7 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -30,7 +30,7 @@ matplotlib # required for qwen-vl test
 mistral_common[opencv] >= 1.5.4 # required for pixtral test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.4 # required for model evaluation test
-transformers==4.48.2
+transformers==4.50.3
 # quantization
 bitsandbytes>=0.45.3
 buildkite-test-collector==0.1.9
diff --git a/requirements/test.txt b/requirements/test.txt
index aed6a5653e2ad..26ed9dbe32cb5 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -643,7 +643,7 @@ tqdm==4.66.6
     #   transformers
 tqdm-multiprocess==0.0.11
     # via lm-eval
-transformers==4.48.2
+transformers==4.50.3
     # via
     #   -r requirements/test.in
     #   genai-perf
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 751c4eb096ae0..6277a1009ffe4 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -245,7 +245,7 @@ TEST_MODELS = [
     # [LANGUAGE GENERATION]
     "microsoft/Phi-3.5-MoE-instruct",
     "meta-llama/Llama-3.2-1B-Instruct",
-    # "ArthurZ/Ilama-3.2-1B", NOTE: Uncomment after #13905
+    "ArthurZ/Ilama-3.2-1B",
     "ibm/PowerLM-3b",
     # [LANGUAGE EMBEDDING]
     "intfloat/e5-mistral-7b-instruct",
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index ecb637c62e439..aa3ac7eea6d0d 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -8,9 +8,7 @@ from collections import defaultdict
 from pathlib import PosixPath
 
 import pytest
-from packaging.version import Version
 from transformers import AutoModelForImageTextToText, AutoModelForVision2Seq
-from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm.platforms import current_platform
 from vllm.utils import identity
@@ -126,25 +124,6 @@ VLM_TEST_SETTINGS = {
         dtype="bfloat16",
         marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")],  # noqa: E501
     ),
-    # TODO(ywang96): Move Qwen2-VL out of core models in favor of Qwen2.5-VL
-    # once we upgraded to transformers>=4.49.0.
-    "qwen2_vl": VLMTestInfo(
-        models=["Qwen/Qwen2-VL-2B-Instruct"],
-        test_type=(
-            VLMTestType.IMAGE,
-            VLMTestType.MULTI_IMAGE,
-            VLMTestType.VIDEO
-        ),
-        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
-        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
-        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
-        max_model_len=4096,
-        max_num_seqs=2,
-        auto_cls=AutoModelForVision2Seq,
-        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
-        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
-        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
-    ),
     "qwen2_5_vl": VLMTestInfo(
         models=["Qwen/Qwen2.5-VL-3B-Instruct"],
         test_type=(
@@ -218,12 +197,6 @@ VLM_TEST_SETTINGS = {
         hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
         stop_str=["<｜end▁of▁sentence｜>", "<｜begin▁of▁sentence｜>"],  # noqa: E501
         image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
-        marks=[
-            pytest.mark.skipif(
-                Version(TRANSFORMERS_VERSION) >= Version("4.48"),
-                reason="HF model is not compatible with transformers>=4.48",
-            )
-        ],
     ),
     "fuyu": VLMTestInfo(
         models=["adept/fuyu-8b"],
@@ -336,6 +309,7 @@ VLM_TEST_SETTINGS = {
         prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
         num_video_frames=16,
         max_model_len=16384,
+        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),   # noqa: E501
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
         custom_test_opts=[CustomTestOptions(
@@ -365,12 +339,6 @@ VLM_TEST_SETTINGS = {
         auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
         patch_hf_runner=model_utils.mantis_patch_hf_runner,
-        marks=[
-            pytest.mark.skipif(
-                Version(TRANSFORMERS_VERSION) >= Version("4.48"),
-                reason="HF model is not compatible with transformers>=4.48",
-            )
-        ],
     ),
     "minicpmv_25": VLMTestInfo(
         models=["openbmb/MiniCPM-Llama3-V-2_5"],
@@ -450,6 +418,23 @@ VLM_TEST_SETTINGS = {
         vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,
         prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
     ),
+    "qwen2_vl": VLMTestInfo(
+        models=["Qwen/Qwen2-VL-2B-Instruct"],
+        test_type=(
+            VLMTestType.IMAGE,
+            VLMTestType.MULTI_IMAGE,
+            VLMTestType.VIDEO
+        ),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
+        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForVision2Seq,
+        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
+        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        marks=[pytest.mark.cpu_model],
+    ),
     "skywork_r1v": VLMTestInfo(
         models=["Skywork/Skywork-R1V-38B"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
@@ -515,6 +500,7 @@ VLM_TEST_SETTINGS = {
         max_model_len=16384,
         max_num_seqs=2,
         auto_cls=AutoModelForVision2Seq,
+        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),   # noqa: E501
         vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
         custom_test_opts=[CustomTestOptions(
             inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index 2ddf28aca4f63..2e9190fc6893c 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -104,6 +104,13 @@ def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
     return hf_output_ids, hf_output_str, out_logprobs
 
 
+def llava_onevision_hf_model_kwargs(model: str) -> dict:
+    """Workaround to fix the sliding window issue in llava_onevision."""
+    config = AutoConfig.from_pretrained(model)
+    config.text_config.sliding_window = None
+    return config.to_dict()
+
+
 def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
                                       model: str) -> RunnerOutput:
     """Sanitize vllm output [llava-onevision] to compare with hf output."""
diff --git a/tests/models/registry.py b/tests/models/registry.py
index a733fedcdd6c0..7c8fac08befff 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -34,6 +34,16 @@ class _HfExamplesInfo:
     The minimum version of HF Transformers that is required to run this model.
     """
 
+    max_transformers_version: Optional[str] = None
+    """
+    The maximum version of HF Transformers that this model runs on.
+    """
+
+    transformers_version_reason: Optional[str] = None
+    """
+    The reason for the minimum/maximum version requirement.
+    """
+
     is_available_online: bool = True
     """
     Set this to ``False`` if the name of this architecture no longer exists on
@@ -57,21 +67,28 @@ class _HfExamplesInfo:
         If the installed transformers version does not meet the requirements,
         perform the given action.
         """
-        if self.min_transformers_version is None:
+        if (self.min_transformers_version is None
+                and self.max_transformers_version is None):
             return
 
         current_version = TRANSFORMERS_VERSION
-        required_version = self.min_transformers_version
-        if Version(current_version) < Version(required_version):
-            msg = (
-                f"You have `transformers=={current_version}` installed, but "
-                f"`transformers>={required_version}` is required to run this "
-                "model")
+        min_version = self.min_transformers_version
+        max_version = self.max_transformers_version
+        msg = f"`transformers=={current_version}` installed, but `transformers"
+        if min_version and Version(current_version) < Version(min_version):
+            msg += f">={min_version}` is required to run this model."
+        elif max_version and Version(current_version) > Version(max_version):
+            msg += f"<={max_version}` is required to run this model."
+        else:
+            return
 
-            if on_fail == "error":
-                raise RuntimeError(msg)
-            else:
-                pytest.skip(msg)
+        if self.transformers_version_reason:
+            msg += f" Reason: {self.transformers_version_reason}"
+
+        if on_fail == "error":
+            raise RuntimeError(msg)
+        else:
+            pytest.skip(msg)
 
     def check_available_online(
         self,
@@ -245,6 +262,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"),  # noqa: E501
     "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),  # noqa: E501
     "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny",  # noqa: E501
+                                                extras={"fork": "Isotr0py/deepseek-vl2-tiny"},  # noqa: E501
+                                                max_transformers_version="4.48",  # noqa: E501
+                                                transformers_version_reason="HF model is not compatible.",  # noqa: E501
                                                hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}),  # noqa: E501
     "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
     "Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it",
@@ -266,13 +286,19 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "LlavaNextVideoForConditionalGeneration": _HfExamplesInfo("llava-hf/LLaVA-NeXT-Video-7B-hf"),  # noqa: E501
     "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),  # noqa: E501
     "MantisForConditionalGeneration": _HfExamplesInfo("TIGER-Lab/Mantis-8B-siglip-llama3",  # noqa: E501
+                                                      max_transformers_version="4.48",  # noqa: E501
+                                                      transformers_version_reason="HF model is not compatible.",  # noqa: E501
                                                       hf_overrides={"architectures": ["MantisForConditionalGeneration"]}),  # noqa: E501
     "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6",
+                                max_transformers_version="4.48",
+                                transformers_version_reason="Use of deprecated imports which have been removed.",  # noqa: E501
                                 trust_remote_code=True),
     "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
                                 extras={"2.6": "openbmb/MiniCPM-V-2_6"},  # noqa: E501
                                 trust_remote_code=True),
     "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
+                                        max_transformers_version="4.48",
+                                        transformers_version_reason="Use of private method which no longer exists.",  # noqa: E501
                                         extras={"olmo": "allenai/Molmo-7B-O-0924"},  # noqa: E501
                                         trust_remote_code=True),
     "NVLM_D": _HfExamplesInfo("nvidia/NVLM-D-72B",

From 09e974d483ab80052fc2586e7d72231c280f46b6 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 1 Apr 2025 00:01:35 +0800
Subject: [PATCH 126/593] [Bugfix] Check dimensions of multimodal embeddings in
 V1 (#15816)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 examples/offline_inference/vision_language.py |  9 +++---
 tests/distributed/test_pipeline_parallel.py   |  2 +-
 .../vision_language/test_models.py            |  8 ++---
 tests/models/registry.py                      |  3 +-
 vllm/model_executor/models/florence2.py       |  3 +-
 vllm/model_executor/models/fuyu.py            | 23 +++++++++------
 vllm/model_executor/models/gemma3_mm.py       |  6 ++--
 vllm/model_executor/models/idefics3.py        |  9 ++++--
 .../model_executor/models/llava_next_video.py |  9 +++---
 vllm/model_executor/models/minicpmv.py        |  7 +++--
 vllm/model_executor/models/vision.py          |  2 +-
 vllm/v1/worker/gpu_model_runner.py            | 18 ++++++++----
 vllm/v1/worker/tpu_model_runner.py            |  7 +++++
 vllm/v1/worker/utils.py                       | 29 +++++++++++++++++++
 14 files changed, 98 insertions(+), 37 deletions(-)
 create mode 100644 vllm/v1/worker/utils.py

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 572eabe261930..eb56b0aee6c76 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -68,7 +68,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
     # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
     prompts = [f"Question: {question} Answer:" for question in questions]
     engine_args = EngineArgs(
-        model="Salesforce/blip2-opt-2.7b",
+        model="Salesforce/blip2-opt-6.7b",
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
@@ -128,7 +128,8 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
     engine_args = EngineArgs(
         model="microsoft/Florence-2-large",
         tokenizer="facebook/bart-large",
-        max_num_seqs=8,
+        max_model_len=4096,
+        max_num_seqs=2,
         trust_remote_code=True,
         dtype="bfloat16",
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
@@ -511,7 +512,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
     engine_args = EngineArgs(
         model=model_name,
         max_model_len=4096,
-        max_num_seqs=16,
+        max_num_seqs=2,
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
@@ -700,7 +701,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
     # NOTE: Need L40 (or equivalent) to avoid OOM
     engine_args = EngineArgs(
         model=model_name,
-        max_model_len=8192,
+        max_model_len=6144,
         max_num_seqs=2,
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 6277a1009ffe4..05e30f855ced2 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -217,7 +217,7 @@ EMBEDDING_MODELS = {  # type: ignore[var-annotated]
 
 MULTIMODAL_MODELS = {
     # [Decoder-only]
-    "Salesforce/blip2-opt-2.7b": PPTestSettings.fast(),
+    "Salesforce/blip2-opt-6.7b": PPTestSettings.fast(),
     "facebook/chameleon-7b": PPTestSettings.fast(),
     "adept/fuyu-8b": PPTestSettings.fast(),
     "THUDM/glm-4v-9b": PPTestSettings.fast(),
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index aa3ac7eea6d0d..7a9158eff94eb 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -34,8 +34,6 @@ REQUIRES_V0_MODELS = [
     # V1 Test: no way to fall back for head_dim = 80
     # https://github.com/vllm-project/vllm/issues/14524
     "qwen_vl",
-    "h2ovl",
-    "blip2",
     # V1 Test: not enough KV cache space in C1.
     "fuyu",
 ]
@@ -161,7 +159,8 @@ VLM_TEST_SETTINGS = {
         marks=[large_gpu_mark(min_gb=64)],
     ),
     "blip2": VLMTestInfo(
-        models=["Salesforce/blip2-opt-2.7b"],
+        # TODO: Change back to 2.7b once head_dim = 80 is supported
+        models=["Salesforce/blip2-opt-6.7b"],
         test_type=VLMTestType.IMAGE,
         prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
         img_idx_to_prompt=lambda idx: "",
@@ -248,7 +247,8 @@ VLM_TEST_SETTINGS = {
     "h2ovl": VLMTestInfo(
         models = [
             "h2oai/h2ovl-mississippi-800m",
-            "h2oai/h2ovl-mississippi-2b",
+            # TODO: Re-enable once head_dim = 80 is supported
+            # "h2oai/h2ovl-mississippi-2b",
         ],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 7c8fac08befff..69ebfe4c92415 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -259,7 +259,8 @@ _CROSS_ENCODER_EXAMPLE_MODELS = {
 _MULTIMODAL_EXAMPLE_MODELS = {
     # [Decoder-only]
     "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
-    "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"),  # noqa: E501
+    "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b",  # noqa: E501
+                                                     extras={"6b": "Salesforce/blip2-opt-6.7b"}),  # noqa: E501
     "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),  # noqa: E501
     "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny",  # noqa: E501
                                                 extras={"fork": "Isotr0py/deepseek-vl2-tiny"},  # noqa: E501
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 3883cd4460f50..02535cc5473c7 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -875,7 +875,8 @@ class Florence2MultiModalProcessor(
     Florence2MultiModalProcessor,
     info=Florence2ProcessingInfo,
     dummy_inputs=Florence2DummyInputsBuilder)
-class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal):
+class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                        SupportsV0Only):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index a1004cd0ac608..a807b047a1aae 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -39,7 +39,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.utils import flatten_2d_lists
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
@@ -66,10 +65,13 @@ class FuyuImagePatchInputs(TypedDict):
     This is used to split the embeddings which has the first two dimensions
     flattened just like `flat_data`.
     """
+
     embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
     """
     A boolean mask indicating which image embeddings correspond
     to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
     """
 
 
@@ -322,16 +324,18 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[FuyuImagePatchInputs]:
         image_patches = kwargs.pop("image_patches", None)
-        embed_is_patch = kwargs.pop("embed_is_patch", None)
         if image_patches is not None:
             if not isinstance(image_patches, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of image patches. "
                                  f"Got type: {type(image_patches)}")
 
+            embed_is_patch = kwargs.pop("embed_is_patch")
             if not isinstance(embed_is_patch, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of embed_is_patch. "
                                  f"Got type: {type(embed_is_patch)}")
+
             image_patches_flat = flatten_bn(image_patches)
+            embed_is_patch = flatten_bn(embed_is_patch)
 
             return FuyuImagePatchInputs(
                 type="image_patches",
@@ -351,6 +355,7 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         assert self.vision_embed_tokens is not None
         vision_embeddings_flat, _ = self.vision_embed_tokens(
             image_patches_flat)
+
         return vision_embeddings_flat.split(patches_per_image, dim=0)
 
     def get_multimodal_embeddings(
@@ -358,13 +363,13 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
-        vision_embeddings = self._process_image_input(image_input)
-        #return vision_embeddings
-        return flatten_2d_lists(
-            scatter_patch_features(*args) for args in zip(
-                vision_embeddings,
-                image_input["embed_is_patch"],
-            ))
+
+        image_features = self._process_image_input(image_input)
+
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 9efb57b8c5aa1..bbdea70a7bcfd 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -613,7 +613,7 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
     def _process_image_input(
         self,
         image_input: Gemma3ImageInputs,
-    ) -> tuple[torch.Tensor, ...]:
+    ) -> list[torch.Tensor]:
         assert self.vision_tower is not None
 
         pixel_values = image_input["pixel_values"]
@@ -625,7 +625,9 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
         )
         image_embeds = self.multi_modal_projector(image_features)
 
-        return image_embeds.split(num_patches.tolist())
+        return [
+            e.flatten(0, 1) for e in image_embeds.split(num_patches.tolist())
+        ]
 
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 327ec4640f03e..da4a44346c32e 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -733,7 +733,10 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
             pixel_attention_mask=pixel_attention_mask,
         )
 
-    def _process_image_input(self, image_input: ImageInputs) -> torch.Tensor:
+    def _process_image_input(
+        self,
+        image_input: ImageInputs,
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
         if image_input["type"] == "image_embeds":
             return image_input["data"]
 
@@ -741,7 +744,9 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
         image_features = self.model.connector(image_features)
 
         num_patches = image_input["num_patches"]
-        return image_features.split(num_patches.tolist())
+        return [
+            e.flatten(0, 1) for e in image_features.split(num_patches.tolist())
+        ]
 
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 8a5edefb4a0b2..780af72d57201 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -406,20 +406,21 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
                                                h, w)
             stacked_embeddings = self._video_pixels_to_features(
                 self.vision_tower, stacked_pixels)
-            return stacked_embeddings.view(b, num_frames,
-                                           *stacked_embeddings.shape[1:])
+            embeds = stacked_embeddings.view(b, num_frames,
+                                             *stacked_embeddings.shape[1:])
 
         elif is_list_of(video_pixels, torch.Tensor):
             frames_per_videos = [v.shape[0] for v in video_pixels]
             stacked_pixels = torch.cat(video_pixels, dim=0)
             stacked_embeddings = self._video_pixels_to_features(
                 self.vision_tower, stacked_pixels)
-            return torch.split(stacked_embeddings, frames_per_videos, dim=0)
-
+            embeds = torch.split(stacked_embeddings, frames_per_videos, dim=0)
         else:
             raise ValueError(
                 f"Unsupported type of video input {type(video_pixels)}")
 
+        return [e.flatten(0, 1) for e in embeds]
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         video_input = self._parse_and_validate_video_input(**kwargs)
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 2c0d37e883b90..5fab9df3f8f99 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -919,8 +919,11 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
 
         image_features_flat = self.get_vision_hidden_states(image_input)
 
-        # Reconstruct the batch dimension
-        return image_features_flat.split(image_input["num_slices"].tolist())
+        num_slices = image_input["num_slices"]
+        return [
+            e.flatten(0, 1)
+            for e in image_features_flat.split(num_slices.tolist())
+        ]
 
     def _process_multimodal_inputs(self, modalities: dict):
         # The result multimodal_embeddings is tuple of tensors, with each
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index db069f8de2a35..5c21fb2d4ad2e 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -204,7 +204,7 @@ def scatter_patch_features(
             (e_is_patch.shape[0], patches_one.shape[-1]),
             fill_value=torch.nan,
         )
-        embed_one[e_is_patch] = patches_one.flatten(0, -2)
+        embed_one[e_is_patch] = patches_one
         return embed_one
 
     return tuple(
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 74f3124e3c779..c7374cc3d3306 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -41,6 +41,8 @@ from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
+from .utils import sanity_check_mm_encoder_outputs
+
 if TYPE_CHECKING:
     import xgrammar as xgr
 
@@ -867,6 +869,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             curr_group_outputs = self.model.get_multimodal_embeddings(
                 **batched_mm_inputs)
 
+            sanity_check_mm_encoder_outputs(
+                curr_group_outputs,
+                expected_num_items=len(grouped_mm_inputs),
+            )
+
             for output in curr_group_outputs:
                 encoder_outputs.append(output)
 
@@ -1490,12 +1497,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             # Run multimodal encoder.
             dummy_encoder_outputs = self.model.get_multimodal_embeddings(
                 **batched_dummy_mm_inputs)
-            assert len(dummy_encoder_outputs) == max_num_mm_items, (
-                "Expected dimension 0 of encoder outputs to match the number "
-                f"of multimodal data items: {max_num_mm_items}, got "
-                f"{len(dummy_encoder_outputs)=} instead. This is most likely "
-                "due to the 'get_multimodal_embeddings' method of the model "
-                "not implemented correctly.")
+
+            sanity_check_mm_encoder_outputs(
+                dummy_encoder_outputs,
+                expected_num_items=max_num_mm_items,
+            )
 
             # Cache the dummy encoder outputs.
             self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index ea5a17016eb6b..8f6a54892a4e6 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -37,6 +37,8 @@ from vllm.v1.sample.tpu.sampler import Sampler as TPUSampler
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
+from .utils import sanity_check_mm_encoder_outputs
+
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
 
@@ -512,6 +514,11 @@ class TPUModelRunner:
             curr_group_outputs = self.model.get_multimodal_embeddings(
                 **batched_mm_inputs)
 
+            sanity_check_mm_encoder_outputs(
+                curr_group_outputs,
+                expected_num_items=len(grouped_mm_inputs),
+            )
+
             for output in curr_group_outputs:
                 encoder_outputs.append(output)
 
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
new file mode 100644
index 0000000000000..b1d3aa7cd8afb
--- /dev/null
+++ b/vllm/v1/worker/utils.py
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sanity_check_mm_encoder_outputs(
+    mm_embeddings: object,
+    expected_num_items: int,
+) -> None:
+    """
+    Perform sanity checks for the result of
+    :meth:`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`.
+    """
+    assert isinstance(mm_embeddings, (list, tuple, torch.Tensor)), (
+        "Expected multimodal embeddings to be a list/tuple of 2D tensors, "
+        f"or a single 3D tensor, but got {type(mm_embeddings)} "
+        "instead. This is most likely due to incorrect implementation "
+        "of the model's `get_multimodal_embeddings` method.")
+
+    assert len(mm_embeddings) == expected_num_items, (
+        "Expected number of multimodal embeddings to match number of "
+        f"input items: {expected_num_items}, but got {len(mm_embeddings)=} "
+        "instead. This is most likely due to incorrect implementation "
+        "of the model's `get_multimodal_embeddings` method.")
+
+    assert all(e.ndim == 2 for e in mm_embeddings), (
+        "Expected multimodal embeddings to be a sequence of 2D tensors, "
+        f"but got tensors with shapes {[e.shape for e in mm_embeddings]} "
+        "instead. This is most likely due to incorrect implementation "
+        "of the model's `get_multimodal_embeddings` method.")

From 239b7befddb486d121779a7f80d1f5d9dcdd248f Mon Sep 17 00:00:00 2001
From: shangmingc <caishangming@linux.alibaba.com>
Date: Tue, 1 Apr 2025 00:19:35 +0800
Subject: [PATCH 127/593] [V1][Spec Decode] Remove deprecated spec decode
 config params (#15466)

Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
---
 .../tests/serving-tests.json                  |  10 +-
 docs/source/features/spec_decode.md           |   2 +-
 examples/offline_inference/eagle.py           |  10 +-
 tests/metrics/test_metrics.py                 |  12 +-
 tests/models/test_initialization.py           |   6 +-
 .../e2e/test_integration_dist_tp2.py          |  80 ++++++-
 .../e2e/test_integration_dist_tp4.py          |   6 +-
 tests/v1/test_oracle.py                       |   4 +-
 vllm/config.py                                |  19 +-
 vllm/engine/arg_utils.py                      | 196 ++----------------
 10 files changed, 125 insertions(+), 220 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json
index 415171e268b08..13fd5aa8db97b 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@@ -63,10 +63,12 @@
             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
             "disable_log_requests": "", 
             "tensor_parallel_size": 4,
-            "swap_space": 16, 
-            "speculative_model": "turboderp/Qwama-0.5B-Instruct",
-            "num_speculative_tokens": 4,
-            "speculative_draft_tensor_parallel_size": 1
+            "swap_space": 16,
+            "speculative_config": {
+                "model": "turboderp/Qwama-0.5B-Instruct",
+                "num_speculative_tokens": 4,
+                "draft_tensor_parallel_size": 1
+            }
         },
         "client_parameters": {
             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md
index 3e1f1d5be7523..f16e0d96522da 100644
--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@@ -52,7 +52,7 @@ python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model
 ```
 
 :::{warning}
-Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately will be deprecated in the next release.
+Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately has been deprecated now.
 :::
 
 Then use a client:
diff --git a/examples/offline_inference/eagle.py b/examples/offline_inference/eagle.py
index baa91b2d0364d..db5012bae2930 100644
--- a/examples/offline_inference/eagle.py
+++ b/examples/offline_inference/eagle.py
@@ -69,10 +69,12 @@ llm = LLM(
     max_model_len=max_model_len,
     max_num_seqs=args.max_num_seqs,
     gpu_memory_utilization=0.8,
-    speculative_model=eagle_dir,
-    num_speculative_tokens=args.num_spec_tokens,
-    speculative_draft_tensor_parallel_size=args.draft_tp,
-    speculative_max_model_len=max_model_len,
+    speculative_config={
+        "model": eagle_dir,
+        "num_speculative_tokens": args.num_spec_tokens,
+        "draft_tensor_parallel_size": args.draft_tp,
+        "max_model_len": max_model_len,
+    },
     disable_log_stats=False,
 )
 
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index 8ddcefd9191ac..e71c87ff3fc82 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -248,8 +248,10 @@ def test_metric_spec_decode(
             dtype=dtype,
             disable_log_stats=False,
             gpu_memory_utilization=0.4,
-            speculative_model=model,
-            num_speculative_tokens=k,
+            speculative_config={
+                "model": model,
+                "num_speculative_tokens": k,
+            },
     ) as vllm_model:
 
         # Force log interval to be 0 to catch all metrics.
@@ -300,8 +302,10 @@ def test_metric_spec_decode_interval(
         dtype=dtype,
         disable_log_stats=False,
         gpu_memory_utilization=0.4,
-        speculative_model=model,
-        num_speculative_tokens=k,
+        speculative_config={
+            "model": model,
+            "num_speculative_tokens": k,
+        },
         enforce_eager=True,
     )
 
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index adb2d6d0a9907..58705637ce94c 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -54,8 +54,10 @@ def test_can_initialize(model_arch):
             model_info.default,
             tokenizer=model_info.tokenizer,
             tokenizer_mode=model_info.tokenizer_mode,
-            speculative_model=model_info.speculative_model,
-            num_speculative_tokens=1 if model_info.speculative_model else None,
+            speculative_config={
+                "model": model_info.speculative_model,
+                "num_speculative_tokens": 1,
+            } if model_info.speculative_model else None,
             trust_remote_code=model_info.trust_remote_code,
             load_format="dummy",
             hf_overrides=hf_overrides,
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py
index b8a2631b91408..b112974754208 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp2.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@@ -3,6 +3,7 @@
 tensor parallelism.
 """
 
+import json
 from typing import Optional
 
 import pytest
@@ -28,14 +29,14 @@ from .conftest import run_equality_correctness_test_tp
 @pytest.mark.parametrize("test_llm_kwargs", [
     [
         "--speculative_config",
-        str({
+        json.dumps({
             "model": "JackFram/llama-68m",
             "num_speculative_tokens": 3,
         }),
     ],
     [
         "--speculative_config",
-        str({
+        json.dumps({
             "model": "ngram",
             "num_speculative_tokens": 5,
             "prompt_lookup_max": 3,
@@ -88,7 +89,7 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
     "model, test_llm_kwargs",
     [("JackFram/llama-68m", [
         "--speculative_config",
-        str({
+        json.dumps({
             "model": "JackFram/llama-68m",
             "num_speculative_tokens": 5,
             "draft_tensor_parallel_size": 1,
@@ -96,7 +97,7 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
     ]),
      ("ibm-granite/granite-3b-code-instruct", [
          "--speculative_config",
-         str({
+         json.dumps({
              "model": "ibm-granite/granite-3b-code-instruct",
              "num_speculative_tokens": 5,
              "draft_tensor_parallel_size": 1,
@@ -147,20 +148,20 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
 @pytest.mark.parametrize("model, test_llm_kwargs",
                          [("JackFram/llama-68m", [
                              "--speculative_config",
-                             str({
+                             json.dumps({
                                  "model": "JackFram/llama-68m",
                                  "num_speculative_tokens": 3,
                              }),
                          ]),
                           ("JackFram/llama-68m", [
                               "--speculative_config",
-                              str({
+                              json.dumps({
                                   "model": "JackFram/llama-68m",
                                   "num_speculative_tokens": 3,
                                   "draft_tensor_parallel_size": 1,
                               }),
                           ])])
-@pytest.mark.parametrize("logprobs", [None, 2])
+@pytest.mark.parametrize("logprobs", [None])
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize("seed", [1])
 def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
@@ -171,9 +172,68 @@ def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
     """Verify spec decode works well with same and different TP size for
     the draft model with chunked prefill.
     """
-    if logprobs:
-        test_llm_kwargs.extend(
-            ["--disable_logprobs_during_spec_decoding", "False"])
+    run_equality_correctness_test_tp(model,
+                                     common_llm_kwargs,
+                                     per_test_common_llm_kwargs,
+                                     baseline_llm_kwargs,
+                                     test_llm_kwargs,
+                                     batch_size,
+                                     max_output_len=32,
+                                     seed=seed,
+                                     temperature=0.0,
+                                     logprobs=logprobs)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [[
+        # Skip cuda graph recording for fast test.
+        "--enforce-eager",
+        "--tensor_parallel_size",
+        "2",
+
+        # precision
+        "--dtype",
+        "bfloat16",
+    ]])
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [["--enable-chunked-prefill", "False"],
+     [
+         "--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4",
+         "--max-num-seqs", "4"
+     ]])
+@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
+@pytest.mark.parametrize("model, test_llm_kwargs",
+                         [("JackFram/llama-68m", [
+                             "--speculative_config",
+                             json.dumps({
+                                 "model": "JackFram/llama-68m",
+                                 "num_speculative_tokens": 3,
+                                 "disable_logprobs": False,
+                             }),
+                         ]),
+                          ("JackFram/llama-68m", [
+                              "--speculative_config",
+                              json.dumps({
+                                  "model": "JackFram/llama-68m",
+                                  "num_speculative_tokens": 3,
+                                  "draft_tensor_parallel_size": 1,
+                                  "disable_logprobs": False,
+                              }),
+                          ])])
+@pytest.mark.parametrize("logprobs", [2])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_chunked_prefill_tp2_with_logprobs(
+        model, common_llm_kwargs, per_test_common_llm_kwargs,
+        baseline_llm_kwargs, test_llm_kwargs, logprobs: Optional[int],
+        batch_size: int, seed: int):
+    """Verify spec decode works well with same and different TP size for
+    the draft model with chunked prefill.
+    """
     run_equality_correctness_test_tp(model,
                                      common_llm_kwargs,
                                      per_test_common_llm_kwargs,
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py
index d42d9029fef66..a1b7c8b40c39d 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp4.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py
@@ -3,6 +3,8 @@
 tensor parallelism.
 """
 
+import json
+
 import openai
 import pytest
 import torch
@@ -33,7 +35,7 @@ SPEC_MODEL = "JackFram/llama-68m"
         #TODO(wooyeon): add spec_draft_dp=2 case
         [
             "--speculative_config",
-            str({
+            json.dumps({
                 "model": f"{SPEC_MODEL}",
                 "num_speculative_tokens": 5,
                 "draft_tensor_parallel_size": 1,
@@ -80,7 +82,7 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
             # Artificially limit the draft model max model len; this forces vLLM
             # to skip speculation once the sequences grow beyond 32-k tokens.
             "--speculative_config",
-            str({
+            json.dumps({
                 "model": f"{SPEC_MODEL}",
                 "num_speculative_tokens": 5,
                 "max_model_len": 32,
diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
index d74a96fbfa02f..762c7bada324c 100644
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -49,7 +49,9 @@ def test_unsupported_configs(monkeypatch):
         with pytest.raises(NotImplementedError):
             AsyncEngineArgs(
                 model=MODEL,
-                speculative_model=MODEL,
+                speculative_config={
+                    "model": MODEL,
+                },
             ).create_engine_config()
 
         with pytest.raises(NotImplementedError):
diff --git a/vllm/config.py b/vllm/config.py
index bd192af2044c5..b06f119680f7b 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2047,14 +2047,13 @@ class SpeculativeConfig:
 
     def __post_init__(self):
 
-        # Note: After next release, the method parameter will be used to
-        # specify the speculative method, which helps to extend the
-        # configuration of non-model-based proposers, and the model parameter
-        # will be used when the draft model or head is needed.
-        # If users do not specify the method, the speculative method will
-        # be detected automatically if possible. If the speculative method can
-        # not be detected, it will be considered as the draft-model-based
-        # method by default.
+        # Note: "method" is a new parameter that helps to extend the
+        # configuration of non-model-based proposers, and the "model" parameter
+        # will be used to set the draft model, eagle head, or additional weight
+        # when needed. If users do not specify "method", the speculative method
+        # will be detected automatically if possible. If the speculative method
+        # can not be detected, it will be considered as the "draft_model" by
+        # default.
 
         if self.model is None and self.num_speculative_tokens is not None:
             # TODO(Shangming): Refactor mtp configuration logic when supporting
@@ -2069,8 +2068,8 @@ class SpeculativeConfig:
                 raise ValueError("num_speculative_tokens was provided without "
                                  "speculative model.")
 
-        # Automatically configure the ngram method during configuration
-        # refactoring to ensure a smooth transition.
+        # Automatically configure the method for ngram when "model" is used
+        # instead of "method"
         if self.method is None and (self.model is not None
                                     and self.model in ("ngram", "[ngram]")):
             self.method = "ngram"
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 1da021d7f70dc..e29b04ab6e0c8 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -181,22 +181,7 @@ class EngineArgs:
     guided_decoding_backend: str = 'xgrammar'
     logits_processor_pattern: Optional[str] = None
 
-    speculative_config: Optional[Union[str, Dict[str, Any]]] = None
-
-    # TODO(Shangming): Deprecate these out-of-date params after next release
-    speculative_model: Optional[str] = None
-    speculative_model_quantization: Optional[str] = None
-    speculative_draft_tensor_parallel_size: Optional[int] = None
-    num_speculative_tokens: Optional[int] = None
-    speculative_disable_mqa_scorer: Optional[bool] = False
-    speculative_max_model_len: Optional[int] = None
-    speculative_disable_by_batch_size: Optional[int] = None
-    ngram_prompt_lookup_max: Optional[int] = None
-    ngram_prompt_lookup_min: Optional[int] = None
-    spec_decoding_acceptance_method: str = 'rejection_sampler'
-    typical_acceptance_sampler_posterior_threshold: Optional[float] = None
-    typical_acceptance_sampler_posterior_alpha: Optional[float] = None
-    disable_logprobs_during_spec_decoding: Optional[bool] = None
+    speculative_config: Optional[Dict[str, Any]] = None
 
     qlora_adapter_name_or_path: Optional[str] = None
     show_hidden_metrics_for_version: Optional[str] = None
@@ -793,122 +778,10 @@ class EngineArgs:
             help='If set, the prefill requests can be chunked based on the '
             'max_num_batched_tokens.')
         parser.add_argument('--speculative-config',
-                            type=nullable_str,
+                            type=json.loads,
                             default=None,
                             help='The configurations for speculative decoding.'
                             ' Should be a JSON string.')
-        parser.add_argument(
-            '--speculative-model',
-            type=nullable_str,
-            default=EngineArgs.speculative_model,
-            help=
-            'The name of the draft model to be used in speculative decoding.')
-        # Quantization settings for speculative model.
-        parser.add_argument(
-            '--speculative-model-quantization',
-            type=nullable_str,
-            choices=[*QUANTIZATION_METHODS, None],
-            default=EngineArgs.speculative_model_quantization,
-            help='Method used to quantize the weights of speculative model. '
-            'If None, we first check the `quantization_config` '
-            'attribute in the model config file. If that is '
-            'None, we assume the model weights are not '
-            'quantized and use `dtype` to determine the data '
-            'type of the weights.')
-        parser.add_argument(
-            '--num-speculative-tokens',
-            type=int,
-            default=EngineArgs.num_speculative_tokens,
-            help='The number of speculative tokens to sample from '
-            'the draft model in speculative decoding.')
-        parser.add_argument(
-            '--speculative-disable-mqa-scorer',
-            action='store_true',
-            help=
-            'If set to True, the MQA scorer will be disabled in speculative '
-            ' and fall back to batch expansion')
-        parser.add_argument(
-            '--speculative-draft-tensor-parallel-size',
-            '-spec-draft-tp',
-            type=int,
-            default=EngineArgs.speculative_draft_tensor_parallel_size,
-            help='Number of tensor parallel replicas for '
-            'the draft model in speculative decoding.')
-
-        parser.add_argument(
-            '--speculative-max-model-len',
-            type=int,
-            default=EngineArgs.speculative_max_model_len,
-            help='The maximum sequence length supported by the '
-            'draft model. Sequences over this length will skip '
-            'speculation.')
-
-        parser.add_argument(
-            '--speculative-disable-by-batch-size',
-            type=int,
-            default=EngineArgs.speculative_disable_by_batch_size,
-            help='Disable speculative decoding for new incoming requests '
-            'if the number of enqueue requests is larger than this value.')
-
-        parser.add_argument(
-            '--ngram-prompt-lookup-max',
-            type=int,
-            default=EngineArgs.ngram_prompt_lookup_max,
-            help='Max size of window for ngram prompt lookup in speculative '
-            'decoding.')
-
-        parser.add_argument(
-            '--ngram-prompt-lookup-min',
-            type=int,
-            default=EngineArgs.ngram_prompt_lookup_min,
-            help='Min size of window for ngram prompt lookup in speculative '
-            'decoding.')
-
-        parser.add_argument(
-            '--spec-decoding-acceptance-method',
-            type=str,
-            default=EngineArgs.spec_decoding_acceptance_method,
-            choices=['rejection_sampler', 'typical_acceptance_sampler'],
-            help='Specify the acceptance method to use during draft token '
-            'verification in speculative decoding. Two types of acceptance '
-            'routines are supported: '
-            '1) RejectionSampler which does not allow changing the '
-            'acceptance rate of draft tokens, '
-            '2) TypicalAcceptanceSampler which is configurable, allowing for '
-            'a higher acceptance rate at the cost of lower quality, '
-            'and vice versa.')
-
-        parser.add_argument(
-            '--typical-acceptance-sampler-posterior-threshold',
-            type=float,
-            default=EngineArgs.typical_acceptance_sampler_posterior_threshold,
-            help='Set the lower bound threshold for the posterior '
-            'probability of a token to be accepted. This threshold is '
-            'used by the TypicalAcceptanceSampler to make sampling decisions '
-            'during speculative decoding.')
-
-        parser.add_argument(
-            '--typical-acceptance-sampler-posterior-alpha',
-            type=float,
-            default=EngineArgs.typical_acceptance_sampler_posterior_alpha,
-            help='A scaling factor for the entropy-based threshold for token '
-            'acceptance in the TypicalAcceptanceSampler. Typically defaults '
-            'to sqrt of --typical-acceptance-sampler-posterior-threshold '
-            'i.e. 0.3')
-
-        parser.add_argument(
-            '--disable-logprobs-during-spec-decoding',
-            action=StoreBoolean,
-            default=EngineArgs.disable_logprobs_during_spec_decoding,
-            nargs="?",
-            const="True",
-            help='If set to True, token log probabilities are not returned '
-            'during speculative decoding. If set to False, log probabilities '
-            'are returned according to the settings in SamplingParams. If '
-            'not specified, it defaults to True. Disabling log probabilities '
-            'during speculative decoding reduces latency by skipping logprob '
-            'calculation in proposal sampling, target sampling, and after '
-            'accepted tokens are determined.')
 
         parser.add_argument('--model-loader-extra-config',
                             type=nullable_str,
@@ -1221,58 +1094,14 @@ class EngineArgs:
         This function utilizes `speculative_config` to create a
         SpeculativeConfig object. The `speculative_config` can either be
         provided as a JSON string input via CLI arguments or directly as a
-        dictionary from the engine. If `speculative_config` is not set, this
-        function will attempt to construct a configuration dictionary using
-        certain parameters, which are scheduled for deprecation in the next
-        release. Note that in next releases, `speculative_config` must be
-        provided, and the deprecated standalone speculative-related parameters
-        will be removed.
+        dictionary from the engine.
         """
         if self.speculative_config is None:
-            if (self.speculative_model is None
-                    and self.num_speculative_tokens is None):
-                return None
+            return None
 
-            # TODO(Shangming): Deprecate this way of setting SpeculativeConfig,
-            # only allow '--speculative-config' after next release
-            logger.warning_once(
-                "Please use '--speculative-config' to set all configurations "
-                "related to speculative decoding. The current method of "
-                "specifying the model through '--speculative-model' and "
-                "adding related parameters (e.g., '--num-speculative-tokens') "
-                "separately will be deprecated in the next release.")
-
-            spec_config_dict = {
-                "model": self.speculative_model,
-                "quantization": self.speculative_model_quantization,
-                "max_model_len": self.speculative_max_model_len,
-                "draft_tensor_parallel_size":
-                self.speculative_draft_tensor_parallel_size,
-                "num_speculative_tokens": self.num_speculative_tokens,
-                "disable_mqa_scorer": self.speculative_disable_mqa_scorer,
-                "disable_by_batch_size":
-                self.speculative_disable_by_batch_size,
-                "prompt_lookup_max": self.ngram_prompt_lookup_max,
-                "prompt_lookup_min": self.ngram_prompt_lookup_min,
-                "acceptance_method": self.spec_decoding_acceptance_method,
-                "posterior_threshold":
-                self.typical_acceptance_sampler_posterior_threshold,
-                "posterior_alpha":
-                self.typical_acceptance_sampler_posterior_alpha,
-                "disable_logprobs": self.disable_logprobs_during_spec_decoding,
-            }
-
-            self.speculative_config = spec_config_dict
-        else:
-            if isinstance(self.speculative_config, str):
-                import ast
-                self.speculative_config = ast.literal_eval(
-                    self.speculative_config)
         # Note(Shangming): These parameters are not obtained from the cli arg
         # '--speculative-config' and must be passed in when creating the engine
         # config.
-
-        assert isinstance(self.speculative_config, dict)
         self.speculative_config.update({
             "target_model_config": target_model_config,
             "target_parallel_config": target_parallel_config,
@@ -1638,11 +1467,15 @@ class EngineArgs:
             return False
 
         # Only Ngram speculative decoding so far.
-        if (self.speculative_model is not None
-                or self.num_speculative_tokens is not None):
+        is_ngram_enabled = False
+        if self.speculative_config is not None:
             # This is supported but experimental (handled below).
-            if self.speculative_model in ("ngram", "[ngram]"):
-                pass
+            if (("method" in self.speculative_config
+                 and self.speculative_config["method"] in ("ngram", "[ngram]"))
+                    or
+                ("model" in self.speculative_config and
+                 self.speculative_config["model"] in ("ngram", "[ngram]"))):
+                is_ngram_enabled = True
             else:
                 _raise_or_fallback(feature_name="Speculative Decoding",
                                    recommend_to_remove=False)
@@ -1691,8 +1524,7 @@ class EngineArgs:
             return False
 
         # ngram is supported on V1, but off by default for now.
-        if self.speculative_model in (
-                "ngram", "[ngram]") and _warn_or_fallback("ngram"):
+        if is_ngram_enabled and _warn_or_fallback("ngram"):
             return False
 
         # Non-CUDA is supported on V1, but off by default for now.
@@ -1721,7 +1553,7 @@ class EngineArgs:
                 is_gpu = current_platform.is_cuda()
                 use_sliding_window = (model_config.get_sliding_window()
                                       is not None)
-                use_spec_decode = self.speculative_model is not None
+                use_spec_decode = self.speculative_config is not None
 
                 if (is_gpu and not use_sliding_window and not use_spec_decode
                         and not self.enable_lora

From 2de411824316bfd44a3fc27786f40ef3781eeb9d Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Tue, 1 Apr 2025 01:00:50 +0800
Subject: [PATCH 128/593] fix: change GB to GiB in logging close #14979
 (#15807)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
---
 vllm/v1/core/kv_cache_utils.py     | 4 ++--
 vllm/v1/worker/gpu_model_runner.py | 8 ++++----
 vllm/worker/model_runner.py        | 4 ++--
 vllm/worker/xpu_model_runner.py    | 6 +++---
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 0d58d4d2218f4..13a3756fdacb7 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -488,9 +488,9 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
     if needed_memory > available_memory:
         raise ValueError(
             f"To serve at least one request with the models's max seq len "
-            f"({max_model_len}), ({needed_memory/1024/1024/1024:.2f} GB KV "
+            f"({max_model_len}), ({needed_memory/1024/1024/1024:.2f} GiB KV "
             f"cache is needed, which is larger than the available KV cache "
-            f"memory ({available_memory/1024/1024/1024:.2f} GB). Try "
+            f"memory ({available_memory/1024/1024/1024:.2f} GiB). Try "
             f"increasing `gpu_memory_utilization` or decreasing "
             f"`max_model_len` when initializing the engine.")
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index c7374cc3d3306..43c756b193a6a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -24,8 +24,8 @@ from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
-                        LayerBlockType, LazyLoader, cdiv, check_use_alibi,
-                        is_pin_memory_available)
+                        GiB_bytes, LayerBlockType, LazyLoader, cdiv,
+                        check_use_alibi, is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
@@ -1206,8 +1206,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                                                   self.device)
             time_after_load = time.perf_counter()
         self.model_memory_usage = m.consumed_memory
-        logger.info("Model loading took %.4f GB and %.6f seconds",
-                    self.model_memory_usage / float(2**30),
+        logger.info("Model loading took %.4f GiB and %.6f seconds",
+                    self.model_memory_usage / GiB_bytes,
                     time_after_load - time_before_load)
 
     def _get_prompt_logprobs_dict(
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index edbafb48c9386..86e6d97520131 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1143,8 +1143,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
             time_after_load = time.perf_counter()
 
         self.model_memory_usage = m.consumed_memory
-        logger.info("Model loading took %.4f GB and %.6f seconds",
-                    self.model_memory_usage / float(2**30),
+        logger.info("Model loading took %.4f GiB and %.6f seconds",
+                    self.model_memory_usage / GiB_bytes,
                     time_after_load - time_before_load)
         if self.prompt_adapter_config:
             self.prompt_adapter_manager = LRUCacheWorkerPromptAdapterManager(
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 39957e661c474..9d49b4385dcaa 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -25,7 +25,7 @@ from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                              MultiModalRegistry)
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
-from vllm.utils import DeviceMemoryProfiler, make_tensor_with_pad
+from vllm.utils import DeviceMemoryProfiler, GiB_bytes, make_tensor_with_pad
 from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata
 from vllm.worker.model_runner_base import (
     ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
@@ -422,8 +422,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
             self.model = get_model(vllm_config=self.vllm_config)
 
         self.model_memory_usage = m.consumed_memory
-        logger.info("Loading model weights took %.4f GB",
-                    self.model_memory_usage / float(2**30))
+        logger.info("Loading model weights took %.4f GiB",
+                    self.model_memory_usage / GiB_bytes)
 
     def get_model(self) -> nn.Module:
         return self.model

From 9a2160fa550ac2eee062174622f8f02a92b674d1 Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com>
Date: Mon, 31 Mar 2025 13:25:20 -0400
Subject: [PATCH 129/593] [V1] TPU CI - Add basic perf regression test (#15414)

Signed-off-by: Alexander Matveev <amatveev@redhat.com>
---
 .buildkite/run-tpu-v1-test.sh          |   2 +
 tests/entrypoints/llm/test_accuracy.py |   2 +-
 tests/v1/tpu/test_basic.py             |   5 +-
 tests/v1/tpu/test_perf.py              | 146 +++++++++++++++++++++++++
 vllm/v1/worker/tpu_model_runner.py     |  57 +++++++---
 5 files changed, 192 insertions(+), 20 deletions(-)
 create mode 100644 tests/v1/tpu/test_perf.py

diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh
index 89252000f4003..8616ea2b79dbc 100755
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@@ -21,6 +21,8 @@ docker run --privileged --net host --shm-size=16G -it \
     && python3 -m pip install lm_eval[api]==0.4.4 \
     && export VLLM_USE_V1=1 \
     && export VLLM_XLA_CHECK_RECOMPILATION=1 \
+    && echo TEST_0 \
+    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py \
     && echo TEST_1 \
     && pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \
     && echo TEST_2 \
diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py
index 77fbb5827da9e..2bc32ace0a59d 100644
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@@ -58,7 +58,7 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
         more_args = None
         if current_platform.is_tpu():
             # Limit compilation time for TPU V1
-            more_args = "max_num_seqs=64"
+            more_args = "max_model_len=2048,max_num_seqs=64"
 
             # Add TP test (if provided)
             if TPU_TP_TEST_STR:
diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py
index 0d7e8d8d7f5e9..8164952fe3823 100644
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@@ -32,7 +32,7 @@ TENSOR_PARALLEL_SIZES = [1]
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES)
-def test_models(
+def test_basic(
     vllm_runner: type[VllmRunner],
     monkeypatch: pytest.MonkeyPatch,
     model: str,
@@ -58,4 +58,5 @@ def test_models(
             vllm_outputs = vllm_model.generate_greedy(example_prompts,
                                                       max_tokens)
         output = vllm_outputs[0][1]
-        assert "1024" in output
+
+        assert "1024" in output or "0, 1" in output
diff --git a/tests/v1/tpu/test_perf.py b/tests/v1/tpu/test_perf.py
new file mode 100644
index 0000000000000..94a1da88a2f06
--- /dev/null
+++ b/tests/v1/tpu/test_perf.py
@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: Apache-2.0
+"""A basic performance regression test for TPUs
+
+Run `pytest tests/v1/tpu/test_perf.py`.
+"""
+from __future__ import annotations
+
+import time
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import numpy as np
+import pytest
+
+from vllm.platforms import current_platform
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+if TYPE_CHECKING:
+    from tests.conftest import VllmRunner
+
+
+@dataclass
+class TestParams:
+    model: str
+    num_prompts: int
+    prefix_len: int
+    decode_len: int
+    expected_avg_time: float
+    err_tol: float
+
+
+TEST_PARAMS = [
+    # TODO: Cannot run a series of tests because:
+    #   RuntimeError: Bad StatusOr access: UNKNOWN: TPU initialization failed:
+    #   open(/dev/vfio/0): Device or resource busy: Device or resource busy;
+    #   Couldn't open iommu group /dev/vfio/0
+    # => Investigate
+
+    # TestParams(
+    #     model="Qwen/Qwen2.5-1.5B-Instruct",
+    #     num_prompts=1,
+    #     prefix_len=10,
+    #     decode_len=5,
+    #     expected_avg_time=0.03,
+    #     err_tol=0.01,
+    # ),
+    # TestParams(
+    #     model="Qwen/Qwen2.5-1.5B-Instruct",
+    #     num_prompts=10,
+    #     prefix_len=100,
+    #     decode_len=50,
+    #     expected_avg_time=0.234,
+    #     err_tol=0.020,
+    # ),
+    TestParams(
+        model="Qwen/Qwen2.5-1.5B-Instruct",
+        num_prompts=64,
+        prefix_len=500,
+        decode_len=50,
+
+        # (This is the active CI/CD instance)
+        # commit id: ccb246776d93ef105904a8ec015b3587240a1183
+        # tpu: v5lite (vllm CI/CD)
+        expected_avg_time=1.4,
+        err_tol=0.30,
+
+        # (TODO: There is no v6e in CI/CD currently)
+        # commit id: ccb246776d93ef105904a8ec015b3587240a1183
+        # tpu: v6e
+        # expected_avg_time=1.5,
+        # err_tol=0.20,
+    ),
+]
+
+NUM_WARMUPS = 5
+NUM_RUNS = 10
+
+MAX_MODEL_LEN = 1024
+MAX_NUM_SEQS = 32
+GPU_UTIL = 0.9
+
+
+@pytest.mark.skipif(not current_platform.is_tpu(),
+                    reason="This is a basic performance test for TPU only")
+@pytest.mark.parametrize("params", TEST_PARAMS)
+def test_perf(
+    vllm_runner: type[VllmRunner],
+    monkeypatch: pytest.MonkeyPatch,
+    params: TestParams,
+) -> None:
+    tokenizer = get_tokenizer(params.model,
+                              tokenizer_mode="auto",
+                              trust_remote_code=True)
+
+    prompts = []
+    for i in range(params.num_prompts):
+        prefix_token_ids = np.random.randint(0,
+                                             tokenizer.vocab_size,
+                                             size=params.prefix_len).tolist()
+        prompt = tokenizer.decode(prefix_token_ids)
+        prompts.append(prompt)
+
+    print(
+        "-- Running: num_prompts = {} prefix_len = {} decode_len = {}".format(
+            len(prompts), params.prefix_len, params.decode_len))
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        sampling_params = SamplingParams(max_tokens=params.decode_len,
+                                         temperature=1.0,
+                                         min_p=0.0)
+
+        with vllm_runner(params.model,
+                         max_num_batched_tokens=MAX_MODEL_LEN,
+                         max_model_len=MAX_MODEL_LEN,
+                         max_num_seqs=MAX_NUM_SEQS,
+                         gpu_memory_utilization=GPU_UTIL,
+                         enforce_eager=False,
+                         tensor_parallel_size=1) as vllm_model:
+            print("  -- Warmup / Compile")
+            for i in range(NUM_WARMUPS):
+                _ = vllm_model.generate(prompts, sampling_params)
+
+            print("  -- Benchmarking... ")
+            times = []
+            for i in range(NUM_RUNS):
+                start_time = time.time()
+                _ = vllm_model.generate(prompts, sampling_params)
+                times.append(time.time() - start_time)
+
+            avg_time = sum(times) / len(times)
+
+            print("  -- avg_time = {}".format(avg_time))
+            print("  -- expected_avg_time = {} with err_tol = {}".format(
+                params.expected_avg_time, params.err_tol))
+            diff = avg_time - params.expected_avg_time
+            ok = diff < params.err_tol
+            if diff < -params.err_tol:
+                print("  !! WARNING !! Performance has improved by {}, "
+                      "it may be necessary to fine-tune the "
+                      "expected_avg_time = {}".format(
+                          -diff, params.expected_avg_time))
+
+            assert ok, " !! ERROR !! Regression detected"
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 8f6a54892a4e6..7f7318a7bdd3e 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -77,9 +77,12 @@ class TPUModelRunner:
         parallel_config = self.parallel_config
         self.device = device
         self.check_recompilation = envs.VLLM_XLA_CHECK_RECOMPILATION
-        if self.check_recompilation:
-            self.num_xla_graphs = xr.get_num_cached_compilation_graph()
+
         self.enforce_eager = model_config.enforce_eager
+
+        self.num_xla_graphs = 0
+        self._update_num_xla_graphs("init")
+
         self.pin_memory = is_pin_memory_available()
         self.dtype = self.model_config.dtype
         self._hidden_states_dtype = self.dtype
@@ -180,6 +183,31 @@ class TPUModelRunner:
             max_token_size=self.max_num_tokens,
             padding_gap=envs.VLLM_TPU_BUCKET_PADDING_GAP)
 
+    def _update_num_xla_graphs(self, case_str):
+        check_comp = self.check_recompilation and not self.enforce_eager
+        if not check_comp:
+            return
+
+        total_cached_graphs = xr.get_num_cached_compilation_graph()
+        new_compiled_graphs = total_cached_graphs - self.num_xla_graphs
+        if new_compiled_graphs == 0:
+            return
+
+        logger.info("Add new %d compiled XLA graphs due to %s",
+                    new_compiled_graphs, case_str)
+        self.num_xla_graphs += new_compiled_graphs
+
+    def _verify_num_xla_graphs(self, case_str):
+        check_comp = self.check_recompilation and not self.enforce_eager
+        if not check_comp:
+            return
+
+        curr_cached_graph = xr.get_num_cached_compilation_graph()
+        assert self.num_xla_graphs == curr_cached_graph, (
+            "Recompilation after warm up is detected during {}."
+            " num_xla_graphs = {} curr_cached_graph = {}".format(
+                case_str, self.num_xla_graphs, curr_cached_graph))
+
     def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
         """Update the cached states and the persistent batch with the scheduler
         output.
@@ -694,12 +722,11 @@ class TPUModelRunner:
             logprobs=None,
             prompt_logprobs_dict=prompt_logprobs_dict,
         )
-        # Check there is no new graph compilation, all the graphs should be
-        # captured and compiled during warming up.
-        if self.check_recompilation and not self.enforce_eager:
-            curr_cached_graph = xr.get_num_cached_compilation_graph()
-            assert self.num_xla_graphs == curr_cached_graph, (
-                "Recompilation after warm up is detected.")
+
+        # Check there are no new graphs compiled - all the graphs should be
+        # captured and compiled during warm up.
+        self._verify_num_xla_graphs("execute_model")
+
         return model_runner_output
 
     def load_model(self) -> None:
@@ -797,7 +824,9 @@ class TPUModelRunner:
             xm.mark_step()
         xm.wait_device_ops()
         end = time.perf_counter()
+
         logger.info("Compilation finished in in %.2f [secs].", end - start)
+        self._update_num_xla_graphs("model")
 
         logger.info("Compiling sampling with different input shapes.")
         start = time.perf_counter()
@@ -832,15 +861,9 @@ class TPUModelRunner:
                     num_reqs_to_sample + 1, self.max_num_reqs)
         xm.wait_device_ops()
         end = time.perf_counter()
-        logger.info("Compilation finished in %.2f [secs].", end - start)
-        # Record the number cached XLA graph after warming up, this will be
-        # used for checking there is no additional graph compilation during
-        # runtime execution.
-        if self.check_recompilation:
-            total_cached_graphs = xr.get_num_cached_compilation_graph()
-            num_compiled_graphs = total_cached_graphs - self.num_xla_graphs
-            logger.info("Compiled %d XLA graphs.", num_compiled_graphs)
-            self.num_xla_graphs += num_compiled_graphs
+
+        logger.info("Compilation finished in in %.2f [secs].", end - start)
+        self._update_num_xla_graphs("sampling")
 
     def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         """

From d4bfc23ef0319e2af6b01a715572364d947aee29 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 31 Mar 2025 18:27:07 +0100
Subject: [PATCH 130/593] Fix Transformers backend compatibility check (#15290)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/model_loader/utils.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index a252c7f8e57bc..d9613fab3a28e 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -36,10 +36,7 @@ def is_transformers_impl_compatible(
     mod = module or getattr(transformers, arch, None)
     if mod is None:
         return False
-    if hasattr(mod, "supports_backend"):
-        return mod.is_backend_compatible()
-    else:
-        return mod._supports_flex_attn
+    return mod.is_backend_compatible()
 
 
 def resolve_transformers_fallback(model_config: ModelConfig,

From f98a4920f968b1514c185bcdb881125903059c70 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Mon, 31 Mar 2025 20:15:21 +0100
Subject: [PATCH 131/593] [V1][Core] Remove unused speculative config from
 scheduler (#15818)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 tests/v1/core/test_scheduler.py | 1 -
 vllm/v1/core/sched/scheduler.py | 5 +----
 vllm/v1/engine/core.py          | 1 -
 3 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 24a51288cbb90..5b96566530c89 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -71,7 +71,6 @@ def create_scheduler(
         scheduler_config,
         model_config,
         cache_config,
-        speculative_config=None,
         lora_config=None,
         log_stats=True,
         structured_output_manager=StructuredOutputManager(vllm_config),
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index aafa2f0a9f30d..9e6c8e69d558d 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -7,8 +7,7 @@ from collections import deque
 from collections.abc import Iterable
 from typing import Optional, Union
 
-from vllm.config import (CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig,
-                         SpeculativeConfig)
+from vllm.config import CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig
 from vllm.logger import init_logger
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
@@ -36,7 +35,6 @@ class Scheduler(SchedulerInterface):
         model_config: ModelConfig,
         cache_config: CacheConfig,
         lora_config: Optional[LoRAConfig],
-        speculative_config: Optional[SpeculativeConfig],
         structured_output_manager: StructuredOutputManager,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         include_finished_set: bool = False,
@@ -45,7 +43,6 @@ class Scheduler(SchedulerInterface):
         self.scheduler_config = scheduler_config
         self.cache_config = cache_config
         self.lora_config = lora_config
-        self.speculative_config = speculative_config
         self.log_stats = log_stats
         self.structured_output_manager = structured_output_manager
 
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 6083eea45cd98..68a1dc1533079 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -95,7 +95,6 @@ class EngineCore:
             model_config=vllm_config.model_config,
             cache_config=vllm_config.cache_config,
             lora_config=vllm_config.lora_config,
-            speculative_config=vllm_config.speculative_config,
             include_finished_set=vllm_config.parallel_config.data_parallel_size
             > 1,
             log_stats=self.log_stats,

From e6e3c55ef28f30bca855399419c61bb70af03db2 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 31 Mar 2025 21:47:32 +0100
Subject: [PATCH 132/593] Move dockerfiles into their own directory (#14549)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .buildkite/release-pipeline.yaml                   | 12 ++++++------
 .buildkite/run-cpu-test-ppc64le.sh                 |  2 +-
 .buildkite/run-cpu-test.sh                         |  4 ++--
 .buildkite/run-gh200-test.sh                       |  1 +
 .buildkite/run-hpu-test.sh                         |  2 +-
 .buildkite/run-neuron-test.sh                      |  2 +-
 .buildkite/run-tpu-v1-test.sh                      |  2 +-
 .buildkite/run-xpu-test.sh                         |  2 +-
 .github/mergify.yml                                |  2 +-
 .github/workflows/lint-and-deploy.yaml             |  2 +-
 CMakeLists.txt                                     |  2 +-
 Dockerfile => docker/Dockerfile                    |  0
 Dockerfile.arm => docker/Dockerfile.arm            |  0
 Dockerfile.cpu => docker/Dockerfile.cpu            |  0
 Dockerfile.hpu => docker/Dockerfile.hpu            |  0
 Dockerfile.neuron => docker/Dockerfile.neuron      |  0
 Dockerfile.ppc64le => docker/Dockerfile.ppc64le    |  0
 Dockerfile.rocm => docker/Dockerfile.rocm          |  0
 .../Dockerfile.rocm_base                           |  0
 Dockerfile.s390x => docker/Dockerfile.s390x        |  0
 Dockerfile.tpu => docker/Dockerfile.tpu            |  0
 Dockerfile.xpu => docker/Dockerfile.xpu            |  0
 docs/source/contributing/dockerfile/dockerfile.md  |  6 +++---
 docs/source/contributing/overview.md               |  2 +-
 docs/source/deployment/docker.md                   |  5 +++--
 docs/source/deployment/nginx.md                    |  4 ++--
 .../installation/ai_accelerator/hpu-gaudi.inc.md   |  2 +-
 .../installation/ai_accelerator/neuron.inc.md      |  2 +-
 .../installation/ai_accelerator/tpu.inc.md         |  4 ++--
 docs/source/getting_started/installation/cpu.md    |  6 +++---
 .../getting_started/installation/gpu/rocm.inc.md   | 14 +++++++-------
 .../getting_started/installation/gpu/xpu.inc.md    |  2 +-
 docs/source/getting_started/quickstart.md          |  2 +-
 vllm/config.py                                     |  4 ++--
 34 files changed, 44 insertions(+), 42 deletions(-)
 rename Dockerfile => docker/Dockerfile (100%)
 rename Dockerfile.arm => docker/Dockerfile.arm (100%)
 rename Dockerfile.cpu => docker/Dockerfile.cpu (100%)
 rename Dockerfile.hpu => docker/Dockerfile.hpu (100%)
 rename Dockerfile.neuron => docker/Dockerfile.neuron (100%)
 rename Dockerfile.ppc64le => docker/Dockerfile.ppc64le (100%)
 rename Dockerfile.rocm => docker/Dockerfile.rocm (100%)
 rename Dockerfile.rocm_base => docker/Dockerfile.rocm_base (100%)
 rename Dockerfile.s390x => docker/Dockerfile.s390x (100%)
 rename Dockerfile.tpu => docker/Dockerfile.tpu (100%)
 rename Dockerfile.xpu => docker/Dockerfile.xpu (100%)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index a1dcb01e482bb..a420759aad916 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -3,7 +3,7 @@ steps:
     agents:
       queue: cpu_queue_postmerge
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/upload-wheels.sh"
@@ -14,7 +14,7 @@ steps:
     agents:
       queue: cpu_queue_postmerge
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/upload-wheels.sh"
@@ -31,7 +31,7 @@ steps:
     agents:
       queue: cpu_queue_postmerge
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/upload-wheels.sh"
@@ -48,7 +48,7 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 
   - label: "Build and publish TPU release image"
@@ -57,7 +57,7 @@ steps:
     agents:
       queue: tpu_queue_postmerge
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
       - "docker push vllm/vllm-tpu:nightly"
       - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
     plugins:
@@ -82,7 +82,7 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f Dockerfile.cpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
     env:
       DOCKER_BUILDKIT: "1"
diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh
index bc06838d804ff..9c5cf7cad9489 100755
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@@ -10,5 +10,5 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Try building the docker image
-docker build -t cpu-test -f Dockerfile.ppc64le .
+docker build -t cpu-test -f docker/Dockerfile.ppc64le .
 
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index bf9f191d3b064..40f3df96065d1 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -18,8 +18,8 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
 
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
index 5c004b47778fb..8c64e14606d3b 100644
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@@ -9,6 +9,7 @@ python3 use_existing_torch.py
 
 # Try building the docker image
 DOCKER_BUILDKIT=1 docker build . \
+  --file docker/Dockerfile \
   --target vllm-openai \
   --platform "linux/arm64" \
   -t gh200-test \
diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/run-hpu-test.sh
index f83eb927aae4e..95b6ac37f1857 100644
--- a/.buildkite/run-hpu-test.sh
+++ b/.buildkite/run-hpu-test.sh
@@ -5,7 +5,7 @@
 set -ex
 
 # Try building the docker image
-docker build -t hpu-test-env -f Dockerfile.hpu .
+docker build -t hpu-test-env -f docker/Dockerfile.hpu .
 
 # Setup cleanup
 # certain versions of HPU software stack have a bug that can
diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
index ad5ae6f415748..ec6a080eb499f 100644
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@@ -35,7 +35,7 @@ else
     date "+%s" > /tmp/neuron-docker-build-timestamp
 fi
 
-docker build -t "${image_name}" -f Dockerfile.neuron .
+docker build -t "${image_name}" -f docker/Dockerfile.neuron .
 
 # Setup cleanup
 remove_docker_container() {
diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh
index 8616ea2b79dbc..4aac57cca94c0 100755
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@@ -3,7 +3,7 @@
 set -e
 
 # Build the docker image.
-docker build -f Dockerfile.tpu -t vllm-tpu .
+docker build -f docker/Dockerfile.tpu -t vllm-tpu .
 
 # Set up cleanup.
 remove_docker_container() { docker rm -f tpu-test || true; }
diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh
index 3a0e6bdb2caaf..f54010c4231f9 100644
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 
 # Try building the docker image
-docker build -t ${image_name} -f Dockerfile.xpu .
+docker build -t ${image_name} -f docker/Dockerfile.xpu .
 
 # Setup cleanup
 remove_docker_container() { 
diff --git a/.github/mergify.yml b/.github/mergify.yml
index e071ece6f1d5e..3097b994659ab 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -19,7 +19,7 @@ pull_request_rules:
       - files~=\.buildkite/
       - files~=^cmake/
       - files=CMakeLists.txt
-      - files~=^Dockerfile
+      - files~=^docker/Dockerfile
       - files~=^requirements.*\.txt
       - files=setup.py
   actions:
diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index b199d0867a648..7b1d9f69938c8 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -50,7 +50,7 @@ jobs:
         uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
 
       - name: Build the Docker image vllm cpu
-        run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
+        run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
 
       - name: Configuration of docker images, network and namespace for the kind cluster
         run: |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ab6185e9a63b8..d0436aa1d0afd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,7 +44,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 #
 # Note: the CUDA torch version is derived from pyproject.toml and various
 # requirements.txt files and should be kept consistent.  The ROCm torch
-# versions are derived from Dockerfile.rocm
+# versions are derived from docker/Dockerfile.rocm
 #
 set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
 set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
diff --git a/Dockerfile b/docker/Dockerfile
similarity index 100%
rename from Dockerfile
rename to docker/Dockerfile
diff --git a/Dockerfile.arm b/docker/Dockerfile.arm
similarity index 100%
rename from Dockerfile.arm
rename to docker/Dockerfile.arm
diff --git a/Dockerfile.cpu b/docker/Dockerfile.cpu
similarity index 100%
rename from Dockerfile.cpu
rename to docker/Dockerfile.cpu
diff --git a/Dockerfile.hpu b/docker/Dockerfile.hpu
similarity index 100%
rename from Dockerfile.hpu
rename to docker/Dockerfile.hpu
diff --git a/Dockerfile.neuron b/docker/Dockerfile.neuron
similarity index 100%
rename from Dockerfile.neuron
rename to docker/Dockerfile.neuron
diff --git a/Dockerfile.ppc64le b/docker/Dockerfile.ppc64le
similarity index 100%
rename from Dockerfile.ppc64le
rename to docker/Dockerfile.ppc64le
diff --git a/Dockerfile.rocm b/docker/Dockerfile.rocm
similarity index 100%
rename from Dockerfile.rocm
rename to docker/Dockerfile.rocm
diff --git a/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
similarity index 100%
rename from Dockerfile.rocm_base
rename to docker/Dockerfile.rocm_base
diff --git a/Dockerfile.s390x b/docker/Dockerfile.s390x
similarity index 100%
rename from Dockerfile.s390x
rename to docker/Dockerfile.s390x
diff --git a/Dockerfile.tpu b/docker/Dockerfile.tpu
similarity index 100%
rename from Dockerfile.tpu
rename to docker/Dockerfile.tpu
diff --git a/Dockerfile.xpu b/docker/Dockerfile.xpu
similarity index 100%
rename from Dockerfile.xpu
rename to docker/Dockerfile.xpu
diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md
index 96674805df534..90b9a33cfbe62 100644
--- a/docs/source/contributing/dockerfile/dockerfile.md
+++ b/docs/source/contributing/dockerfile/dockerfile.md
@@ -1,6 +1,6 @@
 # Dockerfile
 
-We provide a <gh-file:Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
+We provide a <gh-file:docker/Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
 More information about deploying with Docker can be found [here](#deployment-docker).
 
 Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
@@ -28,7 +28,7 @@ The edges of the build graph represent:
   > Commands to regenerate the build graph (make sure to run it **from the \`root\` directory of the vLLM repository** where the dockerfile is present):
   >
   > ```bash
-  > dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile
+  > dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename docker/Dockerfile
   > ```
   >
   > or in case you want to run it directly with the docker image:
@@ -43,7 +43,7 @@ The edges of the build graph represent:
   >    --output png \
   >    --dpi 200 \
   >    --max-label-length 50 \
-  >    --filename Dockerfile \
+  >    --filename docker/Dockerfile \
   >    --legend
   > ```
   >
diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md
index 1e6f73dd524e7..31c7059fda364 100644
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@@ -45,7 +45,7 @@ pytest tests/
 ```
 
 :::{tip}
-Since the <gh-file:Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
+Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
 
 Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
 :::
diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md
index 65cb038de1b4e..1ccb04ac625cf 100644
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@@ -61,11 +61,11 @@ RUN uv pip install --system git+https://github.com/huggingface/transformers.git
 
 ## Building vLLM's Docker Image from Source
 
-You can build and run vLLM from source via the provided <gh-file:Dockerfile>. To build vLLM:
+You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM:
 
 ```console
 # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
-DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai
+DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai --file docker/Dockerfile
 ```
 
 :::{note}
@@ -92,6 +92,7 @@ Keep an eye on memory usage with parallel jobs as it can be substantial (see exa
 # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
 $ python3 use_existing_torch.py
 $ DOCKER_BUILDKIT=1 docker build . \
+  --file docker/Dockerfile \
   --target vllm-openai \
   --platform "linux/arm64" \
   -t vllm/vllm-gh200-openai:latest \
diff --git a/docs/source/deployment/nginx.md b/docs/source/deployment/nginx.md
index 62816f514c00e..bf404f1098c3b 100644
--- a/docs/source/deployment/nginx.md
+++ b/docs/source/deployment/nginx.md
@@ -69,14 +69,14 @@ server {
 
 ```console
 cd $vllm_root
-docker build -f Dockerfile . --tag vllm
+docker build -f docker/Dockerfile . --tag vllm
 ```
 
 If you are behind proxy, you can pass the proxy settings to the docker build command as shown below:
 
 ```console
 cd $vllm_root
-docker build -f Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy
+docker build -f docker/Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy
 ```
 
 (nginxloadbalancer-nginx-docker-network)=
diff --git a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
index e91ed6fbd7a88..e3046f35ee15f 100644
--- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
@@ -86,7 +86,7 @@ Currently, there are no pre-built Intel Gaudi images.
 ### Build image from source
 
 ```console
-docker build -f Dockerfile.hpu -t vllm-hpu-env  .
+docker build -f docker/Dockerfile.hpu -t vllm-hpu-env  .
 docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
 ```
 
diff --git a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
index 4c668a8e68927..b4bfb696faa28 100644
--- a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
@@ -132,7 +132,7 @@ Currently, there are no pre-built Neuron images.
 
 See <project:#deployment-docker-build-image-from-source> for instructions on building the Docker image.
 
-Make sure to use <gh-file:Dockerfile.neuron> in place of the default Dockerfile.
+Make sure to use <gh-file:docker/Dockerfile.neuron> in place of the default Dockerfile.
 
 ## Extra information
 
diff --git a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
index 6c7bbf6024992..beb803cf05978 100644
--- a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
@@ -169,10 +169,10 @@ See <project:#deployment-docker-pre-built-image> for instructions on using the o
 
 ### Build image from source
 
-You can use <gh-file:Dockerfile.tpu> to build a Docker image with TPU support.
+You can use <gh-file:docker/Dockerfile.tpu> to build a Docker image with TPU support.
 
 ```console
-docker build -f Dockerfile.tpu -t vllm-tpu .
+docker build -f docker/Dockerfile.tpu -t vllm-tpu .
 ```
 
 Run the Docker image with the following command:
diff --git a/docs/source/getting_started/installation/cpu.md b/docs/source/getting_started/installation/cpu.md
index 844b184afc99b..e7e12bd683074 100644
--- a/docs/source/getting_started/installation/cpu.md
+++ b/docs/source/getting_started/installation/cpu.md
@@ -177,7 +177,7 @@ Currently, there are no pre-built CPU wheels.
 ### Build image from source
 
 ```console
-$ docker build -f Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
+$ docker build -f docker/Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
 
 # Launching OpenAI server 
 $ docker run --rm \
@@ -193,11 +193,11 @@ $ docker run --rm \
 ```
 
 ::::{tip}
-For ARM or Apple silicon, use `Dockerfile.arm`
+For ARM or Apple silicon, use `docker/Dockerfile.arm`
 ::::
 
 ::::{tip}
-For IBM Z (s390x), use `Dockerfile.s390x` and in `docker run` use flag `--dtype float`
+For IBM Z (s390x), use `docker/Dockerfile.s390x` and in `docker run` use flag `--dtype float`
 ::::
 
 ## Supported features
diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md
index cdd487696c8aa..eae7a23585103 100644
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@@ -123,7 +123,7 @@ Building the Docker image from source is the recommended way to use vLLM with RO
 
 #### (Optional) Build an image with ROCm software stack
 
-Build a docker image from <gh-file:Dockerfile.rocm_base> which setup ROCm software stack needed by the vLLM.
+Build a docker image from <gh-file:docker/Dockerfile.rocm_base> which setup ROCm software stack needed by the vLLM.
 **This step is optional as this rocm_base image is usually prebuilt and store at [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev) under tag `rocm/vllm-dev:base` to speed up user experience.**
 If you choose to build this rocm_base image yourself, the steps are as follows.
 
@@ -140,12 +140,12 @@ It is important that the user kicks off the docker build using buildkit. Either
 To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:
 
 ```console
-DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm_base -t rocm/vllm-dev:base .
+DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm_base -t rocm/vllm-dev:base .
 ```
 
 #### Build an image with vLLM
 
-First, build a docker image from <gh-file:Dockerfile.rocm> and launch a docker container from the image.
+First, build a docker image from <gh-file:docker/Dockerfile.rocm> and launch a docker container from the image.
 It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
 
 ```console
@@ -156,10 +156,10 @@ It is important that the user kicks off the docker build using buildkit. Either
 }
 ```
 
-<gh-file:Dockerfile.rocm> uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches.
+<gh-file:docker/Dockerfile.rocm> uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches.
 It provides flexibility to customize the build of docker image using the following arguments:
 
-- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:Dockerfile.rocm_base>
+- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:docker/Dockerfile.rocm_base>
 - `USE_CYTHON`: An option to run cython compilation on a subset of python files upon docker build
 - `BUILD_RPD`: Include RocmProfileData profiling tool in the image
 - `ARG_PYTORCH_ROCM_ARCH`: Allows to override the gfx architecture values from the base docker image
@@ -169,13 +169,13 @@ Their values can be passed in when running `docker build` with `--build-arg` opt
 To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:
 
 ```console
-DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
+DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-rocm .
 ```
 
 To build vllm on ROCm 6.3 for Radeon RX7900 series (gfx1100), you should pick the alternative base image:
 
 ```console
-DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f Dockerfile.rocm -t vllm-rocm .
+DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f docker/Dockerfile.rocm -t vllm-rocm .
 ```
 
 To run the above docker image `vllm-rocm`, use the below command:
diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/source/getting_started/installation/gpu/xpu.inc.md
index 84a9b387789c7..c41905f250f83 100644
--- a/docs/source/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@@ -54,7 +54,7 @@ Currently, there are no pre-built XPU images.
 ### Build image from source
 
 ```console
-$ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
+$ docker build -f docker/Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
 $ docker run -it \
              --rm \
              --network=host \
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index b5246c41883ea..25189b006c260 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -208,5 +208,5 @@ Currently, vLLM supports multiple backends for efficient Attention computation a
 If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`.
 
 ```{attention}
-There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see [Dockerfile](https://github.com/vllm-project/vllm/blob/main/Dockerfile) for instructions on how to install it.
+There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see <gh-file:docker/Dockerfile> for instructions on how to install it.
 ```
diff --git a/vllm/config.py b/vllm/config.py
index b06f119680f7b..1dd9359199c1e 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -317,8 +317,8 @@ class ModelConfig:
             ) and backend == "FLASHINFER" and find_spec("flashinfer") is None:
             raise ValueError(
                 "VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer "
-                "module was not found."
-                "See https://github.com/vllm-project/vllm/blob/main/Dockerfile "
+                "module was not found. See "
+                "https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile "  # noqa: E501
                 "for instructions on how to install it.")
 
         # The tokenizer version is consistent with the model version by default.

From b7b7676d67ee517c0f97cfd245531db477606010 Mon Sep 17 00:00:00 2001
From: Ilya Markov <markovilya197@gmail.com>
Date: Tue, 1 Apr 2025 07:49:12 +0200
Subject: [PATCH 133/593] [Distributed] Add custom allreduce support for ROCM
 (#14125)

Signed-off-by: ilmarkov <imarkov@redhat.com>
Co-authored-by: ilmarkov <imarkov@redhat.com>
---
 CMakeLists.txt                                |   2 +-
 csrc/custom_all_reduce.cu                     |  49 +++-
 csrc/custom_all_reduce.cuh                    | 239 ++++++++++++------
 csrc/custom_all_reduce_test.cu                |  58 ++++-
 csrc/ops.h                                    |   9 +-
 csrc/torch_bindings.cpp                       |  11 +-
 tests/distributed/test_custom_all_reduce.py   |   2 +-
 tests/utils.py                                |  11 +-
 vllm/_custom_ops.py                           |  16 +-
 vllm/config.py                                |   6 +-
 .../device_communicators/custom_all_reduce.py |  91 +++----
 vllm/platforms/cuda.py                        |   6 +-
 vllm/platforms/rocm.py                        |  33 ++-
 13 files changed, 373 insertions(+), 160 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d0436aa1d0afd..15db4a4f4cba4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -242,6 +242,7 @@ set(VLLM_EXT_SRC
   "csrc/quantization/gguf/gguf_kernel.cu"
   "csrc/cuda_utils_kernels.cu"
   "csrc/prepare_inputs/advance_step.cu"
+  "csrc/custom_all_reduce.cu"
   "csrc/torch_bindings.cpp")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
@@ -283,7 +284,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/mamba/causal_conv1d/causal_conv1d.cu"
     "csrc/quantization/aqlm/gemm_kernels.cu"
     "csrc/quantization/awq/gemm_kernels.cu"
-    "csrc/custom_all_reduce.cu"
     "csrc/permute_cols.cu"
     "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
     "csrc/quantization/fp4/nvfp4_quant_entry.cu"
diff --git a/csrc/custom_all_reduce.cu b/csrc/custom_all_reduce.cu
index 123278bfed71d..a38d6fa24a28e 100644
--- a/csrc/custom_all_reduce.cu
+++ b/csrc/custom_all_reduce.cu
@@ -12,7 +12,7 @@ static_assert(sizeof(void*) == sizeof(fptr_t));
 
 fptr_t init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
                       torch::Tensor& rank_data, int64_t rank,
-                      bool full_nvlink) {
+                      bool fully_connected) {
   int world_size = fake_ipc_ptrs.size();
   if (world_size > 8)
     throw std::invalid_argument("world size > 8 is not supported");
@@ -27,7 +27,7 @@ fptr_t init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
   }
   return (fptr_t) new vllm::CustomAllreduce(ipc_ptrs, rank_data.data_ptr(),
                                             rank_data.numel(), rank, world_size,
-                                            full_nvlink);
+                                            fully_connected);
 }
 
 /**
@@ -142,3 +142,48 @@ void register_graph_buffers(fptr_t _fa,
   bytes.reserve(handles.size());
   fa->register_graph_buffers(bytes, offsets);
 }
+
+std::tuple<fptr_t, torch::Tensor> allocate_shared_buffer_and_handle(
+    int64_t size) {
+  auto device_index = c10::cuda::current_device();
+  at::DeviceGuard device_guard(at::Device(at::DeviceType::CUDA, device_index));
+  void* buffer;
+  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  auto stream = c10::cuda::getCurrentCUDAStream().stream();
+  AT_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+
+  // Allocate buffer
+#if defined(USE_ROCM)
+  // data buffers need to be "uncached" for signal on MI200
+  AT_CUDA_CHECK(
+      hipExtMallocWithFlags((void**)&buffer, size, hipDeviceMallocUncached));
+#else
+  AT_CUDA_CHECK(cudaMalloc((void**)&buffer, size));
+#endif
+  AT_CUDA_CHECK(cudaMemsetAsync(buffer, 0, size, stream));
+  AT_CUDA_CHECK(cudaStreamSynchronize(stream));
+  AT_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+
+  // Create IPC memhandle for the allocated buffer.
+  // Will use it in open_mem_handle.
+  auto options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
+  auto handle =
+      torch::empty({static_cast<int64_t>(sizeof(cudaIpcMemHandle_t))}, options);
+  AT_CUDA_CHECK(
+      cudaIpcGetMemHandle((cudaIpcMemHandle_t*)handle.data_ptr(), buffer));
+
+  return std::make_tuple(reinterpret_cast<fptr_t>(buffer), handle);
+}
+
+fptr_t open_mem_handle(torch::Tensor& mem_handle) {
+  void* ipc_ptr;
+  AT_CUDA_CHECK(cudaIpcOpenMemHandle(
+      (void**)&ipc_ptr, *((const cudaIpcMemHandle_t*)mem_handle.data_ptr()),
+      cudaIpcMemLazyEnablePeerAccess));
+  return reinterpret_cast<fptr_t>(ipc_ptr);
+}
+
+void free_shared_buffer(fptr_t buffer) {
+  AT_CUDA_CHECK(cudaFree(reinterpret_cast<void*>(buffer)));
+}
diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh
index b9df4ed160b03..7150ce29b41ef 100644
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@@ -5,6 +5,10 @@
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
 
+#if defined(USE_ROCM)
+typedef __hip_bfloat16 nv_bfloat16;
+#endif
+
 #include <iostream>
 #include <array>
 #include <limits>
@@ -12,6 +16,7 @@
 #include <unordered_map>
 #include <vector>
 
+namespace vllm {
 #define CUDACHECK(cmd)                                              \
   do {                                                              \
     cudaError_t e = cmd;                                            \
@@ -22,24 +27,37 @@
     }                                                               \
   } while (0)
 
-namespace vllm {
-
+// Maximal number of blocks in allreduce kernel.
 constexpr int kMaxBlocks = 36;
+
+// Default number of blocks in allreduce kernel.
+#ifndef USE_ROCM
+const int defaultBlockLimit = 36;
+CUpointer_attribute rangeStartAddrAttr = CU_POINTER_ATTRIBUTE_RANGE_START_ADDR;
+#else
+const int defaultBlockLimit = 16;
+hipPointer_attribute rangeStartAddrAttr =
+    HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR;
+#endif
+
 // Counter may overflow, but it's fine since unsigned int overflow is
 // well-defined behavior.
 using FlagType = uint32_t;
+
+// Two sets of peer counters are needed for two syncs: starting and ending an
+// operation. The reason is that it's possible for peer GPU block to arrive at
+// the second sync point while the current GPU block haven't passed the first
+// sync point. Thus, peer GPU may write counter+1 while current GPU is busy
+// waiting for counter. We use alternating counter array to avoid this
+// possibility.
 struct Signal {
-  alignas(128) FlagType self_counter[kMaxBlocks][8];
-  // Two sets of peer counters are needed for two syncs. The reason is that
-  // it's possible for peer GPU block to arrive at the second sync point while
-  // the current GPU block haven't passed the first sync point. Thus, peer GPU
-  // may write counter+1 while current GPU is busy waiting for counter. We use
-  // alternating counter array to avoid this possibility.
-  alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
+  alignas(128) FlagType start[kMaxBlocks][8];
+  alignas(128) FlagType end[kMaxBlocks][8];
+  alignas(128) FlagType _flag[kMaxBlocks];  // incremental flags for each rank
 };
 
 struct __align__(16) RankData {
-  const void* __restrict__ ptrs[8];
+  const void* ptrs[8];
 };
 
 struct __align__(16) RankSignals {
@@ -134,27 +152,29 @@ DINLINE O downcast(array_t<float, O::size> val) {
   }
 }
 
+#if !defined(USE_ROCM)
+
 static DINLINE void st_flag_release(FlagType* flag_addr, FlagType flag) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
   asm volatile("st.release.sys.global.u32 [%1], %0;" ::"r"(flag),
                "l"(flag_addr));
-#else
+  #else
   asm volatile("membar.sys; st.volatile.global.u32 [%1], %0;" ::"r"(flag),
                "l"(flag_addr));
-#endif
+  #endif
 }
 
 static DINLINE FlagType ld_flag_acquire(FlagType* flag_addr) {
   FlagType flag;
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
   asm volatile("ld.acquire.sys.global.u32 %0, [%1];"
                : "=r"(flag)
                : "l"(flag_addr));
-#else
+  #else
   asm volatile("ld.volatile.global.u32 %0, [%1]; membar.gl;"
                : "=r"(flag)
                : "l"(flag_addr));
-#endif
+  #endif
   return flag;
 }
 
@@ -170,37 +190,99 @@ static DINLINE FlagType ld_flag_volatile(FlagType* flag_addr) {
   return flag;
 }
 
-// is_start: whether this is the very first synchronization barrier.
-// need_fence: whether a memory fence is needed. If true, a release-acquire
-// semantic is used to enforce memory access order before and after this
-// barrier.
-template <int ngpus, bool is_start, bool need_fence = false>
-DINLINE void multi_gpu_barrier(const RankSignals& sg, Signal* self_sg,
-                               int rank) {
-  if constexpr (!is_start) __syncthreads();
-  static_assert(
-      !(is_start && need_fence));  // Start barrier shouldn't need fence.
+// This function is meant to be used as the first synchronization in the all
+// reduce kernel. Thus, it doesn't need to make any visibility guarantees for
+// prior memory accesses. Note: volatile writes will not be reordered against
+// other volatile writes.
+template <int ngpus>
+DINLINE void barrier_at_start(const RankSignals& sg, Signal* self_sg,
+                              int rank) {
+  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
   if (threadIdx.x < ngpus) {
-    // Increment the counter. Technically we only need one counter, but we use
-    // multiple per block to eliminate the need to share the counter via smem.
-    auto val = self_sg->self_counter[blockIdx.x][threadIdx.x] += 1;
+    auto peer_counter_ptr = &sg.signals[threadIdx.x]->start[blockIdx.x][rank];
+    auto self_counter_ptr = &self_sg->start[blockIdx.x][threadIdx.x];
+    // Write the expected counter value to peer and wait for correct value
+    // from peer.
+    st_flag_volatile(peer_counter_ptr, flag);
+    while (ld_flag_volatile(self_counter_ptr) != flag);
+  }
+  __syncthreads();
+  // use one thread to update flag
+  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
+}
+
+// This function is meant to be used as the second or the final
+// synchronization barrier in the all reduce kernel. If it's the final
+// synchronization barrier, we don't need to make any visibility guarantees
+// for prior memory accesses.
+template <int ngpus, bool final_sync = false>
+DINLINE void barrier_at_end(const RankSignals& sg, Signal* self_sg, int rank) {
+  __syncthreads();
+  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
+  if (threadIdx.x < ngpus) {
+    auto peer_counter_ptr = &sg.signals[threadIdx.x]->end[blockIdx.x][rank];
+    auto self_counter_ptr = &self_sg->end[blockIdx.x][threadIdx.x];
     // Write the expected counter value to peer and wait for correct value from
     // peer.
-    auto peer_counter_ptr =
-        &sg.signals[threadIdx.x]->peer_counter[val % 2][blockIdx.x][rank];
-    auto self_counter_ptr =
-        &self_sg->peer_counter[val % 2][blockIdx.x][threadIdx.x];
-    if constexpr (need_fence) {
-      st_flag_release(peer_counter_ptr, val);
-      while (ld_flag_acquire(self_counter_ptr) != val);
+    if constexpr (!final_sync) {
+      st_flag_release(peer_counter_ptr, flag);
+      while (ld_flag_acquire(self_counter_ptr) != flag);
     } else {
-      st_flag_volatile(peer_counter_ptr, val);
-      while (ld_flag_volatile(self_counter_ptr) != val);
+      st_flag_volatile(peer_counter_ptr, flag);
+      while (ld_flag_volatile(self_counter_ptr) != flag);
     }
   }
-  if constexpr (is_start || need_fence) __syncthreads();
+  if constexpr (!final_sync) __syncthreads();
+
+  // use one thread to update flag
+  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
 }
 
+#else
+
+template <int ngpus>
+DINLINE void barrier_at_start(const RankSignals& sg, Signal* self_sg,
+                              int rank) {
+  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
+  if (threadIdx.x < ngpus) {
+    // simultaneously write to the corresponding flag of all ranks.
+    // Latency = 1 p2p write
+    __scoped_atomic_store_n(&sg.signals[threadIdx.x]->start[blockIdx.x][rank],
+                            flag, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
+    // wait until we got true from all ranks
+    while (__scoped_atomic_load_n(&self_sg->start[blockIdx.x][threadIdx.x],
+                                  __ATOMIC_RELAXED,
+                                  __MEMORY_SCOPE_DEVICE) < flag);
+  }
+  __syncthreads();
+  // use one thread to update flag
+  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
+}
+
+template <int ngpus, bool final_sync = false>
+DINLINE void barrier_at_end(const RankSignals& sg, Signal* self_sg, int rank) {
+  __syncthreads();
+  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
+  if (threadIdx.x < ngpus) {
+    // simultaneously write to the corresponding flag of all ranks.
+    // Latency = 1 p2p write
+    __scoped_atomic_store_n(&sg.signals[threadIdx.x]->end[blockIdx.x][rank],
+                            flag,
+                            final_sync ? __ATOMIC_RELAXED : __ATOMIC_RELEASE,
+                            __MEMORY_SCOPE_SYSTEM);
+    // wait until we got true from all ranks
+    while (
+        __scoped_atomic_load_n(&self_sg->end[blockIdx.x][threadIdx.x],
+                               final_sync ? __ATOMIC_RELAXED : __ATOMIC_ACQUIRE,
+                               __MEMORY_SCOPE_DEVICE) < flag);
+  }
+  if constexpr (!final_sync) __syncthreads();
+  // use one thread to update flag
+  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
+}
+
+#endif
+
 template <typename P, int ngpus, typename A>
 DINLINE P packed_reduce(const P* ptrs[], int idx) {
   A tmp = upcast(ptrs[0][idx]);
@@ -220,13 +302,13 @@ __global__ void __launch_bounds__(512, 1)
   // note: we don't reorder the address so the accumulation order is the same
   // for all ranks, ensuring bitwise identical results
   auto dp = *_dp;
-  multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
+  barrier_at_start<ngpus>(sg, self_sg, rank);
   // do the actual reduction
   for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
        idx += gridDim.x * blockDim.x) {
     ((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx);
   }
-  multi_gpu_barrier<ngpus, false>(sg, self_sg, rank);
+  barrier_at_end<ngpus, true>(sg, self_sg, rank);
 }
 
 template <typename P>
@@ -255,18 +337,20 @@ __global__ void __launch_bounds__(512, 1)
     tmps[i] = get_tmp_buf<P>(sg.signals[target]);
   }
   auto tmp_out = tmps[0];
-  multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
+  barrier_at_start<ngpus>(sg, self_sg, rank);
+
   // stage 1: reduce scatter
   for (int idx = start + tid; idx < end; idx += stride) {
     tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
   }
-  multi_gpu_barrier<ngpus, false, true>(sg, self_sg, rank);
+  barrier_at_end<ngpus>(sg, self_sg, rank);
 
   // stage 2: allgather. Note: it's important to match the tid between
   // the two stages, because visibility across devices is only guaranteed
   // between threads that have the same tid. If thread i computes the sum of
-  // start + i in the first stage, then thread i also gathers start + i from all
-  // ranks.
+  // start + i in the first stage, then thread i also gathers start + i from
+  // all ranks.
+
   for (int idx = tid; idx < largest_part; idx += stride) {
 #pragma unroll
     for (int i = 0; i < ngpus; i++) {
@@ -287,21 +371,22 @@ class CustomAllreduce {
  public:
   int rank_;
   int world_size_;
-  bool full_nvlink_;
+  // Full NVLink or xGMI connection between GPUs.
+  bool fully_connected_;
 
   RankSignals sg_;
-  // Stores an map from a pointer to its peer pointters from all ranks.
+  // Stores an map from a pointer to its peer pointers from all ranks.
   std::unordered_map<void*, RankData*> buffers_;
   Signal* self_sg_;
 
   // Stores rank data from all ranks. This is mainly for cuda graph purposes.
   // For cuda graph to work, all kernel arguments must be fixed during graph
-  // capture time. However, the peer pointers are not known during graph capture
-  // time. Therefore, during capture, we increment the rank data pointer and use
-  // that as the argument to the kernel. The kernel arguments are stored in
-  // graph_unreg_buffers_. The actual peer pointers will be filled in at the
-  // memory pointed to by the pointers in graph_unreg_buffers_ when
-  // the IPC handles are exchanged between ranks.
+  // capture time. However, the peer pointers are not known during graph
+  // capture time. Therefore, during capture, we increment the rank data
+  // pointer and use that as the argument to the kernel. The kernel arguments
+  // are stored in graph_unreg_buffers_. The actual peer pointers will be
+  // filled in at the memory pointed to by the pointers in
+  // graph_unreg_buffers_ when the IPC handles are exchanged between ranks.
   //
   // The overall process looks like this:
   // 1. Graph capture.
@@ -319,17 +404,18 @@ class CustomAllreduce {
    * Signals are an array of ipc-enabled buffers from all ranks.
    * For each of the buffer, the layout is as follows:
    * | -- sizeof(Signal) -- | ------ a few MB ----- |
-   * The first section is for allreduce synchronization, and the second section
-   * is for storing the intermediate results required by some allreduce algos.
+   * The first section is for allreduce synchronization, and the second
+   * section is for storing the intermediate results required by some
+   * allreduce algos.
    *
    * Note: this class does not own any device memory. Any required buffers
    * are passed in from the constructor.
    */
   CustomAllreduce(Signal** signals, void* rank_data, size_t rank_data_sz,
-                  int rank, int world_size, bool full_nvlink = true)
+                  int rank, int world_size, bool fully_connected = true)
       : rank_(rank),
         world_size_(world_size),
-        full_nvlink_(full_nvlink),
+        fully_connected_(fully_connected),
         self_sg_(signals[rank]),
         d_rank_data_base_(reinterpret_cast<RankData*>(rank_data)),
         d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) {
@@ -361,8 +447,7 @@ class CustomAllreduce {
       void* base_ptr;
       // note: must share the base address of each allocation, or we get wrong
       // address
-      if (cuPointerGetAttribute(&base_ptr,
-                                CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
+      if (cuPointerGetAttribute(&base_ptr, rangeStartAddrAttr,
                                 (CUdeviceptr)ptr) != CUDA_SUCCESS)
         throw std::runtime_error("failed to get pointer attr");
       CUDACHECK(cudaIpcGetMemHandle(
@@ -396,11 +481,11 @@ class CustomAllreduce {
 
   // Note: when registering graph buffers, we intentionally choose to not
   // deduplicate the addresses. That means if the allocator reuses some
-  // addresses, they will be registered again. This is to account for the remote
-  // possibility of different allocation patterns between ranks. For example,
-  // rank 1 may get the same input address for the second allreduce, but rank 2
-  // got a different address. IPC handles have internal reference counting
-  // mechanism so overhead should be small.
+  // addresses, they will be registered again. This is to account for the
+  // remote possibility of different allocation patterns between ranks. For
+  // example, rank 1 may get the same input address for the second allreduce,
+  // but rank 2 got a different address. IPC handles have internal reference
+  // counting mechanism so overhead should be small.
   void register_graph_buffers(
       const std::vector<std::string>& handles,
       const std::vector<std::vector<int64_t>>& offsets) {
@@ -431,15 +516,15 @@ class CustomAllreduce {
   /**
    * Performs allreduce, assuming input has already been registered.
    *
-   * Block and grid default configs are results after careful grid search. Using
-   * 36 blocks give the best or close to the best runtime on the devices I
-   * tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also only
-   * take a small amount of SMs. Not quite sure the underlying reason, but my
-   * guess is that too many SMs will cause contention on NVLink bus.
+   * Block and grid default configs are results after careful grid search.
+   * Using 36 blocks give the best or close to the best runtime on the devices
+   * I tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also
+   * only take a small amount of SMs. Not quite sure the underlying reason,
+   * but my guess is that too many SMs will cause contention on NVLink bus.
    */
   template <typename T>
   void allreduce(cudaStream_t stream, T* input, T* output, int size,
-                 int threads = 512, int block_limit = 36) {
+                 int threads = 512, int block_limit = defaultBlockLimit) {
     auto d = packed_t<T>::P::size;
     if (size % d != 0)
       throw std::runtime_error(
@@ -473,13 +558,11 @@ class CustomAllreduce {
 #define KL(ngpus, name)                                                       \
   name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
                                                  rank_, size);
-    // TODO(hanzhi713): Threshold is different for A100 and H100.
-    // Add per device threshold.
 #define REDUCE_CASE(ngpus)                            \
   case ngpus: {                                       \
     if (world_size_ == 2) {                           \
       KL(ngpus, cross_device_reduce_1stage);          \
-    } else if (full_nvlink_) {                        \
+    } else if (fully_connected_) {                    \
       if ((world_size_ <= 4 && bytes < 512 * 1024) || \
           (world_size_ <= 8 && bytes < 256 * 1024)) { \
         KL(ngpus, cross_device_reduce_1stage);        \
@@ -497,7 +580,8 @@ class CustomAllreduce {
       REDUCE_CASE(8)
       default:
         throw std::runtime_error(
-            "custom allreduce only supports num gpus in (2,4,6,8). Actual num "
+            "custom allreduce only supports num gpus in (2,4,6,8). Actual "
+            "num "
             "gpus = " +
             std::to_string(world_size_));
     }
@@ -511,10 +595,11 @@ class CustomAllreduce {
     }
   }
 };
+
 /**
- * To inspect PTX/SASS, copy paste this header file to compiler explorer and add
- a template instantiation:
+ * To inspect PTX/SASS, copy paste this header file to compiler explorer and
+ add a template instantiation:
  * template void vllm::CustomAllreduce::allreduce<half>(cudaStream_t, half *,
  half *, int, int, int);
 */
-}  // namespace vllm
+}  // namespace vllm
\ No newline at end of file
diff --git a/csrc/custom_all_reduce_test.cu b/csrc/custom_all_reduce_test.cu
index b59ea40d980f4..f7f0823465d30 100644
--- a/csrc/custom_all_reduce_test.cu
+++ b/csrc/custom_all_reduce_test.cu
@@ -1,9 +1,9 @@
 /**
  * This is a standalone test for custom allreduce.
  * To compile, make sure you have MPI and NCCL installed in your system.
- * export MPI_HOME=xxx
+ * export MPI_HOME=XXX
  * nvcc -O2 -arch=native -std=c++17 custom_all_reduce_test.cu -o
- * custom_all_reduce_test -lnccl -I${MPI_HOME} -lmpi
+ * custom_all_reduce_test -lnccl -I${MPI_HOME}/include -lmpi
  *
  * Warning: this C++ test is not designed to be very readable and was used
  * during the rapid prototyping process.
@@ -22,7 +22,15 @@
 #include "cuda_profiler_api.h"
 #include "custom_all_reduce.cuh"
 #include "mpi.h"
-#include "nccl.h"
+#ifdef USE_ROCM
+  #include <hip/hip_bf16.h>
+typedef __hip_bfloat16 nv_bfloat16;
+  #include "rccl/rccl.h"
+  #include "custom_all_reduce_hip.cuh"
+#else
+  #include "nccl.h"
+  #include "custom_all_reduce.cuh"
+#endif
 
 #define MPICHECK(cmd)                                                  \
   do {                                                                 \
@@ -43,16 +51,29 @@
     }                                                               \
   } while (0)
 
+#ifdef USE_ROCM
 __global__ void dummy_kernel() {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  for (int i = 0; i < 100; i++) {
+    uint64_t start = wall_clock64();
+    uint64_t cycles_elapsed;
+    do {
+      cycles_elapsed = wall_clock64() - start;
+    } while (cycles_elapsed < 100);
+  }
   for (int i = 0; i < 100; i++) __nanosleep(1000000);  // 100ms
+}
 #else
+__global__ void dummy_kernel() {
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  for (int i = 0; i < 100; i++) __nanosleep(1000000);  // 100ms
+  #else
   for (int i = 0; i < 100; i++) {
     long long int start = clock64();
     while (clock64() - start < 150000000);  // approximately 98.4ms on P40
   }
-#endif
+  #endif
 }
+#endif
 
 template <typename T>
 __global__ void set_data(T* data, int size, int myRank) {
@@ -121,8 +142,14 @@ void run(int myRank, int nRanks, ncclComm_t& comm, int threads, int block_limit,
    * registration, they are allocated and registered together in the test for
    * convenience.
    */
+#ifdef USE_ROCM
+  CUDACHECK(hipExtMallocWithFlags(
+      (void**)&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Signal),
+      hipDeviceMallocUncached));
+#else
   CUDACHECK(
       cudaMalloc(&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Signal)));
+#endif
   CUDACHECK(
       cudaMemset(buffer, 0, 2 * data_size * sizeof(T) + sizeof(vllm::Signal)));
   CUDACHECK(cudaMalloc(&self_data_copy, data_size * sizeof(T)));
@@ -311,13 +338,18 @@ int main(int argc, char** argv) {
 
   bool performance_test = true;
   cudaProfilerStart();
-  // Uncomment to scan through different block size configs.
-  // for (int threads : {256, 512, 1024}) {
-  //   for (int block_limit = 16; block_limit < 112; block_limit += 4) {
-  //     run<half>(myRank, nRanks, comm, threads, block_limit, 1024 * 1024,
-  //     performance_test);
-  //   }
-  // }
+// Uncomment to scan through different block size configs.
+// for (int threads : {256, 512, 1024}) {
+//   for (int block_limit = 16; block_limit < 112; block_limit += 4) {
+//     run<half>(myRank, nRanks, comm, threads, block_limit, 1024 * 1024,
+//     performance_test);
+//   }
+// }
+#ifdef USE_ROCM
+  const int block_limit = 16;
+#else
+  const int block_limit = 36;
+#endif
   // Scan through different sizes to test performance.
   for (int sz = 512; sz <= (8 << 20); sz *= 2) {
     run<half>(myRank, nRanks, comm, 512, 36, sz + 8 * 47, performance_test);
@@ -326,4 +358,4 @@ int main(int argc, char** argv) {
   cudaProfilerStop();
   MPICHECK(MPI_Finalize());
   return EXIT_SUCCESS;
-}
+}
\ No newline at end of file
diff --git a/csrc/ops.h b/csrc/ops.h
index 77d1ab768d95d..a0985d3242662 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -267,10 +267,10 @@ void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
                        const std::optional<at::Tensor>& has_initial_state,
                        bool silu_activation, int64_t pad_slot_id);
 
-#ifndef USE_ROCM
 using fptr_t = int64_t;
 fptr_t init_custom_ar(const std::vector<int64_t>& fake_ipc_ptrs,
-                      torch::Tensor& rank_data, int64_t rank, bool full_nvlink);
+                      torch::Tensor& rank_data, int64_t rank,
+                      bool fully_connected);
 void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
                 fptr_t reg_buffer, int64_t reg_buffer_sz_bytes);
 void dispose(fptr_t _fa);
@@ -281,4 +281,7 @@ get_graph_buffer_ipc_meta(fptr_t _fa);
 void register_graph_buffers(fptr_t _fa,
                             const std::vector<std::vector<int64_t>>& handles,
                             const std::vector<std::vector<int64_t>>& offsets);
-#endif
+std::tuple<int64_t, torch::Tensor> allocate_shared_buffer_and_handle(
+    int64_t size);
+int64_t open_mem_handle(torch::Tensor& mem_handle);
+void free_shared_buffer(int64_t buffer);
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index b0a23a3693711..feb3882c4d54e 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -614,12 +614,11 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
                   &get_max_shared_memory_per_block_device_attribute);
 }
 
-#ifndef USE_ROCM
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
   // Custom all-reduce kernels
   custom_ar.def(
       "init_custom_ar(int[] ipc_tensors, Tensor rank_data, "
-      "int rank, bool full_nvlink) -> int");
+      "int rank, bool fully_connected) -> int");
   custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
   custom_ar.def(
       "all_reduce(int fa, Tensor inp, Tensor! out, int reg_buffer, "
@@ -632,7 +631,13 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
   custom_ar.def("register_buffer", &register_buffer);
   custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
   custom_ar.def("register_graph_buffers", &register_graph_buffers);
+
+  custom_ar.def("allocate_shared_buffer_and_handle",
+                &allocate_shared_buffer_and_handle);
+  custom_ar.def("open_mem_handle(Tensor mem_handle) -> int", &open_mem_handle);
+  custom_ar.impl("open_mem_handle", torch::kCPU, &open_mem_handle);
+
+  custom_ar.def("free_shared_buffer", &free_shared_buffer);
 }
-#endif
 
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index bfa7d06c4d075..a7ba45c9e546e 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -106,7 +106,7 @@ def eager_allreduce(
         # communicate independently
         num_communication = rank // tp_size + 1
         sz = 1024
-        fa = get_tp_group().ca_comm
+        fa = get_tp_group().device_communicator.ca_comm
         inp = torch.ones(sz, dtype=torch.float32, device=device)
         out = inp
         for _ in range(num_communication):
diff --git a/tests/utils.py b/tests/utils.py
index 8915453ebd0a3..69c96d3f0658f 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -612,7 +612,16 @@ def multi_process_parallel(
     # as compared to multiprocessing.
     # NOTE: We need to set working_dir for distributed tests,
     # otherwise we may get import errors on ray workers
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
+    # NOTE: Force ray not to use gitignore file as excluding, otherwise
+    # it will not move .so files to working dir.
+    # So we have to manually add some of large directories
+    os.environ["RAY_RUNTIME_ENV_IGNORE_GITIGNORE"] = "1"
+    ray.init(
+        runtime_env={
+            "working_dir": VLLM_PATH,
+            "excludes":
+            ["build", ".git", "cmake-build-*", "shellcheck", "dist"]
+        })
 
     distributed_init_port = get_open_port()
     refs = []
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 2ffcef414cb28..2aa99ca256c63 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1337,9 +1337,9 @@ def get_max_shared_memory_per_block_device_attribute(device: int) -> int:
 
 # custom ar
 def init_custom_ar(ipc_tensors: list[torch.Tensor], rank_data: torch.Tensor,
-                   rank: int, full_nvlink: bool) -> int:
+                   rank: int, fully_connected: bool) -> int:
     return torch.ops._C_custom_ar.init_custom_ar(ipc_tensors, rank_data, rank,
-                                                 full_nvlink)
+                                                 fully_connected)
 
 
 def all_reduce(fa: int, inp: torch.Tensor, out: torch.Tensor, reg_buffer: int,
@@ -1369,6 +1369,18 @@ def register_graph_buffers(fa: int, handles: list[list[int]],
     torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
 
 
+def allocate_shared_buffer_and_handle(size: int) -> tuple[int, torch.Tensor]:
+    return torch.ops._C_custom_ar.allocate_shared_buffer_and_handle(size)
+
+
+def open_mem_handle(mem_handle: torch.Tensor):
+    return torch.ops._C_custom_ar.open_mem_handle(mem_handle)
+
+
+def free_shared_buffer(ptr: int) -> None:
+    torch.ops._C_custom_ar.free_shared_buffer(ptr)
+
+
 def get_flash_mla_metadata(
     cache_seqlens: torch.Tensor,
     num_heads_per_head_k: int,
diff --git a/vllm/config.py b/vllm/config.py
index 1dd9359199c1e..84b9836ef589f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1606,11 +1606,13 @@ class ParallelConfig:
         if self.use_ray:
             from vllm.executor import ray_utils
             ray_utils.assert_ray_available()
-        if current_platform.is_rocm():
+        device_capability = current_platform.get_device_capability()
+        if (current_platform.is_rocm() and device_capability is not None
+                and device_capability < (9, 4)):
             self.disable_custom_all_reduce = True
             logger.info(
                 "Disabled the custom all-reduce kernel because it is not "
-                "supported on AMD GPUs.")
+                "supported on AMD GPUs older than MI300X.")
         if self.ray_workers_use_nsight and not self.use_ray:
             raise ValueError("Unable to use nsight profiling unless workers "
                              "run with Ray.")
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index 90f7f2d0f9823..45fc2a7118b74 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import ctypes
 from contextlib import contextmanager
 from typing import List, Optional, Union
 
@@ -10,7 +9,6 @@ from torch.distributed import ProcessGroup
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
-from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
 from vllm.distributed.device_communicators.custom_all_reduce_utils import (
     gpu_p2p_access_check)
 from vllm.distributed.parallel_state import in_the_same_node_as
@@ -22,7 +20,7 @@ try:
     ops.meta_size()
     custom_ar = True
 except Exception:
-    # For AMD GPUs and CPUs
+    # For CPUs
     custom_ar = False
 
 logger = init_logger(__name__)
@@ -71,7 +69,9 @@ class CustomAllreduce:
 
         if not custom_ar:
             # disable because of missing custom allreduce library
-            # e.g. in a non-cuda environment
+            # e.g. in a non-GPU environment
+            logger.info("Custom allreduce is disabled because "
+                        "of missing custom allreduce library")
             return
 
         self.group = group
@@ -129,11 +129,10 @@ class CustomAllreduce:
         # test nvlink first, this will filter out most of the cases
         # where custom allreduce is not supported
         # this checks hardware and driver support for NVLink
-        assert current_platform.is_cuda()
-        from vllm.platforms.cuda import CudaPlatform
-        cuda_platform: CudaPlatform = current_platform
-        full_nvlink = cuda_platform.is_full_nvlink(physical_device_ids)
-        if world_size > 2 and not full_nvlink:
+        assert current_platform.is_cuda_alike()
+        fully_connected = current_platform.is_fully_connected(
+            physical_device_ids)
+        if world_size > 2 and not fully_connected:
             logger.warning(
                 "Custom allreduce is disabled because it's not supported on"
                 " more than two PCIe-only GPUs. To silence this warning, "
@@ -142,7 +141,8 @@ class CustomAllreduce:
         # test P2P capability, this checks software/cudaruntime support
         # this is expensive to compute at the first time
         # then we cache the result
-        if not _can_p2p(rank, world_size):
+        # On AMD GPU, p2p is always enabled between XGMI connected GPUs
+        if not current_platform.is_rocm() and not _can_p2p(rank, world_size):
             logger.warning(
                 "Custom allreduce is disabled because your platform lacks "
                 "GPU P2P capability or P2P test failed. To silence this "
@@ -154,7 +154,8 @@ class CustomAllreduce:
         # Meta data composes of two parts: meta data for synchronization and a
         # temporary buffer for storing intermediate allreduce results.
         self.meta_ptrs = self.create_shared_buffer(ops.meta_size() + max_size,
-                                                   group=group)
+                                                   group=group,
+                                                   uncached=True)
         # This is a pre-registered IPC buffer. In eager mode, input tensors
         # are first copied into this buffer before allreduce is performed
         self.buffer_ptrs = self.create_shared_buffer(max_size, group=group)
@@ -169,46 +170,11 @@ class CustomAllreduce:
         self.max_size = max_size
         self.rank = rank
         self.world_size = world_size
-        self.full_nvlink = full_nvlink
+        self.fully_connected = fully_connected
         self._ptr = ops.init_custom_ar(self.meta_ptrs, self.rank_data, rank,
-                                       self.full_nvlink)
+                                       self.fully_connected)
         ops.register_buffer(self._ptr, self.buffer_ptrs)
 
-    @staticmethod
-    def create_shared_buffer(
-            size_in_bytes: int,
-            group: Optional[ProcessGroup] = None) -> List[int]:
-        """
-        Creates a shared buffer and returns a list of pointers
-        representing the buffer on all processes in the group.
-        """
-        lib = CudaRTLibrary()
-        pointer = lib.cudaMalloc(size_in_bytes)
-        handle = lib.cudaIpcGetMemHandle(pointer)
-        world_size = dist.get_world_size(group=group)
-        rank = dist.get_rank(group=group)
-        handles = [None] * world_size
-        dist.all_gather_object(handles, handle, group=group)
-
-        pointers: List[int] = []
-        for i, h in enumerate(handles):
-            if i == rank:
-                pointers.append(pointer.value)  # type: ignore
-            else:
-                pointers.append(
-                    lib.cudaIpcOpenMemHandle(h).value)  # type: ignore
-
-        return pointers
-
-    @staticmethod
-    def free_shared_buffer(pointers: List[int],
-                           group: Optional[ProcessGroup] = None,
-                           rank: Optional[int] = None) -> None:
-        if rank is None:
-            rank = dist.get_rank(group=group)
-        lib = CudaRTLibrary()
-        lib.cudaFree(ctypes.c_void_p(pointers[rank]))
-
     @contextmanager
     def capture(self):
         """
@@ -255,7 +221,7 @@ class CustomAllreduce:
             return False
         # for 4 or more non NVLink-capable GPUs, custom allreduce provides
         # little performance improvement over NCCL.
-        if self.world_size == 2 or self.full_nvlink:
+        if self.world_size == 2 or self.fully_connected:
             return inp_size < self.max_size
         return False
 
@@ -306,3 +272,30 @@ class CustomAllreduce:
 
     def __del__(self):
         self.close()
+
+    @staticmethod
+    def create_shared_buffer(size_in_bytes: int,
+                             group: Optional[ProcessGroup] = None,
+                             uncached: Optional[bool] = False) -> List[int]:
+        pointer, handle = ops.allocate_shared_buffer_and_handle(size_in_bytes)
+
+        world_size = dist.get_world_size(group=group)
+        rank = dist.get_rank(group=group)
+        handles = [None] * world_size
+        dist.all_gather_object(handles, handle, group=group)
+
+        pointers: List[int] = []
+        for i, h in enumerate(handles):
+            if i == rank:
+                pointers.append(pointer)  # type: ignore
+            else:
+                pointers.append(ops.open_mem_handle(h))
+        return pointers
+
+    @staticmethod
+    def free_shared_buffer(pointers: List[int],
+                           group: Optional[ProcessGroup] = None,
+                           rank: Optional[int] = 0) -> None:
+        if rank is None:
+            rank = dist.get_rank(group=group)
+        ops.free_shared_buffer(pointers[rank])
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index ca8a2d2640ec5..28505fca10dfa 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -101,7 +101,7 @@ class CudaPlatformBase(Platform):
         return True
 
     @classmethod
-    def is_full_nvlink(cls, device_ids: List[int]) -> bool:
+    def is_fully_connected(cls, device_ids: List[int]) -> bool:
         raise NotImplementedError
 
     @classmethod
@@ -362,7 +362,7 @@ class NvmlCudaPlatform(CudaPlatformBase):
 
     @classmethod
     @with_nvml_context
-    def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool:
+    def is_fully_connected(cls, physical_device_ids: List[int]) -> bool:
         """
         query if the set of gpus are fully connected by nvlink (1 hop)
         """
@@ -427,7 +427,7 @@ class NonNvmlCudaPlatform(CudaPlatformBase):
         return device_props.total_memory
 
     @classmethod
-    def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool:
+    def is_fully_connected(cls, physical_device_ids: List[int]) -> bool:
         logger.exception(
             "NVLink detection not possible, as context support was"
             " not found. Assuming no NVLink available.")
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index d196e24ac7ac0..89b778c7b5b02 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -20,8 +20,9 @@ else:
 logger = init_logger(__name__)
 
 try:
-    from amdsmi import (amdsmi_get_gpu_asic_info, amdsmi_get_processor_handles,
-                        amdsmi_init, amdsmi_shut_down)
+    from amdsmi import (AmdSmiException, amdsmi_get_gpu_asic_info,
+                        amdsmi_get_processor_handles, amdsmi_init,
+                        amdsmi_shut_down, amdsmi_topo_get_link_type)
 except ImportError as e:
     logger.warning("Failed to import from amdsmi with %r", e)
 
@@ -135,10 +136,36 @@ class RocmPlatform(Platform):
 
     @classmethod
     @lru_cache(maxsize=8)
-    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
+    def get_device_capability(cls,
+                              device_id: int = 0
+                              ) -> Optional[DeviceCapability]:
         major, minor = torch.cuda.get_device_capability(device_id)
         return DeviceCapability(major=major, minor=minor)
 
+    @staticmethod
+    @with_amdsmi_context
+    def is_fully_connected(physical_device_ids: List[int]) -> bool:
+        """
+        Query if the set of gpus are fully connected by xgmi (1 hop)
+        """
+        handles = [
+            amdsmi_get_processor_handles()[i] for i in physical_device_ids
+        ]
+        for i, handle in enumerate(handles):
+            for j, peer_handle in enumerate(handles):
+                if i < j:
+                    try:
+                        link_type = amdsmi_topo_get_link_type(
+                            handle, peer_handle)
+                        # type is 2 for XGMI
+                        if link_type["hops"] != 1 or link_type["type"] != 2:
+                            return False
+                    except AmdSmiException as error:
+                        logger.error("AMD 1 hop XGMI detection failed.",
+                                     exc_info=error)
+                        return False
+        return True
+
     @classmethod
     @with_amdsmi_context
     @lru_cache(maxsize=8)

From a76f547e11ebfc1a5cbe1e5e167e2c5c5691a081 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 1 Apr 2025 06:49:41 +0100
Subject: [PATCH 134/593] Rename fallback model and refactor supported models
 section (#15829)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/source/index.md                       |   2 +-
 docs/source/models/supported_models.md     | 119 ++++++++++++---------
 tests/models/registry.py                   |   4 +-
 vllm/model_executor/model_loader/utils.py  |   7 +-
 vllm/model_executor/models/registry.py     |   6 +-
 vllm/model_executor/models/transformers.py |   2 +-
 6 files changed, 80 insertions(+), 60 deletions(-)

diff --git a/docs/source/index.md b/docs/source/index.md
index 402f242679041..28dc0f67d7746 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -77,9 +77,9 @@ getting_started/v1_user_guide
 :caption: Models
 :maxdepth: 1
 
+models/supported_models
 models/generative_models
 models/pooling_models
-models/supported_models
 models/extensions/index
 :::
 
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 1705757d8308f..62274854d8bee 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -1,55 +1,28 @@
 (supported-models)=
 
-# List of Supported Models
+# Supported Models
 
-vLLM supports generative and pooling models across various tasks.
+vLLM supports [generative](generative-models) and [pooling](pooling-models) models across various tasks.
 If a model supports more than one task, you can set the task via the `--task` argument.
 
 For each task, we list the model architectures that have been implemented in vLLM.
 Alongside each architecture, we include some popular models that use it.
 
-## Loading a Model
+## Model Implementation
 
-### HuggingFace Hub
+### vLLM
 
-By default, vLLM loads models from [HuggingFace (HF) Hub](https://huggingface.co/models).
+If vLLM natively supports a model, its implementation can be found in <gh-file:vllm/model_executor/models>.
 
-To determine whether a given model is natively supported, you can check the `config.json` file inside the HF repository.
-If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
+These models are what we list in <project:#supported-text-models> and <project:#supported-mm-models>.
 
-Models do not _need_ to be natively supported to be used in vLLM.
-The <project:#transformers-fallback> enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
+(transformers-backend)=
 
-:::{tip}
-The easiest way to check if your model is really supported at runtime is to run the program below:
+### Transformers
 
-```python
-from vllm import LLM
+vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models are supported, and vision language model support is planned!
 
-# For generative models (task=generate) only
-llm = LLM(model=..., task="generate")  # Name or path of your model
-output = llm.generate("Hello, my name is")
-print(output)
-
-# For pooling models (task={embed,classify,reward,score}) only
-llm = LLM(model=..., task="embed")  # Name or path of your model
-output = llm.encode("Hello, my name is")
-print(output)
-```
-
-If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
-:::
-
-Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
-Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
-
-(transformers-fallback)=
-
-### Transformers fallback
-
-vLLM can fallback to model implementations that are available in Transformers. This does not work for all models for now, but most decoder language models are supported, and vision language model support is planned!
-
-To check if the backend is Transformers, you can simply do this:
+To check if the modeling backend is Transformers, you can simply do this:
 
 ```python 
 from vllm import LLM
@@ -69,16 +42,15 @@ vLLM may not fully optimise the Transformers implementation so you may see degra
 
 #### Supported features
 
-The Transformers fallback explicitly supports the following features:
+The Transformers modeling backend explicitly supports the following features:
 
 - <project:#quantization-index> (except GGUF)
 - <project:#lora-adapter>
 - <project:#distributed-serving>
 
-#### Remote code
+#### Remote Code
 
-Earlier we mentioned that the Transformers fallback enables you to run remote code models directly in vLLM.
-If you are interested in this feature, this section is for you!
+If your model is neither supported natively by vLLM or Transformers, you can still run it in vLLM!
 
 Simply set `trust_remote_code=True` and vLLM will run any model on the Model Hub that is compatible with Transformers.
 Provided that the model writer implements their model in a compatible way, this means that you can run new models before they are officially supported in Transformers or vLLM!
@@ -89,7 +61,7 @@ llm = LLM(model=..., task="generate", trust_remote_code=True)  # Name or path of
 llm.apply_model(lambda model: print(model.__class__))
 ```
 
-To make your model compatible with the Transformers fallback, it needs:
+To make your model compatible with the Transformers backend, it needs:
 
 ```{code-block} python
 :caption: modeling_my_model.py
@@ -121,7 +93,9 @@ Here is what happens in the background:
 2. `MyModel` Python class is loaded from the `auto_map`, and we check that the model `_supports_attention_backend`.
 3. The `TransformersForCausalLM` backend is used. See <gh-file:vllm/model_executor/models/transformers.py>, which leverage `self.config._attn_implementation = "vllm"`, thus the need to use `ALL_ATTENTION_FUNCTION`.
 
-To make your model compatible with tensor parallel, it needs:
+That's it!
+
+For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class:
 
 ```{code-block} python
 :caption: configuration_my_model.py
@@ -130,20 +104,65 @@ from transformers import PretrainedConfig
 
 class MyConfig(PretrainedConfig):
   base_model_tp_plan = {
-    "layers.*.self_attn.q_proj": "colwise",
-    ...
+    "layers.*.self_attn.k_proj": "colwise",
+    "layers.*.self_attn.v_proj": "colwise",
+    "layers.*.self_attn.o_proj": "rowwise",
+    "layers.*.mlp.gate_proj": "colwise",
+    "layers.*.mlp.up_proj": "colwise",
+    "layers.*.mlp.down_proj": "rowwise",
+  }
+  base_model_pp_plan = {
+    "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+    "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+    "norm": (["hidden_states"], ["hidden_states"]),
   }
 ```
 
+- `base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported).
+- `base_model_pp_plan` is a `dict` that maps direct child layer names to `tuple`s of `list`s of `str`s:
+  * You only need to do this for layers which are not present on all pipeline stages
+  * vLLM assumes that there will be only one `nn.ModuleList`, which is distributed across the pipeline stages
+  * The `list` in the first element of the `tuple` contains the names of the input arguments
+  * The `list` in the last element of the `tuple` contains the names of the variables the layer outputs to in your modeling code
+
+## Loading a Model
+
+### Hugging Face Hub
+
+By default, vLLM loads models from [Hugging Face (HF) Hub](https://huggingface.co/models).
+
+To determine whether a given model is natively supported, you can check the `config.json` file inside the HF repository.
+If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
+
+Models do not _need_ to be natively supported to be used in vLLM.
+The <project:#transformers-backend> enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
+
 :::{tip}
-`base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported).
+The easiest way to check if your model is really supported at runtime is to run the program below:
+
+```python
+from vllm import LLM
+
+# For generative models (task=generate) only
+llm = LLM(model=..., task="generate")  # Name or path of your model
+output = llm.generate("Hello, my name is")
+print(output)
+
+# For pooling models (task={embed,classify,reward,score}) only
+llm = LLM(model=..., task="embed")  # Name or path of your model
+output = llm.encode("Hello, my name is")
+print(output)
+```
+
+If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
 :::
 
-That's it!
+Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
+Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
 
 ### ModelScope
 
-To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFace Hub, set an environment variable:
+To use models from [ModelScope](https://www.modelscope.cn) instead of Hugging Face Hub, set an environment variable:
 
 ```shell
 export VLLM_USE_MODELSCOPE=True
@@ -165,6 +184,8 @@ output = llm.encode("Hello, my name is")
 print(output)
 ```
 
+(supported-text-models)=
+
 ## List of Text-only Language Models
 
 ### Generative Models
@@ -1066,7 +1087,7 @@ At vLLM, we are committed to facilitating the integration and support of third-p
 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results.
 
     :::{tip}
-    When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
+    When comparing the output of `model.generate` from Hugging Face Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
     :::
 
 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback.
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 69ebfe4c92415..8cc5c28d237c7 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -346,7 +346,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
                                         trust_remote_code=True),
 }
 
-_FALLBACK_MODEL = {
+_TRANSFORMERS_MODELS = {
     "TransformersForCausalLM": _HfExamplesInfo("ArthurZ/Ilama-3.2-1B", trust_remote_code=True),  # noqa: E501
 }
 
@@ -356,7 +356,7 @@ _EXAMPLE_MODELS = {
     **_CROSS_ENCODER_EXAMPLE_MODELS,
     **_MULTIMODAL_EXAMPLE_MODELS,
     **_SPECULATIVE_DECODING_EXAMPLE_MODELS,
-    **_FALLBACK_MODEL,
+    **_TRANSFORMERS_MODELS,
 }
 
 
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index d9613fab3a28e..15f37aad6d8c3 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -39,8 +39,8 @@ def is_transformers_impl_compatible(
     return mod.is_backend_compatible()
 
 
-def resolve_transformers_fallback(model_config: ModelConfig,
-                                  architectures: list[str]):
+def resolve_transformers_arch(model_config: ModelConfig,
+                              architectures: list[str]):
     for i, arch in enumerate(architectures):
         if arch == "TransformersForCausalLM":
             continue
@@ -101,8 +101,7 @@ def get_model_architecture(
                             for arch in architectures)
     if (not is_vllm_supported
             or model_config.model_impl == ModelImpl.TRANSFORMERS):
-        architectures = resolve_transformers_fallback(model_config,
-                                                      architectures)
+        architectures = resolve_transformers_arch(model_config, architectures)
 
     model_cls, arch = ModelRegistry.resolve_model_cls(architectures)
     if model_config.task == "embed":
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 34be221285cee..21ebaac77371d 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -202,7 +202,7 @@ _SPECULATIVE_DECODING_MODELS = {
     "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
 }
 
-_FALLBACK_MODEL = {
+_TRANSFORMERS_MODELS = {
     "TransformersForCausalLM": ("transformers", "TransformersForCausalLM"),
 }
 # yapf: enable
@@ -213,7 +213,7 @@ _VLLM_MODELS = {
     **_CROSS_ENCODER_MODELS,
     **_MULTIMODAL_MODELS,
     **_SPECULATIVE_DECODING_MODELS,
-    **_FALLBACK_MODEL,
+    **_TRANSFORMERS_MODELS,
 }
 
 # This variable is used as the args for subprocess.run(). We
@@ -427,7 +427,7 @@ class _ModelRegistry:
         normalized_arch = list(
             filter(lambda model: model in self.models, architectures))
 
-        # make sure Transformers fallback are put at the last
+        # make sure Transformers backend is put at the last as a fallback
         if len(normalized_arch) != len(architectures):
             normalized_arch.append("TransformersForCausalLM")
         return normalized_arch
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 70daadf913798..a1f233e04892e 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -401,7 +401,7 @@ class TransformersForCausalLM(nn.Module, SupportsQuant, SupportsLoRA,
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
-    # FIXME(Isotr0py): Don't use any weights mapper for Transformers fallback,
+    # FIXME(Isotr0py): Don't use any weights mapper for Transformers backend,
     # this makes thing complicated. We need to remove this mapper after refactor
     # `TransformersModel` in the future.
     @property

From a164aea35d48a4ef7316c203ef89720b0267d9b4 Mon Sep 17 00:00:00 2001
From: Kinfey <93169410+kinfey@users.noreply.github.com>
Date: Tue, 1 Apr 2025 13:50:05 +0800
Subject: [PATCH 135/593] [Frontend] Add Phi-4-mini function calling support
 (#14886)

Signed-off-by: Kinfey <kinfeylo@microsoft.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 examples/tool_chat_template_phi4_mini.jinja   |  60 ++++++++++
 .../openai/tool_parsers/__init__.py           |   3 +-
 .../tool_parsers/phi4mini_tool_parser.py      | 108 ++++++++++++++++++
 3 files changed, 170 insertions(+), 1 deletion(-)
 create mode 100644 examples/tool_chat_template_phi4_mini.jinja
 create mode 100644 vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py

diff --git a/examples/tool_chat_template_phi4_mini.jinja b/examples/tool_chat_template_phi4_mini.jinja
new file mode 100644
index 0000000000000..36423b6c4240a
--- /dev/null
+++ b/examples/tool_chat_template_phi4_mini.jinja
@@ -0,0 +1,60 @@
+{%- if messages %}
+    {%- if system_message or tools %}
+<|system|>
+
+{%- if system_message %}
+{{ system_message }}
+{%- endif %}
+In addition to plain text responses, you can chose to call one or more of the provided functions.
+
+Use the following rule to decide when to call a function:
+  * if the response can be generated from your internal knowledge (e.g., as in the case of queries like "What is the capital of Poland?"), do so
+  * if you need external information that can be obtained by calling one or more of the provided functions, generate a function calls
+
+If you decide to call functions:
+  * prefix function calls with functools marker (no closing marker required)
+  * all function calls should be generated in a single JSON list formatted as functools[{"name": [function name], "arguments": [function arguments as JSON]}, ...]
+  * follow the provided JSON schema. Do not hallucinate arguments or values. Do to blindly copy values from the provided samples
+  * respect the argument type formatting. E.g., if the type if number and format is float, write value 7 as 7.0
+  * make sure you pick the right functions that match the user intent
+
+
+{%- if tools %}
+        {%- for t in tools %}
+            {{- t | tojson(indent=4) }}
+            {{- "\n\n" }}
+        {%- endfor %}
+{%- endif %}<|end|>
+    {%- endif %}
+
+    {%- for message in messages %}
+        {%- if message.role != "system" %}
+<|{{ message.role }}|>
+            {%- if message.content and message.role == "tools" %}
+{"result": {{ message.content }}}
+            {%- elif message.content %}
+{{ message.content }}
+            {%- elif message.tool_calls %}
+                {%- for call in message.tool_calls %}
+{"name": "{{ call.function.name }}", "arguments": {{ call.function.arguments }}}
+                    {%- if not loop.last %},{% endif %}
+                {%- endfor %}
+            {%- endif %}<|end|>
+        {%- endif %}
+    {%- endfor %}<|assistant|>
+
+{%- else %}
+    {%- if system_message %}
+<|system|>
+
+{{ system_message }}<|end|>
+    {%- endif %}
+    {%- if prompt %}
+<|user|>
+
+{{ prompt }}<|end|>
+    {%- endif %}<|assistant|>
+
+{%- endif %}
+{{ response }}
+{%- if response %}<|user|>{% endif %}
\ No newline at end of file
diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index d1c3afa64b96c..b81dc4e7ad7b8 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -8,11 +8,12 @@ from .internlm2_tool_parser import Internlm2ToolParser
 from .jamba_tool_parser import JambaToolParser
 from .llama_tool_parser import Llama3JsonToolParser
 from .mistral_tool_parser import MistralToolParser
+from .phi4mini_tool_parser import Phi4MiniJsonToolParser
 from .pythonic_tool_parser import PythonicToolParser
 
 __all__ = [
     "ToolParser", "ToolParserManager", "Granite20bFCToolParser",
     "GraniteToolParser", "Hermes2ProToolParser", "MistralToolParser",
     "Internlm2ToolParser", "Llama3JsonToolParser", "JambaToolParser",
-    "PythonicToolParser"
+    "PythonicToolParser", "Phi4MiniJsonToolParser"
 ]
diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
new file mode 100644
index 0000000000000..167eb0ea2a97d
--- /dev/null
+++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
@@ -0,0 +1,108 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import re
+from collections.abc import Sequence
+from typing import Any, Optional
+
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+from vllm.utils import random_uuid
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("phi4_mini_json")
+class Phi4MiniJsonToolParser(ToolParser):
+    """
+    Tool call parser for phi-4-mini models intended for use with the
+    examples/tool_chat_template_llama.jinja template.
+
+    Used when --enable-auto-tool-choice --tool-call-parser phi4_mini_json  
+    are all set
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase) -> None:
+        super().__init__(tokenizer)
+
+        # initialize properties used for state when parsing tool calls in
+        # streaming mode
+        self.prev_tool_call_arr: list[dict[str, Any]] = []
+        self.current_tool_id: int = -1
+        self.current_tool_name_sent: bool = False
+        self.streamed_args_for_tool: list[str] = [
+        ]  # map what has been streamed for each tool so far to a list
+        self.bot_token: str = "functools"
+
+    def extract_tool_calls(
+            self, model_output: str,
+            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        """
+        Extract the tool calls from a complete model response.
+        """
+        print(f"Model output: {model_output}")
+
+        pattern = r'functools\[(.*?)\]'
+        matches = re.search(pattern, model_output, re.DOTALL)
+
+        if not matches:
+            print("No function calls found")
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        try:
+            function_call_arr: list[dict[str, Any]] = []
+            try:
+                json_content = '[' + matches.group(1) + ']'
+
+                function_call_arr = json.loads(json_content)
+                print(f"Successfully extracted {len(function_call_arr)} "
+                      "function calls")
+            except json.JSONDecodeError as e:
+                print(f"Error parsing JSON: {e}")
+
+            tool_calls: list[ToolCall] = [
+                ToolCall(
+                    id=f"chatcmpl-tool-{random_uuid()}",
+                    type="function",
+                    function=FunctionCall(
+                        name=raw_function_call["name"],
+                        # function call args are JSON but as a string
+                        arguments=json.dumps(
+                            raw_function_call["arguments"] if "arguments" in
+                            raw_function_call else
+                            raw_function_call["parameters"])))
+                for raw_function_call in function_call_arr
+            ]
+
+            # get any content before the tool call
+            ret = ExtractedToolCallInformation(tools_called=True,
+                                               tool_calls=tool_calls,
+                                               content=None)
+            return ret
+
+        except Exception:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Optional[DeltaMessage]:
+
+        return None

From ff6473980d3152733e6846556c8152e2f4d500b4 Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Tue, 1 Apr 2025 13:53:37 +0800
Subject: [PATCH 136/593] [Bugfix][Model] fix mllama multi-image (#14883)

Signed-off-by: yan ma <yan.ma@intel.com>
---
 .../vision_language/test_mllama.py            |  2 +-
 vllm/model_executor/models/mllama.py          | 34 +++++++++++++++----
 2 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py
index 260d2c1093879..c688655887e27 100644
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -212,7 +212,7 @@ def _run_test(
     with vllm_runner(model,
                      dtype=dtype,
                      max_model_len=4096,
-                     max_num_seqs=2,
+                     max_num_seqs=3,
                      tensor_parallel_size=tensor_parallel_size,
                      distributed_executor_backend=distributed_executor_backend,
                      limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 68d5298dfc9bc..6a2e20840fcf5 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1235,11 +1235,34 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
+    def unpack_data(self,
+                    image_data: Union[List[torch.Tensor], torch.Tensor],
+                    padding_value=0) -> torch.Tensor:
+        if isinstance(image_data, torch.Tensor):
+            # torch.Tensor
+            return image_data
+        else:
+            assert isinstance(
+                image_data[0],
+                torch.Tensor), "Image data is not properly batched."
+            # List[torch.Tensor]
+            bsz = len(image_data)
+            max_length = max(t.size(0) for t in image_data)
+            trailing_dims = image_data[0].shape[1:]
+            for data in image_data:
+                cur_trailing_dims = data.shape[1:]
+                assert cur_trailing_dims == trailing_dims
+            output_tensor = torch.full((bsz, max_length, *trailing_dims),
+                                       padding_value,
+                                       dtype=image_data[0].dtype,
+                                       device=image_data[0].device)
+            for i, t in enumerate(image_data):
+                output_tensor[i, :t.size(0)] = t
+            return output_tensor
+
     def _parse_and_validate_image_input(self, **kwargs: object):
         # tensor with the same shape will be batched together by
         # MultiModalKwargs.batch, so pixel_values here can be:
-        #   - List[List[torch.Tensor]]:
-        #       with shape (num_tiles, 3, image_res, image_res)
         #   - List[torch.Tensor]:
         #       with shape (num_image, num_tiles, 3, image_res, image_res)
         #   - torch.Tensor:
@@ -1274,10 +1297,9 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
 
             return MllamaImagePixelInputs(
                 type="pixel_values",
-                data=pixel_values,
-                aspect_ratio_ids=aspect_ratio_ids,
-                aspect_ratio_mask=aspect_ratio_mask,
-            )
+                data=self.unpack_data(pixel_values),
+                aspect_ratio_ids=self.unpack_data(aspect_ratio_ids),
+                aspect_ratio_mask=self.unpack_data(aspect_ratio_mask))
 
         if image_embeds is not None:
             raise NotImplementedError

From e830b01383d4c3372975e36aa068ac44eba90b9e Mon Sep 17 00:00:00 2001
From: Percy <xhc_1007@163.com>
Date: Tue, 1 Apr 2025 00:57:28 -0500
Subject: [PATCH 137/593] [Bugfix] Fix extra comma (#15851)

Signed-off-by: haochengxia <xhc_1007@163.com>
---
 vllm/model_executor/sampling_metadata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 0a580a4e907de..d76c75d9e6ce8 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -191,7 +191,7 @@ class SamplingMetadata:
             "SamplingMetadata("
             f"seq_groups={self.seq_groups}, "
             f"selected_token_indices={self.selected_token_indices}, "
-            f"categorized_sample_indices={self.categorized_sample_indices}), ")
+            f"categorized_sample_indices={self.categorized_sample_indices})")
 
 
 def _prepare_seq_groups(

From 63d8eabed05a632679f0e9e929d52afb08d2ec7f Mon Sep 17 00:00:00 2001
From: Alexey Kiryushin <alexey.a.kiryushin@gmail.com>
Date: Tue, 1 Apr 2025 05:57:59 +0000
Subject: [PATCH 138/593] [Bugfix]: Fix is_embedding_layer condition in
 VocabParallelEmbedding  (#15824)

Signed-off-by: alexwl <alexey.a.kiryushin@gmail.com>
---
 vllm/model_executor/layers/vocab_parallel_embedding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index f65dfc3cb3294..1eb0c8c2ef4e1 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -235,7 +235,7 @@ class VocabParallelEmbedding(torch.nn.Module):
         # If we are making an embedding layer, then our quantization linear
         # method must implement the embedding operation. If we are another
         # layer type like ParallelLMHead, this is not important.
-        is_embedding_layer = type(self.__class__) is VocabParallelEmbedding
+        is_embedding_layer = type(self) is VocabParallelEmbedding
         quant_method_implements_embedding = method_has_implemented_embedding(
             type(quant_method))
         if is_embedding_layer and not quant_method_implements_embedding:

From 7e4e709b43b6b35011b78247f8b0afde7e41961b Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com>
Date: Tue, 1 Apr 2025 01:58:07 -0400
Subject: [PATCH 139/593] [V1] TPU - Fix fused MOE (#15834)

Signed-off-by: Alexander Matveev <amatveev@redhat.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index ef33852e31621..143123e577b1c 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -309,7 +309,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                                 expert_map=expert_map,
                                 renormalize=renormalize)
 
-    forward_native = forward_cuda
+    forward_native = forward_tpu if current_platform.is_tpu else forward_cuda
 
 
 def determine_expert_map(

From 4a9ce1784c4497d71b9a4497619f60d97271bd00 Mon Sep 17 00:00:00 2001
From: Lionel Villard <villard@us.ibm.com>
Date: Tue, 1 Apr 2025 01:58:58 -0400
Subject: [PATCH 140/593] [sleep mode] clear pytorch cache after sleep (#15248)

Signed-off-by: <villard@us.ibm.com>
---
 vllm/device_allocator/cumem.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
index 0291fd9e1c88f..f666c18c1999b 100644
--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -8,6 +8,7 @@
 # not sure why, they are created from a different context.
 # the only successful approach is to call cuda driver API in C.
 import dataclasses
+import gc
 import os
 from contextlib import contextmanager
 from typing import Any, Callable, Dict, Optional, Tuple, Union
@@ -175,7 +176,7 @@ class CuMemAllocator:
                                          str]] = None) -> None:
         """
         Put the allocator in sleep mode.
-        All data in the memory allocation with the specified tag will be 
+        All data in the memory allocation with the specified tag will be
         offloaded to CPU memory, and others will be discarded.
 
         :param offload_tags: The tags of the memory allocation that will be
@@ -204,10 +205,13 @@ class CuMemAllocator:
                 data.cpu_backup_tensor = cpu_backup_tensor
             unmap_and_release(handle)
 
+        gc.collect()
+        torch.cuda.empty_cache()
+
     def wake_up(self):
         """
         Wake up the allocator from sleep mode.
-        All data that is previously offloaded will be loaded back to GPU 
+        All data that is previously offloaded will be loaded back to GPU
         memory, and the rest of the data will have empty memory."""
         for ptr, data in self.pointer_to_data.items():
             handle = data.handle
@@ -225,7 +229,7 @@ class CuMemAllocator:
     def use_memory_pool(self, tag: Optional[str] = None):
         """
         A context manager to use the memory pool.
-        All memory allocation created inside the context will be allocated 
+        All memory allocation created inside the context will be allocated
         in the memory pool, and has the specified tag.
 
         :param tag: The tag of the memory allocation. If None, the default tag

From c7e63aa4d84de4f0b076d2974d30cd1cd34a4191 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Tue, 1 Apr 2025 03:10:48 -0400
Subject: [PATCH 141/593] [ROCm] Use device name in the warning (#15838)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 vllm/engine/arg_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index e29b04ab6e0c8..019cbe18397e6 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1530,7 +1530,7 @@ class EngineArgs:
         # Non-CUDA is supported on V1, but off by default for now.
         not_cuda = not current_platform.is_cuda()
         if not_cuda and _warn_or_fallback(  # noqa: SIM103
-                current_platform.device_type):
+                current_platform.device_name):
             return False
         #############################################################
 

From 3a5f0afcd2e91264f9ac812badaa0ed3cd0a623e Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 1 Apr 2025 15:33:17 +0800
Subject: [PATCH 142/593] [V1] Implement sliding window attention in
 kv_cache_manager (#14097)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 .../e2e/test_correctness_sliding_window.py    |  15 +-
 tests/v1/core/test_prefix_caching.py          | 133 +++++++--------
 tests/v1/core/test_scheduler.py               |  12 ++
 tests/v1/core/test_specialized_manager.py     | 138 +++++++++++++++
 .../v1/e2e/test_correctness_sliding_window.py |  84 +++++++++
 vllm/config.py                                |   3 +-
 vllm/v1/core/block_pool.py                    |  15 +-
 vllm/v1/core/kv_cache_manager.py              |  71 +++++---
 vllm/v1/core/kv_cache_utils.py                |  35 +++-
 vllm/v1/core/sched/scheduler.py               |  22 +--
 vllm/v1/core/specialized_manager.py           | 161 ++++++++++++++++++
 vllm/v1/engine/core.py                        |  18 +-
 vllm/v1/kv_cache_interface.py                 |  57 +++++--
 vllm/v1/worker/gpu_model_runner.py            |  28 ++-
 vllm/v1/worker/tpu_model_runner.py            |  28 +--
 15 files changed, 662 insertions(+), 158 deletions(-)
 create mode 100644 tests/v1/core/test_specialized_manager.py
 create mode 100644 tests/v1/e2e/test_correctness_sliding_window.py
 create mode 100644 vllm/v1/core/specialized_manager.py

diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py
index e23b8718cb632..039b5e739892a 100644
--- a/tests/core/block/e2e/test_correctness_sliding_window.py
+++ b/tests/core/block/e2e/test_correctness_sliding_window.py
@@ -129,12 +129,16 @@ def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
     check_answers(indices, answer, test_texts)
 
 
-def prep_prompts(batch_size: int):
+def prep_prompts(batch_size: int, ln_range: tuple[int, int] = (800, 1100)):
     """
     Generate prompts which a bunch of assignments,
     then asking for the value of one of them.
     The prompt is just under 10k tokens; sliding window is 4k
     so the answer is outside sliding window, but should still be correct.
+
+    Args:
+        batch_size: number of prompts to generate
+        ln_range: an argument to control the length of the prompt
     """
     prompts: list[str] = []
     answer: list[int] = []
@@ -145,7 +149,7 @@ def prep_prompts(batch_size: int):
         indices.append(idx)
         prompt = "```python\n# We set a number of variables, " + \
                  f"x{idx} will be important later\n"
-        ln = random.randint(800, 1100)
+        ln = random.randint(*ln_range)
         for k in range(30, ln):
             v = random.randint(10, 99)
             if k == idx:
@@ -157,7 +161,10 @@ def prep_prompts(batch_size: int):
     return prompts, answer, indices
 
 
-def check_answers(indices: list[int], answer: list[int], outputs: list[str]):
+def check_answers(indices: list[int],
+                  answer: list[int],
+                  outputs: list[str],
+                  accept_rate: float = 0.7):
     answer2 = [int(text[0:2].strip()) for text in outputs]
     print(list(zip(indices, zip(answer, answer2))))
     numok = 0
@@ -166,7 +173,7 @@ def check_answers(indices: list[int], answer: list[int], outputs: list[str]):
             numok += 1
     frac_ok = numok / len(answer)
     print(f"Num OK: {numok}/{len(answer)} {frac_ok}")
-    assert frac_ok > 0.7
+    assert frac_ok >= accept_rate
 
 
 def check_window(prompts: list[str]):
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 72a1874fbd446..80dd275a90b87 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -4,6 +4,7 @@
 from typing import Optional
 
 import pytest
+import torch
 
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
@@ -12,6 +13,8 @@ from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
 from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock,
                                          hash_block_tokens)
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheGroupSpec)
 
 
 def make_request(request_id,
@@ -39,13 +42,23 @@ def make_request(request_id,
     )
 
 
+def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
+    return KVCacheConfig(
+        num_blocks=num_blocks,
+        tensors={},
+        kv_cache_groups=[
+            KVCacheGroupSpec(['layer'],
+                             FullAttentionSpec(block_size, 1, 1, torch.float32,
+                                               False))
+        ],
+    )
+
+
 @pytest.mark.parametrize("hash_algo", ["sha256", "hash"])
 def test_prefill(hash_algo):
     manager = KVCacheManager(
-        block_size=16,
-        num_gpu_blocks=10,
+        make_kv_cache_config(16, 11),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
         caching_hash_algo=hash_algo,
         num_preallocate_tokens=16,
@@ -67,12 +80,12 @@ def test_prefill(hash_algo):
     assert not computed_blocks
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55, computed_blocks)
-    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4, 5]
 
     # Check full block metadata
     parent_block_hash = None
-    for block_id in (0, 1, 2):
-        block_tokens = tuple(all_token_ids[block_id * 16:(block_id + 1) * 16])
+    for block_id in (1, 2, 3):
+        block_tokens = tuple(all_token_ids[(block_id - 1) * 16:block_id * 16])
         block_hash = hash_block_tokens(hash_fn, parent_block_hash,
                                        block_tokens)
         assert manager.block_pool.blocks[block_id].block_hash == block_hash
@@ -80,7 +93,7 @@ def test_prefill(hash_algo):
         parent_block_hash = block_hash.hash_value
 
     # Check partial/preallocated block metadata
-    for block_id in (3, 4):
+    for block_id in (4, 5):
         assert manager.block_pool.blocks[block_id].block_hash is None
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
 
@@ -90,11 +103,11 @@ def test_prefill(hash_algo):
     req1 = make_request("1", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert len(manager.req_to_block_hashes[req1.request_id]) == 3
-    assert [b.block_id for b in computed_blocks] == [0, 1, 2]
+    assert [b.block_id for b in computed_blocks] == [1, 2, 3]
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
-    assert [b.block_id for b in blocks] == [5, 6]
+    assert [b.block_id for b in blocks] == [6, 7]
     for block in computed_blocks:
         assert block.ref_cnt == 2
 
@@ -107,14 +120,14 @@ def test_prefill(hash_algo):
     # All blocks should be available.
     assert manager.block_pool.free_block_queue.num_free_blocks == 10
     # The order should be
-    # [unallocated (7, 8, 9)]
-    # [unique_req0 (4, 3)]
-    # [unique_req1 (6, 5)]
-    # [common (2, 1, 0)]
+    # [unallocated (8, 9, 10)]
+    # [unique_req0 (5, 4)]
+    # [unique_req1 (7, 6)]
+    # [common (3, 2, 1)]
     assert [
         b.block_id
         for b in manager.block_pool.free_block_queue.get_all_free_blocks()
-    ] == [7, 8, 9, 4, 3, 6, 5, 2, 1, 0]
+    ] == [8, 9, 10, 5, 4, 7, 6, 3, 2, 1]
 
     # Cache hit in the common prefix when the original block is already free.
     # Incomplete 1 block (6 tokens)
@@ -122,11 +135,11 @@ def test_prefill(hash_algo):
     req2 = make_request("2", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert len(manager.req_to_block_hashes[req2.request_id]) == 3
-    assert [b.block_id for b in computed_blocks] == [0, 1, 2]
+    assert [b.block_id for b in computed_blocks] == [1, 2, 3]
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks)
-    assert [b.block_id for b in blocks] == [7, 8]
+    assert [b.block_id for b in blocks] == [8, 9]
 
     # Although we only have 5 free blocks, we have 8 blocks in
     # the free block queue due to lazy removal.
@@ -148,7 +161,7 @@ def test_prefill(hash_algo):
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req3, 16 * 9, computed_blocks)
     # This block ID order also checks the eviction order.
-    assert [b.block_id for b in blocks] == [9, 4, 3, 6, 5, 8, 7, 2, 1, 0]
+    assert [b.block_id for b in blocks] == [10, 5, 4, 7, 6, 9, 8, 3, 2, 1]
     assert manager.block_pool.free_block_queue.num_free_blocks == 0
     assert manager.block_pool.free_block_queue.free_list_head is None
     assert manager.block_pool.free_block_queue.free_list_tail is None
@@ -162,10 +175,8 @@ def test_prefill_plp():
     3. Schedule plp request; no hit should occur; validate blocks
     '''
     manager = KVCacheManager(
-        block_size=16,
-        num_gpu_blocks=10,
+        make_kv_cache_config(16, 11),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
         num_preallocate_tokens=16,
     )
@@ -186,13 +197,13 @@ def test_prefill_plp():
     assert not computed_blocks
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55, computed_blocks)
-    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4, 5]
     req0_block_hashes = [b.block_hash for b in blocks]
 
     # Check full block metadata
     parent_block_hash = None
-    for block_id in (0, 1, 2):
-        block_tokens = tuple(all_token_ids[block_id * 16:(block_id + 1) * 16])
+    for block_id in (1, 2, 3):
+        block_tokens = tuple(all_token_ids[(block_id - 1) * 16:block_id * 16])
         block_hash = hash_block_tokens(hash_fn, parent_block_hash,
                                        block_tokens)
         assert manager.block_pool.blocks[block_id].block_hash == block_hash
@@ -200,7 +211,7 @@ def test_prefill_plp():
         parent_block_hash = block_hash.hash_value
 
     # Check partial/preallocated block metadata
-    for block_id in (3, 4):
+    for block_id in (4, 5):
         assert manager.block_pool.blocks[block_id].block_hash is None
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
 
@@ -211,11 +222,11 @@ def test_prefill_plp():
     req1 = make_request("1", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert len(manager.req_to_block_hashes[req1.request_id]) == 3
-    assert [b.block_id for b in computed_blocks] == [0, 1, 2]
+    assert [b.block_id for b in computed_blocks] == [1, 2, 3]
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
-    assert [b.block_id for b in blocks] == [5, 6]
+    assert [b.block_id for b in blocks] == [6, 7]
     for block in computed_blocks:
         assert block.ref_cnt == 2
 
@@ -228,14 +239,14 @@ def test_prefill_plp():
     # All blocks should be available.
     assert manager.block_pool.free_block_queue.num_free_blocks == 10
     # The order should be
-    # [unallocated (7, 8, 9)]
-    # [unique_req0 (4, 3)]
-    # [unique_req1 (6, 5)]
-    # [common (2, 1, 0)]
+    # [unallocated (8, 9, 10)]
+    # [unique_req0 (5, 4)]
+    # [unique_req1 (7, 6)]
+    # [common (3, 2, 1)]
     assert [
         b.block_id
         for b in manager.block_pool.free_block_queue.get_all_free_blocks()
-    ] == [7, 8, 9, 4, 3, 6, 5, 2, 1, 0]
+    ] == [8, 9, 10, 5, 4, 7, 6, 3, 2, 1]
 
     # Request #2 is a prompt-logprobs request:
     # NO cache hit in the common prefix; duplicates request #0 cached blocks
@@ -251,7 +262,7 @@ def test_prefill_plp():
     block_ids = [b.block_id for b in blocks]
     # Duplicate cached blocks have different ids but same hashes vs request #0
     assert [b.block_hash for b in blocks] == req0_block_hashes
-    assert block_ids != [0, 1, 2, 3, 4]
+    assert block_ids != [1, 2, 3, 4, 5]
 
     # Request #2 block hashes are valid since request #0 hashes are.
     # Check block reference counts.
@@ -263,10 +274,8 @@ def test_prefill_plp():
 
 def test_decode():
     manager = KVCacheManager(
-        block_size=16,
-        num_gpu_blocks=10,
+        make_kv_cache_config(16, 11),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
         num_preallocate_tokens=16,
     )
@@ -282,7 +291,7 @@ def test_decode():
     assert not computed_blocks
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55, computed_blocks)
-    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4, 5]
 
     # Append slots without allocating a new block.
     req0.num_computed_tokens = 55
@@ -316,10 +325,8 @@ def test_decode():
 
 def test_evict():
     manager = KVCacheManager(
-        block_size=16,
-        num_gpu_blocks=10,
+        make_kv_cache_config(16, 11),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
         num_preallocate_tokens=16,
     )
@@ -350,15 +357,15 @@ def test_evict():
     assert [
         b.block_id
         for b in manager.block_pool.free_block_queue.get_all_free_blocks()
-    ] == [6, 5, 4, 3, 2, 1, 0, 9, 8, 7]
+    ] == [7, 6, 5, 4, 3, 2, 1, 10, 9, 8]
 
     # Touch the first 2 blocks.
     req2 = make_request("2", list(range(2 * 16 + 3)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
-    assert [b.block_id for b in computed_blocks] == [0, 1]
+    assert [b.block_id for b in computed_blocks] == [1, 2]
     assert num_computed_tokens == 2 * 16
     blocks = manager.allocate_slots(req2, 3, computed_blocks)
-    assert [b.block_id for b in blocks] == [6, 5]
+    assert [b.block_id for b in blocks] == [7, 6]
     assert manager.block_pool.free_block_queue.num_free_blocks == 6
 
 
@@ -369,10 +376,8 @@ def test_hash_block_correct_reuse():
     """
     block_size = 16
     manager = KVCacheManager(
-        block_size=block_size,
-        num_gpu_blocks=1,
+        make_kv_cache_config(16, 2),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
         num_preallocate_tokens=0,
     )
@@ -408,10 +413,8 @@ def test_computed_blocks_not_evicted():
     """
     block_size = 16
     manager = KVCacheManager(
-        block_size=block_size,
-        num_gpu_blocks=2,
+        make_kv_cache_config(block_size, 3),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
         num_preallocate_tokens=0,
     )
@@ -424,7 +427,7 @@ def test_computed_blocks_not_evicted():
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, num_tokens, computed_blocks)
     assert len(blocks) == 1
-    assert blocks[0].block_id == 0
+    assert blocks[0].block_id == 1
 
     # Allocate another block.
     req1 = make_request("1", list(range(num_tokens, num_tokens * 2)))
@@ -433,7 +436,7 @@ def test_computed_blocks_not_evicted():
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req1, num_tokens, computed_blocks)
     assert len(blocks) == 1
-    assert blocks[0].block_id == 1
+    assert blocks[0].block_id == 2
 
     # Free the blocks.
     manager.free(req0)
@@ -444,13 +447,13 @@ def test_computed_blocks_not_evicted():
     req2 = make_request("2", list(range(num_tokens * 2)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert len(computed_blocks) == 1
-    assert computed_blocks[0].block_id == 0
+    assert computed_blocks[0].block_id == 1
     assert num_computed_tokens == block_size
 
     blocks = manager.allocate_slots(req2, num_tokens * 2 - num_tokens,
                                     computed_blocks)
     assert len(blocks) == 1
-    assert blocks[0].block_id == 1
+    assert blocks[0].block_id == 2
 
 
 def test_basic_prefix_caching_disabled():
@@ -459,10 +462,8 @@ def test_basic_prefix_caching_disabled():
     """
     block_size = 4
     manager = KVCacheManager(
-        block_size=block_size,
-        num_gpu_blocks=4,
+        make_kv_cache_config(block_size, 5),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=False,
         num_preallocate_tokens=0,
     )
@@ -502,10 +503,8 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
     This tests that the preallocated blocks are correctly added.
     """
     manager = KVCacheManager(
-        block_size=block_size,
-        num_gpu_blocks=10,
+        make_kv_cache_config(block_size, 11),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
         num_preallocate_tokens=num_preallocate_tokens,
     )
@@ -586,10 +585,8 @@ def test_mm_prefix_caching():
     This tests that the multi-modal prefix caching is correct.
     """
     manager = KVCacheManager(
-        block_size=16,
-        num_gpu_blocks=10,
+        make_kv_cache_config(16, 11),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
         num_preallocate_tokens=16,
     )
@@ -629,7 +626,7 @@ def test_mm_prefix_caching():
     assert block_hashes[2].extra_keys == ("bbb", )
 
     blocks = manager.allocate_slots(req0, 59, computed_blocks)
-    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4, 5]
     req0.num_computed_tokens = 59
 
     # Append slots without allocating a new block.
@@ -667,10 +664,8 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     """
     block_size = 16
     manager = KVCacheManager(
-        block_size=block_size,
-        num_gpu_blocks=10,
+        make_kv_cache_config(block_size, 11),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
         num_preallocate_tokens=0,
     )
@@ -723,10 +718,8 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
 
 def test_reset_prefix_cache():
     manager = KVCacheManager(
-        block_size=16,
-        num_gpu_blocks=10,
+        make_kv_cache_config(16, 11),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
         num_preallocate_tokens=0,
     )
@@ -736,7 +729,7 @@ def test_reset_prefix_cache():
     all_token_ids = full_block_token_ids + unique_token_ids
     req0 = make_request("0", all_token_ids)
     blocks = manager.allocate_slots(req0, 55)
-    assert [b.block_id for b in blocks] == [0, 1, 2, 3]
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4]
 
     unique_token_ids = [4] * 7
     all_token_ids = full_block_token_ids + unique_token_ids
@@ -745,7 +738,7 @@ def test_reset_prefix_cache():
     assert len(manager.req_to_block_hashes[req1.request_id]) == 3
     assert len(computed_blocks) == 3
     blocks = manager.allocate_slots(req1, 7, computed_blocks)
-    assert [b.block_id for b in blocks] == [4]
+    assert [b.block_id for b in blocks] == [5]
 
     # Failed to reset prefix cache because some blocks are not freed yet.
     assert not manager.reset_prefix_cache()
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 5b96566530c89..73af7dad5cca6 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -2,12 +2,15 @@
 from typing import Optional
 
 import pytest
+import torch
 
 from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheGroupSpec)
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager
@@ -66,12 +69,21 @@ def create_scheduler(
         model_config=model_config,
         cache_config=cache_config,
     )
+    kv_cache_config = KVCacheConfig(
+        num_blocks=10000,  # A large number of blocks to hold all requests
+        tensors={},
+        kv_cache_groups=[
+            KVCacheGroupSpec(['layer'],
+                             FullAttentionSpec(16, 1, 1, torch.float32, False))
+        ],
+    )
     cache_config.num_gpu_blocks = 10000
     return Scheduler(
         scheduler_config,
         model_config,
         cache_config,
         lora_config=None,
+        kv_cache_config=kv_cache_config,
         log_stats=True,
         structured_output_manager=StructuredOutputManager(vllm_config),
     )
diff --git a/tests/v1/core/test_specialized_manager.py b/tests/v1/core/test_specialized_manager.py
new file mode 100644
index 0000000000000..9b4ab5fa8b121
--- /dev/null
+++ b/tests/v1/core/test_specialized_manager.py
@@ -0,0 +1,138 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from vllm.v1.core.block_pool import BlockPool
+from vllm.v1.core.kv_cache_utils import BlockHashType, KVCacheBlock
+from vllm.v1.core.specialized_manager import SlidingWindowManager
+from vllm.v1.kv_cache_interface import SlidingWindowSpec
+
+
+def test_sliding_window_possible_cached_prefix():
+    sliding_window_spec = SlidingWindowSpec(
+        block_size=2,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+        sliding_window=4,
+        use_mla=False,
+    )
+
+    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    manager = SlidingWindowManager(sliding_window_spec, block_pool)
+
+    def run_one_case(block_is_cached, expect_length):
+        block_hash_list = [
+            BlockHashType(i, ()) for i in range(len(block_is_cached))
+        ]
+
+        block_pool.cached_block_hash_to_block.clear()
+
+        # Mock the block pool with the cached blocks
+        for i, (block_hash,
+                is_cached) in enumerate(zip(block_hash_list, block_is_cached)):
+            if is_cached:
+                block_pool.cached_block_hash_to_block[block_hash] = {
+                    i: block_pool.blocks[i + 10]
+                }
+
+        computed_blocks = manager.find_longest_cache_hit(block_hash_list)
+        assert len(computed_blocks) == expect_length
+
+        assert all(block == block_pool.null_block
+                   for block in computed_blocks[:expect_length - 2])
+        for i in range(2):
+            if i < expect_length:
+                block_index = expect_length - i - 1
+                assert computed_blocks[
+                    block_index].block_id == block_index + 10
+
+    run_one_case([False] * 10, 0)
+    run_one_case([True], 1)
+    run_one_case([True, False], 1)
+    run_one_case([True, True], 2)
+    run_one_case([True, True, False], 2)
+    run_one_case([True, True, True], 3)
+    run_one_case([True, True, True, False], 3)
+    run_one_case([
+        True, True, False, True, False, False, True, True, False, True, True,
+        True
+    ], 12)
+    run_one_case([
+        True, True, False, True, False, False, True, True, False, False, False
+    ], 8)
+    run_one_case([
+        True, True, False, True, False, False, True, True, False, False, False,
+        True
+    ], 8)
+
+
+def test_sliding_window_remove_skipped_blocks():
+    sliding_window_spec = SlidingWindowSpec(
+        block_size=2,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+        sliding_window=4,
+        use_mla=False,
+    )
+
+    block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True)
+
+    manager = SlidingWindowManager(sliding_window_spec, block_pool)
+
+    null_block_id = block_pool.null_block.block_id
+
+    def id_to_block_table(ids):
+        return [
+            KVCacheBlock(id_)
+            if id_ != null_block_id else block_pool.null_block for id_ in ids
+        ]
+
+    def assert_block_id(block_table, ids):
+        for block, id_ in zip(block_table, ids):
+            if id_ == null_block_id:
+                assert block == block_pool.null_block
+            else:
+                assert block.block_id == id_
+
+    original_block_ids = [
+        1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010
+    ]
+    block_table = id_to_block_table(original_block_ids)
+    removed = manager.remove_skipped_blocks(block_table, 0)
+    assert_block_id(removed, [])
+    assert_block_id(block_table, original_block_ids)
+
+    # 4 tokens are computed. Only token 0 is out of the sliding window. As
+    # block 1000 also contains token 1 that is in the sliding window, block 1000
+    # cannot be removed.
+    removed = manager.remove_skipped_blocks(block_table, 4)
+    assert_block_id(removed, [])
+    assert_block_id(block_table, original_block_ids)
+
+    # 5 tokens are computed. Token 0 & 1 are out of the sliding window.
+    # Block 1000 can be removed.
+    removed = manager.remove_skipped_blocks(block_table, 5)
+    assert_block_id(removed, [original_block_ids[0]])
+    assert_block_id(block_table, [null_block_id] + original_block_ids[1:])
+
+    # 6 tokens are computed. Token 0-2 are out of the sliding window.
+    # Cannot remove new block as the block 1001 is still used by token 3.
+    removed = manager.remove_skipped_blocks(block_table, 6)
+    assert_block_id(removed, [])
+    assert_block_id(block_table, [null_block_id] + original_block_ids[1:])
+
+    # 7 tokens are computed. Token 0-3 are out of the sliding window.
+    # Block 1001 can be removed and block 1000 is already removed.
+    removed = manager.remove_skipped_blocks(block_table, 7)
+    assert_block_id(removed, [original_block_ids[1]])
+    assert_block_id(block_table, [null_block_id] * 2 + original_block_ids[2:])
+
+    # 11 tokens are computed. Token 0-7 are out of the sliding window.
+    # Block 1002 & 1003 can be removed now. Block 1003 represents a longer
+    # sequence, and is expected to be evicted earlier than 1002, so the order
+    # of removed blocks should be [1003, 1002].
+    removed = manager.remove_skipped_blocks(block_table, 11)
+    assert_block_id(removed, [original_block_ids[3], original_block_ids[2]])
+    assert_block_id(block_table, [null_block_id] * 4 + original_block_ids[4:])
diff --git a/tests/v1/e2e/test_correctness_sliding_window.py b/tests/v1/e2e/test_correctness_sliding_window.py
new file mode 100644
index 0000000000000..a125d3fb79750
--- /dev/null
+++ b/tests/v1/e2e/test_correctness_sliding_window.py
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: Apache-2.0
+from dataclasses import dataclass
+
+import pytest
+
+from vllm import LLM, SamplingParams
+
+from ...core.block.e2e.test_correctness_sliding_window import (check_answers,
+                                                               prep_prompts)
+
+
+@dataclass
+class TestConfig:
+    sliding_window: int
+    ln_range: tuple[int, int]
+
+
+model_config = {
+    "bigcode/starcoder2-3b": TestConfig(4096, (800, 1100)),
+    "google/gemma-2-2b-it": TestConfig(4096, (400, 800)),
+}
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        "bigcode/starcoder2-3b",  # sliding window only
+        "google/gemma-2-2b-it",  # sliding window + full attention
+    ])
+@pytest.mark.parametrize("batch_size", [5])
+@pytest.mark.parametrize("seed", [1])
+def test_sliding_window_retrival(monkeypatch, model, batch_size, seed):
+    """
+    The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
+    asks for value of one of them (which is outside the sliding window).
+    If we tell it upfront which we are going to be looking for, then
+    it answers correctly (mostly).
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        test_config = model_config[model]
+
+        llm = LLM(model=model)
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
+
+        prompts, answer, indices = prep_prompts(batch_size,
+                                                ln_range=test_config.ln_range)
+
+        check_length(prompts, llm, test_config.sliding_window)
+
+        # Fresh generation
+        responses = llm.generate(prompts, sampling_params)
+        check_answers(indices,
+                      answer,
+                      [response.outputs[0].text for response in responses],
+                      accept_rate=1.0)
+
+        # Re-generate with the same prompts to test prefix caching
+        responses = llm.generate(prompts, sampling_params)
+        check_answers(indices,
+                      answer,
+                      [response.outputs[0].text for response in responses],
+                      accept_rate=1.0)
+
+
+def check_length(prompts: list[str], llm: LLM, sliding_window: int):
+    """
+    Check if the prompt length is valid, i.e., longer than the sliding window 
+    size and shorter than the model's max length.
+
+    Args:
+        prompts: list of prompts
+        llm: LLM object
+        sliding_window: Sliding window size
+    """
+    tokenizer = llm.get_tokenizer()
+    max_model_len = llm.llm_engine.model_config.max_model_len
+    assert any(
+        len(tokenizer.encode(prompt)) > sliding_window
+        for prompt in prompts), "Prompt is too short for test"
+    assert all(
+        len(tokenizer.encode(prompt)) <= max_model_len
+        for prompt in prompts), "Prompt is too long for test"
diff --git a/vllm/config.py b/vllm/config.py
index 84b9836ef589f..96b6f84be2850 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1116,8 +1116,7 @@ class CacheConfig:
         is_attention_free: Whether the model is attention-free.
         num_gpu_blocks_override: Number of GPU blocks to use. This overrides the
             profiled num_gpu_blocks if specified. Does nothing if None.
-        sliding_window: Sliding window size for the KV cache. Can not work with
-            prefix caching enabled.
+        sliding_window: Sliding window size for the KV cache.
         enable_prefix_caching: Whether to enable prefix caching.
         cpu_offload_gb: Size of the CPU offload buffer in GiB.
     """
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index 79b0c42d4f812..43f30f7103c7a 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -27,6 +27,7 @@ class BlockPool:
     """
 
     def __init__(self, num_gpu_blocks: int, enable_caching: bool):
+        assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0
         self.num_gpu_blocks = num_gpu_blocks
         self.enable_caching = enable_caching
         # All kv-cache blocks.
@@ -50,6 +51,11 @@ class BlockPool:
         self.cached_block_hash_to_block: dict[BlockHashType, dict[
             int, KVCacheBlock]] = defaultdict(dict)
 
+        # To represent a placeholder block with block_id=0.
+        # The ref_cnt of null_block is not maintained, needs special care to
+        # avoid freeing it.
+        self.null_block = self.free_block_queue.popleft()
+
     def get_cached_block(self,
                          block_hash: BlockHashType) -> Optional[KVCacheBlock]:
         """Get a cached block by the block hash, or None if cache miss.
@@ -214,7 +220,7 @@ class BlockPool:
         for block in blocks:
             # ref_cnt=0 means this block is in the free list (i.e. eviction
             # candidate), so remove it.
-            if block.ref_cnt == 0:
+            if block.ref_cnt == 0 and block != self.null_block:
                 self.free_block_queue.remove(block)
             block.incr_ref()
 
@@ -228,7 +234,8 @@ class BlockPool:
         """
         for block in ordered_blocks:
             block.decr_ref()
-            if block.ref_cnt == 0:
+            # null_block should not be added to the free list.
+            if block.ref_cnt == 0 and block != self.null_block:
                 self.free_block_queue.append(block)
 
     def reset_prefix_cache(self) -> bool:
@@ -241,10 +248,10 @@ class BlockPool:
             False otherwise.
         """
         num_used_blocks = (self.num_gpu_blocks - self.get_num_free_blocks())
-        if num_used_blocks > 0:
+        if num_used_blocks != 1:  # The null block is always marked as used
             logger.warning(
                 "Failed to reset prefix cache because some "
-                "blocks (%d) are not freed yet", num_used_blocks)
+                "blocks (%d) are not freed yet", num_used_blocks - 1)
             return False
 
         # Remove all hashes so that no new blocks will hit.
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 39390babaa8ef..c0f7715209d11 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -9,6 +9,8 @@ from vllm.utils import cdiv, sha256
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock,
                                          hash_request_tokens)
+from vllm.v1.core.specialized_manager import get_specialized_manager
+from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request, RequestStatus
 
@@ -19,20 +21,22 @@ class KVCacheManager:
 
     def __init__(
         self,
-        block_size: int,
-        num_gpu_blocks: int,
+        kv_cache_config: KVCacheConfig,
         max_model_len: int,
-        sliding_window: Optional[int] = None,
         enable_caching: bool = True,
         caching_hash_algo: str = "builtin",
         num_preallocate_tokens: int = 64,
         log_stats: bool = False,
     ) -> None:
-        self.block_size = block_size
-        self.num_gpu_blocks = num_gpu_blocks
+        assert len(kv_cache_config.kv_cache_groups) == 1, (
+            "KVCacheManager does not support hybrid models with more than 1 "
+            "kv cache group")
+        kv_cache_spec = kv_cache_config.kv_cache_groups[0].kv_cache_spec
+        self.block_size = kv_cache_spec.block_size
+        self.num_gpu_blocks = kv_cache_config.num_blocks
         self.max_model_len = max_model_len
-        self.max_num_blocks_per_req = cdiv(max_model_len, block_size)
-        self.sliding_window = sliding_window
+        self.max_num_blocks_per_req = cdiv(max_model_len, self.block_size)
+
         self.enable_caching = enable_caching
         self.caching_hash_fn = sha256 if caching_hash_algo == "sha256" else hash
         # FIXME: make prefix cache stats conditional on log_stats
@@ -48,9 +52,15 @@ class KVCacheManager:
         # further allocation. When it uses up all the N empty blocks, it gets
         # N new empty blocks.
         self.num_preallocate_tokens = num_preallocate_tokens
-        self.num_preallocate_blocks = cdiv(num_preallocate_tokens, block_size)
+        self.num_preallocate_blocks = cdiv(num_preallocate_tokens,
+                                           self.block_size)
 
-        self.block_pool = BlockPool(num_gpu_blocks, enable_caching)
+        self.block_pool = BlockPool(self.num_gpu_blocks, enable_caching)
+
+        self.specialized_manager = get_specialized_manager(
+            kv_cache_spec=kv_cache_spec,
+            block_pool=self.block_pool,
+        )
 
         # Mapping from request ID to blocks to track the blocks allocated
         # for each request, so that we can free the blocks when the request
@@ -117,17 +127,25 @@ class KVCacheManager:
 
         self.prefix_cache_stats.requests += 1
         if request.sampling_params.prompt_logprobs is None:
-            # Check for cache hits
-            computed_blocks = []
-            for block_hash in block_hashes:
-                # block_hashes is a chain of block hashes. If a block hash
-                # is not in the cached_block_hash_to_id, the following
-                # block hashes are not computed yet for sure.
-                if cached_block := self.block_pool.get_cached_block(
-                        block_hash):
-                    computed_blocks.append(cached_block)
-                else:
-                    break
+            if len(block_hashes) * self.block_size == request.num_tokens:
+                # When prompt length is divisible by the block size and all
+                # blocks are cached, we need to recompute the last token. This
+                # have to be achieved by re-computing an entire block because
+                # allocate_slots() assumes num_computed_tokens is always a
+                # multiple of the block size. To achieve this, remove the last
+                # block hash from the block_hashes for find_longest_cache_hit
+                # This limitation can potentially be removed in the future to
+                # slightly improve the performance.
+                last_block_hash = block_hashes.pop()
+            else:
+                last_block_hash = None
+
+            computed_blocks = (
+                self.specialized_manager.find_longest_cache_hit(block_hashes))
+
+            if last_block_hash is not None:
+                # Add back the last block hash if it was removed.
+                block_hashes.append(last_block_hash)
 
             self.prefix_cache_stats.queries += len(block_hashes)
             self.prefix_cache_stats.hits += len(computed_blocks)
@@ -176,13 +194,24 @@ class KVCacheManager:
 
         new_computed_blocks = new_computed_blocks or []
 
+        req_blocks = self.req_to_blocks[request.request_id]
+
+        # Free the blocks that are skipped during the attention computation
+        # (e.g., tokens outside the sliding window).
+        # We can do this even if we cannot schedule this request due to
+        # insufficient free blocks.
+        # Should call this function before allocating new blocks to reduce
+        # the number of evicted blocks.
+        removed_blocks = self.specialized_manager.remove_skipped_blocks(
+            req_blocks, request.num_computed_tokens)
+        self.block_pool.free_blocks(removed_blocks)
+
         # The number of computed tokens is the number of computed tokens plus
         # the new prefix caching hits
         num_computed_tokens = (request.num_computed_tokens +
                                len(new_computed_blocks) * self.block_size)
         num_required_blocks = cdiv(num_computed_tokens + num_tokens,
                                    self.block_size)
-        req_blocks = self.req_to_blocks[request.request_id]
         num_new_blocks = (num_required_blocks - len(req_blocks) -
                           len(new_computed_blocks))
 
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 13a3756fdacb7..34bc9369b125d 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -9,8 +9,9 @@ from typing import Any, Callable, NamedTuple, Optional
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.utils import sha256
-from vllm.v1.kv_cache_interface import (KVCacheConfig, KVCacheGroupSpec,
-                                        KVCacheSpec, KVCacheTensor)
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheGroupSpec, KVCacheSpec,
+                                        KVCacheTensor, SlidingWindowSpec)
 from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request
 
@@ -483,7 +484,7 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
     max_model_len = vllm_config.model_config.max_model_len
     needed_memory = 0
     for layer_spec in kv_cache_spec.values():
-        needed_memory += layer_spec.bytes_for_tokens(max_model_len)
+        needed_memory += layer_spec.max_memory_usage_bytes(vllm_config)
 
     if needed_memory > available_memory:
         raise ValueError(
@@ -597,6 +598,33 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
     return kv_cache_config
 
 
+def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
+    """
+    Only models with one type of KV cache are supported yet. This function tries
+    to convert the KV cache specs to one type if the model is a hybrid model 
+    with multiple type of KV cache. It will convert all SlidingWindowSpec to
+    FullAttentionSpec if both types are present.
+    
+    Args:
+        kv_cache_spec: The kv cache spec of each attention layer in the model
+    """
+
+    has_full_attention = any(
+        isinstance(spec, FullAttentionSpec) for spec in kv_cache_spec.values())
+    has_sliding_window = any(
+        isinstance(spec, SlidingWindowSpec) for spec in kv_cache_spec.values())
+    if has_full_attention and has_sliding_window:
+        for layer_name, spec in kv_cache_spec.items():
+            if isinstance(spec, SlidingWindowSpec):
+                kv_cache_spec[layer_name] = FullAttentionSpec(
+                    block_size=spec.block_size,
+                    num_kv_heads=spec.num_kv_heads,
+                    head_size=spec.head_size,
+                    dtype=spec.dtype,
+                    use_mla=spec.use_mla,
+                )
+
+
 def get_kv_cache_config(vllm_config: VllmConfig,
                         kv_cache_spec: dict[str, KVCacheSpec],
                         available_memory: int) -> KVCacheConfig:
@@ -613,6 +641,7 @@ def get_kv_cache_config(vllm_config: VllmConfig,
         The generated KVCacheConfigs
     """
     check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory)
+    unify_hybrid_kv_cache_specs(kv_cache_spec)
     if is_kv_cache_type_uniform(kv_cache_spec):
         # KV cache of all layers are the same, which is true for
         # most models. Allocate the same amount of memory for
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 9e6c8e69d558d..4d477567b9b60 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -19,6 +19,7 @@ from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
 from vllm.v1.core.sched.utils import check_stop
 from vllm.v1.engine import (EngineCoreEventType, EngineCoreOutput,
                             EngineCoreOutputs)
+from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.stats import SchedulerStats
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
@@ -35,6 +36,7 @@ class Scheduler(SchedulerInterface):
         model_config: ModelConfig,
         cache_config: CacheConfig,
         lora_config: Optional[LoRAConfig],
+        kv_cache_config: KVCacheConfig,
         structured_output_manager: StructuredOutputManager,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         include_finished_set: bool = False,
@@ -43,6 +45,7 @@ class Scheduler(SchedulerInterface):
         self.scheduler_config = scheduler_config
         self.cache_config = cache_config
         self.lora_config = lora_config
+        self.kv_cache_config = kv_cache_config
         self.log_stats = log_stats
         self.structured_output_manager = structured_output_manager
 
@@ -58,15 +61,11 @@ class Scheduler(SchedulerInterface):
             self.scheduler_config.max_num_batched_tokens
         self.max_model_len = self.scheduler_config.max_model_len
 
-        num_gpu_blocks = cache_config.num_gpu_blocks
-        assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0
         # Create the KV cache manager.
         self.kv_cache_manager = KVCacheManager(
-            block_size=self.cache_config.block_size,
-            num_gpu_blocks=num_gpu_blocks,
+            kv_cache_config=kv_cache_config,
             max_model_len=self.max_model_len,
-            sliding_window=self.cache_config.sliding_window,
-            enable_caching=self.cache_config.enable_prefix_caching,
+            enable_caching=cache_config.enable_prefix_caching,
             caching_hash_algo=self.cache_config.prefix_caching_hash_algo,
             log_stats=self.log_stats)
         self.block_size = self.cache_config.block_size
@@ -300,17 +299,6 @@ class Scheduler(SchedulerInterface):
                 # `request.num_prompt_tokens` to consider the resumed requests,
                 # which have output tokens.
                 num_new_tokens = request.num_tokens - num_computed_tokens
-                if num_new_tokens == 0:
-                    # This happens when prompt length is divisible by the block
-                    # size and all blocks are cached. Now we force to recompute
-                    # the last block. Note that we have to re-compute an entire
-                    # block because allocate_slots() assumes num_computed_tokens
-                    # is always a multiple of the block size. This limitation
-                    # can potentially be removed in the future to slightly
-                    # improve the performance.
-                    num_computed_tokens -= self.block_size
-                    num_new_tokens = self.block_size
-                    computed_blocks.pop()
                 if (0 < self.scheduler_config.long_prefill_token_threshold <
                         num_new_tokens):
                     num_new_tokens = (
diff --git a/vllm/v1/core/specialized_manager.py b/vllm/v1/core/specialized_manager.py
new file mode 100644
index 0000000000000..7a8a98361c7ed
--- /dev/null
+++ b/vllm/v1/core/specialized_manager.py
@@ -0,0 +1,161 @@
+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC, abstractmethod
+
+from vllm.utils import cdiv
+from vllm.v1.core.block_pool import BlockPool
+from vllm.v1.core.kv_cache_utils import BlockHashType, KVCacheBlock
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheSpec,
+                                        SlidingWindowSpec)
+
+
+class SpecializedManager(ABC):
+    """
+    An abstract base class for specialized managers that handle the kv
+    cache management logic of different attention layers.
+    """
+
+    def __init__(
+        self,
+        kv_cache_spec: KVCacheSpec,
+        block_pool: BlockPool,
+    ) -> None:
+        """
+        Initializes the SpecializedManager.
+        Args:
+            kv_cache_spec: The kv_cache_spec for this manager.
+            block_pool: The block pool.
+        """
+
+        self.block_size = kv_cache_spec.block_size
+        self.kv_cache_spec = kv_cache_spec
+        self.block_pool = block_pool
+
+    @abstractmethod
+    def find_longest_cache_hit(
+            self, block_hashes: list[BlockHashType]) -> list[KVCacheBlock]:
+        """
+        Get the longest cache hit prefix of the blocks. If no cache hit is 
+        found, return an empty list.
+
+        Args:
+            block_hashes: The block hashes of the request.
+        Returns:
+            A list of cached blocks with skipped blocks replaced by null block.
+            For example, sliding window manager should return a list like
+            [NULL, NULL, KVCacheBlock(7), KVCacheBlock(8)] for block size 4 and 
+            sliding window 8. 
+        """
+
+        raise NotImplementedError
+
+    @abstractmethod
+    def remove_skipped_blocks(self, blocks: list[KVCacheBlock],
+                              num_computed_tokens: int) -> list[KVCacheBlock]:
+        """
+        Remove the blocks that are no longer needed from `blocks`. The removed 
+        blocks should be replaced by null_block. Return the removed blocks in 
+        eviction order, where the first returned block should be evicted first.
+        Don't free the removed blocks in this function.
+
+        Args:
+            blocks: The list of blocks to be updated.
+            num_computed_tokens: The number of tokens that have been computed.
+        Returns:
+            The removed blocks in eviction order.
+        """
+        raise NotImplementedError
+
+
+class FullAttentionManager(SpecializedManager):
+
+    def find_longest_cache_hit(
+            self, block_hashes: list[BlockHashType]) -> list[KVCacheBlock]:
+        computed_blocks: list[KVCacheBlock] = []
+        for block_hash in block_hashes:
+            # block_hashes is a chain of block hashes. If a block hash is not
+            # in the cached_block_hash_to_id, the following block hashes are
+            # not computed yet for sure.
+            if cached_block := self.block_pool.get_cached_block(block_hash):
+                computed_blocks.append(cached_block)
+            else:
+                break
+        return computed_blocks
+
+    def remove_skipped_blocks(self, blocks: list[KVCacheBlock],
+                              num_computed_tokens: int) -> list[KVCacheBlock]:
+        # No need to remove blocks for full attention.
+        return []
+
+
+class SlidingWindowManager(SpecializedManager):
+
+    def __init__(self, kv_cache_spec: SlidingWindowSpec,
+                 block_pool: BlockPool):
+        super().__init__(kv_cache_spec, block_pool)
+        self.sliding_window = kv_cache_spec.sliding_window
+        # The number of contiguous blocks needed for prefix cache hit.
+        # -1 since the input token itself is also included in the window
+        self.sliding_window_contiguous_blocks = cdiv(
+            (kv_cache_spec.sliding_window - 1), self.block_size)
+        self._null_block = block_pool.null_block
+
+    def find_longest_cache_hit(
+            self, block_hashes: list[BlockHashType]) -> list[KVCacheBlock]:
+        # TODO: reduce i by sliding_window_contiguous_blocks when cache miss, to
+        # optimize the time complexity from O(len(block_hashes)) to
+        # O(len(block_hashes) / sliding_window_contiguous_blocks +
+        # sliding_window_contiguous_blocks),
+        # which is good for low cache hit rate scenarios.
+        computed_blocks = [self._null_block] * len(block_hashes)
+        num_contiguous_blocks = 0
+
+        # Search from right to left and early stop when a match is found.
+        for i in range(len(block_hashes) - 1, -1, -1):
+            if cached_block := self.block_pool.get_cached_block(
+                    block_hashes[i]):
+                computed_blocks[i] = cached_block
+                num_contiguous_blocks += 1
+                if (num_contiguous_blocks
+                        >= self.sliding_window_contiguous_blocks):
+                    # Trim the trailing blocks.
+                    # E.g., [NULL, NULL, 8, 3, NULL, 9] -> [NULL, NULL, 8, 3]
+                    # when sliding_window_contiguous_blocks=2.
+                    del computed_blocks[i + num_contiguous_blocks:]
+                    return computed_blocks
+            else:
+                num_contiguous_blocks = 0
+        # The first `num_contiguous_blocks` is a cache hit even if
+        # `num_contiguous_blocks < sliding_window_contiguous_blocks`.
+        del computed_blocks[num_contiguous_blocks:]
+        return computed_blocks
+
+    def remove_skipped_blocks(self, blocks: list[KVCacheBlock],
+                              num_computed_tokens: int) -> list[KVCacheBlock]:
+        # Remove the blocks that are no longer be in the sliding window and
+        # skipped during the attention computation.
+        last_useful_token = num_computed_tokens - self.sliding_window + 1
+        last_useful_block = last_useful_token // self.block_size
+
+        removed_blocks: list[KVCacheBlock] = []
+        for i in range(last_useful_block - 1, -1, -1):
+            if blocks[i] == self._null_block:
+                # If the block is already a null block, the blocks before it
+                # should also have been set to null blocks by the previous calls
+                # to this function.
+                break
+            removed_blocks.append(blocks[i])
+            blocks[i] = self._null_block
+        return removed_blocks
+
+
+spec_manager_map: dict[type[KVCacheSpec], type[SpecializedManager]] = {
+    FullAttentionSpec: FullAttentionManager,
+    SlidingWindowSpec: SlidingWindowManager,
+}
+
+
+def get_specialized_manager(kv_cache_spec: KVCacheSpec,
+                            block_pool: BlockPool) -> SpecializedManager:
+    manager_class = spec_manager_map[type(kv_cache_spec)]
+    manager = manager_class(kv_cache_spec, block_pool)
+    return manager
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 68a1dc1533079..d915d474cfd0a 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -33,6 +33,7 @@ from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType, UtilityOutput)
 from vllm.v1.engine.mm_input_cache import MMInputCacheServer
 from vllm.v1.executor.abstract import Executor
+from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
@@ -66,8 +67,9 @@ class EngineCore:
         self.model_executor = executor_class(vllm_config)
 
         # Setup KV Caches and update CacheConfig after profiling.
-        num_gpu_blocks, num_cpu_blocks = self._initialize_kv_caches(
-            vllm_config)
+        num_gpu_blocks, num_cpu_blocks, kv_cache_config = \
+            self._initialize_kv_caches(vllm_config)
+
         vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks
         vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
 
@@ -95,10 +97,11 @@ class EngineCore:
             model_config=vllm_config.model_config,
             cache_config=vllm_config.cache_config,
             lora_config=vllm_config.lora_config,
+            kv_cache_config=kv_cache_config,
+            structured_output_manager=self.structured_output_manager,
             include_finished_set=vllm_config.parallel_config.data_parallel_size
             > 1,
             log_stats=self.log_stats,
-            structured_output_manager=self.structured_output_manager,
         )
 
         # Setup MM Input Mapper.
@@ -117,8 +120,8 @@ class EngineCore:
                         self.batch_queue_size)
             self.batch_queue = queue.Queue(self.batch_queue_size)
 
-    def _initialize_kv_caches(self,
-                              vllm_config: VllmConfig) -> tuple[int, int]:
+    def _initialize_kv_caches(
+            self, vllm_config: VllmConfig) -> tuple[int, int, KVCacheConfig]:
         start = time.time()
 
         # Get all kv cache needed by the model
@@ -143,13 +146,14 @@ class EngineCore:
         unify_kv_cache_configs(kv_cache_configs)
 
         # All workers have the same kv_cache_config except layer names, so use
-        # an arbitrary one to get the number of blocks.
+        # an arbitrary one to initialize the scheduler.
         assert all([
             cfg.num_blocks == kv_cache_configs[0].num_blocks
             for cfg in kv_cache_configs
         ])
         num_gpu_blocks = kv_cache_configs[0].num_blocks
         num_cpu_blocks = 0
+        scheduler_kv_cache_config = kv_cache_configs[0]
 
         # Initialize kv cache and warmup the execution
         self.model_executor.initialize_from_config(kv_cache_configs)
@@ -157,7 +161,7 @@ class EngineCore:
         elapsed = time.time() - start
         logger.info(("init engine (profile, create kv cache, "
                      "warmup model) took %.2f seconds"), elapsed)
-        return num_gpu_blocks, num_cpu_blocks
+        return num_gpu_blocks, num_cpu_blocks, scheduler_kv_cache_config
 
     def add_request(self, request: EngineCoreRequest):
         """Add request to the scheduler."""
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index 867b1b61c8799..4fc0844cd1f4d 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -4,6 +4,7 @@ from dataclasses import dataclass
 
 import torch
 
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.utils import cdiv, get_dtype_size
 
@@ -43,28 +44,23 @@ class KVCacheSpec:
         """
         raise NotImplementedError
 
-    def bytes_for_tokens(self, num_tokens: int) -> int:
+    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
         """
-        The KV cache size for `num_tokens` tokens in bytes. Returns the real
-        memory size after padding `num_tokens` to full blocks.
+        The maximum possible memory usage of this KV cache in bytes.
 
         Returns:
-            The KV cache size
+            The KV cache size in bytes
         """
         raise NotImplementedError
 
 
 @dataclass
-class FullAttentionSpec(KVCacheSpec):
+class AttentionSpec(KVCacheSpec):
     num_kv_heads: int
     head_size: int
     dtype: torch.dtype
     use_mla: bool
 
-    @property
-    def type_id(self) -> str:
-        return f"full_attention_{self.block_size}_{self.page_size_bytes}"
-
     @property
     def page_size_bytes(self) -> int:
         # For MLA we only store a single latent vector
@@ -72,8 +68,47 @@ class FullAttentionSpec(KVCacheSpec):
         return coef * self.block_size * self.num_kv_heads * self.head_size \
                 * get_dtype_size(self.dtype)
 
-    def bytes_for_tokens(self, num_tokens: int) -> int:
-        return cdiv(num_tokens, self.block_size) * self.page_size_bytes
+
+@dataclass
+class FullAttentionSpec(AttentionSpec):
+
+    @property
+    def type_id(self) -> str:
+        return f"full_attention_{self.block_size}_{self.page_size_bytes}"
+
+    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
+        max_model_len = vllm_config.model_config.max_model_len
+        return cdiv(max_model_len, self.block_size) * self.page_size_bytes
+
+
+@dataclass
+class SlidingWindowSpec(AttentionSpec):
+    sliding_window: int
+
+    def __post_init__(self):
+        assert not self.use_mla, "MLA is not supported for sliding window"
+
+    @property
+    def type_id(self) -> str:
+        return f"sliding_window_{self.sliding_window}_{self.block_size}_{self.page_size_bytes}"  # noqa
+
+    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
+        max_model_len = vllm_config.model_config.max_model_len
+        max_num_batched_tokens = (
+            vllm_config.scheduler_config.max_num_batched_tokens)
+
+        # During chunked prefill, we allocate KV cache for the last
+        # `self.sliding_window-1` computed tokens plus the newly scheduled
+        # tokens. And we won't allocate KV cache for more than `max_model_len`
+        # tokens.
+        num_tokens = min(self.sliding_window - 1 + max_num_batched_tokens,
+                         max_model_len)
+
+        # +1 here because the sliding window may not start from the beginning
+        # of the block. For example, if the block size is 4 and num_token
+        # is 4, we need two blocks [XXCD] [EF] to store the sliding
+        # window [CDEF] of 6 tokens.
+        return (cdiv(num_tokens, self.block_size) + 1) * self.page_size_bytes
 
 
 @dataclass
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 43c756b193a6a..637367a70d2a3 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -28,8 +28,9 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
                         check_use_alibi, is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
-from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
-                                        KVCacheSpec)
+from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec,
+                                        KVCacheConfig, KVCacheSpec,
+                                        SlidingWindowSpec)
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
                              ModelRunnerOutput)
 from vllm.v1.sample.metadata import SamplingMetadata
@@ -1572,7 +1573,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 # different GPUs, and `kv_cache_config.num_blocks` is set to
                 # the min of all `num_blocks`. Verify it here.
                 assert num_blocks >= kv_cache_config.num_blocks
-                if isinstance(kv_cache_spec, FullAttentionSpec):
+                if isinstance(kv_cache_spec, AttentionSpec):
                     kv_cache_shape = self.attn_backend.get_kv_cache_shape(
                         num_blocks, kv_cache_spec.block_size,
                         kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
@@ -1611,12 +1612,21 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             # cross-attention
             assert isinstance(attn_module, Attention)
             if attn_module.attn_type == AttentionType.DECODER:
-                kv_cache_spec[layer_name] = FullAttentionSpec(
-                    block_size=block_size,
-                    num_kv_heads=attn_module.num_kv_heads,
-                    head_size=attn_module.head_size,
-                    dtype=self.kv_cache_dtype,
-                    use_mla=use_mla)
+                if attn_module.sliding_window is not None:
+                    kv_cache_spec[layer_name] = SlidingWindowSpec(
+                        block_size=block_size,
+                        num_kv_heads=attn_module.num_kv_heads,
+                        head_size=attn_module.head_size,
+                        dtype=self.kv_cache_dtype,
+                        sliding_window=attn_module.sliding_window,
+                        use_mla=use_mla)
+                else:
+                    kv_cache_spec[layer_name] = FullAttentionSpec(
+                        block_size=block_size,
+                        num_kv_heads=attn_module.num_kv_heads,
+                        head_size=attn_module.head_size,
+                        dtype=self.kv_cache_dtype,
+                        use_mla=use_mla)
             elif attn_module.attn_type in (AttentionType.ENCODER,
                                            AttentionType.ENCODER_ONLY):
                 # encoder-only attention does not need KV cache.
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 7f7318a7bdd3e..c2edbaf351d04 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -29,7 +29,7 @@ from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK,
                                                PallasMetadata)
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
-                                        KVCacheSpec)
+                                        KVCacheSpec, SlidingWindowSpec)
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
                              ModelRunnerOutput, SamplerOutput)
 from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
@@ -353,17 +353,25 @@ class TPUModelRunner:
         block_size = self.vllm_config.cache_config.block_size
         kv_cache_spec: dict[str, KVCacheSpec] = {}
         for layer_name, attn_module in forward_ctx.items():
-            # TODO: Support other attention modules, e.g., sliding window,
-            # cross-attention, MLA.
             assert isinstance(attn_module, Attention)
             if attn_module.attn_type == AttentionType.DECODER:
-                kv_cache_spec[layer_name] = FullAttentionSpec(
-                    block_size=block_size,
-                    num_kv_heads=attn_module.num_kv_heads,
-                    head_size=attn_module.head_size,
-                    dtype=attn_module.dtype,
-                    use_mla=False,
-                )
+                if attn_module.sliding_window is not None:
+                    kv_cache_spec[layer_name] = SlidingWindowSpec(
+                        block_size=block_size,
+                        num_kv_heads=attn_module.num_kv_heads,
+                        head_size=attn_module.head_size,
+                        dtype=attn_module.dtype,
+                        sliding_window=attn_module.sliding_window,
+                        use_mla=False,
+                    )
+                else:
+                    kv_cache_spec[layer_name] = FullAttentionSpec(
+                        block_size=block_size,
+                        num_kv_heads=attn_module.num_kv_heads,
+                        head_size=attn_module.head_size,
+                        dtype=attn_module.dtype,
+                        use_mla=False,
+                    )
             elif attn_module.attn_type in (AttentionType.ENCODER,
                                            AttentionType.ENCODER_ONLY):
                 # encoder-only attention does not need KV cache.

From 8af5a5c4e580ee20674dfe8be8cd687677419d8c Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Tue, 1 Apr 2025 15:45:49 +0800
Subject: [PATCH 143/593] fix: can not use uv run collect_env close #13888
 (#15792)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
---
 collect_env.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/collect_env.py b/collect_env.py
index 0ec9d4cae4ba7..1562fa2a0325d 100644
--- a/collect_env.py
+++ b/collect_env.py
@@ -482,16 +482,28 @@ def get_pip_packages(run_lambda, patterns=None):
     if patterns is None:
         patterns = DEFAULT_PIP_PATTERNS
 
-    # People generally have `pip` as `pip` or `pip3`
-    # But here it is invoked as `python -mpip`
-    def run_with_pip(pip):
-        out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"])
+    def run_with_pip():
+        try:
+            import importlib.util
+            pip_spec = importlib.util.find_spec('pip')
+            pip_available = pip_spec is not None
+        except ImportError:
+            pip_available = False
+
+        if pip_available:
+            cmd = [sys.executable, '-mpip', 'list', '--format=freeze']
+        elif os.environ.get("UV") is not None:
+            print("uv is set")
+            cmd = ["uv", "pip", "list", "--format=freeze"]
+        else:
+            raise RuntimeError("Could not collect pip list output (pip or uv module not available)")
+
+        out = run_and_read_all(run_lambda, cmd)
         return "\n".join(line for line in out.splitlines()
                          if any(name in line for name in patterns))
 
     pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
-    out = run_with_pip([sys.executable, '-mpip'])
-
+    out = run_with_pip()
     return pip_version, out
 
 
From 30d6a015e07bc334f1d408ee343891f3aeb976aa Mon Sep 17 00:00:00 2001
From: Wei Zeng <48810492+wayzeng@users.noreply.github.com>
Date: Tue, 1 Apr 2025 01:20:06 -0700
Subject: [PATCH 144/593] [Feature] specify model in config.yaml (#15798)

Signed-off-by: weizeng <weizeng@roblox.com>
---
 .../serving/openai_compatible_server.md       |  4 +-
 tests/{data => config}/test_config.yaml       |  0
 tests/config/test_config_with_model.yaml      |  7 ++
 tests/conftest.py                             | 12 ++++
 tests/test_utils.py                           | 64 +++++++++++++++----
 vllm/entrypoints/cli/serve.py                 | 18 ++----
 vllm/utils.py                                 | 36 ++++++++---
 7 files changed, 109 insertions(+), 32 deletions(-)
 rename tests/{data => config}/test_config.yaml (100%)
 create mode 100644 tests/config/test_config_with_model.yaml

diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 1cebff7e1f6e2..b2e972fa46983 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -188,6 +188,7 @@ For example:
 ```yaml
 # config.yaml
 
+model: meta-llama/Llama-3.1-8B-Instruct
 host: "127.0.0.1"
 port: 6379
 uvicorn-log-level: "info"
@@ -196,12 +197,13 @@ uvicorn-log-level: "info"
 To use the above config file:
 
 ```bash
-vllm serve SOME_MODEL --config config.yaml
+vllm serve --config config.yaml
 ```
 
 :::{note}
 In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence.
 The order of priorities is `command line > config file values > defaults`.
+e.g. `vllm serve SOME_MODEL --config config.yaml`, SOME_MODEL takes precedence over `model` in config file.
 :::
 
 ## API Reference
diff --git a/tests/data/test_config.yaml b/tests/config/test_config.yaml
similarity index 100%
rename from tests/data/test_config.yaml
rename to tests/config/test_config.yaml
diff --git a/tests/config/test_config_with_model.yaml b/tests/config/test_config_with_model.yaml
new file mode 100644
index 0000000000000..d8c8c7bc8162a
--- /dev/null
+++ b/tests/config/test_config_with_model.yaml
@@ -0,0 +1,7 @@
+# Same as test_config.yaml but with model specified
+model: config-model
+port: 12312
+served_model_name: mymodel
+tensor_parallel_size: 2
+trust_remote_code: true
+multi_step_stream_outputs: false
diff --git a/tests/conftest.py b/tests/conftest.py
index 6627ab638bf55..b833cff4db7c0 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1117,3 +1117,15 @@ def pytest_collection_modifyitems(config, items):
     for item in items:
         if "optional" in item.keywords:
             item.add_marker(skip_optional)
+
+
+@pytest.fixture(scope="session")
+def cli_config_file():
+    """Return the path to the CLI config file."""
+    return os.path.join(_TEST_DIR, "config", "test_config.yaml")
+
+
+@pytest.fixture(scope="session")
+def cli_config_file_with_model():
+    """Return the path to the CLI config file with model."""
+    return os.path.join(_TEST_DIR, "config", "test_config_with_model.yaml")
diff --git a/tests/test_utils.py b/tests/test_utils.py
index ccbbffcabfcda..b6129a102085b 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -10,7 +10,7 @@ from unittest.mock import patch
 
 import pytest
 import torch
-from vllm_test_utils import monitor
+from vllm_test_utils.monitor import monitor
 
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.utils import (FlexibleArgumentParser, MemorySnapshot,
@@ -143,7 +143,8 @@ def parser():
 def parser_with_config():
     parser = FlexibleArgumentParser()
     parser.add_argument('serve')
-    parser.add_argument('model_tag')
+    parser.add_argument('model_tag', nargs='?')
+    parser.add_argument('--model', type=str)
     parser.add_argument('--served-model-name', type=str)
     parser.add_argument('--config', type=str)
     parser.add_argument('--port', type=int)
@@ -199,29 +200,29 @@ def test_missing_required_argument(parser):
         parser.parse_args([])
 
 
-def test_cli_override_to_config(parser_with_config):
+def test_cli_override_to_config(parser_with_config, cli_config_file):
     args = parser_with_config.parse_args([
-        'serve', 'mymodel', '--config', './data/test_config.yaml',
+        'serve', 'mymodel', '--config', cli_config_file,
         '--tensor-parallel-size', '3'
     ])
     assert args.tensor_parallel_size == 3
     args = parser_with_config.parse_args([
         'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
-        './data/test_config.yaml'
+        cli_config_file
     ])
     assert args.tensor_parallel_size == 3
     assert args.port == 12312
     args = parser_with_config.parse_args([
         'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
-        './data/test_config.yaml', '--port', '666'
+        cli_config_file, '--port', '666'
     ])
     assert args.tensor_parallel_size == 3
     assert args.port == 666
 
 
-def test_config_args(parser_with_config):
+def test_config_args(parser_with_config, cli_config_file):
     args = parser_with_config.parse_args(
-        ['serve', 'mymodel', '--config', './data/test_config.yaml'])
+        ['serve', 'mymodel', '--config', cli_config_file])
     assert args.tensor_parallel_size == 2
     assert args.trust_remote_code
     assert not args.multi_step_stream_outputs
@@ -243,10 +244,9 @@ def test_config_file(parser_with_config):
         ])
 
 
-def test_no_model_tag(parser_with_config):
+def test_no_model_tag(parser_with_config, cli_config_file):
     with pytest.raises(ValueError):
-        parser_with_config.parse_args(
-            ['serve', '--config', './data/test_config.yaml'])
+        parser_with_config.parse_args(['serve', '--config', cli_config_file])
 
 
 # yapf: enable
@@ -480,6 +480,48 @@ def test_swap_dict_values(obj, key1, key2):
     else:
         assert key1 not in obj
 
+
+def test_model_specification(parser_with_config,
+                             cli_config_file,
+                             cli_config_file_with_model):
+    # Test model in CLI takes precedence over config
+    args = parser_with_config.parse_args([
+        'serve', 'cli-model', '--config', cli_config_file_with_model
+    ])
+    assert args.model_tag == 'cli-model'
+    assert args.served_model_name == 'mymodel'
+
+    # Test model from config file works
+    args = parser_with_config.parse_args([
+        'serve', '--config', cli_config_file_with_model,
+    ])
+    assert args.model == 'config-model'
+    assert args.served_model_name == 'mymodel'
+
+    # Test no model specified anywhere raises error
+    with pytest.raises(ValueError, match="No model specified!"):
+        parser_with_config.parse_args(['serve', '--config', cli_config_file])
+
+    # Test using --model option raises error
+    with pytest.raises(
+        ValueError,
+        match=(
+            "With `vllm serve`, you should provide the model as a positional "
+            "argument or in a config file instead of via the `--model` option."
+        ),
+    ):
+        parser_with_config.parse_args(['serve', '--model', 'my-model'])
+
+    # Test other config values are preserved
+    args = parser_with_config.parse_args([
+        'serve', 'cli-model', '--config', cli_config_file_with_model,
+    ])
+    assert args.tensor_parallel_size == 2
+    assert args.trust_remote_code is True
+    assert args.multi_step_stream_outputs is False
+    assert args.port == 12312
+
+
 @pytest.mark.parametrize("input", [(), ("abc", ), (None, ),
                                     (None, bool, [1, 2, 3])])
 @pytest.mark.parametrize("output", [0, 1, 2])
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index c345ece4dada9..e89ac4e219997 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -4,7 +4,6 @@ import argparse
 
 import uvloop
 
-from vllm.engine.arg_utils import EngineArgs
 from vllm.entrypoints.cli.types import CLISubcommand
 from vllm.entrypoints.openai.api_server import run_server
 from vllm.entrypoints.openai.cli_args import (make_arg_parser,
@@ -21,14 +20,9 @@ class ServeSubcommand(CLISubcommand):
 
     @staticmethod
     def cmd(args: argparse.Namespace) -> None:
-        # The default value of `--model`
-        if args.model != EngineArgs.model:
-            raise ValueError(
-                "With `vllm serve`, you should provide the model as a "
-                "positional argument instead of via the `--model` option.")
-
-        # EngineArgs expects the model name to be passed as --model.
-        args.model = args.model_tag
+        # If model is specified in CLI (as positional arg), it takes precedence
+        if hasattr(args, 'model_tag') and args.model_tag is not None:
+            args.model = args.model_tag
 
         uvloop.run(run_server(args))
 
@@ -41,10 +35,12 @@ class ServeSubcommand(CLISubcommand):
         serve_parser = subparsers.add_parser(
             "serve",
             help="Start the vLLM OpenAI Compatible API server",
-            usage="vllm serve <model_tag> [options]")
+            usage="vllm serve [model_tag] [options]")
         serve_parser.add_argument("model_tag",
                                   type=str,
-                                  help="The model tag to serve")
+                                  nargs='?',
+                                  help="The model tag to serve "
+                                  "(optional if specified in config)")
         serve_parser.add_argument(
             "--config",
             type=str,
diff --git a/vllm/utils.py b/vllm/utils.py
index f13f4d78723b3..5f32f8cb66a5c 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1241,6 +1241,16 @@ class FlexibleArgumentParser(argparse.ArgumentParser):
         if args is None:
             args = sys.argv[1:]
 
+        # Check for --model in command line arguments first
+        if args and args[0] == "serve":
+            model_in_cli_args = any(arg == '--model' for arg in args)
+
+            if model_in_cli_args:
+                raise ValueError(
+                    "With `vllm serve`, you should provide the model as a "
+                    "positional argument or in a config file instead of via "
+                    "the `--model` option.")
+
         if '--config' in args:
             args = self._pull_args_from_config(args)
 
@@ -1324,19 +1334,29 @@ class FlexibleArgumentParser(argparse.ArgumentParser):
         config_args = self._load_config_file(file_path)
 
         # 0th index is for {serve,chat,complete}
-        # followed by model_tag (only for serve)
+        # optionally followed by model_tag (only for serve)
         # followed by config args
         # followed by rest of cli args.
         # maintaining this order will enforce the precedence
         # of cli > config > defaults
         if args[0] == "serve":
-            if index == 1:
+            model_in_cli = len(args) > 1 and not args[1].startswith('-')
+            model_in_config = any(arg == '--model' for arg in config_args)
+
+            if not model_in_cli and not model_in_config:
                 raise ValueError(
-                    "No model_tag specified! Please check your command-line"
-                    " arguments.")
-            args = [args[0]] + [
-                args[1]
-            ] + config_args + args[2:index] + args[index + 2:]
+                    "No model specified! Please specify model either "
+                    "as a positional argument or in a config file.")
+
+            if model_in_cli:
+                # Model specified as positional arg, keep CLI version
+                args = [args[0]] + [
+                    args[1]
+                ] + config_args + args[2:index] + args[index + 2:]
+            else:
+                # No model in CLI, use config if available
+                args = [args[0]
+                        ] + config_args + args[1:index] + args[index + 2:]
         else:
             args = [args[0]] + config_args + args[1:index] + args[index + 2:]
 
@@ -1354,9 +1374,7 @@ class FlexibleArgumentParser(argparse.ArgumentParser):
                 '--port': '12323',
                 '--tensor-parallel-size': '4'
             ]
-
         """
-
         extension: str = file_path.split('.')[-1]
         if extension not in ('yaml', 'yml'):
             raise ValueError(

From 79455cf42113efb026e39342997f7a878af8bc38 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Tue, 1 Apr 2025 04:53:56 -0400
Subject: [PATCH 145/593] [Misc] Enable V1 LoRA by default (#15320)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 tests/entrypoints/openai/test_chat.py | 60 +++++++++++++++++++++++++--
 tests/lora/test_baichuan.py           | 16 +++----
 tests/lora/test_chatglm3_tp.py        | 16 +++----
 tests/lora/test_gemma.py              | 16 +++----
 tests/lora/test_layers.py             |  5 ---
 tests/lora/test_llama_tp.py           | 20 ++++-----
 tests/lora/test_lora_manager.py       | 14 +++++--
 tests/lora/test_phi.py                | 16 +++----
 tests/lora/test_quant_model.py        | 16 +++----
 tests/lora/test_transfomers_model.py  | 19 ++++-----
 tests/v1/test_oracle.py               | 10 +----
 vllm/engine/arg_utils.py              |  4 --
 12 files changed, 125 insertions(+), 87 deletions(-)

diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 25e4595cef6f6..4d13421adee0b 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -24,7 +24,23 @@ GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
 
 
 @pytest.fixture(scope="module")
-def server(zephyr_lora_files, zephyr_lora_added_tokens_files):  # noqa: F811
+def monkeypatch_module():
+    from _pytest.monkeypatch import MonkeyPatch
+    mpatch = MonkeyPatch()
+    yield mpatch
+    mpatch.undo()
+
+
+@pytest.fixture(scope="module", params=[False, True])
+def server(
+        request,
+        monkeypatch_module,
+        zephyr_lora_files,  #noqa: F811
+        zephyr_lora_added_tokens_files):  # noqa: F811
+
+    use_v1 = request.param
+    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+
     args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
@@ -49,6 +65,13 @@ def server(zephyr_lora_files, zephyr_lora_added_tokens_files):  # noqa: F811
         yield remote_server
 
 
+@pytest.fixture
+def is_v1_server(server):
+    import os
+    assert os.environ['VLLM_USE_V1'] in ['0', '1']
+    return os.environ['VLLM_USE_V1'] == '1'
+
+
 @pytest_asyncio.fixture
 async def client(server):
     async with server.get_async_client() as async_client:
@@ -471,8 +494,13 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_choice_chat(client: openai.AsyncOpenAI,
+                                  is_v1_server: bool,
                                   guided_decoding_backend: str,
                                   sample_guided_choice):
+
+    if is_v1_server and guided_decoding_backend != 'xgrammar':
+        pytest.skip("Only xgrammar backend is supported with V1")
+
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -511,9 +539,13 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_guided_json_chat(client: openai.AsyncOpenAI,
+async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
                                 guided_decoding_backend: str,
                                 sample_json_schema):
+
+    if is_v1_server:
+        pytest.skip("sample_json_schema has features unsupported in V1")
+
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -559,7 +591,12 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_regex_chat(client: openai.AsyncOpenAI,
+                                 is_v1_server: bool,
                                  guided_decoding_backend: str, sample_regex):
+
+    if is_v1_server and guided_decoding_backend != 'xgrammar':
+        pytest.skip("Only xgrammar backend is supported with V1")
+
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -617,8 +654,13 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
+                                           is_v1_server: bool,
                                            guided_decoding_backend: str,
                                            sample_guided_choice):
+
+    if is_v1_server and guided_decoding_backend != 'xgrammar':
+        pytest.skip("Only xgrammar backend is supported with V1")
+
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -648,9 +690,13 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_named_tool_use(client: openai.AsyncOpenAI,
+async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
                               guided_decoding_backend: str,
                               sample_json_schema):
+
+    if is_v1_server:
+        pytest.skip("sample_json_schema has features unsupported on V1")
+
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -742,6 +788,10 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 async def test_required_tool_use_not_yet_supported(client: openai.AsyncOpenAI,
                                                    sample_json_schema):
+
+    if is_v1_server:
+        pytest.skip("sample_json_schema has features unsupported on V1")
+
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -787,6 +837,10 @@ async def test_required_tool_use_not_yet_supported(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
                                                   sample_json_schema):
+
+    if is_v1_server:
+        pytest.skip("sample_json_schema has features unsupported on V1")
+
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
index 9103ba425af18..3aa30b7b3c723 100644
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -11,6 +11,14 @@ MODEL_PATH = "baichuan-inc/Baichuan-7B"
 PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(query="How many singers do we have?"),
@@ -40,14 +48,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     return generated_texts
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def test_baichuan_lora(baichuan_lora_files):
     llm = vllm.LLM(MODEL_PATH,
                    max_model_len=1024,
diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
index fa8c66d10309d..28a6f163d115a 100644
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -18,6 +18,14 @@ EXPECTED_LORA_OUTPUT = [
 ]
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(query="How many singers do we have?"),
@@ -46,14 +54,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     return generated_texts
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @create_new_process_for_each_test()
 def test_chatglm3_lora(chatglm3_lora_files):
     llm = vllm.LLM(MODEL_PATH,
diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py
index 8f07e39d20d3b..610bc405ede5c 100644
--- a/tests/lora/test_gemma.py
+++ b/tests/lora/test_gemma.py
@@ -9,6 +9,14 @@ from vllm.platforms import current_platform
 MODEL_PATH = "google/gemma-7b"
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         "Quote: Imagination is",
@@ -31,14 +39,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     return generated_texts
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 # The V1 lora test for this model requires more than 24GB.
 @pytest.mark.skip_v1
 @pytest.mark.xfail(current_platform.is_rocm(),
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 8c8e55edae67b..56da97b6a06d8 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import importlib
 import random
 from copy import deepcopy
 from dataclasses import dataclass
@@ -82,10 +81,6 @@ def v1(run_with_both_engines_lora):
     # This can be promoted up to conftest.py to run for every
     # test in a package
 
-    # Reload punica_gpu as the kernels used are tied to engine type.
-    from vllm.lora.punica_wrapper import punica_gpu
-    importlib.reload(punica_gpu)
-
     # Release any memory we might be holding on to. CI runs OOMs otherwise.
     from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
                                                 _LORA_B_PTR_DICT)
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index 7026f705026fb..9f20e47c2f948 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -28,6 +28,14 @@ EXPECTED_LORA_OUTPUT = [
 ]
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
@@ -71,16 +79,6 @@ def generate_and_test(llm, sql_lora_files):
     print("removing lora")
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
-# V1 Test: Failing due to numerics on V1.
-@pytest.mark.skip_v1
 @create_new_process_for_each_test()
 def test_llama_lora(sql_lora_files):
 
@@ -126,8 +124,6 @@ def test_llama_lora_warmup(sql_lora_files):
         "less when using lora than when not using lora")
 
 
-# V1 Test: Failing due to numerics on V1.
-@pytest.mark.skip_v1
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_llama_lora_tp4(sql_lora_files):
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index db6a6ec78fa2f..576d95a471547 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -7,7 +7,6 @@ import torch
 from safetensors.torch import load_file
 from torch import nn
 
-from vllm import envs
 from vllm.config import LoRAConfig
 from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
                               MergedColumnParallelLinearWithLoRA,
@@ -33,6 +32,17 @@ DEVICES = ([
 ] if current_platform.is_cuda_alike() else ["cpu"])
 
 
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
+    """
+    Some tests depend on V0 internals. Since both V0 and V1 use the same
+    LoRAModelManager it is okay to just test V0.
+    """
+    with monkeypatch.context() as m:
+        m.setenv('VLLM_USE_V1', '0')
+        yield
+
+
 @pytest.mark.parametrize("device", DEVICES)
 def test_from_lora_tensors(sql_lora_files, device):
     tensors = load_file(
@@ -411,7 +421,6 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
     assert manager.device == device
 
 
-@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.")
 @pytest.mark.parametrize("device", DEVICES)
 def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
                                           sql_lora_files, device):
@@ -491,7 +500,6 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
             device)
 
 
-@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.")
 @pytest.mark.parametrize("device", DEVICES)
 def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
                                 sql_lora_files, device):
diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py
index 8596d3999799c..7375cabbc36d9 100644
--- a/tests/lora/test_phi.py
+++ b/tests/lora/test_phi.py
@@ -10,6 +10,14 @@ MODEL_PATH = "microsoft/phi-2"
 PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:"  # noqa: E501
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(
@@ -48,14 +56,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     return generated_texts
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 # Skipping for V1 for now as we are hitting,
 # "Head size 80 is not supported by FlashAttention." error.
 @pytest.mark.skip_v1
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index d607bf66ebd45..a4a47a9c2acdf 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -37,6 +37,14 @@ else:
     ]
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def do_sample(llm: vllm.LLM,
               lora_path: str,
               lora_id: int,
@@ -69,14 +77,6 @@ def do_sample(llm: vllm.LLM,
     return generated_texts
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tp_size", [1])
 def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
diff --git a/tests/lora/test_transfomers_model.py b/tests/lora/test_transfomers_model.py
index f65fb1cdbbd56..0f18de42cd9cb 100644
--- a/tests/lora/test_transfomers_model.py
+++ b/tests/lora/test_transfomers_model.py
@@ -18,6 +18,14 @@ EXPECTED_LORA_OUTPUT = [
 ]
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(query="How many singers do we have?"),
@@ -46,15 +54,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     return generated_texts
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
-@pytest.mark.skip_v1
 @create_new_process_for_each_test()
 def test_ilama_lora(ilama_lora_files):
     llm = vllm.LLM(MODEL_PATH,
@@ -74,7 +73,6 @@ def test_ilama_lora(ilama_lora_files):
         assert output2[i] == EXPECTED_LORA_OUTPUT[i]
 
 
-@pytest.mark.skip_v1
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_ilama_lora_tp4(ilama_lora_files):
@@ -96,7 +94,6 @@ def test_ilama_lora_tp4(ilama_lora_files):
         assert output2[i] == EXPECTED_LORA_OUTPUT[i]
 
 
-@pytest.mark.skip_v1
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):
diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
index 762c7bada324c..1448641f6a570 100644
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -104,14 +104,6 @@ def test_enable_by_default_fallback(monkeypatch):
         assert envs.VLLM_USE_V1
         m.delenv("VLLM_USE_V1")
 
-        # Should fall back to V0 for experimental config.
-        _ = AsyncEngineArgs(
-            model=MODEL,
-            enable_lora=True,
-        ).create_engine_config()
-        assert not envs.VLLM_USE_V1
-        m.delenv("VLLM_USE_V1")
-
         # Should fall back to V0 for supported model.
         _ = AsyncEngineArgs(
             model=UNSUPPORTED_MODELS_V1[0]).create_engine_config()
@@ -125,7 +117,7 @@ def test_v1_llm_by_default(monkeypatch):
             m.delenv("VLLM_USE_V1")
 
         # Should default to V1 for supported config.
-        model = LLM(MODEL, enforce_eager=True)
+        model = LLM(MODEL, enforce_eager=True, enable_lora=True)
         print(model.generate("Hello my name is"))
         assert hasattr(model.llm_engine, "engine_core")
         m.delenv("VLLM_USE_V1")
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 019cbe18397e6..ecdcab50e4524 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1512,10 +1512,6 @@ class EngineArgs:
                 and _warn_or_fallback("Engine in background thread")):
             return False
 
-        # LoRA is supported on V1, but off by default for now.
-        if self.enable_lora and _warn_or_fallback("LORA"):
-            return False
-
         # PP is supported on V1 with Ray distributed executor,
         # but off for MP distributed executor for now.
         if (self.pipeline_parallel_size > 1

From 656fd72976a86f4bc1887be228dac0ab71c56c71 Mon Sep 17 00:00:00 2001
From: shangmingc <caishangming@linux.alibaba.com>
Date: Tue, 1 Apr 2025 17:26:22 +0800
Subject: [PATCH 146/593] [Misc] Fix speculative config repr string (#15860)

Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
---
 vllm/config.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 96b6f84be2850..c82c9763ccdc7 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2359,12 +2359,10 @@ class SpeculativeConfig:
         return self.num_speculative_tokens
 
     def __repr__(self) -> str:
-        if self.prompt_lookup_max is not None and self.prompt_lookup_max > 0:
-            draft_model = "ngram"
-        else:
-            draft_model = self.draft_model_config.model
+        method = self.method
+        model = None if method == "ngram" else self.draft_model_config.model
         num_spec_tokens = self.num_speculative_tokens
-        return f"SpeculativeConfig({draft_model=}, {num_spec_tokens=})"
+        return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"
 
 
 @dataclass

From d330558bab70ce760ea70433f3e9846a909640a1 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 1 Apr 2025 11:05:14 +0100
Subject: [PATCH 147/593] [Docs] Fix small error in link text (#15868)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/source/models/supported_models.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 62274854d8bee..fb4c8bde06576 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -135,7 +135,7 @@ To determine whether a given model is natively supported, you can check the `con
 If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
 
 Models do not _need_ to be natively supported to be used in vLLM.
-The <project:#transformers-backend> enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
+The [Transformers backend](#transformers-backend) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
 
 :::{tip}
 The easiest way to check if your model is really supported at runtime is to run the program below:

From 0a298ea4181a9b324afdb24876560b7e98e69bba Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 1 Apr 2025 18:17:11 +0800
Subject: [PATCH 148/593] [Bugfix] Fix no video/image profiling edge case for
 `MultiModalDataParser` (#15828)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/multimodal/parse.py | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 772b1609a9fbb..fc5a294564e3c 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -295,7 +295,7 @@ class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
 
 
 ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]],
-                                         ModalityDataItems[Any, Any]]
+                                         Optional[ModalityDataItems[Any, Any]]]
 
 
 class MultiModalDataParser:
@@ -319,7 +319,15 @@ class MultiModalDataParser:
         if isinstance(data, torch.Tensor):
             return data.ndim == 3
         if is_list_of(data, torch.Tensor):
-            return len(data) == 0 or data[0].ndim == 2
+            return data[0].ndim == 2
+
+        return False
+
+    def _is_empty(self, data: object) -> TypeGuard[None]:
+        if isinstance(data, list):
+            return len(data) == 0
+        if isinstance(data, (np.ndarray, torch.Tensor)):
+            return data.size == 0
 
         return False
 
@@ -341,7 +349,12 @@ class MultiModalDataParser:
     def _parse_audio_data(
         self,
         data: ModalityData[AudioItem],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> Optional[ModalityDataItems[Any, Any]]:
+        # also check single audio item with sampling rate
+        if self._is_empty(data) or (isinstance(data, tuple)
+                                    and self._is_empty(data[0])):
+            return None
+
         if self._is_embeddings(data):
             return AudioEmbeddingItems(data)
 
@@ -378,7 +391,10 @@ class MultiModalDataParser:
     def _parse_image_data(
         self,
         data: ModalityData[ImageItem],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> Optional[ModalityDataItems[Any, Any]]:
+        if self._is_empty(data):
+            return None
+
         if self._is_embeddings(data):
             return ImageEmbeddingItems(data)
 
@@ -396,7 +412,10 @@ class MultiModalDataParser:
     def _parse_video_data(
         self,
         data: ModalityData[VideoItem],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> Optional[ModalityDataItems[Any, Any]]:
+        if self._is_empty(data):
+            return None
+
         if self._is_embeddings(data):
             return VideoEmbeddingItems(data)
 
@@ -427,6 +446,8 @@ class MultiModalDataParser:
             if k not in subparsers:
                 raise ValueError(f"Unsupported modality: {k}")
 
-            mm_items[k] = subparsers[k](v)
+            # ignore empty embedding data
+            if (parsed_data := subparsers[k](v)) is not None:
+                mm_items[k] = parsed_data
 
         return mm_items

From 8dd41d6bcc6680b69819cd6518c44ff3242b73f1 Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Tue, 1 Apr 2025 06:07:53 -0700
Subject: [PATCH 149/593] [Misc] Use
 envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE (#15831)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/envs.py                              | 20 +++++++++++------
 vllm/executor/ray_distributed_executor.py | 27 ++++++++++++++---------
 2 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 8a03ba329b028..b34c2df81698f 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -48,7 +48,7 @@ if TYPE_CHECKING:
     VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
     VLLM_USE_RAY_SPMD_WORKER: bool = False
     VLLM_USE_RAY_COMPILED_DAG: bool = False
-    VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: bool = True
+    VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "auto"
     VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
     VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
     VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
@@ -380,15 +380,21 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # (previously known as ADAG) API which optimizes the
     # control plane overhead.
     # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
+    # Note that this variable is set to 1 in V1 by default
+    # when ray distributed executor is used.
     "VLLM_USE_RAY_COMPILED_DAG":
     lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG", "0"))),
 
-    # If the env var is set, it uses NCCL for communication in
-    # Ray's Compiled Graph. This flag is ignored if
-    # VLLM_USE_RAY_COMPILED_DAG is not set.
-    "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL":
-    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", "1"))
-                 ),
+    # If the env var is set, Ray Compiled Graph uses the specified
+    # channel type to communicate between workers belonging to
+    # different pipeline-parallel stages.
+    # Available options:
+    # - "auto": use the default channel type
+    # - "nccl": use NCCL for communication
+    # - "shm": use shared memory and gRPC for communication
+    # This flag is ignored if VLLM_USE_RAY_COMPILED_DAG is not set.
+    "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE":
+    lambda: os.getenv("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "auto"),
 
     # If the env var is set, it enables GPU communication overlap
     # (experimental feature) in Ray's Compiled Graph. This flag is ignored if
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index c823ab5bf9698..9b0b98731e033 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -79,7 +79,7 @@ class RayDistributedExecutor(DistributedExecutorBase):
 
             # For TPU, avoid compiling NVIDIA's NCCL
             if current_platform.is_tpu():
-                os.environ["VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL"] = "0"
+                os.environ["VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE"] = "shm"
 
         # If the env var is set, it uses the Ray's compiled DAG API
         # which optimizes the control plane overhead.
@@ -546,10 +546,11 @@ class RayDistributedExecutor(DistributedExecutorBase):
                              "Run `pip install ray[cgraph]` to install it.")
 
         cupy_spec = importlib.util.find_spec("cupy")
-        if cupy_spec is None and envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL:
+        if (cupy_spec is None
+                and envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE == "nccl"):
             raise ValueError(
                 "cupy is not installed but required since "
-                "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL is set. "
+                "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE is set to 'nccl'. "
                 "Run `pip install ray[cgraph]` and check cupy installation.")
 
     def _compiled_ray_dag(self, enable_asyncio: bool):
@@ -557,10 +558,17 @@ class RayDistributedExecutor(DistributedExecutorBase):
         self._check_ray_cgraph_installation()
         from ray.dag import InputNode, MultiOutputNode
 
-        logger.info("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL = %s",
-                    envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL)
+        logger.info("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE = %s",
+                    envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE)
         logger.info("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM = %s",
                     envs.VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM)
+
+        channel_type = envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE
+        if channel_type not in ("auto", "nccl", "shm"):
+            raise ValueError(
+                "Invalid value for VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: "
+                f"{channel_type}. Valid values are: 'auto', 'nccl', or 'shm'.")
+
         # Enlarge the default value of "RAY_CGRAPH_get_timeout" to 300 seconds
         # (it is 10 seconds by default). This is a Ray environment variable to
         # control the timeout of getting result from a compiled graph execution,
@@ -605,13 +613,12 @@ class RayDistributedExecutor(DistributedExecutorBase):
                     ]
 
                 last_pp_rank = len(self.pp_tp_workers) - 1
-                if pp_rank < last_pp_rank:
+                if (pp_rank < last_pp_rank and
+                        envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE != "shm"):
                     # Specify how intermediate tensors should be passed
                     # between pp stages, no need to specify for the last
-                    # pp stage.
-                    transport = "nccl" \
-                        if envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL \
-                        else "auto"
+                    # pp stage or when using shared memory (the default).
+                    transport = envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE
                     outputs = [
                         output.with_tensor_transport(transport=transport)
                         for output in outputs

From f3aca1ee30f7df3d9a3a33a2a1ba136b1183c77f Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Tue, 1 Apr 2025 06:09:40 -0700
Subject: [PATCH 150/593] setup correct nvcc version with CUDA_HOME (#15725)

Signed-off-by: Yang Chen <yangche@fb.com>
---
 setup.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 3a92d5a23a1f2..cf2acb20d9cf0 100755
--- a/setup.py
+++ b/setup.py
@@ -201,6 +201,9 @@ class cmake_build_ext(build_ext):
         else:
             # Default build tool to whatever cmake picks.
             build_tool = []
+        # Make sure we use the nvcc from CUDA_HOME
+        if _is_cuda():
+            cmake_args += [f'-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc']
         subprocess.check_call(
             ['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args],
             cwd=self.build_temp)
@@ -639,11 +642,10 @@ if _is_hip():
 
 if _is_cuda():
     ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
-    if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.0"):
-        # FA3 requires CUDA 12.0 or later
+    if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
+        # FA3 requires CUDA 12.3 or later
         ext_modules.append(
             CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
-    if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
         # Optional since this doesn't get built (produce an .so file) when
         # not targeting a hopper system
         ext_modules.append(

From 51d7c6a2b23e100cd9e7d85b8e7c0eea656b331e Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 1 Apr 2025 07:10:05 -0600
Subject: [PATCH 151/593] [Model] Support Mistral3 in the HF Transformers
 format (#15505)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/source/models/supported_models.md        |   7 +
 examples/offline_inference/vision_language.py |  24 +
 .../vision_language_multi_image.py            |  23 +
 tests/models/registry.py                      |   3 +
 vllm/entrypoints/chat_utils.py                |   3 +-
 vllm/model_executor/models/mistral3.py        | 656 ++++++++++++++++++
 vllm/model_executor/models/pixtral.py         |   7 +-
 vllm/model_executor/models/registry.py        |   1 +
 vllm/model_executor/models/vision.py          |   3 +
 9 files changed, 723 insertions(+), 4 deletions(-)
 create mode 100644 vllm/model_executor/models/mistral3.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index fb4c8bde06576..42d923e160463 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -865,6 +865,13 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
   * ✅︎
   * ✅︎
+- * `Mistral3ForConditionalGeneration`
+  * Mistral3
+  * T + I<sup>+</sup>
+  * `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc.
+  *
+  * ✅︎
+  *
 - * `MllamaForConditionalGeneration`
   * Llama 3.2
   * T + I<sup>+</sup>
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index eb56b0aee6c76..d32bfcd3460c5 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -498,6 +498,29 @@ def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
     return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
 
 
+# Mistral-3 HF-format
+def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+
+    # NOTE: Need L40 (or equivalent) to avoid OOM
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        tensor_parallel_size=2,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # LLama 3.2
 def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -859,6 +882,7 @@ model_example_map = {
     "mantis": run_mantis,
     "minicpmo": run_minicpmo,
     "minicpmv": run_minicpmv,
+    "mistral3": run_mistral3,
     "mllama": run_mllama,
     "molmo": run_molmo,
     "NVLM_D": run_nvlm_d,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 0493222da1341..318cf989d731a 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -218,6 +218,28 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+
+    # Adjust this as necessary to fit in GPU
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        tensor_parallel_size=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = "[IMG]" * len(image_urls)
+    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
 
@@ -509,6 +531,7 @@ model_example_map = {
     "h2ovl_chat": load_h2ovl,
     "idefics3": load_idefics3,
     "internvl_chat": load_internvl,
+    "mistral3": load_mistral3,
     "mllama": load_mllama,
     "NVLM_D": load_nvlm_d,
     "phi3_v": load_phi3v,
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 8cc5c28d237c7..ffc00261a8a16 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -297,6 +297,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
                                 extras={"2.6": "openbmb/MiniCPM-V-2_6"},  # noqa: E501
                                 trust_remote_code=True),
+    "Mistral3ForConditionalGeneration": _HfExamplesInfo("mistralai/Mistral-Small-3.1-24B-Instruct-2503",  # noqa: E501
+                                                        min_transformers_version="4.50",  # noqa: E501
+                                                        extras={"fp8": "nm-testing/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic"}),  # noqa: E501
     "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
                                         max_transformers_version="4.48",
                                         transformers_version_reason="Use of private method which no longer exists.",  # noqa: E501
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 24382142768b5..e32b8ffc044c3 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -487,7 +487,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
                 return "<|endoftext10|>"  # 200010 (see vocab.json in hf model)
             if model_type in ("minicpmo", "minicpmv"):
                 return "(<image>./</image>)"
-            if model_type in ("blip-2", "fuyu", "paligemma", "pixtral"):
+            if model_type in ("blip-2", "fuyu", "paligemma", "pixtral",
+                              "mistral3"):
                 # These models do not use image tokens in the prompt
                 return None
             if model_type == "qwen":
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
new file mode 100644
index 0000000000000..4cd9a7bf58e77
--- /dev/null
+++ b/vllm/model_executor/models/mistral3.py
@@ -0,0 +1,656 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from abc import abstractmethod
+from collections.abc import Iterable, Mapping, Sequence
+from functools import cached_property
+from typing import (Final, Literal, Optional, Protocol, Set, Tuple, TypedDict,
+                    TypeVar, Union)
+
+import torch
+import torch.nn as nn
+from transformers import (BatchFeature, Mistral3Config, PixtralVisionConfig,
+                          PretrainedConfig)
+from transformers.models.pixtral import PixtralProcessor
+
+from vllm.config import VllmConfig
+from vllm.inputs import InputProcessingContext
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, ProcessingCache,
+                                        PromptReplacement, PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
+                         SupportsV0Only)
+from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel
+from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
+                    maybe_prefix, merge_multimodal_embeddings)
+from .vision import get_vision_encoder_info, select_patch_features
+
+
+class Mistral3ImagePixelInputs(TypedDict):
+    type: Literal["pixel_values_pixtral"]
+    pixel_values: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    Shape: `(batch_size * num_images, num_channels, height, width)`
+
+    Note that `height` or `width` may be different per batch and image,
+    in which case the data is passed as a list instead of a batched tensor.
+    """
+
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+    
+    Shape: `(batch_size, num_images, num_embeds)`
+    """
+
+
+class Mistral3PatchMerger(nn.Module):
+    """
+    Learned merging of spatial_merge_size ** 2 patches
+    """
+
+    def __init__(self, vision_hidden_size: int, spatial_merge_size: int,
+                 patch_size: int):
+        super().__init__()
+
+        self.vision_hidden_size = vision_hidden_size
+        self.spatial_merge_size = spatial_merge_size
+        self.patch_size = patch_size
+        self.merging_layer = nn.Linear(vision_hidden_size *
+                                       self.spatial_merge_size**2,
+                                       vision_hidden_size,
+                                       bias=False)
+
+    def forward(self, image_features: torch.Tensor,
+                image_sizes: torch.Tensor) -> torch.Tensor:
+        image_sizes = [(image_size[0] // self.patch_size,
+                        image_size[1] // self.patch_size)
+                       for image_size in image_sizes]
+
+        tokens_per_image = [h * w for h, w in image_sizes]
+        d = image_features.shape[-1]
+
+        permuted_tensor = []
+        for image_index, image_tokens in enumerate(
+                image_features.split(tokens_per_image)):
+            # Reshape image_tokens into a 2D grid
+            h, w = image_sizes[image_index]
+            image_grid = image_tokens.view(h, w, d).permute(2, 0,
+                                                            1).unsqueeze(0)
+            grid = torch.nn.functional.unfold(
+                image_grid,
+                kernel_size=self.spatial_merge_size,
+                stride=self.spatial_merge_size)
+            grid = grid.view(d * self.spatial_merge_size**2, -1).t()
+            permuted_tensor.append(grid)
+
+        image_features = torch.cat(permuted_tensor, dim=0)
+        image_features = self.merging_layer(image_features)
+        return image_features
+
+
+class Mistral3MultiModalProjector(nn.Module):
+
+    def __init__(self,
+                 vision_hidden_size: int,
+                 text_hidden_size: int,
+                 spatial_merge_size: int,
+                 patch_size: int,
+                 projector_hidden_act: str,
+                 multimodal_projector_bias: bool,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+
+        self.norm = RMSNorm(vision_hidden_size, eps=1e-5)
+        self.patch_merger = Mistral3PatchMerger(
+            vision_hidden_size=vision_hidden_size,
+            spatial_merge_size=spatial_merge_size,
+            patch_size=patch_size)
+
+        self.linear_1 = ColumnParallelLinear(vision_hidden_size,
+                                             text_hidden_size,
+                                             bias=multimodal_projector_bias,
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.linear_1")
+        self.act = get_act_fn(projector_hidden_act)
+        self.linear_2 = RowParallelLinear(text_hidden_size,
+                                          text_hidden_size,
+                                          bias=multimodal_projector_bias,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.linear_2")
+
+    def forward(self, image_features: torch.Tensor,
+                image_sizes: torch.Tensor) -> torch.Tensor:
+        image_features = self.norm(image_features)
+        image_features = self.patch_merger(image_features, image_sizes)
+        hidden_states, _ = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class LlavaLikeConfig(Protocol):
+    vision_config: Final[PretrainedConfig]
+    image_token_index: Final[int]
+    vision_feature_select_strategy: Final[str]
+    vision_feature_layer: Final[Union[int, list[int]]]
+
+
+class LlavaLikeProcessor(Protocol):
+    image_token: Final[str]
+
+
+class BaseLlavaProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self) -> LlavaLikeConfig:
+        return self.ctx.get_hf_config(Mistral3Config)
+
+    def get_vision_encoder_info(self):
+        return get_vision_encoder_info(self.get_hf_config())
+
+    @abstractmethod
+    def get_hf_processor(self, **kwargs: object) -> LlavaLikeProcessor:
+        raise NotImplementedError
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        return {"image": self.get_max_image_tokens()}
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        vision_encoder_info = self.get_vision_encoder_info()
+        return vision_encoder_info.get_num_image_tokens(
+            image_width=image_width,
+            image_height=image_height,
+        )
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        vision_encoder_info = self.get_vision_encoder_info()
+        width = height = vision_encoder_info.get_image_size()
+        return ImageSize(width=width, height=height)
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+        )
+
+
+_I = TypeVar("_I", bound=BaseLlavaProcessingInfo)
+
+
+class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[_I]):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text=image_token * num_images,
+            mm_data=mm_data,
+        )
+
+
+class Mistral3ProcessingInfo(BaseLlavaProcessingInfo):
+
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(PixtralProcessor, **kwargs)
+
+
+class Mistral3MultiModalProcessor(
+        BaseMultiModalProcessor[Mistral3ProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+        pixel_values = processed_outputs.get("pixel_values")
+        if pixel_values is not None:
+
+            # Avoid padding since we need the output for each image to be
+            # independent of other images for the cache to work correctly
+            image_sizes = processed_outputs["image_sizes"]
+            assert len(pixel_values) == len(image_sizes)
+
+            processed_outputs["pixel_values"] = [
+                p[:, :h, :w] for p, (h, w) in zip(pixel_values, image_sizes)
+            ]
+
+            hf_config = self.info.get_hf_config()
+            vision_config = hf_config.vision_config
+            assert isinstance(vision_config, PixtralVisionConfig)
+            encoder_info = PixtralHFEncoderInfo(vision_config)
+
+            tile_sizes = [
+                encoder_info.get_patch_grid_size(
+                    image_width=pixel_value.shape[-1],
+                    image_height=pixel_value.shape[-2],
+                ) for pixel_value in processed_outputs["pixel_values"]
+            ]
+            embed_is_patch = [
+                torch.tensor(([True] * ncols + [False]) * nrows)
+                for ncols, nrows in tile_sizes
+            ]
+            processed_outputs["embed_is_patch"] = embed_is_patch
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        hf_config = self.info.get_hf_config()
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+
+        image_break_id = vocab[processor.image_break_token]
+        image_token_id = hf_config.image_token_index
+        image_end_id = vocab[processor.image_end_token]
+
+        vision_config = hf_config.vision_config
+        assert isinstance(vision_config, PixtralVisionConfig)
+        encoder_info = PixtralHFEncoderInfo(vision_config)
+
+        def get_replacement(item_idx: int):
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)
+
+            ncols, nrows = encoder_info.get_patch_grid_size(
+                image_width=image_size.width,
+                image_height=image_size.height,
+            )
+
+            tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
+            tokens[-1] = image_end_id
+
+            return tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=get_replacement,
+            ),
+        ]
+
+
+def _build_mistral3_info(
+    ctx: InputProcessingContext, ) -> BaseLlavaProcessingInfo:
+    hf_config = ctx.get_hf_config(Mistral3Config)
+    assert isinstance(hf_config.vision_config, PixtralVisionConfig)
+    return Mistral3ProcessingInfo(ctx)
+
+
+def _build_mistral3_processor(
+    info: _I,
+    dummy_inputs: BaseDummyInputsBuilder[_I],
+    *,
+    cache: Optional[ProcessingCache] = None,
+    enable_sanity_checks: bool = True,
+) -> BaseMultiModalProcessor:
+    assert isinstance(info, Mistral3ProcessingInfo)
+    return Mistral3MultiModalProcessor(
+        info,
+        dummy_inputs,  # type: ignore
+        cache=cache,
+        enable_sanity_checks=enable_sanity_checks,
+    )
+
+
+def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
+    """Determine the number of hidden layers to initialize up to in the
+    visual encoder.
+    
+    Args:
+        hf_config: Model config with vision feature layer(s).
+    """
+    feature_layers = hf_config.vision_feature_layer
+    num_hidden_layers = hf_config.vision_config.num_hidden_layers
+    # If we have one feature layer, initialize up to that layer
+    if isinstance(feature_layers, int):
+        return _get_layer_index(feature_layers, num_hidden_layers)
+    # If we have multiple feature layers, initialize up to the deepest one
+    elif isinstance(feature_layers, (list, tuple)):
+        return max(
+            _get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
+    raise TypeError(f"vision_layer_feature type: {type(feature_layers)}"
+                    " is not supported")
+
+
+def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
+    """Given a signed vision feature layer, get the number of hidden layers
+    needed to leverage it.
+
+    Args:
+        feature_layer_index: Index of a required layer in the visual encoder.
+        num_hidden_layers: The total number of hidden layers in the visual
+            encoder.
+    """
+    if feature_layer_index < 0:
+        return num_hidden_layers + feature_layer_index + 1
+    return feature_layer_index
+
+
+def init_vision_tower_for_llava(
+    hf_config: LlavaLikeConfig,
+    quant_config: Optional[QuantizationConfig],
+    *,
+    require_post_norm: Optional[bool] = None,
+    prefix: str = "",
+) -> PixtralHFVisionModel:
+    vision_config = hf_config.vision_config
+
+    # Initialize the vision tower only up to the deepest required feature layer
+    num_hidden_layers = _get_num_hidden_layers(hf_config)
+
+    assert isinstance(vision_config, PixtralVisionConfig)
+
+    return PixtralHFVisionModel(
+        vision_config,
+        quant_config=quant_config,
+        num_hidden_layers_override=num_hidden_layers,
+        require_post_norm=require_post_norm,
+        prefix=prefix,
+    )
+
+
+# TODO(mgoin): Support V1, there are issues with image batching/chunking
+# that need to be resolved first.
+@MULTIMODAL_REGISTRY.register_processor(
+    _build_mistral3_processor,
+    info=_build_mistral3_info,
+    dummy_inputs=Mistral3DummyInputsBuilder)
+class Mistral3ForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                       SupportsPP, SupportsV0Only):
+
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        # NOTE: These are special cases for Pixtral-12B in the HF-format
+        # https://huggingface.co/mistral-community/pixtral-12b/blob/main/config.json  # noqa
+        if (config.text_config.architectures is None
+                and config.text_config.model_type == "mistral"):
+            config.text_config.architectures = ["MistralForCausalLM"]
+        if (config.projector_hidden_act is None
+                and config.vision_config.hidden_act == "gelu"):
+            config.projector_hidden_act = "gelu"
+
+        # TODO: Optionally initializes this for supporting embeddings.
+        self.vision_tower = init_vision_tower_for_llava(
+            config,
+            quant_config,
+            require_post_norm=False,
+            prefix=maybe_prefix(prefix, "vision_tower"))
+        self.multi_modal_projector = Mistral3MultiModalProjector(
+            vision_hidden_size=config.vision_config.hidden_size,
+            text_hidden_size=config.text_config.hidden_size,
+            projector_hidden_act=config.projector_hidden_act,
+            spatial_merge_size=config.spatial_merge_size,
+            patch_size=config.vision_config.patch_size,
+            multimodal_projector_bias=config.multimodal_projector_bias,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "multi_modal_projector"))
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    @cached_property
+    def sampler(self):
+        if hasattr(self.language_model, "sampler"):
+            return self.language_model.sampler
+
+        return get_sampler()
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+        actual_dims = tuple(data.shape[1:])
+
+        if actual_dims != expected_dims:
+            expected_expr = ("batch_size", *map(str, expected_dims))
+            raise ValueError(
+                f"The expected shape of pixel values is {expected_expr}. "
+                f"You supplied {tuple(data.shape)}.")
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Mistral3ImagePixelInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        assert pixel_values is not None
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
+
+        assert self.config.vision_config.model_type == "pixtral"
+        embed_is_patch = kwargs.pop("embed_is_patch")
+        if not isinstance(embed_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of embed_is_patch. "
+                             f"Got type: {type(embed_is_patch)}")
+
+        return Mistral3ImagePixelInputs(
+            type="pixel_values_pixtral",
+            pixel_values=flatten_bn(pixel_values),
+            embed_is_patch=embed_is_patch,
+        )
+
+    def _process_image_input(
+        self,
+        image_input: Mistral3ImagePixelInputs,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        if image_input["type"] == "image_embeds":
+            return image_input["data"]
+
+        image_sizes = [(img.shape[-2], img.shape[-1])
+                       for img in image_input["pixel_values"]]
+
+        image_features = self.vision_tower(image_input["pixel_values"])
+
+        if isinstance(image_features, torch.Tensor):
+            return self.multi_modal_projector(image_features, image_sizes)
+
+        feature_sizes = [
+            image_feature.shape[0] // self.config.spatial_merge_size**2
+            for image_feature in image_features
+        ]
+
+        image_embeds = self.multi_modal_projector(torch.cat(image_features),
+                                                  image_sizes)
+        if len(feature_sizes) > 1:
+            image_embeds = torch.split(image_embeds, feature_sizes)
+        else:
+            image_embeds = (image_embeds, )
+        return image_embeds
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+
+        vision_embeddings = self._process_image_input(image_input)
+
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                select_patch_features(multimodal_embeddings),
+                self.config.image_token_index,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        """Run forward pass for Mistral3.
+
+        One key thing to understand is the `input_ids` already accounts for the
+        positions of the to-be-inserted image embeddings.
+
+        Concretely, consider a text prompt:
+        `"USER: <image>\\nWhat's the content of the image?\\nASSISTANT:"`.
+
+        Tokenizer outputs:
+        `[1, 3148, 1001, 29901, 29871, 32000, 29871, 13, 5618, 29915, 29879,
+        278, 2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566, 29901]`.
+
+        To reserve space in KV cache, we have to insert placeholder tokens
+        before they are inputted to the model, so the input processor prepends
+        additional image tokens (denoted as `32000`), resulting in:
+        `[1, 3148, 1001, 29901, 29871, 32000, ..., 32000, 29871, 13, 5618,
+        29915, 29879, 278, 2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566,
+        29901]`.
+
+        We insert 575 tokens so that including the original image token in the
+        input, there are a total of 576 (24 * 24) image tokens, which
+        corresponds to the number of image tokens inputted to the language
+        model, i.e. the number of image tokens outputted by the visual encoder.
+
+        This way, the `positions` and `attn_metadata` are consistent
+        with the `input_ids`.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            pixel_values: The pixels in each input image.
+
+        See also:
+            :class:`Mistral3ImagePixelInputs`
+        """
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        return self.language_model.sample(logits, sampling_metadata)
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index da2017c987d4f..f8c7cc9382aac 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -979,7 +979,8 @@ class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]):
         return self.vision_config.image_size
 
     def get_patch_size(self) -> int:
-        return self.vision_config.patch_size
+        return (self.vision_config.patch_size *
+                self.vision_config.spatial_merge_size)
 
     def get_patch_grid_length(self) -> int:
         image_size, patch_size = self.get_image_size(), self.get_patch_size()
@@ -1001,8 +1002,8 @@ class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]):
         ratio = max(image_width / max_width, image_height / max_height)
 
         if ratio > 1:
-            image_width = int(math.ceil(image_width / ratio))
-            image_height = int(math.ceil(image_height / ratio))
+            image_width = int(math.floor(image_width / ratio))
+            image_height = int(math.floor(image_height / ratio))
 
         nrows, ncols = _get_pixtral_hf_num_image_tokens(
             (image_height, image_width),
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 21ebaac77371d..5211cd08f84ec 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -177,6 +177,7 @@ _MULTIMODAL_MODELS = {
     "MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"),  # noqa: E501
     "MiniCPMO": ("minicpmo", "MiniCPMO"),
     "MiniCPMV": ("minicpmv", "MiniCPMV"),
+    "Mistral3ForConditionalGeneration": ("mistral3", "Mistral3ForConditionalGeneration"),  # noqa: E501
     "MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
     "NVLM_D": ("nvlm_d", "NVLM_D_Model"),
     "PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"),  # noqa: E501
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 5c21fb2d4ad2e..9e00da682e808 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -69,6 +69,9 @@ def get_vision_encoder_info(
     if isinstance(vision_config, CLIPVisionConfig):
         return CLIPEncoderInfo(vision_config)
     if isinstance(vision_config, PixtralVisionConfig):
+        # Need to sneak in spatial_merge_size for Mistral3
+        vision_config.spatial_merge_size = getattr(hf_config,
+                                                   "spatial_merge_size", 1)
         return PixtralHFEncoderInfo(vision_config)
     if isinstance(vision_config, SiglipVisionConfig):
         return SiglipEncoderInfo(vision_config)

From 2e45bd29fe738265931ed2565fc6b1d9aac0521a Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Tue, 1 Apr 2025 21:58:05 +0800
Subject: [PATCH 152/593] [Misc] remove unused script (#15746)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 python_only_dev.py | 16 ----------------
 1 file changed, 16 deletions(-)
 delete mode 100644 python_only_dev.py

diff --git a/python_only_dev.py b/python_only_dev.py
deleted file mode 100644
index a303697b780a6..0000000000000
--- a/python_only_dev.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-msg = """Old style python only build (without compilation) is deprecated, please check https://docs.vllm.ai/en/latest/getting_started/installation.html#python-only-build-without-compilation for the new way to do python only build (without compilation).
-
-TL;DR:
-
-VLLM_USE_PRECOMPILED=1 pip install -e .
-
-or
-
-export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
-pip install -e .
-""" # noqa
-
-print(msg)

From 2b93162fb0bc91c8e1e397cb2a385e8febce11e2 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 1 Apr 2025 15:27:46 +0100
Subject: [PATCH 153/593] Remove `format.sh` as it's been unsupported >70 days
 (#15884)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 format.sh | 6 ------
 1 file changed, 6 deletions(-)
 delete mode 100755 format.sh

diff --git a/format.sh b/format.sh
deleted file mode 100755
index fb503ec4bbfcd..0000000000000
--- a/format.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-
-echo "vLLM linting system has been moved from format.sh to pre-commit hook."
-echo "Please run 'pip install -r requirements/lint.txt', followed by"
-echo "'pre-commit install --hook-type pre-commit --hook-type commit-msg' to install the pre-commit hook."
-echo "Then linters will run automatically before each commit."

From 085cbc4f9fd71d1d25495db130098483fe1d4e70 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 1 Apr 2025 23:32:26 +0800
Subject: [PATCH 154/593] [New Model]:
 jinaai/jina-reranker-v2-base-multilingual  (#15876)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/supported_models.md        |  2 +-
 .../language/test_jina_reranker_v2.py         | 70 +++++++++++++++++++
 vllm/model_executor/models/roberta.py         | 17 ++++-
 3 files changed, 86 insertions(+), 3 deletions(-)
 create mode 100644 tests/models/embedding/language/test_jina_reranker_v2.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 42d923e160463..6d21405410dda 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -566,7 +566,7 @@ you should explicitly specify the task type to ensure that the model is used in
   *
 - * `XLMRobertaModel`
   * XLM-RoBERTa-based
-  * `intfloat/multilingual-e5-large`, etc.
+  * `intfloat/multilingual-e5-large`, `jinaai/jina-reranker-v2-base-multilingual`, etc.
   *
   *
 :::
diff --git a/tests/models/embedding/language/test_jina_reranker_v2.py b/tests/models/embedding/language/test_jina_reranker_v2.py
new file mode 100644
index 0000000000000..ab88fa9ba636c
--- /dev/null
+++ b/tests/models/embedding/language/test_jina_reranker_v2.py
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: Apache-2.0
+# ruff: noqa: E501
+"""Compare the scoring outputs of HF and vLLM models.
+
+Run `pytest tests/models/embedding/language/test_jina_reranker_v2.py`.
+"""
+import math
+
+import pytest
+
+MODELS = [
+    "jinaai/jina-reranker-v2-base-multilingual",  # Roberta
+]
+
+TEXTS_1 = ["Organic skincare products for sensitive skin"]
+
+TEXTS_2 = [
+    "Organic skincare for sensitive skin with aloe vera and chamomile.",
+    "New makeup trends focus on bold colors and innovative techniques",
+    "Bio-Hautpflege für empfindliche Haut mit Aloe Vera und Kamille",
+    "Neue Make-up-Trends setzen auf kräftige Farben und innovative Techniken",
+    "Cuidado de la piel orgánico para piel sensible con aloe vera y manzanilla",
+    "Las nuevas tendencias de maquillaje se centran en colores vivos y técnicas innovadoras",
+    "针对敏感肌专门设计的天然有机护肤产品",
+    "新的化妆趋势注重鲜艳的颜色和创新的技巧",
+    "敏感肌のために特別に設計された天然有機スキンケア製品",
+    "新しいメイクのトレンドは鮮やかな色と革新的な技術に焦点を当てています",
+]
+
+
+@pytest.fixture(scope="module", params=MODELS)
+def model_name(request):
+    yield request.param
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str):
+
+    text_pair = [TEXTS_1[0], TEXTS_2[0]]
+
+    with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
+        hf_outputs = hf_model.predict([text_pair]).tolist()
+
+    with vllm_runner(model_name, task="score", dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
+
+    assert len(vllm_outputs) == 1
+    assert len(hf_outputs) == 1
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
+
+    text_pairs = [[TEXTS_1[0], text] for text in TEXTS_2]
+
+    with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
+        hf_outputs = hf_model.predict(text_pairs).tolist()
+
+    with vllm_runner(model_name, task="score", dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
+
+    assert len(vllm_outputs) == 10
+    assert len(hf_outputs) == 10
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index ba92eef12707c..a09741a559755 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -13,7 +13,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.bert import BertEmbeddingModel, BertModel
-from vllm.model_executor.models.utils import maybe_prefix
+from vllm.model_executor.models.utils import WeightsMapper, maybe_prefix
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors, PoolerOutput
 from vllm.transformers_utils.config import (
@@ -203,6 +203,18 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
        _pooler: An instance of Pooler used for pooling operations.
    """
 
+    jina_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            'emb_ln': "embeddings.LayerNorm",
+            'layers': "layer",
+            'mixer.Wqkv': "attention.self.qkv_proj",
+            'mixer.out_proj': "attention.output.dense",
+            'norm1': "attention.output.LayerNorm",
+            'mlp.fc1': "intermediate.dense",
+            'mlp.fc2': "output.dense",
+            'norm2': "output.LayerNorm",
+        })
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -219,8 +231,9 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
         self._pooler = CrossEncodingPooler(config, self.classifier)
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-
         bert_weights, task_weights = roberta_task_weights_filter(weights)
+        bert_weights = self.jina_to_vllm_mapper.apply(bert_weights)
+
         self.roberta.load_weights(bert_weights)
 
         params_dict = dict(self.named_parameters())

From 2041c0e360bf69955d0752e604289edd5547f122 Mon Sep 17 00:00:00 2001
From: chaow-amd <chaow@amd.com>
Date: Tue, 1 Apr 2025 23:32:45 +0800
Subject: [PATCH 155/593] [Doc] Quark quantization documentation (#15861)

Signed-off-by: chaow <chaow@amd.com>
---
 docs/source/features/quantization/index.md |   1 +
 docs/source/features/quantization/quark.md | 217 +++++++++++++++++++++
 2 files changed, 218 insertions(+)
 create mode 100644 docs/source/features/quantization/quark.md

diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md
index 65f438f599f19..4b59695afbd1a 100644
--- a/docs/source/features/quantization/index.md
+++ b/docs/source/features/quantization/index.md
@@ -16,5 +16,6 @@ gptqmodel
 int4
 int8
 fp8
+quark
 quantized_kvcache
 :::
diff --git a/docs/source/features/quantization/quark.md b/docs/source/features/quantization/quark.md
new file mode 100644
index 0000000000000..935ee37a815ff
--- /dev/null
+++ b/docs/source/features/quantization/quark.md
@@ -0,0 +1,217 @@
+(quark)=
+
+# AMD QUARK
+
+Quantization can effectively reduce memory and bandwidth usage, accelerate computation and improve
+throughput while with minimal accuracy loss. vLLM can leverage [Quark](https://quark.docs.amd.com/latest/),
+the flexible and powerful quantization toolkit, to produce performant quantized models to run on AMD GPUs. Quark has specialized support for quantizing large language models with weight,
+activation and kv-cache quantization and cutting-edge quantization algorithms like
+AWQ, GPTQ, Rotation and SmoothQuant.
+
+## Quark Installation
+
+Before quantizing models, you need to install Quark. The latest release of Quark can be installed with pip:
+
+```console
+pip install amd-quark
+```
+
+You can refer to [Quark installation guide](https://quark.docs.amd.com/latest/install.html)
+for more installation details.
+
+## Quantization Process
+
+After installing Quark, we will use an example to illustrate how to use Quark.  
+The Quark quantization process can be listed for 5 steps as below:
+
+1. Load the model
+2. Prepare the calibration dataloader
+3. Set the quantization configuration
+4. Quantize the model and export
+5. Evaluation in vLLM
+
+### 1. Load the Model
+
+Quark uses [Transformers](https://huggingface.co/docs/transformers/en/index)
+to fetch model and tokenizer.
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
+MAX_SEQ_LEN = 512
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto",
+)
+model.eval()
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN)
+tokenizer.pad_token = tokenizer.eos_token
+```
+
+### 2. Prepare the Calibration Dataloader
+
+Quark uses the [PyTorch Dataloader](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html)
+to load calibration data. For more details about how to use calibration datasets efficiently, please refer
+to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calibration_datasets.html).
+
+```python
+from datasets import load_dataset
+from torch.utils.data import DataLoader
+
+BATCH_SIZE = 1
+NUM_CALIBRATION_DATA = 512
+
+# Load the dataset and get calibration data.
+dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
+text_data = dataset["text"][:NUM_CALIBRATION_DATA]
+
+tokenized_outputs = tokenizer(text_data, return_tensors="pt",
+    padding=True, truncation=True, max_length=MAX_SEQ_LEN)
+calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
+    batch_size=BATCH_SIZE, drop_last=True)
+```
+
+### 3. Set the Quantization Configuration
+
+We need to set the quantization configuration, you can check
+[quark config guide](https://quark.docs.amd.com/latest/pytorch/user_guide_config_description.html)
+for further details. Here we use FP8 per-tensor quantization on weight, activation,
+kv-cache and the quantization algorithm is AutoSmoothQuant.
+
+:::{note}
+Note the quantization algorithm needs a JSON config file and the config file is located in
+[Quark Pytorch examples](https://quark.docs.amd.com/latest/pytorch/pytorch_examples.html),
+under the directory `examples/torch/language_modeling/llm_ptq/models`. For example,
+AutoSmoothQuant config file for Llama is
+`examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`.
+:::
+
+```python
+from quark.torch.quantization import (Config, QuantizationConfig,
+                                     FP8E4M3PerTensorSpec,
+                                     load_quant_algo_config_from_file)
+
+# Define fp8/per-tensor/static spec.
+FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
+    is_dynamic=False).to_quantization_spec()
+
+# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
+global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
+    weight=FP8_PER_TENSOR_SPEC)
+
+# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
+KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
+kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
+kv_cache_quant_config = {name :
+    QuantizationConfig(input_tensors=global_quant_config.input_tensors,
+                       weight=global_quant_config.weight,
+                       output_tensors=KV_CACHE_SPEC)
+    for name in kv_cache_layer_names_for_llama}
+layer_quant_config = kv_cache_quant_config.copy()
+
+# Define algorithm config by config file.
+LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
+    'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
+algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
+
+EXCLUDE_LAYERS = ["lm_head"]
+quant_config = Config(
+    global_quant_config=global_quant_config,
+    layer_quant_config=layer_quant_config,
+    kv_cache_quant_config=kv_cache_quant_config,
+    exclude=EXCLUDE_LAYERS,
+    algo_config=algo_config)
+```
+
+### 4. Quantize the Model and Export
+
+Then we can apply the quantization. After quantizing, we need to freeze the
+quantized model first before exporting. Note that we need to export model with format of
+HuggingFace `safetensors`, you can refer to
+[HuggingFace format exporting](https://quark.docs.amd.com/latest/pytorch/export/quark_export_hf.html)
+for more exporting format details.
+
+```python
+import torch
+from quark.torch import ModelQuantizer, ModelExporter
+from quark.torch.export import ExporterConfig, JsonExporterConfig
+
+# Apply quantization.
+quantizer = ModelQuantizer(quant_config)
+quant_model = quantizer.quantize_model(model, calib_dataloader)
+
+# Freeze quantized model to export.
+freezed_model = quantizer.freeze(model)
+
+# Define export config.
+LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
+export_config = ExporterConfig(json_export_config=JsonExporterConfig())
+export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
+
+EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
+exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
+with torch.no_grad():
+    exporter.export_safetensors_model(freezed_model,
+        quant_config=quant_config, tokenizer=tokenizer)
+```
+
+### 5. Evaluation in vLLM
+
+Now, you can load and run the Quark quantized model directly through the LLM entrypoint:
+
+```python
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# Create an LLM.
+llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
+          kv_cache_dtype='fp8',quantization='quark')
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+print("\nGenerated Outputs:\n" + "-" * 60)
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt:    {prompt!r}")
+    print(f"Output:    {generated_text!r}")
+    print("-" * 60)
+```
+
+Or, you can use `lm_eval` to evaluate accuracy:
+
+```console
+$ lm_eval --model vllm \
+  --model_args pretrained=Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant,kv_cache_dtype='fp8',quantization='quark' \
+  --tasks gsm8k
+```
+
+## Quark Quantization Script
+In addition to the example of Python API above, Quark also offers a
+[quantization script](https://quark.docs.amd.com/latest/pytorch/example_quark_torch_llm_ptq.html)
+to quantize large language models more conveniently. It supports quantizing models with variety
+of different quantization schemes and optimization algorithms. It can export the quantized model
+and run evaluation tasks on the fly. With the script, the example above can be:
+
+```console
+python3 quantize_quark.py --model_dir meta-llama/Llama-2-70b-chat-hf \
+                          --output_dir /path/to/output \
+                          --quant_scheme w_fp8_a_fp8 \
+                          --kv_cache_dtype fp8 \
+                          --quant_algo autosmoothquant \
+                          --num_calib_data 512 \
+                          --model_export hf_format \
+                          --tasks gsm8k
+```

From b63bd14999187d07c02e99c24fb5299fbebe7155 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 1 Apr 2025 16:41:30 +0100
Subject: [PATCH 156/593] Reinstate `format.sh` and make `pre-commit`
 installation simpler (#15890)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .pre-commit-config.yaml | 3 +++
 format.sh               | 6 ++++++
 2 files changed, 9 insertions(+)
 create mode 100644 format.sh

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 484cd171f5f52..f81410ab40690 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,3 +1,6 @@
+default_install_hook_types:
+  - pre-commit
+  - commit-msg
 default_stages:
   - pre-commit # Run locally
   - manual # Run in CI
diff --git a/format.sh b/format.sh
new file mode 100644
index 0000000000000..6ba93e0a19ba8
--- /dev/null
+++ b/format.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+echo "vLLM linting system has been moved from format.sh to pre-commit hooks."
+echo "Please run 'pip install -r requirements/lint.txt', followed by"
+echo "'pre-commit install' to install the pre-commit hooks."
+echo "Then linters will run automatically before each commit."
\ No newline at end of file

From 4e5a0f6ae208c56c169de58a2f3a02c533d9ec00 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 1 Apr 2025 23:55:13 +0800
Subject: [PATCH 157/593] [Misc] Allow using OpenCV as video IO fallback
 (#15055)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 requirements/common.txt  |  1 +
 requirements/test.in     |  2 +-
 requirements/test.txt    |  9 ++---
 setup.py                 |  2 +-
 vllm/assets/video.py     | 19 +++++----
 vllm/multimodal/video.py | 86 ++++++++++++++++++++++++++++++----------
 6 files changed, 84 insertions(+), 35 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index c7bbdb71b742c..48e58c85c89b1 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -30,6 +30,7 @@ msgspec
 gguf == 0.10.0
 importlib_metadata
 mistral_common[opencv] >= 1.5.4
+opencv-python-headless >= 4.11.0    # required for video IO
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
diff --git a/requirements/test.in b/requirements/test.in
index cf89794b93fc7..c1b70bca70ead 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -9,7 +9,6 @@ pytest-shard
 # testing utils
 awscli
 backoff # required for phi4mm test
-decord # required for video tests
 einops # required for MPT, qwen-vl and Mamba
 httpx
 librosa # required for audio tests
@@ -28,6 +27,7 @@ torchvision==0.21.0
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
 mistral_common[opencv] >= 1.5.4 # required for pixtral test
+opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.4 # required for model evaluation test
 transformers==4.50.3
diff --git a/requirements/test.txt b/requirements/test.txt
index 26ed9dbe32cb5..c46fa0721d6f0 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -93,8 +93,6 @@ datasets==3.0.2
     #   lm-eval
 decorator==5.1.1
     # via librosa
-decord==0.6.0
-    # via -r requirements/test.in
 dill==0.3.8
     # via
     #   datasets
@@ -276,7 +274,6 @@ numpy==1.26.4
     #   contourpy
     #   cupy-cuda12x
     #   datasets
-    #   decord
     #   einx
     #   encodec
     #   evaluate
@@ -337,8 +334,10 @@ nvidia-nvjitlink-cu12==12.4.127
     #   torch
 nvidia-nvtx-cu12==12.4.127
     # via torch
-opencv-python-headless==4.10.0.84
-    # via mistral-common
+opencv-python-headless==4.11.0.86
+    # via
+    #   -r requirements/test.in
+    #   mistral-common
 packaging==24.1
     # via
     #   accelerate
diff --git a/setup.py b/setup.py
index cf2acb20d9cf0..b0cc2f48163c3 100755
--- a/setup.py
+++ b/setup.py
@@ -684,7 +684,7 @@ setup(
         "fastsafetensors": ["fastsafetensors >= 0.1.10"],
         "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
         "audio": ["librosa", "soundfile"],  # Required for audio processing
-        "video": ["decord"]  # Required for video processing
+        "video": []  # Kept for backwards compatibility
     },
     cmdclass=cmdclass,
     package_data=package_data,
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index e45e1a65f8905..32b0b86ba36f4 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -10,8 +10,6 @@ import numpy.typing as npt
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from vllm.multimodal.video import sample_frames_from_video
-
 from .base import get_cache_dir
 
 
@@ -43,14 +41,19 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
 
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
     frames = []
-    for i in range(total_frames):
-        ret, frame = cap.read()
-        if ret:
-            frames.append(frame)
-    cap.release()
+
+    num_frames = num_frames if num_frames > 0 else total_frames
+    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+    for idx in range(total_frames):
+        ok = cap.grab()  # next img
+        if not ok:
+            break
+        if idx in frame_indices:  # only decompress needed
+            ret, frame = cap.retrieve()
+            if ret:
+                frames.append(frame)
 
     frames = np.stack(frames)
-    frames = sample_frames_from_video(frames, num_frames)
     if len(frames) < num_frames:
         raise ValueError(f"Could not read enough frames from video file {path}"
                          f" (expected {num_frames} frames, got {len(frames)})")
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 0b3d3f8c79d72..f7c3f10529542 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -13,7 +13,7 @@ from PIL import Image
 from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.processor import cached_get_video_processor
-from vllm.utils import PlaceholderModule, is_list_of
+from vllm.utils import is_list_of
 
 from .base import MediaIO, ModalityData
 from .image import ImageMediaIO, ImagePlugin
@@ -22,11 +22,6 @@ from .inputs import MultiModalKwargs, VideoItem
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
 
-try:
-    import decord
-except ImportError:
-    decord = PlaceholderModule("decord")  # type: ignore[assignment]
-
 logger = init_logger(__name__)
 
 
@@ -117,6 +112,69 @@ def sample_frames_from_video(frames: npt.NDArray,
     return sampled_frames
 
 
+class VideoLoader:
+
+    @classmethod
+    def load_bytes(self, data: bytes, num_frames: int = -1) -> npt.NDArray:
+        raise NotImplementedError
+
+
+class OpenCVVideoBackend(VideoLoader):
+
+    def get_cv2_video_api(self):
+        import cv2.videoio_registry as vr
+
+        api_pref = None
+        for backend in vr.getStreamBufferedBackends():
+            if not vr.hasBackend(backend):
+                continue
+            if not vr.isBackendBuiltIn(backend):
+                _, abi, api = vr.getStreamBufferedBackendPluginVersion(backend)
+                if (abi < 1 or (abi == 1 and api < 2)):
+                    continue
+            api_pref = backend
+            break
+        return api_pref
+
+    @classmethod
+    def load_bytes(cls, data: bytes, num_frames: int = -1) -> npt.NDArray:
+        import cv2
+
+        backend = cls().get_cv2_video_api()
+        cap = cv2.VideoCapture(BytesIO(data), backend, [])
+        if not cap.isOpened():
+            raise ValueError("Could not open video stream")
+
+        total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        full_read = num_frames == -1 or total_frames_num < num_frames
+        if full_read:
+            frame_idx = list(range(0, total_frames_num))
+        else:
+            uniform_sampled_frames = np.linspace(0,
+                                                 total_frames_num - 1,
+                                                 num_frames,
+                                                 dtype=int)
+            frame_idx = uniform_sampled_frames.tolist()
+
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        frames = np.empty((len(frame_idx), height, width, 3), dtype=np.uint8)
+
+        i = 0
+        for idx in range(total_frames_num):
+            ok = cap.grab()  # next img
+            if not ok:
+                break
+            if idx in frame_idx:  # only decompress needed
+                ret, frame = cap.retrieve()
+                if ret:
+                    frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    i += 1
+        # we expect all frames loaded
+        assert i == num_frames
+        return frames
+
+
 class VideoMediaIO(MediaIO[npt.NDArray]):
 
     def __init__(
@@ -129,22 +187,10 @@ class VideoMediaIO(MediaIO[npt.NDArray]):
 
         self.image_io = image_io
         self.num_frames = num_frames
+        self.video_loader = OpenCVVideoBackend
 
     def load_bytes(self, data: bytes) -> npt.NDArray:
-        vr = decord.VideoReader(BytesIO(data), num_threads=1)
-        total_frame_num = len(vr)
-
-        num_frames = self.num_frames
-        if total_frame_num > num_frames:
-            uniform_sampled_frames = np.linspace(0,
-                                                 total_frame_num - 1,
-                                                 num_frames,
-                                                 dtype=int)
-            frame_idx = uniform_sampled_frames.tolist()
-        else:
-            frame_idx = list(range(0, total_frame_num))
-
-        return vr.get_batch(frame_idx).asnumpy()
+        return self.video_loader.load_bytes(data, self.num_frames)
 
     def load_base64(self, media_type: str, data: str) -> npt.NDArray:
         if media_type.lower() == "video/jpeg":

From a57a3044aa57bc4100e2033e23b6fa1dfb051a2e Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Tue, 1 Apr 2025 11:56:39 -0400
Subject: [PATCH 158/593] [ROCm][Build][Bugfix] Bring the base dockerfile in
 sync with the ROCm fork (#15820)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 docker/Dockerfile.rocm_base | 52 +++++++++++++++++++++----------------
 requirements/rocm-build.txt |  2 +-
 2 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
index 38d6a33636eba..b8523fbc2a01c 100644
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -1,18 +1,18 @@
 ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
-ARG HIPBLASLT_BRANCH="4d40e36"
+ARG HIPBLASLT_BRANCH="db8e93b4"
 ARG HIPBLAS_COMMON_BRANCH="7c1566b"
 ARG LEGACY_HIPBLASLT_OPTION=
 ARG RCCL_BRANCH="648a58d"
 ARG RCCL_REPO="https://github.com/ROCm/rccl"
 ARG TRITON_BRANCH="e5be006"
 ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
-ARG PYTORCH_BRANCH="3a585126"
-ARG PYTORCH_VISION_BRANCH="v0.19.1"
+ARG PYTORCH_BRANCH="295f2ed4"
+ARG PYTORCH_VISION_BRANCH="v0.21.0"
 ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
-ARG FA_BRANCH="b7d29fb"
-ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
-ARG AITER_BRANCH="21d47a9"
+ARG FA_BRANCH="1a7f4dfa"
+ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
+ARG AITER_BRANCH="8970b25b"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 
 FROM ${BASE_IMAGE} AS base
@@ -20,7 +20,7 @@ FROM ${BASE_IMAGE} AS base
 ENV PATH=/opt/rocm/llvm/bin:$PATH
 ENV ROCM_PATH=/opt/rocm
 ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
-ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
+ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201
 ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
 
 ARG PYTHON_VERSION=3.12
@@ -31,7 +31,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 
 # Install Python and other dependencies
 RUN apt-get update -y \
-    && apt-get install -y software-properties-common git curl sudo vim less \
+    && apt-get install -y software-properties-common git curl sudo vim less libgfortran5 \
     && add-apt-repository ppa:deadsnakes/ppa \
     && apt-get update -y \
     && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
@@ -42,7 +42,7 @@ RUN apt-get update -y \
     && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
     && python3 --version && python3 -m pip --version
 
-RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython
+RUN pip install -U packaging 'cmake<4' ninja wheel setuptools pybind11 Cython
 
 FROM base AS build_hipblaslt
 ARG HIPBLASLT_BRANCH
@@ -60,7 +60,8 @@ RUN cd hipBLAS-common \
 RUN git clone https://github.com/ROCm/hipBLASLt
 RUN cd hipBLASLt \
     && git checkout ${HIPBLASLT_BRANCH} \
-    && ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
+    && apt-get install -y llvm-dev \
+    && ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
     && cd build/release \
     && make package
 RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
@@ -110,11 +111,24 @@ RUN git clone ${FA_REPO}
 RUN cd flash-attention \
     && git checkout ${FA_BRANCH} \
     && git submodule update --init \
-    && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
+    && GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist
 RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
     && cp /app/vision/dist/*.whl /app/install \
     && cp /app/flash-attention/dist/*.whl /app/install
 
+FROM base AS build_aiter
+ARG AITER_BRANCH
+ARG AITER_REPO
+RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
+    pip install /install/*.whl
+RUN git clone --recursive ${AITER_REPO}
+RUN cd aiter \
+    && git checkout ${AITER_BRANCH} \
+    && git submodule update --init --recursive \
+    && pip install -r requirements.txt
+RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
+RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
+
 FROM base AS final
 RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
     dpkg -i /install/*deb \
@@ -130,19 +144,12 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
     pip install /install/*.whl
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
     pip install /install/*.whl
-
-ARG AITER_REPO
-ARG AITER_BRANCH
-RUN git clone --recursive ${AITER_REPO}
-RUN cd aiter \
-    && git checkout ${AITER_BRANCH} \
-    && git submodule update --init --recursive \
-    && pip install -r requirements.txt \
-    && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter 
+RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
+    pip install /install/*.whl
 
 ARG BASE_IMAGE
-ARG HIPBLASLT_BRANCH
 ARG HIPBLAS_COMMON_BRANCH
+ARG HIPBLASLT_BRANCH
 ARG LEGACY_HIPBLASLT_OPTION
 ARG RCCL_BRANCH
 ARG RCCL_REPO
@@ -154,6 +161,8 @@ ARG PYTORCH_REPO
 ARG PYTORCH_VISION_REPO
 ARG FA_BRANCH
 ARG FA_REPO
+ARG AITER_BRANCH
+ARG AITER_REPO
 RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
     && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
     && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
@@ -167,6 +176,5 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
     && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
     && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
     && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
-    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
     && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
     && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt
index 6af78da4993db..29d5647807bb9 100644
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -6,7 +6,7 @@ torch==2.6.0
 torchvision==0.21.0
 torchaudio==2.6.0
 
-cmake>=3.26
+cmake>=3.26,<4
 packaging
 setuptools>=61
 setuptools-scm>=8

From e59ca942f576c78fd457f16f2029bda716c81959 Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Tue, 1 Apr 2025 12:07:43 -0400
Subject: [PATCH 159/593] Add option to use DeepGemm contiguous grouped gemm
 kernel for fused MoE operations. (#13932)

Signed-off-by: Bill Nell <bnell@redhat.com>
---
 benchmarks/kernels/benchmark_moe.py           |  97 ++--
 tests/kernels/test_block_fp8.py               | 279 ++++++++++-
 vllm/_custom_ops.py                           |   2 +-
 vllm/envs.py                                  |   5 +
 .../layers/fused_moe/fused_moe.py             | 468 ++++++++++++++++--
 .../model_executor/layers/quantization/fp8.py |  36 ++
 6 files changed, 773 insertions(+), 114 deletions(-)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 491f8c3962f73..f1803b39c8836 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -30,19 +30,18 @@ class BenchmarkConfig(TypedDict):
     num_stages: int
 
 
-def benchmark_config(
-    config: BenchmarkConfig,
-    num_tokens: int,
-    num_experts: int,
-    shard_intermediate_size: int,
-    hidden_size: int,
-    topk: int,
-    dtype: torch.dtype,
-    use_fp8_w8a8: bool,
-    use_int8_w8a16: bool,
-    num_iters: int = 100,
-    block_quant_shape: List[int] = None,
-) -> float:
+def benchmark_config(config: BenchmarkConfig,
+                     num_tokens: int,
+                     num_experts: int,
+                     shard_intermediate_size: int,
+                     hidden_size: int,
+                     topk: int,
+                     dtype: torch.dtype,
+                     use_fp8_w8a8: bool,
+                     use_int8_w8a16: bool,
+                     num_iters: int = 100,
+                     block_quant_shape: List[int] = None,
+                     use_deep_gemm: bool = False) -> float:
     init_dtype = torch.float16 if use_fp8_w8a8 else dtype
     x = torch.randn(num_tokens, hidden_size, dtype=dtype)
     if use_int8_w8a16:
@@ -115,22 +114,41 @@ def benchmark_config(
     def run():
         from vllm.model_executor.layers.fused_moe import override_config
         with override_config(config):
-            fused_moe(
-                x,
-                w1,
-                w2,
-                input_gating,
-                topk,
-                renormalize=True,
-                inplace=True,
-                use_fp8_w8a8=use_fp8_w8a8,
-                use_int8_w8a16=use_int8_w8a16,
-                w1_scale=w1_scale,
-                w2_scale=w2_scale,
-                a1_scale=a1_scale,
-                a2_scale=a2_scale,
-                block_shape=block_quant_shape,
-            )
+            if use_deep_gemm:
+                topk_weights, topk_ids = fused_topk(x, input_gating, topk,
+                                                    False)
+                return fused_experts(
+                    x,
+                    w1,
+                    w2,
+                    topk_weights,
+                    topk_ids,
+                    inplace=True,
+                    use_fp8_w8a8=use_fp8_w8a8,
+                    w1_scale=w1_scale,
+                    w2_scale=w2_scale,
+                    a1_scale=a1_scale,
+                    a2_scale=a2_scale,
+                    block_shape=block_quant_shape,
+                    allow_deep_gemm=True,
+                )
+            else:
+                fused_moe(
+                    x,
+                    w1,
+                    w2,
+                    input_gating,
+                    topk,
+                    renormalize=True,
+                    inplace=True,
+                    use_fp8_w8a8=use_fp8_w8a8,
+                    use_int8_w8a16=use_int8_w8a16,
+                    w1_scale=w1_scale,
+                    w2_scale=w2_scale,
+                    a1_scale=a1_scale,
+                    a2_scale=a2_scale,
+                    block_shape=block_quant_shape,
+                )
 
     # JIT compilation & warmup
     run()
@@ -366,6 +384,7 @@ class BenchmarkWorker:
         use_fp8_w8a8: bool,
         use_int8_w8a16: bool,
         block_quant_shape: List[int] = None,
+        use_deep_gemm: bool = False,
     ) -> tuple[dict[str, int], float]:
         current_platform.seed_everything(self.seed)
         dtype_str = get_config_dtype_str(dtype,
@@ -396,7 +415,8 @@ class BenchmarkWorker:
                                        use_fp8_w8a8,
                                        use_int8_w8a16,
                                        num_iters=100,
-                                       block_quant_shape=block_quant_shape)
+                                       block_quant_shape=block_quant_shape,
+                                       use_deep_gemm=use_deep_gemm)
         return config, kernel_time
 
     def tune(
@@ -411,6 +431,7 @@ class BenchmarkWorker:
         use_int8_w8a16: bool,
         search_space: list[dict[str, int]],
         block_quant_shape: list[int],
+        use_deep_gemm: bool,
     ) -> dict[str, int]:
         best_config = None
         best_time = float("inf")
@@ -436,7 +457,8 @@ class BenchmarkWorker:
                         use_fp8_w8a8,
                         use_int8_w8a16,
                         num_iters=20,
-                        block_quant_shape=block_quant_shape)
+                        block_quant_shape=block_quant_shape,
+                        use_deep_gemm=use_deep_gemm)
                 except triton.runtime.autotuner.OutOfResources:
                     # Some configurations may be invalid and fail to compile.
                     continue
@@ -550,6 +572,8 @@ def main(args: argparse.Namespace):
     else:
         batch_sizes = [args.batch_size]
 
+    use_deep_gemm = bool(args.use_deep_gemm)
+
     ray.init()
     num_gpus = int(ray.available_resources()["GPU"])
     workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
@@ -572,10 +596,10 @@ def main(args: argparse.Namespace):
 
         start = time.time()
         configs = _distribute(
-            "tune",
-            [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
-              use_fp8_w8a8, use_int8_w8a16, search_space, block_quant_shape)
-             for batch_size in batch_sizes])
+            "tune", [(batch_size, E, shard_intermediate_size, hidden_size,
+                      topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space,
+                      block_quant_shape, use_deep_gemm)
+                     for batch_size in batch_sizes])
         best_configs = {
             M: sort_config(config)
             for M, config in zip(batch_sizes, configs)
@@ -589,7 +613,7 @@ def main(args: argparse.Namespace):
         outputs = _distribute(
             "benchmark",
             [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
-              use_fp8_w8a8, use_int8_w8a16, block_quant_shape)
+              use_fp8_w8a8, use_int8_w8a16, block_quant_shape, use_deep_gemm)
              for batch_size in batch_sizes])
 
         for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
@@ -611,6 +635,7 @@ if __name__ == "__main__":
                         type=str,
                         choices=["auto", "fp8_w8a8", "int8_w8a16"],
                         default="auto")
+    parser.add_argument("--use-deep-gemm", action="store_true")
     parser.add_argument("--seed", type=int, default=0)
     parser.add_argument("--batch-size", type=int, required=False)
     parser.add_argument("--tune", action="store_true")
diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/test_block_fp8.py
index 6206cbd5f76f7..fda981f4c8005 100644
--- a/tests/kernels/test_block_fp8.py
+++ b/tests/kernels/test_block_fp8.py
@@ -6,12 +6,22 @@ import itertools
 import pytest
 import torch
 
+from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.fused_moe.fused_moe import (
+    deep_gemm_moe_fp8, fused_topk, moe_align_block_size)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     per_token_group_quant_fp8, w8a8_block_fp8_matmul)
 from vllm.platforms import current_platform
 
+dg_available = False
+try:
+    import deep_gemm
+    dg_available = True
+except ImportError:
+    pass
+
 if current_platform.get_device_capability() < (9, 0):
     pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
                 allow_module_level=True)
@@ -21,17 +31,18 @@ DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
 NUM_TOKENS = [7, 83, 2048]
 D = [512, 4096, 5120, 13824]
 GROUP_SIZE = [64, 128, 256, 512]
-M = [1, 7, 83, 512, 2048]
-N = [128, 512, 1024, 4096, 7748, 13824]
-K = [256, 4096, 5120, 3884, 13824]
+M = [1, 7, 8, 83, 84, 512, 2048, 4096]
+N = [128, 512, 1024, 4096, 7168, 7748, 13824]
+K = [256, 4096, 5120, 3884, 13824, 16384]
 # Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8
 # and its hidden size is 7168.
-M_moe = [1, 7, 83, 512, 2048]
-N_moe = [4608]  # [128, 4608, 13824]
-K_moe = [7168]  # [256, 7168, 13824]
+M_moe = [1, 2, 7, 83, 128, 512, 2048]
+M_moe_dg = [128, 192, 512, 1335, 2048]
+N_moe = [128, 256, 1024, 4608]  # [13824]
+K_moe = [256, 512, 7168]  # [13824]
 BLOCK_SIZE = [[128, 128]]
-E = [8, 24]  # [8, 24, 128, 256]
-TOP_KS = [2]  # [1, 2, 6]
+E = [2, 8, 16, 24]  # [128, 256]
+TOP_KS = [1, 2, 6]
 OUT_DTYPES = [torch.bfloat16]  # [torch.float32, torch.half, torch.bfloat16]
 SEEDS = [0]
 
@@ -217,11 +228,16 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
                       SEEDS))
 @torch.inference_mode()
 def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
+    if topk > E:
+        pytest.skip(f"Skipping test; topk={topk} > E={E}")
+
     torch.manual_seed(seed)
     factor_for_scale = 1e-2
     fp8_info = torch.finfo(torch.float8_e4m3fn)
     fp8_max, fp8_min = fp8_info.max, fp8_info.min
 
+    vllm_config = VllmConfig()
+
     a = torch.randn((M, K), dtype=dtype) / 10
 
     w1_bf16 = (torch.rand(
@@ -246,25 +262,240 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
 
     score = torch.randn((M, E), dtype=dtype)
 
-    out = fused_moe(
-        a,
-        w1,
-        w2,
-        score,
-        topk,
-        renormalize=False,
-        use_fp8_w8a8=True,
-        w1_scale=w1_s,
-        w2_scale=w2_s,
-        block_shape=block_size,
-    )
-    ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk,
-                                       block_size)
+    # Set the context to avoid lots of warning spam.
+    with set_current_vllm_config(vllm_config):
+        out = fused_moe(
+            a,
+            w1,
+            w2,
+            score,
+            topk,
+            renormalize=False,
+            use_fp8_w8a8=True,
+            w1_scale=w1_s,
+            w2_scale=w2_s,
+            block_shape=block_size,
+        )
+        ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk,
+                                           block_size)
 
-    print(f"{out.sum()=}")
-    print(f"{ref_out.sum()=}")
+    #print(f"{out.sum()=}")
+    #print(f"{ref_out.sum()=}")
 
     rel_diff = (torch.mean(
         torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
                 torch.mean(torch.abs(ref_out.to(torch.float32))))
     assert rel_diff < 0.03
+
+
+def per_block_cast_to_fp8(
+        x: torch.Tensor,
+        block_size_n: int = 128) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros(
+        (deep_gemm.ceil_div(m, 128) * 128,
+         deep_gemm.ceil_div(n, block_size_n) * block_size_n),
+        dtype=x.dtype,
+        device=x.device)
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, block_size_n)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
+    x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
+    scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
+    return x_scaled_sub, scales
+
+
+@pytest.mark.parametrize(
+    "M,N,K,block_size,out_dtype,seed",
+    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
+@torch.inference_mode()
+def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
+    # only aligned sizes
+    if M % 4 != 0 or K % 128 != 0 or N % 64 != 0:
+        pytest.skip(f"Skipping test; invalid size {M}, {N}, {K}")
+
+    torch.manual_seed(seed)
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max = fp8_info.max
+
+    A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+    B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+
+    _, block_k = block_size[0], block_size[1]
+
+    A_fp8, As_fp8 = per_token_group_quant_fp8(A_fp32, block_k)
+    B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32)
+
+    As = As_fp8.to(torch.float32)
+    Bs = Bs_fp8.to(torch.float32)
+
+    ref_out = native_w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size,
+                                           out_dtype)
+
+    # Transpose earlier so that the testing will not trigger transposing kernels
+    As_fp8 = deep_gemm.get_col_major_tma_aligned_tensor(As_fp8)
+
+    out = torch.zeros((M, N), device='cuda', dtype=out_dtype)
+
+    assert As_fp8.shape == (M, (K + 127) //
+                            128), f"{As_fp8.shape} != {(M, (K + 127) // 128)}"
+
+    deep_gemm.gemm_fp8_fp8_bf16_nt((A_fp8, As_fp8), (B_fp8, Bs_fp8), out)
+
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.001
+
+
+def fp8_perm(m, idx):
+    if torch.is_floating_point(m) and torch.finfo(m.dtype).bits == 8:
+        return m.view(dtype=torch.uint8)[idx, ...].view(dtype=m.dtype)
+    else:
+        return m[idx, ...]
+
+
+def test_moe_permute(a, a_s, topk_ids, num_groups, topk, block_m):
+    M, K = a.shape
+
+    sorted_token_ids, m_indices, num_pad = moe_align_block_size(
+        topk_ids, block_m, num_groups, None, pad_sorted_ids=True)
+
+    num_tokens = topk * M
+
+    sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1)
+    m_indices = torch.repeat_interleave(m_indices, block_m, dim=0)
+    inv_perm = torch.argsort(sorted_token_ids)[:M * topk]
+
+    a = fp8_perm(a, sorted_token_ids // topk)
+    if a_s is not None:
+        a_s = a_s[sorted_token_ids // topk]
+
+    return a, a_s, m_indices, inv_perm
+
+
+def test_moe_unpermute(out, inv_perm, topk, K, topk_weight):
+    M = topk_weight.shape[0]
+    out = out[inv_perm, ...]
+    tmp_out = out.view(-1, topk, K)
+    return (tmp_out * topk_weight.view(M, -1, 1).to(out.dtype)).sum(dim=1)
+
+
+def deep_gemm_w8a8_block_fp8_moe(M, K, a, w1, w2, w1_s, w2_s, score, topk,
+                                 block_shape):
+    """Fused moe with block-wise quantization using DeepGemm grouped gemm."""
+    num_groups = w1.shape[0]
+    M, K = a.shape
+    N = w2.shape[-1]
+
+    topk_weight, topk_ids = fused_topk(a, score.float(), topk, False)
+
+    block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
+
+    _, block_k = block_shape[0], block_shape[1]
+
+    a_q, a_s = per_token_group_quant_fp8(a, block_m)
+
+    a_q, a_s, m_indices, inv_perm = test_moe_permute(a_q, a_s, topk_ids,
+                                                     num_groups, topk, block_m)
+
+    inter_out = torch.zeros((a_q.shape[0], N * 2),
+                            dtype=torch.bfloat16,
+                            device=a.device)
+
+    deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous((a_q, a_s), (w1, w1_s),
+                                                        inter_out, m_indices)
+
+    act_out = SiluAndMul().forward_native(inter_out)
+    act_out_q, act_out_s = per_token_group_quant_fp8(act_out, block_k)
+
+    out = torch.zeros(a_q.shape[0], K, dtype=torch.bfloat16, device=a.device)
+
+    deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
+        (act_out_q, act_out_s), (w2, w2_s), out, m_indices)
+
+    final_out = test_moe_unpermute(out, inv_perm, topk, K, topk_weight)
+
+    return final_out
+
+
+@pytest.mark.parametrize(
+    "M,N,K,E,topk,seed",
+    itertools.product(M_moe_dg, N_moe, K_moe, E, TOP_KS, SEEDS))
+@pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.")
+@torch.inference_mode()
+def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed):
+
+    block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
+    block_size = [block_m, block_m]
+    dtype = torch.bfloat16
+
+    # only aligned sizes
+    if (N % block_m != 0 or K % block_m != 0 or topk > E):
+        pytest.skip(
+            f"Skipping test; bad size m={M}, n={N}, k={K}, topk={topk}, E={E}")
+
+    if (N <= 512):
+        pytest.skip("Skipping N <= 512 until performance issues solved.")
+
+    vllm_config = VllmConfig()
+
+    torch.manual_seed(seed)
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+    a = torch.randn((M, K), dtype=dtype) / 10
+
+    w1_bf16 = ((torch.rand((E, 2 * N, K), dtype=torch.bfloat16) - 0.5) * 2 *
+               fp8_max).clamp(min=fp8_min, max=fp8_max)
+
+    w2_bf16 = ((torch.rand((E, K, N), dtype=torch.bfloat16) - 0.5) * 2 *
+               fp8_max).clamp(min=fp8_min, max=fp8_max)
+
+    score = torch.randn((M, E), dtype=dtype)
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles_w1 = ((2 * N) + block_n - 1) // block_n
+    k_tiles_w1 = (K + block_k - 1) // block_k
+    n_tiles_w2 = (K + block_n - 1) // block_n
+    k_tiles_w2 = (N + block_k - 1) // block_k
+
+    w1 = torch.empty_like(w1_bf16, dtype=torch.float8_e4m3fn)
+    w2 = torch.empty_like(w2_bf16, dtype=torch.float8_e4m3fn)
+
+    w1_s = torch.empty((E, n_tiles_w1, k_tiles_w1), dtype=torch.float32)
+    w2_s = torch.empty((E, n_tiles_w2, k_tiles_w2), dtype=torch.float32)
+
+    w1_s = deep_gemm.get_col_major_tma_aligned_tensor(w1_s).contiguous()
+    w2_s = deep_gemm.get_col_major_tma_aligned_tensor(w2_s).contiguous()
+
+    assert w1_s.shape == (E, (2 * N + 127) // 128, (K + 127) // 128)
+    assert (w2.shape[-2] + block_n - 1) // block_n == w2_s.shape[-2]
+
+    for i in range(E):
+        w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i])
+        w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i])
+
+    # Set the context to avoid lots of warning spam.
+    with set_current_vllm_config(vllm_config):
+        if M >= 128:
+            ref_out = deep_gemm_w8a8_block_fp8_moe(M, K, a, w1, w2, w1_s, w2_s,
+                                                   score, topk, block_size)
+        else:
+            ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score,
+                                               topk, block_size)
+
+        topk_weights, topk_ids = fused_topk(a, score.float(), topk, False)
+
+        out = deep_gemm_moe_fp8(a, w1, w2, w1_s, w2_s, topk_weights, topk_ids)
+
+    #print(f"{out.sum()=}")
+    #print(f"{ref_out.sum()=}")
+
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+
+    assert rel_diff < 0.03
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 2aa99ca256c63..039397f5a5ef5 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1224,7 +1224,7 @@ def moe_wna16_gemm(input: torch.Tensor, output: torch.Tensor,
 
 def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
                  token_expert_indicies: torch.Tensor,
-                 gating_output: float) -> None:
+                 gating_output: torch.Tensor) -> None:
     torch.ops._moe_C.topk_softmax(topk_weights, topk_ids,
                                   token_expert_indicies, gating_output)
 
diff --git a/vllm/envs.py b/vllm/envs.py
index b34c2df81698f..6067f5bdd0578 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -105,6 +105,7 @@ if TYPE_CHECKING:
     VLLM_V0_USE_OUTLINES_CACHE: bool = False
     VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION: bool = False
     VLLM_TPU_BUCKET_PADDING_GAP: int = 0
+    VLLM_USE_DEEP_GEMM: bool = False
 
 
 def get_default_cache_root():
@@ -687,6 +688,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_TPU_BUCKET_PADDING_GAP":
     lambda: int(os.environ["VLLM_TPU_BUCKET_PADDING_GAP"])
     if "VLLM_TPU_BUCKET_PADDING_GAP" in os.environ else 0,
+
+    # Allow use of DeepGemm kernels for fused moe ops.
+    "VLLM_USE_DEEP_GEMM":
+    lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))),
 }
 
 # end-env-vars-definition
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 70d0037d7cb01..977447e03995e 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1,8 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 """Fused MoE kernel."""
 import functools
+import importlib.util
 import json
 import os
+from math import prod
 from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import torch
@@ -15,7 +17,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     per_token_group_quant_fp8)
 from vllm.platforms import current_platform
-from vllm.utils import direct_register_custom_op
+from vllm.utils import direct_register_custom_op, round_up
 
 from .rocm_aiter_fused_moe import (is_rocm_aiter_moe_enabled,
                                    rocm_aiter_fused_experts,
@@ -23,6 +25,8 @@ from .rocm_aiter_fused_moe import (is_rocm_aiter_moe_enabled,
 
 logger = init_logger(__name__)
 
+has_deep_gemm = importlib.util.find_spec("deep_gemm") is not None
+
 
 @triton.jit
 def write_zeros_to_output(c_ptr, stride_cm, stride_cn, pid_n, N, offs_token,
@@ -581,7 +585,8 @@ def moe_align_block_size(
     topk_ids: torch.Tensor,
     block_size: int,
     num_experts: int,
-    expert_map: torch.Tensor = None
+    expert_map: Optional[torch.Tensor] = None,
+    pad_sorted_ids: bool = False
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Aligns the token distribution across experts to be compatible with block
@@ -596,6 +601,8 @@ def moe_align_block_size(
         from the global space to the local index space of the current
         expert parallel shard. If the expert is not in the current expert
         parallel shard, the mapping is set to -1.
+    - pad_sorted_ids: A flag indicating whether the sorted_token_ids length
+      should be padded to a multiple of block_size,
 
     Returns:
     - sorted_token_ids: A tensor containing the sorted token indices according
@@ -625,6 +632,8 @@ def moe_align_block_size(
         by block_size for proper block matrix operations.
     """
     max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    if pad_sorted_ids:
+        max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
     sorted_ids = torch.empty((max_num_tokens_padded, ),
                              dtype=torch.int32,
                              device=topk_ids.device)
@@ -667,6 +676,59 @@ def moe_align_block_size(
     return sorted_ids, expert_ids, num_tokens_post_pad
 
 
+def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor,
+                     w2: torch.Tensor,
+                     expert_map: Optional[torch.Tensor]) -> bool:
+    """
+    Check if the given problem size is supported by the DeepGemm grouped
+    gemm kernel.  All of M, N, K and the quantization block_shape must be
+    aligned by `dg.get_m_alignment_for_contiguous_layout()`.
+    """
+    if not has_deep_gemm:
+        return False
+
+    # Lazy import to avoid CUDA initialization problems.
+    import deep_gemm as dg
+
+    # Expert maps not supported yet.
+    if expert_map is not None:
+        return False
+
+    align = dg.get_m_alignment_for_contiguous_layout()
+    M = hidden_states.shape[0]
+    _, K, N = w2.shape
+
+    # For now, disable DeepGemm for small N until better permute/unpermute
+    # ops are available.
+    if N <= 512:
+        return False
+
+    if align > M or N % align != 0 or K % align != 0:
+        return False
+
+    return (hidden_states.is_contiguous() and w1.is_contiguous()
+            and w2.is_contiguous())
+
+
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = ops.scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
+
+
 def invoke_fused_moe_kernel(A: torch.Tensor,
                             B: torch.Tensor,
                             C: torch.Tensor,
@@ -691,15 +753,11 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
 
     if use_fp8_w8a8:
         assert B_scale is not None
-        if block_shape is None:
-            A, A_scale = ops.scaled_fp8_quant(A, A_scale)
-        else:
-            assert len(block_shape) == 2
-            block_n, block_k = block_shape[0], block_shape[1]
-            A, A_scale = per_token_group_quant_fp8(A, block_k)
-            assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
-            assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2]
-            assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1]
+        assert (block_shape is None or triton.cdiv(B.shape[-2], block_shape[0])
+                == B_scale.shape[-2])
+        assert (block_shape is None or triton.cdiv(B.shape[-1], block_shape[1])
+                == B_scale.shape[-1])
+
     elif use_int8_w8a16 or use_int4_w4a16:
         assert B_scale is not None
         assert block_shape is None or block_shape[0] == 0
@@ -1066,7 +1124,7 @@ def fused_topk(
     gating_output: torch.Tensor,
     topk: int,
     renormalize: bool,
-):
+) -> Tuple[torch.Tensor, torch.Tensor]:
     assert hidden_states.shape[0] == gating_output.shape[0], (
         "Number of tokens mismatch")
 
@@ -1098,14 +1156,16 @@ def fused_topk(
 
 # This is used by the Deepseek-V2 and Deepseek-V3 model
 @torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
-def grouped_topk(hidden_states: torch.Tensor,
-                 gating_output: torch.Tensor,
-                 topk: int,
-                 renormalize: bool,
-                 num_expert_group: int = 0,
-                 topk_group: int = 0,
-                 scoring_func: str = "softmax",
-                 e_score_correction_bias: Optional[torch.Tensor] = None):
+def grouped_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: int = 0,
+    topk_group: int = 0,
+    scoring_func: str = "softmax",
+    e_score_correction_bias: Optional[torch.Tensor] = None
+) -> Tuple[torch.Tensor, torch.Tensor]:
 
     assert hidden_states.shape[0] == gating_output.shape[0], (
         "Number of tokens mismatch")
@@ -1154,10 +1214,11 @@ def grouped_topk(hidden_states: torch.Tensor,
     return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
 
 
-def get_config_dtype_str(dtype: torch.dtype,
-                         use_int4_w4a16: Optional[bool] = False,
-                         use_int8_w8a16: Optional[bool] = False,
-                         use_fp8_w8a8: Optional[bool] = False):
+def get_config_dtype_str(
+        dtype: torch.dtype,
+        use_int4_w4a16: Optional[bool] = False,
+        use_int8_w8a16: Optional[bool] = False,
+        use_fp8_w8a8: Optional[bool] = False) -> Optional[str]:
     if use_fp8_w8a8:
         return "fp8_w8a8"
     elif use_int8_w8a16:
@@ -1318,26 +1379,123 @@ def fused_experts(hidden_states: torch.Tensor,
                   w2_zp: Optional[torch.Tensor] = None,
                   a1_scale: Optional[torch.Tensor] = None,
                   a2_scale: Optional[torch.Tensor] = None,
-                  block_shape: Optional[List[int]] = None) -> torch.Tensor:
-    return dispatch_fused_experts_func(inplace)(
-        hidden_states=hidden_states,
-        w1=w1,
-        w2=w2,
-        topk_weights=topk_weights,
-        topk_ids=topk_ids,
-        activation=activation,
-        use_fp8_w8a8=use_fp8_w8a8,
-        use_int8_w8a16=use_int8_w8a16,
-        use_int4_w4a16=use_int4_w4a16,
-        global_num_experts=global_num_experts,
-        expert_map=expert_map,
-        w1_scale=w1_scale,
-        w2_scale=w2_scale,
-        w1_zp=w1_zp,
-        w2_zp=w2_zp,
-        a1_scale=a1_scale,
-        a2_scale=a2_scale,
-        block_shape=block_shape)
+                  block_shape: Optional[List[int]] = None,
+                  allow_deep_gemm: bool = False) -> torch.Tensor:
+    if (allow_deep_gemm and use_fp8_w8a8
+            and _valid_deep_gemm(hidden_states, w1, w2, expert_map)):
+        return deep_gemm_moe_fp8(
+            hidden_states=hidden_states,
+            w1=w1,
+            w2=w2,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            activation=activation,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+        )
+    else:
+        return dispatch_fused_experts_func(inplace)(
+            hidden_states=hidden_states,
+            w1=w1,
+            w2=w2,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            activation=activation,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            use_int4_w4a16=use_int4_w4a16,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            w1_zp=w1_zp,
+            w2_zp=w2_zp,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            block_shape=block_shape)
+
+
+def _fp8_perm(m: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
+    """
+    A permutation routine that works on fp8 types.
+    """
+    if torch.is_floating_point(m) and torch.finfo(m.dtype).bits == 8:
+        return m.view(dtype=torch.uint8)[idx, ...].view(dtype=m.dtype)
+    else:
+        return m[idx, ...]
+
+
+def _moe_permute(
+    curr_hidden_states: torch.Tensor,
+    a1q_scale: Optional[torch.Tensor],
+    curr_topk_ids: torch.Tensor,
+    global_num_experts: int,
+    expert_map: Optional[torch.Tensor],
+    top_k_num: int,
+    block_m: int,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
+           torch.Tensor]:
+    """
+    Determine the sorted_token_ids, expert_ids for the given problem size.
+    Permute the hidden states and scales according to `sorted_token_ids`.
+    """
+    tokens_in_chunk, _ = curr_hidden_states.shape
+
+    sorted_token_ids, expert_ids, num_tokens_post_padded = (
+        moe_align_block_size(curr_topk_ids,
+                             block_m,
+                             global_num_experts,
+                             expert_map,
+                             pad_sorted_ids=True))
+
+    inv_perm: Optional[torch.Tensor] = None
+
+    num_tokens = top_k_num * tokens_in_chunk
+    sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1)
+    expert_ids = torch.repeat_interleave(expert_ids, block_m, dim=0)
+    inv_perm = torch.argsort(sorted_token_ids)[:num_tokens]
+
+    # Permute according to sorted token ids.
+    curr_hidden_states = _fp8_perm(curr_hidden_states,
+                                   sorted_token_ids // top_k_num)
+
+    if a1q_scale is not None:
+        a1q_scale = a1q_scale[sorted_token_ids // top_k_num]
+
+    return (curr_hidden_states, a1q_scale, sorted_token_ids, expert_ids,
+            inv_perm)
+
+
+def _moe_unpermute_and_reduce(
+    out: torch.Tensor,
+    curr_hidden: torch.Tensor,
+    inv_perm: Optional[torch.Tensor],
+    topk: int,
+    K: int,
+    topk_weight: torch.Tensor,
+) -> None:
+    """
+    Unpermute the final result and apply topk_weights, then perform the final
+    reduction on the hidden states.
+    """
+    M = topk_weight.shape[0]
+    curr_hidden = curr_hidden[inv_perm, ...]
+    curr_hidden = curr_hidden.view(-1, topk, K)
+    curr_hidden.mul_(topk_weight.view(M, -1, 1))
+    ops.moe_sum(curr_hidden, out)
+
+
+def _resize_cache(x: torch.Tensor, v: Tuple[int, ...]) -> torch.Tensor:
+    """
+    Shrink the given tensor and apply the given view to it.  This is
+    used to resize the intermediate fused_moe caches.
+    """
+    assert prod(v) <= x.numel()
+    return x.flatten()[:prod(v)].view(*v)
 
 
 def fused_experts_impl(hidden_states: torch.Tensor,
@@ -1376,6 +1534,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
 
     num_tokens, _ = hidden_states.shape
     E, N, _ = w1.shape
+    K = w2.shape[1]
     if global_num_experts == -1:
         global_num_experts = E
     top_k_num = topk_ids.shape[1]
@@ -1401,13 +1560,11 @@ def fused_experts_impl(hidden_states: torch.Tensor,
 
     # We can reuse the memory between these because by the time we need
     # cache3, we're done with cache1
-    cache13 = torch.empty(M * top_k_num * max(N, w2.shape[1]),
+    cache13 = torch.empty(M * top_k_num * max(N, K),
                           device=hidden_states.device,
                           dtype=hidden_states.dtype)
-    intermediate_cache1 = cache13[:M * top_k_num * N].view(
-        (M, topk_ids.shape[1], N))
-    intermediate_cache3 = cache13[:M * top_k_num * w2.shape[1]].view(
-        (M, topk_ids.shape[1], w2.shape[1]))
+    intermediate_cache1 = cache13[:M * top_k_num * N].view(M, top_k_num, N)
+    intermediate_cache3 = cache13[:M * top_k_num * K].view(M, top_k_num, K)
 
     # This needs separate memory since it's used concurrently with cache1
     intermediate_cache2 = torch.empty((M * top_k_num, N // 2),
@@ -1452,14 +1609,23 @@ def fused_experts_impl(hidden_states: torch.Tensor,
         curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]
         curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx]
 
+        a1q_scale: Optional[torch.Tensor] = None
+
+        if use_fp8_w8a8:
+            qcurr_hidden_states, a1q_scale = _fp8_quantize(
+                curr_hidden_states, a1_scale, block_shape)
+        else:
+            qcurr_hidden_states = curr_hidden_states
+            a1q_scale = a1_scale
+
         sorted_token_ids, expert_ids, num_tokens_post_padded = (
             moe_align_block_size(curr_topk_ids, config['BLOCK_SIZE_M'],
                                  global_num_experts, expert_map))
 
-        invoke_fused_moe_kernel(curr_hidden_states,
+        invoke_fused_moe_kernel(qcurr_hidden_states,
                                 w1,
                                 intermediate_cache1,
-                                a1_scale,
+                                a1q_scale,
                                 w1_scale,
                                 w1_zp,
                                 curr_topk_weights,
@@ -1485,10 +1651,19 @@ def fused_experts_impl(hidden_states: torch.Tensor,
         else:
             raise ValueError(f"Unsupported FusedMoe activation: {activation}")
 
-        invoke_fused_moe_kernel(intermediate_cache2,
+        a2q_scale: Optional[torch.Tensor] = None
+
+        if use_fp8_w8a8:
+            qintermediate_cache2, a2q_scale = _fp8_quantize(
+                intermediate_cache2, a2_scale, block_shape)
+        else:
+            qintermediate_cache2 = intermediate_cache2
+            a2q_scale = a2_scale
+
+        invoke_fused_moe_kernel(qintermediate_cache2,
                                 w2,
                                 intermediate_cache3,
-                                a2_scale,
+                                a2q_scale,
                                 w2_scale,
                                 w2_zp,
                                 curr_topk_weights,
@@ -1617,6 +1792,193 @@ def fused_moe(
                          block_shape=block_shape)
 
 
+def deep_gemm_moe_fp8(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    activation: str = "silu",
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """
+    This function computes a a8w8-quantized Mixture of Experts (MoE) layer
+    using two sets of quantized weights, w1_q and w2_q, and top-k gating
+    mechanism. The matrix multiplications are implemented with DeepGemm
+    grouped gemm.
+
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+        Shape: [M, K]
+    - w1 (torch.Tensor): The first set of fp8 quantized expert weights.
+        Shape: [num_experts, K, 2N] (the weights are passed transposed)
+    - w2 (torch.Tensor): The second set of fp8 quantized expert weights.
+        Shape: [num_experts, N, K] (the weights are passed transposed)
+    - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
+        Shape: [num_experts] or [num_experts, 2N]
+    - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
+        Shape: [num_experts] or [num_experts, K]
+    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
+    - topk_ids (torch.Tensor): The token->expert mapping for topk_weights.
+    - inplace (bool): If True, perform the operation in-place.
+        Defaults to False.
+    - activation (str): The activation function to apply after the first
+        MoE layer.
+    - global_num_experts (int): The total number of experts in the global
+        expert space.
+    - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices
+        from the global expert space to the local expert space of the expert
+        parallel shard.
+    - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
+        Shape: scalar or [M]
+    - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
+        quantize the intermediate result between the gemms.
+        Shape: scalar or [M]
+
+    Returns:
+    - torch.Tensor: The bfloat16 output tensor after applying the MoE layer.
+    """
+    # Lazy import to avoid CUDA initialization problems.
+    import deep_gemm as dg
+
+    assert expert_map is None, "Expert maps not supported yet"
+
+    assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch"
+
+    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
+    assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
+    assert hidden_states.dtype in [
+        torch.float32, torch.float16, torch.bfloat16
+    ]
+    assert w1.dtype == torch.float8_e4m3fn
+    assert w2.dtype == torch.float8_e4m3fn
+    assert w1.shape[0] == w2.shape[0], "Expert number mismatch"
+    assert w1.shape[0] == w1_scale.shape[0], "w1 scales expert number mismatch"
+    assert w1.shape[0] == w2_scale.shape[0], "w2 scales expert number mismatch"
+    assert a1_scale is None or a1_scale.dim(
+    ) == 0 or a1_scale.shape[0] == 1 or a1_scale.shape[
+        0] == hidden_states.shape[0], "Input scale shape mismatch"
+    assert a2_scale is None or a1_scale is None or a2_scale.shape == a1_scale.shape, "Intermediate scale shape mismatch"  # noqa: E501
+
+    num_tokens, _ = hidden_states.shape
+    E, N, _ = w1.shape
+    K = w2.shape[1]
+    if global_num_experts == -1:
+        global_num_experts = E
+    top_k_num = topk_ids.shape[1]
+    # We execute the fused_moe kernel in chunks to circumvent this issue:
+    # https://github.com/vllm-project/vllm/issues/5938
+    CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
+
+    assert _valid_deep_gemm(hidden_states, w1, w2, expert_map)
+
+    if inplace:
+        out_hidden_states = hidden_states
+    else:
+        out_hidden_states = torch.empty_like(hidden_states)
+
+    block_m = dg.get_m_alignment_for_contiguous_layout()
+    block_shape = [block_m, block_m]
+
+    assert w1_scale is not None
+    assert w2_scale is not None
+
+    # We attempt to transpose and align offline in Fp8MoEMethod, in which
+    # case these calls will be nops.  Otherwise, they'll be performed every
+    # time the layer is executed.
+    w1_scale = dg.get_col_major_tma_aligned_tensor(w1_scale).contiguous()
+    w2_scale = dg.get_col_major_tma_aligned_tensor(w2_scale).contiguous()
+
+    M_sum = topk_ids.numel() + global_num_experts * (block_m - 1)
+    M_sum = round_up(M_sum, block_m)
+
+    num_chunks = (num_tokens // CHUNK_SIZE) + 1
+
+    # We can reuse the memory between cache1 and cache3 because by the time
+    # we need cache3, we're done with cache1
+    cache13 = torch.empty(M_sum * max(N, K),
+                          device=hidden_states.device,
+                          dtype=hidden_states.dtype)
+
+    intermediate_cache1 = cache13[:M_sum * N].view(M_sum, N)
+    intermediate_cache2 = torch.empty((M_sum, N // 2),
+                                      device=hidden_states.device,
+                                      dtype=hidden_states.dtype)
+    intermediate_cache3 = cache13[:M_sum * K].view(M_sum, K)
+
+    for chunk in range(num_chunks):
+        begin_chunk_idx, end_chunk_idx = (chunk * CHUNK_SIZE,
+                                          min((chunk + 1) * CHUNK_SIZE,
+                                              num_tokens))
+        curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx]
+        tokens_in_chunk, _ = curr_hidden_states.shape
+
+        if tokens_in_chunk == 0:
+            break
+
+        curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]
+        curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx]
+
+        a1q_scale: Optional[torch.Tensor] = None
+
+        qcurr_hidden_states, a1q_scale = _fp8_quantize(curr_hidden_states,
+                                                       a1_scale, block_shape)
+
+        (qcurr_hidden_states, a1q_scale, sorted_token_ids, expert_ids,
+         inv_perm) = _moe_permute(qcurr_hidden_states, a1q_scale,
+                                  curr_topk_ids, global_num_experts,
+                                  expert_map, top_k_num, block_m)
+
+        # Adjust the intermediate cache size and config for the last chunk.
+        # Note that in most cases we only have one chunk so the cache size
+        # and config are already set correctly and do not need to be adjusted.
+        if tokens_in_chunk < CHUNK_SIZE and chunk > 0:
+            curr_M = sorted_token_ids.numel()
+            intermediate_cache1 = _resize_cache(intermediate_cache1,
+                                                (curr_M, N))
+            intermediate_cache2 = _resize_cache(intermediate_cache2,
+                                                (curr_M, N // 2))
+            intermediate_cache3 = _resize_cache(intermediate_cache3,
+                                                (curr_M, K))
+
+        dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
+            (qcurr_hidden_states, a1q_scale), (w1, w1_scale),
+            intermediate_cache1, expert_ids)
+
+        if activation == "silu":
+            torch.ops._C.silu_and_mul(intermediate_cache2,
+                                      intermediate_cache1.view(-1, N))
+        elif activation == "gelu":
+            torch.ops._C.gelu_and_mul(intermediate_cache2,
+                                      intermediate_cache1.view(-1, N))
+        else:
+            raise ValueError(f"Unsupported FusedMoe activation: {activation}")
+
+        a2q_scale: Optional[torch.Tensor] = None
+
+        qintermediate_cache2, a2q_scale = _fp8_quantize(
+            intermediate_cache2, a2_scale, block_shape)
+
+        dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
+            (qintermediate_cache2, a2q_scale), (w2, w2_scale),
+            intermediate_cache3, expert_ids)
+
+        _moe_unpermute_and_reduce(
+            out_hidden_states[begin_chunk_idx:end_chunk_idx],
+            intermediate_cache3.view(*intermediate_cache3.shape), inv_perm,
+            top_k_num, K, curr_topk_weights)
+
+    return out_hidden_states
+
+
 #TODO make the grouped gemm kernel consistent with scaled gemm kernel
 def cutlass_moe_fp8(
     a: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 11bfdb4180531..e7c733db5c009 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import importlib.util
 from typing import Any, Callable, Dict, List, Optional
 
 import torch
@@ -37,6 +38,14 @@ ACTIVATION_SCHEMES = ["static", "dynamic"]
 
 logger = init_logger(__name__)
 
+has_deep_gemm = importlib.util.find_spec("deep_gemm") is not None
+
+
+def _is_col_major(x: torch.Tensor) -> bool:
+    assert x.dim() == 3
+    b, m, n = x.shape
+    return x.stride(0) == m * n and x.stride(1) == 1 and x.stride(2) == m
+
 
 class Fp8Config(QuantizationConfig):
     """Config class for FP8."""
@@ -424,6 +433,19 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         self.quant_config = quant_config
         self.block_quant = self.quant_config.weight_block_size is not None
 
+        # Check for DeepGemm support.
+        self.allow_deep_gemm = False
+        if envs.VLLM_USE_DEEP_GEMM:
+            if not has_deep_gemm:
+                logger.warning_once("Failed to import DeepGemm kernels.")
+            elif (current_platform.is_cuda()
+                  and current_platform.has_device_capability(90)):
+                logger.info_once("Using DeepGemm kernels for Fp8MoEMethod.")
+                self.allow_deep_gemm = True
+            else:
+                logger.warning_once(
+                    "DeepGemm not supported on the current platform.")
+
     def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
                        intermediate_size_per_partition: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
@@ -585,6 +607,19 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                                                       requires_grad=False)
                 layer.w2_weight = torch.nn.Parameter(shuffled_w2,
                                                      requires_grad=False)
+
+            # DeepGemm scales need to be transposed and aligned.  We try to do
+            # it ahead of time for performance reasons.
+            if self.allow_deep_gemm:
+                # Lazy import to avoid CUDA initialization problems.
+                import deep_gemm as dg
+                if _is_col_major(layer.w13_weight_scale_inv):
+                    layer.w13_weight_scale_inv = \
+                        dg.get_col_major_tma_aligned_tensor(layer.w13_weight_scale_inv).contiguous()
+                if _is_col_major(layer.w2_weight_scale_inv):
+                    layer.w2_weight_scale_inv = \
+                        dg.get_col_major_tma_aligned_tensor(layer.w2_weight_scale_inv).contiguous()
+
             return
 
         # If checkpoint is fp16, quantize in place.
@@ -773,6 +808,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             a1_scale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
             block_shape=self.quant_config.weight_block_size,
+            allow_deep_gemm=self.allow_deep_gemm,
         )
 
 
From dfa82e2a3da027433bde4fe4eb67ce169a30b9dd Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 2 Apr 2025 00:28:50 +0800
Subject: [PATCH 160/593] [CI/Build] Clean up LoRA tests (#15867)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_baichuan.py          |   8 --
 tests/lora/test_gemma.py             |  65 -------------
 tests/lora/test_layers.py            | 140 +++++----------------------
 tests/lora/test_minicpmv_tp.py       |   4 +
 tests/lora/test_transfomers_model.py |  10 --
 5 files changed, 27 insertions(+), 200 deletions(-)
 delete mode 100644 tests/lora/test_gemma.py

diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
index 3aa30b7b3c723..4dacbe26f3d9a 100644
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -11,14 +11,6 @@ MODEL_PATH = "baichuan-inc/Baichuan-7B"
 PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(query="How many singers do we have?"),
diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py
deleted file mode 100644
index 610bc405ede5c..0000000000000
--- a/tests/lora/test_gemma.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-
-import vllm
-from vllm.lora.request import LoRARequest
-from vllm.platforms import current_platform
-
-MODEL_PATH = "google/gemma-7b"
-
-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
-    prompts = [
-        "Quote: Imagination is",
-        "Quote: Be yourself;",
-        "Quote: Painting is poetry that is seen rather than felt,",
-    ]
-    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
-    outputs = llm.generate(
-        prompts,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
-    # Print the outputs.
-    generated_texts: list[str] = []
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text.strip()
-        generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    return generated_texts
-
-
-# The V1 lora test for this model requires more than 24GB.
-@pytest.mark.skip_v1
-@pytest.mark.xfail(current_platform.is_rocm(),
-                   reason="There can be output mismatch on ROCm")
-def test_gemma_lora(gemma_lora_files):
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=4,
-                   enable_chunked_prefill=True)
-
-    expected_lora_output = [
-        "more important than knowledge.\nAuthor: Albert Einstein\n",
-        "everyone else is already taken.\nAuthor: Oscar Wilde\n",
-        "and poetry is painting that is felt rather than seen.\n"
-        "Author: Leonardo da Vinci\n",
-    ]
-
-    output1 = do_sample(llm, gemma_lora_files, lora_id=1)
-    for i in range(len(expected_lora_output)):
-        assert output1[i].startswith(expected_lora_output[i])
-    output2 = do_sample(llm, gemma_lora_files, lora_id=2)
-    for i in range(len(expected_lora_output)):
-        assert output2[i].startswith(expected_lora_output[i])
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 56da97b6a06d8..99d60b332e659 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -19,7 +19,6 @@ from vllm.lora.fully_sharded_layers import (
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
-                              LinearScalingRotaryEmbeddingWithLoRA,
                               LogitsProcessorWithLoRA, LoRAMapping,
                               MergedColumnParallelLinearWithLoRA,
                               MergedQKVParallelLinearWithLoRA,
@@ -28,8 +27,7 @@ from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
                               RowParallelLinearWithLoRA,
                               VocabParallelEmbeddingWithLoRA)
 # yapf: enable
-from vllm.lora.models import (LongContextLoRAContext, LoRALayerWeights,
-                              PackedLoRALayerWeights)
+from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights
 from vllm.lora.punica_wrapper import get_punica_wrapper
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
@@ -37,7 +35,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask)
 from vllm.model_executor.utils import set_random_seed
@@ -59,28 +56,16 @@ DEVICES = ([
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ] if current_platform.is_cuda_alike() else ["cpu"])
 
-#For GPU, we will launch different triton kernels between the prefill and decode
-# stages, so we need to verify this. prefill stage(True) or decode stage(False)
+# prefill stage(True) or decode stage(False)
 STAGES = [True, False]
 
-# With the inclusion of V1 tests (look at the run_with_both_engines_lora),
-# the tests in this file run twice, once with the V0 engine and then with
-# the V1 engine.
-# The NUM_RANDOM_SEEDS value was set to 10 before. It is cut to half
-# with the inclusion of V1 tests to maintain the CI test times.
-NUM_RANDOM_SEEDS = 5
-# The VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS value was set to
-# 256 before. It is cut to half with the inclusion of V1 tests to maintain
-# the CI test times.
+NUM_RANDOM_SEEDS = 10
+
 VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
 
 
 @pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-
+def clean_cache():
     # Release any memory we might be holding on to. CI runs OOMs otherwise.
     from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
                                                 _LORA_B_PTR_DICT)
@@ -90,6 +75,24 @@ def v1(run_with_both_engines_lora):
     yield
 
 
+@pytest.fixture(autouse=True)
+def skip_cuda_with_stage_false(request):
+    """
+    On cuda-like platforms, we use the same kernels for prefill and decode 
+    stage, and 'stage' is generally ignored, so we only need to test once.
+    """
+    if current_platform.is_cuda_alike():
+        try:
+            if hasattr(request.node, "callspec") and hasattr(
+                    request.node.callspec, "params"):
+                params = request.node.callspec.params
+                if "stage" in params and params["stage"] is False:
+                    pytest.skip("Skip test when stage=False")
+        except Exception:
+            pass
+    yield
+
+
 def get_random_id_to_index(num_loras: int,
                            num_slots: int,
                            log: bool = True) -> list[Optional[int]]:
@@ -1011,103 +1014,6 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
                                    atol=atol)
 
 
-@torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 8])
-@pytest.mark.parametrize("device", ["cuda"])
-@pytest.mark.parametrize("scaling_factors", [(1.0, ), (4.0, ), (4.0, 8.0),
-                                             (6.0, 1.0)])
-@pytest.mark.parametrize("max_position", [11, 4096, 32768])
-@pytest.mark.parametrize("is_neox_style", [True, False])
-@pytest.mark.parametrize("rotary_dim", [None, 32])
-@pytest.mark.parametrize("head_size", [32, 108])
-@pytest.mark.parametrize("seq_len", [11, 1024])
-@pytest.mark.skipif(not current_platform.is_cuda_alike(),
-                    reason="Only CUDA backends are supported")
-def test_rotary_embedding_long_context(dist_init, num_loras, device,
-                                       scaling_factors, max_position,
-                                       is_neox_style, rotary_dim, head_size,
-                                       seq_len) -> None:
-    dtype = torch.float16
-    max_loras = 8
-    seed = 0
-    current_platform.seed_everything(seed)
-    torch.set_default_device(device)
-    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
-    assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             long_lora_scaling_factors=scaling_factors,
-                             lora_dtype=dtype)
-
-    if rotary_dim is None:
-        rotary_dim = head_size
-    base = 10000
-    batch_size = 5 * num_loras
-    num_heads = 7
-
-    # Verify lora is equivalent to linear scaling rotary embedding.
-    rope = get_rope(
-        head_size,
-        rotary_dim,
-        max_position,
-        base,
-        is_neox_style,
-    )
-    lora_rope = LinearScalingRotaryEmbeddingWithLoRA(rope)
-    lora_rope.set_mapping(punica_wrapper)
-    lora_rope.create_lora_weights(max_loras, lora_config)
-    linear_rope = get_rope(head_size, rotary_dim, max_position, base,
-                           is_neox_style, {
-                               "rope_type": "linear",
-                               "factor": scaling_factors
-                           })
-    linear_rope = linear_rope.to(dtype=dtype)
-    id_to_index = get_random_id_to_index(num_loras, max_loras)
-    _, index_mapping, prompt_mapping = create_random_inputs(
-        active_lora_ids=[0],
-        num_inputs=batch_size,
-        input_size=(1, max_position),
-        input_range=(0, lora_config.lora_extra_vocab_size),
-        input_type=torch.float16,
-        device=device)
-
-    lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
-    long_lora_context = LongContextLoRAContext(list(scaling_factors),
-                                               rotary_dim)
-
-    next_expected_offset = 0
-    # Make sure the offset is correct.
-    scaling_factor_to_offset = lora_rope.scaling_factor_to_offset
-    for scaling_factor, offset in scaling_factor_to_offset.items():
-        assert offset == next_expected_offset
-        next_expected_offset += scaling_factor * max_position
-
-    for i in range(len(scaling_factors)):
-        long_lora_context.offsets_by_lora_id[i] = scaling_factor_to_offset.get(
-            scaling_factors[i], 0)
-    punica_wrapper.update_metadata(
-        lora_mapping,
-        id_to_index,
-        max_loras,
-        512,
-        lora_config.lora_extra_vocab_size,
-        long_lora_context=long_lora_context,
-    )
-    # lora_rope.set_mapping(*mapping_info)
-
-    positions = torch.randint(0, max_position, (batch_size, seq_len))
-    query = torch.randn(batch_size,
-                        seq_len,
-                        num_heads * head_size,
-                        dtype=dtype)
-    key = torch.randn_like(query)
-    ref_q, ref_k = linear_rope(positions, query, key)
-    actual_q, actual_k = lora_rope(positions, query, key)
-
-    torch.allclose(ref_q, actual_q)
-    torch.allclose(ref_k, actual_k)
-
-
 @pytest.mark.parametrize("tp_size", [1, 2, 4, 8])
 @pytest.mark.parametrize(
     "seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS)))
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
index ee0d7b5da3a99..00e6fe7c61de7 100644
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -78,6 +78,8 @@ def test_minicpmv_lora(minicpmv_lora_files):
         assert EXPECTED_OUTPUT[i].startswith(output2[i])
 
 
+@pytest.mark.skipif(current_platform.is_cuda_alike(),
+                    reason="Skipping to avoid redundant model tests")
 @pytest.mark.xfail(
     current_platform.is_rocm(),
     reason="MiniCPM-V dependency xformers incompatible with ROCm")
@@ -99,6 +101,8 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
         assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
 
 
+@pytest.mark.skipif(current_platform.is_cuda_alike(),
+                    reason="Skipping to avoid redundant model tests")
 @pytest.mark.xfail(
     current_platform.is_rocm(),
     reason="MiniCPM-V dependency xformers incompatible with ROCm")
diff --git a/tests/lora/test_transfomers_model.py b/tests/lora/test_transfomers_model.py
index 0f18de42cd9cb..87db0b4bbde06 100644
--- a/tests/lora/test_transfomers_model.py
+++ b/tests/lora/test_transfomers_model.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import pytest
-
 import vllm
 from vllm.lora.request import LoRARequest
 
@@ -18,14 +16,6 @@ EXPECTED_LORA_OUTPUT = [
 ]
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(query="How many singers do we have?"),

From 38327cf454ab44bf55790abd6fe25e176bea5d82 Mon Sep 17 00:00:00 2001
From: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Date: Tue, 1 Apr 2025 09:30:43 -0700
Subject: [PATCH 161/593] [Model] Aya Vision (#15441)

Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
---
 docs/source/models/supported_models.md        |   7 +
 examples/offline_inference/vision_language.py |  23 +
 .../vision_language_multi_image.py            |  36 ++
 .../vision_language/test_models.py            |  14 +
 .../multimodal/processing/test_common.py      |   1 +
 tests/models/registry.py                      |   1 +
 vllm/config.py                                |   4 +
 vllm/entrypoints/chat_utils.py                |   5 +-
 vllm/model_executor/models/aya_vision.py      | 527 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |   1 +
 10 files changed, 617 insertions(+), 2 deletions(-)
 create mode 100644 vllm/model_executor/models/aya_vision.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 6d21405410dda..1b742717885e3 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -753,6 +753,13 @@ See [this page](#generative-models) for more information on how to use generativ
   *
   * ✅︎
   * ✅︎
+- * `AyaVisionForConditionalGeneration`
+  * Aya Vision
+  * T + I<sup>+</sup>
+  * `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc.
+  *
+  * ✅︎
+  * ✅︎
 - * `Blip2ForConditionalGeneration`
   * BLIP-2
   * T + I<sup>E</sup>
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index d32bfcd3460c5..c1115708505af 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -60,6 +60,28 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# Aya Vision
+def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "CohereForAI/aya-vision-8b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=2048,
+        max_num_seqs=2,
+        mm_processor_kwargs={"crop_to_patches": True},
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+    prompts = [
+        f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
+        for question in questions
+    ]
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # BLIP-2
 def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -865,6 +887,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
 
 model_example_map = {
     "aria": run_aria,
+    "aya_vision": run_aya_vision,
     "blip-2": run_blip2,
     "chameleon": run_chameleon,
     "deepseek_vl_v2": run_deepseek_vl2,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 318cf989d731a..39951e5e89c46 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -61,6 +61,41 @@ def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "CohereForAI/aya-vision-8b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *placeholders,
+            {
+                "type": "text",
+                "text": question
+            },
+        ],
+    }]
+
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_deepseek_vl2(question: str,
                       image_urls: list[str]) -> ModelRequestData:
     model_name = "deepseek-ai/deepseek-vl2-tiny"
@@ -526,6 +561,7 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
 
 model_example_map = {
     "aria": load_aria,
+    "aya_vision": load_aya_vision,
     "deepseek_vl_v2": load_deepseek_vl2,
     "gemma3": load_gemma3,
     "h2ovl_chat": load_h2ovl,
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 7a9158eff94eb..3b34f012f6264 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -158,6 +158,20 @@ VLM_TEST_SETTINGS = {
         max_tokens=64,
         marks=[large_gpu_mark(min_gb=64)],
     ),
+    "aya_vision": VLMTestInfo(
+        models=["CohereForAI/aya-vision-8b"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<image>What's the content in the center of the image?",  # noqa: E501
+            "cherry_blossom": "<image>What is the season?",  # noqa: E501
+        }),
+        multi_image_prompt="<image><image>Describe the two images in detail.",  # noqa: E501
+        max_model_len=8192,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}}
+    ),
     "blip2": VLMTestInfo(
         # TODO: Change back to 2.7b once head_dim = 80 is supported
         models=["Salesforce/blip2-opt-6.7b"],
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index e4f1d297fc092..fdcd7a9e1738e 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -246,6 +246,7 @@ def _test_processing_correctness_mistral(
 # yapf: disable
 @pytest.mark.parametrize("model_id", [
     "rhymes-ai/Aria",
+    "CohereForAI/aya-vision-8b",
     "Salesforce/blip2-opt-2.7b",
     "facebook/chameleon-7b",
     "deepseek-ai/deepseek-vl2-tiny",
diff --git a/tests/models/registry.py b/tests/models/registry.py
index ffc00261a8a16..137f1418736b4 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -259,6 +259,7 @@ _CROSS_ENCODER_EXAMPLE_MODELS = {
 _MULTIMODAL_EXAMPLE_MODELS = {
     # [Decoder-only]
     "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
+    "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereForAI/aya-vision-8b"), # noqa: E501
     "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b",  # noqa: E501
                                                      extras={"6b": "Salesforce/blip2-opt-6.7b"}),  # noqa: E501
     "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),  # noqa: E501
diff --git a/vllm/config.py b/vllm/config.py
index c82c9763ccdc7..c213c9b475680 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2716,6 +2716,10 @@ def _get_and_verify_max_len(
             max_len_key = key if max_len < derived_max_model_len \
                 else max_len_key
             derived_max_model_len = min(derived_max_model_len, max_len)
+    # For Command-R / Cohere, Cohere2 / Aya Vision models
+    if tmp_max_len := getattr(hf_config, "model_max_length", None):
+        max_len_key = "model_max_length"
+        derived_max_model_len = tmp_max_len
 
     # If sliding window is manually disabled, max_length should be less
     # than the sliding window length in the model config.
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index e32b8ffc044c3..ff2d1aacbecec 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -496,8 +496,9 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
             if model_type.startswith("llava"):
                 return self._cached_token_str(self._tokenizer,
                                               hf_config.image_token_index)
-            if model_type in ("chameleon", "deepseek_vl_v2", "internvl_chat",
-                              "skywork_chat", "NVLM_D", "h2ovl_chat"):
+            if model_type in ("aya_vision", "chameleon", "deepseek_vl_v2",
+                              "internvl_chat", "skywork_chat", "NVLM_D",
+                              "h2ovl_chat"):
                 return "<image>"
             if model_type == "mllama":
                 return "<|image|>"
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
new file mode 100644
index 0000000000000..b4bf1d82c083e
--- /dev/null
+++ b/vllm/model_executor/models/aya_vision.py
@@ -0,0 +1,527 @@
+# SPDX-License-Identifier: Apache-2.0 Adapted from
+# https://github.com/huggingface/transformers/tree/main/src/transformers/models/aya_vision
+from functools import cached_property
+from typing import (Iterable, Literal, Mapping, Optional, Sequence, Set, Tuple,
+                    TypedDict, Union, cast)
+
+import torch
+from torch import nn
+from transformers import BatchFeature, GotOcr2ImageProcessor
+from transformers.activations import ACT2FN
+from transformers.image_processing_utils import get_size_dict
+from transformers.models.aya_vision import AyaVisionConfig
+from transformers.models.aya_vision.processing_aya_vision import (
+    AyaVisionProcessor)
+from transformers.models.got_ocr2.image_processing_got_ocr2 import (
+    get_optimal_tiled_canvas)
+
+from vllm.config import VllmConfig
+from vllm.jsontree import json_map_leaves
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalKwargs
+from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo,
+                                        MultiModalFieldConfig,
+                                        PromptReplacement, PromptUpdate,
+                                        encode_tokens)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .siglip import SiglipVisionModel
+from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
+                    maybe_prefix, merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features
+
+
+class AyaVisionImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: torch.Tensor
+    """
+    Shape: `(num_patches_total, num_channels, height, width)`
+
+    `num_patches_total` is the total number of patches over each image over each
+    prompt in the batch.
+    """
+
+    num_patches: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
+
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
+
+class AyaVisionMultiModalProjector(nn.Module):
+
+    def __init__(self, config: AyaVisionConfig):
+        super().__init__()
+        self.config = config
+        self.downsample_factor = config.downsample_factor
+        self.alignment_intermediate_size = getattr(
+            config, "alignment_intermediate_size",
+            config.text_config.hidden_size)
+        self.layernorm = nn.LayerNorm(config.vision_config.hidden_size *
+                                      (config.downsample_factor**2),
+                                      eps=config.adapter_layer_norm_eps)
+
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size * (config.downsample_factor**2),
+            self.alignment_intermediate_size,
+            bias=True,
+        )
+
+        self.act = ACT2FN["silu"]  # SwiGLU uses SiLU activation
+        # For SwiGLU, project down to half size since we split intermediate dim
+        self.linear_2 = nn.Linear(self.alignment_intermediate_size // 2,
+                                  config.text_config.hidden_size,
+                                  bias=True)
+
+    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
+        image_features = self.pixel_shuffle(image_features)
+        image_features = self.layernorm(image_features)
+        hidden_states = self.linear_1(image_features)
+
+        # Split along last dimension and apply SwiGLU
+        x, gate = hidden_states.chunk(2, dim=-1)
+        hidden_states = self.act(gate) * x
+
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+    def pixel_shuffle(self,
+                      image_features: torch.Tensor) -> torch.Tensor:  # B, S, D
+        batch_size, seq_length, _ = image_features.shape
+        height = width = int(seq_length**0.5)
+        image_features = image_features.reshape(image_features.shape[0], width,
+                                                height, -1)
+        channels = image_features.shape[-1]
+        image_features = image_features.reshape(
+            batch_size, width, int(height / self.downsample_factor),
+            int(channels * self.downsample_factor))
+        image_features = image_features.permute(0, 2, 1, 3)
+        image_features = image_features.reshape(
+            batch_size, int(height / self.downsample_factor),
+            int(width / self.downsample_factor), -1)
+        image_features = image_features.permute(0, 2, 1, 3)
+        return image_features
+
+
+class AyaVisionProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self) -> AyaVisionConfig:
+        return self.ctx.get_hf_config(AyaVisionConfig)
+
+    def get_hf_processor(self, **kwargs: object) -> AyaVisionProcessor:
+        return self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs)
+
+    def get_image_processor(self) -> GotOcr2ImageProcessor:
+        return self.get_hf_processor().image_processor
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        return {"image": self.get_max_image_tokens()}
+
+    def get_max_image_tokens(self) -> int:
+        hf_processor = self.get_hf_processor()
+        image_processor = hf_processor.image_processor
+        image_size = self.get_image_size_with_most_features()
+        tokenizer = hf_processor.tokenizer
+        num_patches = self.get_num_patches(
+            image_width=image_size.width,
+            image_height=image_size.height,
+            size=image_processor.size,
+            min_patches=image_processor.min_patches,
+            max_patches=image_processor.max_patches)
+        image_string = hf_processor._prompt_split_image(num_patches)
+        x = encode_tokens(
+            tokenizer,
+            image_string,
+            add_special_tokens=False,
+        )
+        return len(x)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        image_processor = self.get_image_processor()
+        height = image_processor.size['height']
+        width = image_processor.size['width']
+        max_patches = image_processor.max_patches
+        return ImageSize(height=height * max_patches,
+                         width=width * max_patches)
+
+    def get_num_patches(self, *, image_width: int, image_height: int,
+                        size: dict, min_patches: int, max_patches: int) -> int:
+        """
+        Calculate the number of patches needed for a given image based on size
+        constraints.  This method replicates and adjusts the logic from:
+        transformers/models/got_ocr2/image_processing_got_ocr2
+        """
+        size = get_size_dict(size, default_to_square=False)
+        num_columns, num_rows = get_optimal_tiled_canvas(
+            (image_height, image_width), (size["height"], size["width"]),
+            min_patches, max_patches)
+        num_blocks = num_columns * num_rows
+        return num_blocks if num_blocks == 1 else num_blocks + 1
+
+
+class AyaVisionDummyInputsBuilder(
+        BaseDummyInputsBuilder[AyaVisionProcessingInfo]):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+
+        num_images = mm_counts.get("image", 0)
+        image_size = \
+            self.info.get_image_size_with_most_features()
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=image_size.width,
+                                   height=image_size.height,
+                                   num_images=num_images)
+        }
+        return ProcessorInputs(
+            prompt_text=image_token * num_images,
+            mm_data=mm_data,
+        )
+
+
+class AyaVisionMultiModalProcessor(
+        BaseMultiModalProcessor[AyaVisionProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        processed_outputs = super()._call_hf_processor(
+            prompt,
+            mm_data,
+            mm_kwargs,
+        )
+        hf_processor = self.info.get_hf_processor(**mm_kwargs)
+        image_processor = hf_processor.image_processor
+
+        hf_config = self.info.get_hf_config()
+        # HF processor pops the `num_patches` kwarg, which is needed by vLLM
+        if (images :=
+                mm_data.get("images")) is not None and '<image>' in prompt:
+            assert isinstance(images, list)
+            parsed_images = (self._get_data_parser().parse_mm_data({
+                "image":
+                images
+            }).get_items("image", ImageProcessorItems))
+            image_sizes = [
+                parsed_images.get_image_size(i)
+                for i in range(len(parsed_images))
+            ]
+            num_patches = [
+                self.info.get_num_patches(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    size=image_processor.size,
+                    min_patches=image_processor.min_patches,
+                    max_patches=image_processor.max_patches)
+                for image_size in image_sizes
+            ]
+            image_tokens_list = [
+                hf_processor._prompt_split_image(num_patch)
+                for num_patch in num_patches
+            ]
+            tokenizer = self.info.get_tokenizer()
+            image_token_ids = [
+                tokenizer.encode(image_tokens, add_special_tokens=False)
+                for image_tokens in image_tokens_list
+            ]
+            embed_is_patch = [
+                torch.tensor(image_repl_tokens) == hf_config.image_token_index
+                for image_repl_tokens in image_token_ids
+            ]
+            processed_outputs["embed_is_patch"] = embed_is_patch
+            processed_outputs["num_patches"] = torch.tensor(num_patches)
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        num_patches = hf_inputs.get("num_patches", torch.empty(0))
+        return dict(
+            pixel_values=MultiModalFieldConfig.flat_from_sizes(
+                "image", num_patches),
+            num_patches=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_token = hf_processor.image_token
+        image_processor = hf_processor.image_processor
+
+        def get_replacement(item_idx: int):
+            images: ImageProcessorItems = mm_items.get("image",
+                                                       ImageProcessorItems)
+            image_size: ImageSize = images.get_image_size(item_idx)
+            num_patches = self.info.get_num_patches(
+                image_width=image_size.width,
+                image_height=image_size.height,
+                size=image_processor.size,
+                min_patches=image_processor.min_patches,
+                max_patches=image_processor.max_patches)
+            return hf_processor._prompt_split_image(num_patches=num_patches)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=image_token,
+                replacement=get_replacement,
+            )
+        ]
+
+
+def _get_num_hidden_layers(hf_config: AyaVisionConfig) -> int:
+    feature_layers = hf_config.vision_feature_layer
+    num_hidden_layers = hf_config.vision_config.num_hidden_layers
+    # If we have one feature layer, initialize up to that layer
+    if isinstance(feature_layers, int):
+        return _get_layer_index(feature_layers, num_hidden_layers)
+    # If we have multiple feature layers, initialize up to the deepest m
+    elif isinstance(feature_layers, (list, tuple)):
+        return max(
+            _get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
+    raise TypeError(f"vision_layer_feature type: {type(feature_layers)}"
+                    " is not supported")
+
+
+def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
+    if feature_layer_index < 0:
+        return num_hidden_layers + feature_layer_index + 1
+    return feature_layer_index
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    AyaVisionMultiModalProcessor,
+    info=AyaVisionProcessingInfo,
+    dummy_inputs=AyaVisionDummyInputsBuilder)
+class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                        SupportsPP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: AyaVisionConfig = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        num_hidden_layers = _get_num_hidden_layers(config)
+        self.config = config
+        self.quant_config = quant_config
+        self.multimodal_config = multimodal_config
+
+        self.vision_tower = SiglipVisionModel(
+            config.vision_config,
+            quant_config,
+            num_hidden_layers_override=num_hidden_layers,
+            prefix=maybe_prefix(prefix, "vision_model"))
+        self.vocab_size = config.text_config.vocab_size
+        self.multi_modal_projector = AyaVisionMultiModalProjector(config)
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "model"),
+            # Cohere2ForCausalLM and CohereForCausalLM are the same on vllm
+            architectures=["Cohere2ForCausalLM"])
+
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+    def _image_pixels_to_features(self, vision_tower: SiglipVisionModel,
+                                  pixel_values: torch.Tensor,
+                                  **kwargs) -> torch.Tensor:
+        target_dtype = vision_tower.get_input_embeddings().weight.dtype
+        image_features = vision_tower(pixel_values.to(dtype=target_dtype),
+                                      **kwargs)
+
+        def select_features(leaf: torch.Tensor):
+            return self._select_image_features(
+                leaf,
+                strategy=self.config.vision_feature_select_strategy,
+            )
+
+        return cast(
+            Union[torch.Tensor, tuple[torch.Tensor, ...]],
+            json_map_leaves(select_features, image_features),
+        )
+
+    def _select_image_features(self, image_features: torch.Tensor, *,
+                               strategy: str) -> torch.Tensor:
+        if strategy == "default":
+            return image_features[:, 1:]
+        elif strategy == "full":
+            return image_features
+
+        raise ValueError(f"Unexpected select feature strategy: {strategy}")
+
+    def _process_image_input(self, image_input: AyaVisionImagePixelInputs,
+                             **kwargs) -> list[torch.Tensor]:
+        assert self.vision_tower is not None
+        pixel_values = image_input["pixel_values"]
+        num_patches = image_input["num_patches"]
+        image_features = self._image_pixels_to_features(
+            self.vision_tower, pixel_values=pixel_values)
+        image_embeds = self.multi_modal_projector(image_features)
+        return [
+            e.flatten(0, 2) for e in image_embeds.split(num_patches.tolist())
+        ]
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            if d.shape != expected_dims:
+                raise ValueError(
+                    "The expected shape of pixel values per image per batch "
+                    f"is {expected_dims}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[AyaVisionImagePixelInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        num_patches = kwargs.pop("num_patches", None)
+        embed_is_patch = kwargs.pop("embed_is_patch", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        assert image_embeds is None, "Aya Vision does not support image_embeds."
+
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
+        if num_patches is not None and not isinstance(num_patches,
+                                                      (torch.Tensor, list)):
+            raise ValueError("Incorrect type of num_patches. "
+                             f"Got type: {type(num_patches)}")
+
+        if not isinstance(embed_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of embed_is_patch. "
+                             f"Got type: {type(embed_is_patch)}")
+
+        pixel_values = flatten_bn(pixel_values, concat=True)
+        num_patches = flatten_bn(num_patches, concat=True)
+        embed_is_patch = flatten_bn(embed_is_patch)
+        return AyaVisionImagePixelInputs(
+            type="pixel_values",
+            pixel_values=self._validate_pixel_values(pixel_values),
+            num_patches=num_patches,
+            embed_is_patch=embed_is_patch,
+        )
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        image_features = self._process_image_input(image_input, **kwargs)
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                multimodal_embeddings=select_patch_features(
+                    multimodal_embeddings),
+                placeholder_token_id=self.config.image_token_index)
+
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    @cached_property
+    def sampler(self):
+        if hasattr(self.language_model, "sampler"):
+            return self.language_model.sampler
+
+        return get_sampler()
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        return self.language_model.sample(logits, sampling_metadata)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 5211cd08f84ec..2f1827c174082 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -161,6 +161,7 @@ _CROSS_ENCODER_MODELS = {
 _MULTIMODAL_MODELS = {
     # [Decoder-only]
     "AriaForConditionalGeneration": ("aria", "AriaForConditionalGeneration"),
+    "AyaVisionForConditionalGeneration": ("aya_vision", "AyaVisionForConditionalGeneration"),  # noqa: E501
     "Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),
     "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"),  # noqa: E501
     "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"),

From 9ec8257914b129080a8b4eab65da400eaaca6300 Mon Sep 17 00:00:00 2001
From: cloud11665 <Cloud11665@gmail.com>
Date: Wed, 2 Apr 2025 02:13:40 +0900
Subject: [PATCH 162/593] [Model] Add module name prefixes to gemma3 (#15889)

Signed-off-by: Bartholomew Sabat <bartek@recursal.ai>
Co-authored-by: Bartholomew Sabat <bartek@recursal.ai>
---
 vllm/model_executor/models/gemma3.py | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 55c96f649fbeb..fb8eccc55078a 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -59,16 +59,23 @@ class Gemma3MLP(nn.Module):
         intermediate_size: int,
         hidden_activation: str,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
+            hidden_size,
+            [intermediate_size] * 2,
             bias=False,
-            quant_config=quant_config)
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           quant_config=quant_config)
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
         if hidden_activation != "gelu_pytorch_tanh":
             raise ValueError(
                 "Gemma3 uses `gelu_pytorch_tanh` as the hidden activation "
@@ -125,12 +132,14 @@ class Gemma3Attention(nn.Module):
             self.total_num_kv_heads,
             bias=config.attention_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             hidden_size,
             bias=config.attention_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
 
         self.q_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps)
@@ -293,6 +302,7 @@ class Gemma3DecoderLayer(nn.Module):
             intermediate_size=config.intermediate_size,
             hidden_activation=config.hidden_activation,
             quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
         )
         self.input_layernorm = GemmaRMSNorm(config.hidden_size,
                                             eps=config.rms_norm_eps)
@@ -344,6 +354,7 @@ class Gemma3Model(nn.Module):
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
+            prefix=f"{prefix}.embed_tokens",
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,

From 7e3f7a4ee799b42aea206f6c09b993ea184bfca3 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 1 Apr 2025 10:42:34 -0700
Subject: [PATCH 163/593] [CI] Disable flaky structure decoding test
 temporarily. (#15892)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 tests/v1/entrypoints/llm/test_struct_output_generate.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index fa58c6460f840..0ffee08c23462 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -23,7 +23,8 @@ PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
     ("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace",
      "mistral"),
     ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar:disable-any-whitespace", "auto"),
-    ("Qwen/Qwen2.5-1.5B-Instruct", "guidance:disable-any-whitespace", "auto"),
+    #FIXME: This test is flaky on CI thus disabled
+    #("Qwen/Qwen2.5-1.5B-Instruct", "guidance:disable-any-whitespace", "auto"),
 ]
 
 PARAMS_MODELS_TOKENIZER_MODE = [

From a79cc68b3a0dcde7d3c3449920aa56ea3766020f Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Tue, 1 Apr 2025 18:45:04 +0100
Subject: [PATCH 164/593] [V1][Metrics] Initial speculative decoding metrics
 (#15151)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 tests/v1/core/test_scheduler.py | 95 +++++++++++++++++++++++++++++++++
 vllm/v1/core/sched/scheduler.py | 15 +++++-
 vllm/v1/metrics/loggers.py      | 33 ++++++++++++
 vllm/v1/metrics/stats.py        |  4 ++
 vllm/v1/spec_decode/metrics.py  | 59 ++++++++++++++++++++
 5 files changed, 204 insertions(+), 2 deletions(-)
 create mode 100644 vllm/v1/spec_decode/metrics.py

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 73af7dad5cca6..a087c41ab3a9f 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -611,3 +611,98 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
         prompt_logprobs_dict={},
     )
     scheduler.update_from_output(scheduler_output1, model_runner_output)
+
+
+# Note - these test cases mirror some of those in test_rejection_sampler.py
+@pytest.mark.parametrize(
+    "spec_tokens,output_tokens,expected",
+    [
+        ([[1, 2, 3]], [[1, 2, 3, 4]], (3, 3)),  # perfect match
+        ([[1, 2, 3]], [[1, 5]], (3, 1)),  # early mismatch
+        ([[1, 2], [3]], [[1, 2, 5], [3, 4]], (3, 3)),  # multiple sequences
+        ([[1]], [[1, 2]], (1, 1)),  # single token sequence
+        ([[]], [[5]], (0, 0)),  # empty sequence
+        ([[1, 2, 3], [4, 5, 6]], [[1, 2, 7], [4, 8]],
+         (6, 3)),  # multiple mismatches
+    ])
+def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
+    """Test scheduling behavior with speculative decoding.
+
+    This test verifies that:
+    1. Speculated tokens get scheduled correctly
+    2. Spec decoding stats properly count number of draft and accepted tokens
+    """
+    scheduler = create_scheduler()
+    requests = create_requests(num_requests=len(spec_tokens), num_tokens=1)
+    req_ids = []
+    req_to_index = {}
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        req_ids.append(request.request_id)
+        req_to_index[request.request_id] = i
+
+    # Schedule a decode, which will also draft speculative tokens
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == len(requests)
+    assert output.total_num_scheduled_tokens == len(requests)
+    for i in range(len(requests)):
+        req_id = requests[i].request_id
+        assert output.num_scheduled_tokens[req_id] == 1
+        assert req_id not in output.scheduled_spec_decode_tokens
+
+    model_runner_output = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[0] for _ in range(len(requests))],
+        spec_token_ids=spec_tokens,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+    engine_core_outputs = scheduler.update_from_output(output,
+                                                       model_runner_output)
+
+    for i in range(len(requests)):
+        running_req = scheduler.running[i]
+        # The prompt token
+        assert running_req.num_computed_tokens == 1
+        # The prompt token and the sampled token
+        assert running_req.num_tokens == 2
+        # The prompt token, the sampled token, and the speculated tokens
+        assert running_req.num_tokens_with_spec == 2 + len(spec_tokens[i])
+
+    # No draft or accepted tokens counted yet
+    assert engine_core_outputs.scheduler_stats.spec_decoding_stats is not None
+    stats = engine_core_outputs.scheduler_stats.spec_decoding_stats
+    assert stats.num_draft_tokens == 0
+    assert stats.num_accepted_tokens == 0
+
+    # Schedule the speculated tokens for validation
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 0
+    # The sampled token and speculated tokens
+    assert output.total_num_scheduled_tokens == \
+        len(requests) + sum(len(ids) for ids in spec_tokens)
+    for i in range(len(requests)):
+        req_id = requests[i].request_id
+        assert output.num_scheduled_tokens[req_id] == 1 + len(spec_tokens[i])
+        if spec_tokens[i]:
+            assert len(output.scheduled_spec_decode_tokens[req_id]) == \
+                len(spec_tokens[i])
+        else:
+            assert req_id not in output.scheduled_spec_decode_tokens
+
+    model_runner_output = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=output_tokens,
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+    engine_core_outputs = scheduler.update_from_output(output,
+                                                       model_runner_output)
+
+    assert engine_core_outputs.scheduler_stats.spec_decoding_stats is not None
+    stats = engine_core_outputs.scheduler_stats.spec_decoding_stats
+    assert stats.num_draft_tokens == expected[0]
+    assert stats.num_accepted_tokens == expected[1]
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 4d477567b9b60..a0865c8fd8457 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -23,6 +23,7 @@ from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.stats import SchedulerStats
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
+from vllm.v1.spec_decode.metrics import SpecDecodingStats
 from vllm.v1.structured_output import StructuredOutputManager
 
 logger = init_logger(__name__)
@@ -552,6 +553,7 @@ class Scheduler(SchedulerInterface):
         spec_token_ids = model_runner_output.spec_token_ids
         logprobs = model_runner_output.logprobs
         prompt_logprobs_dict = model_runner_output.prompt_logprobs_dict
+        spec_decoding_stats = SpecDecodingStats() if self.log_stats else None
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
 
         new_running: list[Request] = []
@@ -584,6 +586,11 @@ class Scheduler(SchedulerInterface):
                                        len(generated_token_ids))
                 request.num_computed_tokens -= num_tokens_rejected
 
+                if spec_decoding_stats is not None:
+                    spec_decoding_stats.observe(
+                        num_draft_tokens=len(scheduled_spec_token_ids),
+                        num_accepted_tokens=len(generated_token_ids) - 1)
+
             cached_encoder_input_ids = (
                 self.encoder_cache_manager.get_cached_input_ids(request))
             # OPTIMIZATION: Avoid list(set) if the set is empty.
@@ -657,7 +664,7 @@ class Scheduler(SchedulerInterface):
         self.running = new_running
         engine_core_outputs = EngineCoreOutputs(
             outputs=outputs,
-            scheduler_stats=self.make_stats(),
+            scheduler_stats=self.make_stats(spec_decoding_stats),
         )
         if self.include_finished_set:
             #TODO currently sending duplicates here, improve this
@@ -724,7 +731,10 @@ class Scheduler(SchedulerInterface):
     def reset_prefix_cache(self) -> bool:
         return self.kv_cache_manager.reset_prefix_cache()
 
-    def make_stats(self) -> Optional[SchedulerStats]:
+    def make_stats(
+        self,
+        spec_decoding_stats: Optional[SpecDecodingStats] = None,
+    ) -> Optional[SchedulerStats]:
         if not self.log_stats:
             return None
         return SchedulerStats(
@@ -732,4 +742,5 @@ class Scheduler(SchedulerInterface):
             num_waiting_reqs=len(self.waiting),
             gpu_cache_usage=self.kv_cache_manager.usage,
             prefix_cache_stats=self.kv_cache_manager.make_prefix_cache_stats(),
+            spec_decoding_stats=spec_decoding_stats,
         )
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 6ffd00ebd17a1..73883d9a735dd 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -12,6 +12,7 @@ from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics
 from vllm.v1.engine import FinishReason
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
+from vllm.v1.spec_decode.metrics import SpecDecodingMetrics
 
 logger = init_logger(__name__)
 
@@ -38,6 +39,7 @@ class LoggingStatLogger(StatLoggerBase):
         # Prefix cache metrics. This cannot be reset.
         # TODO: Make the interval configurable.
         self.prefix_caching_metrics = PrefixCachingMetrics()
+        self.spec_decoding_metrics = SpecDecodingMetrics()
 
     def _reset(self, now):
         self.last_log_time = now
@@ -65,6 +67,10 @@ class LoggingStatLogger(StatLoggerBase):
 
         self.prefix_caching_metrics.observe(scheduler_stats.prefix_cache_stats)
 
+        if scheduler_stats.spec_decoding_stats is not None:
+            self.spec_decoding_metrics.observe(
+                scheduler_stats.spec_decoding_stats)
+
         self.last_scheduler_stats = scheduler_stats
 
     def log(self):
@@ -94,6 +100,9 @@ class LoggingStatLogger(StatLoggerBase):
             self.prefix_caching_metrics.hit_rate * 100,
         )
 
+        if scheduler_stats.spec_decoding_stats is not None:
+            self.spec_decoding_metrics.log()
+
 
 class PrometheusStatLogger(StatLoggerBase):
 
@@ -302,6 +311,24 @@ class PrometheusStatLogger(StatLoggerBase):
                         self.labelname_running_lora_adapters,
                     ])
 
+        #
+        # Speculative Decoding metrics
+        # The acceptance rate can be calculated using a PromQL query:
+        #
+        #   rate(vllm:spec_decode_num_accepted_tokens_total[$interval]) /
+        #   rate(vllm:spec_decode_num_draft_tokens_total[$interval])
+        #
+        self.counter_spec_decode_num_draft_tokens = \
+            prometheus_client.Counter(
+                name="vllm:spec_decode_num_draft_tokens_total",
+                documentation="Number of draft tokens.",
+                labelnames=labelnames).labels(*labelvalues)
+        self.counter_spec_decode_num_accepted_tokens = \
+            prometheus_client.Counter(
+                name="vllm:spec_decode_num_accepted_tokens_total",
+                documentation="Number of accepted tokens.",
+                labelnames=labelnames).labels(*labelvalues)
+
         #
         # Cache config info metric
         #
@@ -338,6 +365,12 @@ class PrometheusStatLogger(StatLoggerBase):
         self.counter_gpu_prefix_cache_hits.inc(
             scheduler_stats.prefix_cache_stats.hits)
 
+        if scheduler_stats.spec_decoding_stats is not None:
+            self.counter_spec_decode_num_draft_tokens.inc(
+                scheduler_stats.spec_decoding_stats.num_draft_tokens)
+            self.counter_spec_decode_num_accepted_tokens.inc(
+                scheduler_stats.spec_decoding_stats.num_accepted_tokens)
+
         if iteration_stats is None:
             return
 
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 6f3d344474260..fd949264885b5 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -4,6 +4,8 @@ import time
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Optional
 
+from vllm.v1.spec_decode.metrics import SpecDecodingStats
+
 if TYPE_CHECKING:
     from vllm.v1.engine import EngineCoreEvent, EngineCoreOutput, FinishReason
     from vllm.v1.engine.output_processor import RequestState
@@ -35,6 +37,8 @@ class SchedulerStats:
     prefix_cache_stats: PrefixCacheStats = field(
         default_factory=PrefixCacheStats)
 
+    spec_decoding_stats: Optional[SpecDecodingStats] = None
+
 
 @dataclass
 class LoRAStats:
diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py
new file mode 100644
index 0000000000000..7fecbaeed4f74
--- /dev/null
+++ b/vllm/v1/spec_decode/metrics.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from dataclasses import dataclass
+
+import numpy as np
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class SpecDecodingStats:
+    num_draft_tokens: int = 0
+    num_accepted_tokens: int = 0
+
+    def take(self):
+        copied = SpecDecodingStats(self.num_draft_tokens,
+                                   self.num_accepted_tokens)
+        self.reset()
+        return copied
+
+    def reset(self):
+        self.num_draft_tokens = 0
+        self.num_accepted_tokens = 0
+
+    def observe(self, num_draft_tokens: int, num_accepted_tokens: int):
+        self.num_draft_tokens += num_draft_tokens
+        self.num_accepted_tokens += num_accepted_tokens
+
+
+class SpecDecodingMetrics:
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.num_draft_tokens: list[int] = []
+        self.num_accepted_tokens: list[int] = []
+
+    def observe(self, spec_decoding_stats: SpecDecodingStats):
+        self.num_draft_tokens.append(spec_decoding_stats.num_draft_tokens)
+        self.num_accepted_tokens.append(
+            spec_decoding_stats.num_accepted_tokens)
+
+    def log(self):
+        num_draft_tokens = np.sum(self.num_draft_tokens)
+        num_accepted_tokens = np.sum(self.num_accepted_tokens)
+
+        draft_acceptance_rate = (num_accepted_tokens / num_draft_tokens
+                                 if num_draft_tokens > 0 else float("nan"))
+
+        logger.info(
+            "Speculative metrics: "
+            "Draft acceptance rate: %.3f, "
+            "Number of accepted tokens: %d, "
+            "Number of draft tokens: %d, ", draft_acceptance_rate,
+            num_accepted_tokens, num_draft_tokens)
+        self.reset()

From e75a6301bda30a5a8453eb95804aedc2104936f7 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 1 Apr 2025 12:33:16 -0700
Subject: [PATCH 165/593] [V1][Spec Decode] Implement Eagle Proposer [1/N]
 (#15729)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/config.py                        |   5 +-
 vllm/engine/arg_utils.py              |  22 ++-
 vllm/v1/spec_decode/eagle.py          | 262 ++++++++++++++++++++++++++
 vllm/v1/spec_decode/ngram_proposer.py |   9 +
 vllm/v1/worker/gpu_input_batch.py     |  11 +-
 vllm/v1/worker/gpu_model_runner.py    |  90 +++++++--
 6 files changed, 378 insertions(+), 21 deletions(-)
 create mode 100644 vllm/v1/spec_decode/eagle.py

diff --git a/vllm/config.py b/vllm/config.py
index c213c9b475680..6ec5d1bc28faa 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2152,9 +2152,10 @@ class SpeculativeConfig:
 
                 # Replace hf_config for EAGLE draft_model
                 if self.method == "eagle":
-                    if self.enable_chunked_prefill:
+                    if self.enable_chunked_prefill and not envs.VLLM_USE_V1:
                         raise ValueError(
-                            "Chunked prefill and EAGLE are not compatible.")
+                            "Chunked prefill and EAGLE are not compatible "
+                            "when using V0.")
 
                     from vllm.transformers_utils.configs.eagle import (
                         EAGLEConfig)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ecdcab50e4524..88723d9f5b741 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1468,15 +1468,21 @@ class EngineArgs:
 
         # Only Ngram speculative decoding so far.
         is_ngram_enabled = False
+        is_eagle_enabled = False
         if self.speculative_config is not None:
             # This is supported but experimental (handled below).
-            if (("method" in self.speculative_config
-                 and self.speculative_config["method"] in ("ngram", "[ngram]"))
-                    or
-                ("model" in self.speculative_config and
-                 self.speculative_config["model"] in ("ngram", "[ngram]"))):
-                is_ngram_enabled = True
+            speculative_method = self.speculative_config.get("method")
+            if speculative_method:
+                if speculative_method in ("ngram", "[ngram]"):
+                    is_ngram_enabled = True
+                elif speculative_method == "eagle":
+                    is_eagle_enabled = True
             else:
+                speculative_model = self.speculative_config.get("model")
+                if speculative_model in ("ngram", "[ngram]"):
+                    is_ngram_enabled = True
+            if not (is_ngram_enabled or is_eagle_enabled):
+                # Other speculative decoding methods are not supported yet.
                 _raise_or_fallback(feature_name="Speculative Decoding",
                                    recommend_to_remove=False)
                 return False
@@ -1523,6 +1529,10 @@ class EngineArgs:
         if is_ngram_enabled and _warn_or_fallback("ngram"):
             return False
 
+        # Eagle is under development, so we don't support it yet.
+        if is_eagle_enabled and _warn_or_fallback("Eagle"):
+            return False
+
         # Non-CUDA is supported on V1, but off by default for now.
         not_cuda = not current_platform.is_cuda()
         if not_cuda and _warn_or_fallback(  # noqa: SIM103
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
new file mode 100644
index 0000000000000..57c6b652593d6
--- /dev/null
+++ b/vllm/v1/spec_decode/eagle.py
@@ -0,0 +1,262 @@
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+from vllm.config import VllmConfig
+from vllm.forward_context import set_forward_context
+from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
+from vllm.v1.sample.metadata import SamplingMetadata
+
+
+class EagleProposer:
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        self.vllm_config = vllm_config
+        self.num_speculative_tokens = (
+            vllm_config.speculative_config.num_speculative_tokens)
+        self.block_size = vllm_config.cache_config.block_size
+        self.arange = torch.arange(vllm_config.scheduler_config.max_num_seqs,
+                                   device=device)
+
+    def propose(
+        self,
+        # [num_tokens]
+        target_token_ids: torch.Tensor,
+        # [num_tokens]
+        target_positions: torch.Tensor,
+        # [num_tokens, hidden_size]
+        target_hidden_states: torch.Tensor,
+        # [num_tokens]
+        target_slot_mapping: torch.Tensor,
+        # [batch_size]
+        next_token_ids: torch.Tensor,
+        # [batch_size + 1] starting with 0
+        cu_num_tokens: torch.Tensor,
+        # [batch_size, max_num_blocks_per_req]
+        block_table: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        num_tokens = target_token_ids.shape[0]
+        batch_size = next_token_ids.shape[0]
+        last_token_indices = cu_num_tokens[1:] - 1
+
+        input_ids = torch.empty_like(target_token_ids)
+        # Shift the input ids by one token.
+        # E.g., [a1, b1, b2, c1, c2, c3] -> [b1, b2, c1, c2, c3, c3]
+        input_ids[:-1] = target_token_ids[1:]
+        # Replace the last token with the next token.
+        # E.g., [b1, b2, c1, c2, c3, c3] -> [a2, b2, b3, c2, c3, c4]
+        input_ids[last_token_indices] = next_token_ids
+
+        seq_lens = target_positions[last_token_indices] + 1
+        # FIXME(woosuk): The below two ops cause synchronization. Optimize.
+        max_seq_len = seq_lens.max().item()
+        max_num_tokens = (cu_num_tokens[1:] - cu_num_tokens[:-1]).max().item()
+        attn_metadata = FlashAttentionMetadata(
+            num_actual_tokens=num_tokens,
+            max_query_len=max_num_tokens,
+            query_start_loc=cu_num_tokens,
+            max_seq_len=max_seq_len,
+            seq_lens=seq_lens,
+            block_table=block_table,
+            slot_mapping=target_slot_mapping,
+            # TODO(woosuk): Support cascade attention.
+            use_cascade=False,
+            common_prefix_len=0,
+            cu_prefix_query_lens=None,
+            prefix_kv_lens=None,
+            suffix_kv_lens=None,
+        )
+
+        with set_forward_context(attn_metadata, self.vllm_config):
+            hidden_states = self.model(
+                input_ids=input_ids,
+                hidden_states=target_hidden_states,
+                positions=target_positions,
+            )
+        sample_hidden_states = hidden_states[last_token_indices]
+        logits = self.model.compute_logits(sample_hidden_states, None)
+        draft_token_ids, draft_probs = compute_probs_and_sample_next_token(
+            logits, sampling_metadata)
+
+        # Early exit if there is only one draft token to be generated.
+        if self.num_speculative_tokens == 1:
+            # [batch_size, 1] and [batch_size, 1, vocab_size]
+            return draft_token_ids.view(-1, 1), draft_probs.unsqueeze(dim=1)
+
+        # Generate the remaining draft tokens.
+        draft_token_ids_list = [draft_token_ids]
+        draft_probs_list = [draft_probs]
+
+        positions = target_positions[last_token_indices]
+        hidden_states = sample_hidden_states
+        attn_metadata.num_actual_tokens = batch_size
+        attn_metadata.max_query_len = 1
+        attn_metadata.query_start_loc = self.arange[:batch_size]
+        for _ in range(self.num_speculative_tokens - 1):
+            # Update the inputs.
+            input_ids = draft_token_ids_list[-1]
+            positions += 1
+            attn_metadata.max_seq_len += 1
+            attn_metadata.seq_lens += 1
+            # Compute the slot mapping.
+            block_numbers = positions // self.block_size
+            block_ids = block_table.gather(dim=1,
+                                           index=block_numbers.view(-1, 1))
+            block_ids = block_ids.view(-1)
+            attn_metadata.slot_mapping = (block_ids * self.block_size +
+                                          positions % self.block_size)
+
+            # Run the model.
+            with set_forward_context(attn_metadata, self.vllm_config):
+                hidden_states = self.model(
+                    input_ids=input_ids,
+                    hidden_states=hidden_states,
+                    positions=positions,
+                )
+            logits = self.model.compute_logits(hidden_states, None)
+            draft_token_ids, probs = compute_probs_and_sample_next_token(
+                logits, sampling_metadata)
+            draft_token_ids_list.append(draft_token_ids)
+            draft_probs_list.append(probs)
+
+        # [batch_size, num_speculative_tokens]
+        draft_token_ids = torch.stack(draft_token_ids_list, dim=1)
+        # [batch_size, num_speculative_tokens, vocab_size]
+        draft_probs = torch.stack(draft_probs_list, dim=1)
+        return draft_token_ids, draft_probs
+
+    @staticmethod
+    def prepare_inputs(
+        # [batch_size + 1]
+        cu_target_query_lens: torch.Tensor,
+        # [batch_size]
+        num_rejected_tokens: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # cu_target_query_lens: [0, a, a + b, a + b + c]
+        # num_rejected_tokens: [n1, n2, n3]
+        # num_tokens_per_req: [a - n1, b - n2, c - n3]
+        # cu_num_tokens: [0, a - n1, a + b - n1 - n2, a + b + c - n1 - n2 - n3]
+        # token_indices: [0, 1, ..., a - n1 - 1,
+        #                 a, a + 1, ..., a + b - n2 - 1,
+        #                 a + b, a + b + 1, ..., a + b + c - n3 - 1]
+
+        # [0, a, a + b, a + b + c] -> [a, b, c]
+        query_len_per_req = (cu_target_query_lens[1:] -
+                             cu_target_query_lens[:-1])
+        # [a, b, c] -> [a - n1, b - n2, c - n3]
+        num_tokens_per_req = query_len_per_req - num_rejected_tokens
+
+        cu_num_tokens = torch.empty_like(cu_target_query_lens)
+        torch.cumsum(num_tokens_per_req, dim=0, out=cu_num_tokens[1:])
+        cu_num_tokens[0] = 0
+
+        # FIXME(woosuk): Avoid synchronization.
+        num_tokens = cu_num_tokens[-1].item()
+        token_indices = torch.empty(
+            num_tokens,
+            dtype=torch.int32,
+            device=cu_num_tokens.device,
+        )
+
+        batch_size = num_rejected_tokens.shape[0]
+        BLOCK_SIZE = 1024
+        prepare_input_kernel[(batch_size, )](
+            token_indices,
+            cu_target_query_lens,
+            cu_num_tokens,
+            BLOCK_SIZE=BLOCK_SIZE,
+        )
+        return cu_num_tokens, token_indices
+
+    def load_model(self, target_model: nn.Module) -> None:
+        self.model = DummyEagleModel()
+        self.model.get_input_embeddings = target_model.get_input_embeddings
+        self.model.compute_logits = target_model.compute_logits
+
+
+# FIXME(woosuk): This is a dummy model for testing.
+# Remove this once we have a real model.
+class DummyEagleModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+        positions: torch.Tensor,
+    ) -> torch.Tensor:
+        input_embeddings = self.get_input_embeddings(input_ids)
+        return hidden_states + input_embeddings  # Dummy return.
+
+
+# FIXME(woosuk): The logic here is duplicated with the main sampling code.
+# We should refactor this to reuse the same sampling implementation.
+def compute_probs_and_sample_next_token(
+    logits: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if sampling_metadata.all_greedy:
+        # For greedy requests, draft_probs is not used in rejection sampling.
+        # Therefore, we can just return the logits.
+        probs = logits
+        next_token_ids = logits.argmax(dim=-1)
+        return next_token_ids, probs
+
+    is_greedy = sampling_metadata.temperature == -1
+    temperature = torch.where(is_greedy, 1.0, sampling_metadata.temperature)
+    logits.div_(temperature.view(-1, 1))
+    probs = logits.softmax(dim=-1, dtype=torch.float32)
+
+    # NOTE(woosuk): Currently, we ignore most of the sampling parameters in
+    # generating the draft tokens. We only use the temperature. While this
+    # could degrade the acceptance rate, it does not affect the distribution
+    # of the generated tokens after rejection sampling.
+
+    # TODO(woosuk): Consider seeds.
+    q = torch.empty_like(probs)
+    q.exponential_()
+    next_token_ids = probs.div_(q).argmax(dim=-1).view(-1)
+    if not sampling_metadata.all_random:
+        greedy_token_ids = probs.argmax(dim=-1)
+        next_token_ids = torch.where(
+            is_greedy,
+            greedy_token_ids,
+            next_token_ids,
+        )
+    return next_token_ids, probs
+
+
+@triton.jit
+def prepare_input_kernel(
+    out_ptr,
+    cu_query_lens_ptr,
+    cu_num_tokens_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    # [start_pos, end_pos)
+    start_pos = tl.load(cu_num_tokens_ptr + pid)
+    end_pos = tl.load(cu_num_tokens_ptr + pid + 1)
+    num_tokens = end_pos - start_pos
+
+    index_start = tl.load(cu_query_lens_ptr + pid)
+    indices = index_start + tl.arange(0, BLOCK_SIZE)
+
+    num_blocks = tl.cdiv(num_tokens, BLOCK_SIZE)
+    for i in tl.range(num_blocks):
+        offset = i * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+        tl.store(
+            out_ptr + start_pos + offset,
+            indices,
+            mask=offset < num_tokens,
+        )
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index 0bef349e99e28..8f6d20d11ff3d 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -4,9 +4,14 @@ from typing import Optional
 import numpy as np
 from numba import jit
 
+from vllm.config import VllmConfig
+
 
 class NgramProposer:
 
+    def __init__(self, vllm_config: VllmConfig):
+        self.vllm_config = vllm_config
+
     def propose(
         self,
         context_token_ids: np.ndarray,
@@ -50,6 +55,10 @@ class NgramProposer:
                 return result
         return None
 
+    def load_model(self, *args, **kwargs):
+        # No model to load.
+        pass
+
 
 @jit(nopython=True)
 def _kmp_lps_array(pattern: np.ndarray) -> np.ndarray:
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 351b358155801..a64cb97e0123f 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -39,9 +39,18 @@ class CachedRequestState:
 
     lora_request: Optional[LoRARequest] = None
 
+    def __post_init__(self):
+        self.num_prompt_tokens = len(self.prompt_token_ids)
+
     @property
     def num_tokens(self) -> int:
-        return len(self.prompt_token_ids) + len(self.output_token_ids)
+        return self.num_prompt_tokens + len(self.output_token_ids)
+
+    def get_token_id(self, idx: int) -> int:
+        if idx < self.num_prompt_tokens:
+            return self.prompt_token_ids[idx]
+        else:
+            return self.output_token_ids[idx - self.num_prompt_tokens]
 
 
 class InputBatch:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 637367a70d2a3..513806332efe3 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -35,6 +35,7 @@ from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
                              ModelRunnerOutput)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import RejectionSampler
+from vllm.v1.spec_decode.eagle import EagleProposer
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 from vllm.v1.spec_decode.utils import is_spec_decode_supported
@@ -157,18 +158,15 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         self.use_spec_decode = False
         if self.speculative_config:
             self.use_spec_decode = True
-            assert self.speculative_config.method == "ngram", \
-                    "Currently, only ngram spec decode is supported in V1."
             if get_pp_group().is_last_rank:
-                self.drafter = NgramProposer()
-                # Trigger Numba JIT compilation for N-gram proposer.
-                # This usually takes less than 1 second.
-                self.drafter.propose(
-                    np.zeros(1024, dtype=np.int32),
-                    self.speculative_config.prompt_lookup_min,
-                    self.speculative_config.prompt_lookup_max,
-                    self.speculative_config.num_speculative_tokens,
-                )
+                if self.speculative_config.method == "ngram":
+                    self.drafter = NgramProposer(self.vllm_config)
+                elif self.speculative_config.method == "eagle":
+                    self.drafter = EagleProposer(self.vllm_config,
+                                                 self.device)  # type: ignore
+                else:
+                    raise ValueError("Unknown speculative decoding method: "
+                                     f"{self.speculative_config.method}")
                 self.rejection_sampler = RejectionSampler()
 
         # Request states.
@@ -1144,10 +1142,75 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             valid_sampled_token_ids[i].clear()
 
         if not self.use_spec_decode:
+            # Speculative decoding is not enabled.
             spec_token_ids = None
-        else:
+        elif self.speculative_config.method == "ngram":
+            assert isinstance(self.drafter, NgramProposer)
             spec_token_ids = self.generate_draft_token_ids(
                 valid_sampled_token_ids, sampling_metadata)
+        elif self.speculative_config.method == "eagle":
+            assert isinstance(self.drafter, EagleProposer)
+            # TODO(woosuk): Refactor the loop.
+            next_token_ids: list[int] = []
+            for i, token_ids in enumerate(valid_sampled_token_ids):
+                if token_ids:
+                    # Common case.
+                    next_token_id = token_ids[-1]
+                else:
+                    # Partial prefill (rare case).
+                    # Get the next token id from the request state.
+                    req_id = self.input_batch.req_ids[i]
+                    req_state = self.requests[req_id]
+                    seq_len = (req_state.num_computed_tokens +
+                               scheduler_output.num_scheduled_tokens[req_id])
+                    next_token_id = req_state.get_token_id(seq_len)
+                next_token_ids.append(next_token_id)
+            next_token_ids = torch.tensor(next_token_ids,
+                                          dtype=torch.int32,
+                                          device=self.device)
+
+            if spec_decode_metadata is None:
+                # input_ids can be None for multimodal models.
+                target_token_ids = self.input_ids[:num_scheduled_tokens]
+                target_positions = positions
+                target_hidden_states = hidden_states
+                target_slot_mapping = attn_metadata.slot_mapping
+                cu_num_tokens = attn_metadata.query_start_loc
+            else:
+                # TODO(woosuk): Refactor this.
+                num_draft_tokens = spec_decode_metadata.num_draft_tokens
+                num_rejected_tokens = [
+                    n + 1 - len(valid_sampled_token_ids[i]) if n > 0 else 0
+                    for i, n in enumerate(num_draft_tokens)
+                ]
+                num_rejected_tokens = torch.tensor(
+                    num_rejected_tokens,
+                    dtype=torch.int32,
+                    device=self.device,
+                )
+                cu_num_tokens, token_indices = self.drafter.prepare_inputs(
+                    attn_metadata.query_start_loc,
+                    num_rejected_tokens,
+                )
+                target_token_ids = self.input_ids[token_indices]
+                target_positions = positions[token_indices]
+                target_hidden_states = hidden_states[token_indices]
+                target_slot_mapping = attn_metadata.slot_mapping[token_indices]
+
+            draft_token_ids, draft_probs = self.drafter.propose(
+                target_token_ids=target_token_ids,
+                target_positions=target_positions,
+                target_hidden_states=target_hidden_states,
+                target_slot_mapping=target_slot_mapping,
+                next_token_ids=next_token_ids,
+                cu_num_tokens=cu_num_tokens,
+                block_table=attn_metadata.block_table,
+                sampling_metadata=sampling_metadata,
+            )
+            spec_token_ids = draft_token_ids.tolist()
+            # TODO(woosuk): Cache draft_probs and use it for rejection sampling
+            # in the next step.
+            del draft_probs
 
         return ModelRunnerOutput(
             req_ids=self.input_batch.req_ids,
@@ -1205,6 +1268,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                                                   self.scheduler_config,
                                                   self.lora_config,
                                                   self.device)
+            if hasattr(self, "drafter"):
+                logger.info("Loading drafter model...")
+                self.drafter.load_model(self.model)
             time_after_load = time.perf_counter()
         self.model_memory_usage = m.consumed_memory
         logger.info("Model loading took %.4f GiB and %.6f seconds",

From 7acd539cd772953bbeb14de1888f788a0926a5cd Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Tue, 1 Apr 2025 12:54:13 -0700
Subject: [PATCH 166/593] [Docs] update usage stats language (#15898)

Signed-off-by: simon-mo <simon.mo@hey.com>
---
 docs/source/serving/usage_stats.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/source/serving/usage_stats.md b/docs/source/serving/usage_stats.md
index cfc3cb2576873..750cba7ed9ce2 100644
--- a/docs/source/serving/usage_stats.md
+++ b/docs/source/serving/usage_stats.md
@@ -1,6 +1,8 @@
 # Usage Stats Collection
 
-vLLM collects anonymous usage data by default to help the engineering team better understand which hardware and model configurations are widely used. This data allows them to prioritize their efforts on the most common workloads. The collected data is transparent, does not contain any sensitive information, and will be publicly released for the community's benefit.
+vLLM collects anonymous usage data by default to help the engineering team better understand which hardware and model configurations are widely used. This data allows them to prioritize their efforts on the most common workloads. The collected data is transparent, does not contain any sensitive information.
+
+A subset of the data, after cleaning and aggregation, will be publicly released for the community's benefit. For example, you can see the 2024 usage report [here](https://2024.vllm.ai).
 
 ## What data is collected?
 

From 93491aefc7b98770e0a361859eb15660138a3abb Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Wed, 2 Apr 2025 04:10:24 +0800
Subject: [PATCH 167/593] [BugFix] make sure socket close (#15875)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
---
 vllm/entrypoints/openai/api_server.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 2a61259896a37..1e7d9eb83b9af 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1098,9 +1098,10 @@ async def run_server(args, **uvicorn_kwargs) -> None:
         )
 
     # NB: Await server shutdown only after the backend context is exited
-    await shutdown_task
-
-    sock.close()
+    try:
+        await shutdown_task
+    finally:
+        sock.close()
 
 
 if __name__ == "__main__":

From 9ef98d527ee1b943afd39d9dcad0ffe089ab241f Mon Sep 17 00:00:00 2001
From: Gerald <1252500865@QQ.COM>
Date: Wed, 2 Apr 2025 04:23:55 +0800
Subject: [PATCH 168/593] [Model][MiniMaxText01] Support MiniMaxText01 model
 inference (#13454)

Signed-off-by: qscqesze <475517977@qq.com>
Co-authored-by: qingjun <qingjun@minimaxi.com>
Co-authored-by: qscqesze <475517977@qq.com>
---
 docs/source/models/supported_models.md        |    5 +
 tests/kernels/test_lightning_attn.py          |  286 ++++
 tests/models/registry.py                      |    2 +
 vllm/config.py                                |   40 +-
 vllm/engine/async_llm_engine.py               |    7 +-
 vllm/model_executor/layers/lightning_attn.py  |  651 +++++++++
 .../models/constant_size_cache.py             |  136 ++
 vllm/model_executor/models/mamba_cache.py     |  132 +-
 vllm/model_executor/models/minimax_cache.py   |   35 +
 vllm/model_executor/models/minimax_text_01.py | 1273 +++++++++++++++++
 vllm/model_executor/models/registry.py        |    1 +
 11 files changed, 2439 insertions(+), 129 deletions(-)
 create mode 100644 tests/kernels/test_lightning_attn.py
 create mode 100644 vllm/model_executor/layers/lightning_attn.py
 create mode 100644 vllm/model_executor/models/constant_size_cache.py
 create mode 100644 vllm/model_executor/models/minimax_cache.py
 create mode 100644 vllm/model_executor/models/minimax_text_01.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 1b742717885e3..af0f7304c6657 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -503,6 +503,11 @@ See [this page](#generative-models) for more information on how to use generativ
   * `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
   * ✅︎
   * ✅︎
+- * `MiniMaxText01ForCausalLM`
+  * MiniMax-Text
+  * `MiniMaxAI/MiniMax-Text-01`, etc.
+  *
+  * ✅︎
 - * `Zamba2ForCausalLM`
   * Zamba2
   * `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc.
diff --git a/tests/kernels/test_lightning_attn.py b/tests/kernels/test_lightning_attn.py
new file mode 100644
index 0000000000000..fbad52987dd2b
--- /dev/null
+++ b/tests/kernels/test_lightning_attn.py
@@ -0,0 +1,286 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.lightning_attn import (
+    linear_decode_forward_triton)
+from vllm.platforms import current_platform
+
+NUM_HEADS = [4, 8]
+HEAD_SIZES = [64]
+BATCH_SIZES = [1, 2]
+SEQ_LENGTHS = [16]
+DTYPES = [torch.float32]
+
+
+def reference_lightning_attention(q, k, v, ed, block_size, kv_history):
+    """Reference implementation of lightning attention core algorithm
+    
+    The difference from the main implementation is that this processes 
+    each step sequentially, instead of using parallelized triton kernels
+    """
+    B, H, S, D = q.shape
+    E = v.shape[-1]
+    dtype = q.dtype
+    output = torch.zeros((B, H, S, E), dtype=dtype, device=q.device)
+
+    # Use clone() to ensure an independent copy
+    if kv_history is None:
+        kv_cache = torch.zeros((B, H, D, E), dtype=dtype, device=q.device)
+    else:
+        kv_cache = kv_history.clone()
+
+    # More efficient implementation
+    # Convert decay factors to matrix form
+    if ed.dim() == 1:
+        decay = torch.exp(-ed).view(1, -1, 1, 1)
+    else:
+        decay = torch.exp(-ed)
+
+    for b in range(B):
+        for step in range(S):
+            # Process all heads at once for this position
+            q_bs = q[b, :, step]  # [H, D]
+            k_bs = k[b, :, step]  # [H, D]
+            v_bs = v[b, :, step]  # [H, E]
+
+            # Calculate KV outer products for all heads
+            for h in range(H):
+                # Calculate KV outer product
+                kv_outer = torch.outer(k_bs[h], v_bs[h])
+
+                # Update KV cache with decay
+                # Note: Using the same order as in the Triton kernel
+                kv_cache[b, h] = decay[0, h, 0, 0] * kv_cache[b, h] + kv_outer
+
+                # Calculate attention output
+                output[b, h, step] = torch.matmul(q_bs[h], kv_cache[b, h])
+
+    # Match the shape returned by the actual implementation
+    # The actual implementation returns a tensor of shape [B, H, 2, D, E]
+    # where dimension 2 contains both KV and KV history
+    kv_reshaped = kv_cache.unsqueeze(2)  # [B, H, 1, D, E]
+    final_kv_cache = torch.cat([kv_reshaped, kv_reshaped],
+                               dim=2)  # [B, H, 2, D, E]
+
+    return output, final_kv_cache
+
+
+def reference_linear_decode(q, k, v, kv_caches, slope_rate, slot_idx):
+    """Reference implementation: linear attention decode function"""
+    B, H, _, D = q.shape
+    output = torch.zeros(B, H * D, dtype=q.dtype, device=q.device)
+
+    # Calculate decay factors once (more efficient)
+    decay = torch.exp(-slope_rate).view(-1, 1, 1)  # [H, 1, 1]
+
+    # Process each batch
+    for b in range(B):
+        slot_id = slot_idx[b].item()
+
+        # Skip padding positions
+        if slot_id == -1:
+            continue
+
+        # Process all heads at once for this batch
+        q_b = q[b, :, 0]  # [H, D]
+        k_b = k[b, :, 0]  # [H, D]
+        v_b = v[b, :, 0]  # [H, D]
+
+        # Process each attention head
+        for h in range(H):
+            # Get current query, key and value
+            q_bh = q_b[h]
+            k_bh = k_b[h]
+            v_bh = v_b[h]
+
+            # Get cache
+            kv_cache_old = kv_caches[b, h]
+
+            # Calculate new key-value outer product
+            kv_outer = torch.outer(k_bh, v_bh)
+
+            # Apply decay and update cache
+            kv_new = kv_outer + decay[h, 0, 0] * kv_cache_old
+
+            # Calculate output
+            out_h = torch.matmul(q_bh, kv_new)
+
+            # Update output and cache
+            output[b, h * D:(h + 1) * D] = out_h
+            kv_caches[b, h] = kv_new
+
+    return output
+
+
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@torch.inference_mode()
+def test_linear_decode_forward_triton(
+    batch_size: int,
+    num_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+):
+    torch.set_default_device("cuda")
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
+    current_platform.seed_everything(42)
+    base = 0.01
+    q = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+    k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+    v = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+
+    kv_caches = base * torch.randn(batch_size,
+                                   num_heads,
+                                   head_size,
+                                   head_size,
+                                   dtype=dtype,
+                                   device="cuda")
+
+    kv_caches_copy = kv_caches.clone()
+
+    slope_rate = torch.zeros(num_heads, device="cuda")
+    for h in range(num_heads):
+        slope_rate[h] = 0.1 * (h + 1)
+
+    slot_idx = torch.arange(batch_size, device="cuda")
+
+    triton_output = linear_decode_forward_triton(q, k, v, kv_caches,
+                                                 slope_rate, slot_idx)
+
+    reference_output = reference_linear_decode(q, k, v, kv_caches_copy,
+                                               slope_rate, slot_idx)
+    torch.testing.assert_close(triton_output,
+                               reference_output,
+                               rtol=1e-1,
+                               atol=1e-1)
+    torch.testing.assert_close(kv_caches, kv_caches_copy, rtol=1e-1, atol=1e-1)
+
+    assert triton_output.shape == (batch_size, num_heads * head_size)
+
+
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@torch.inference_mode()
+def test_linear_decode_forward_triton_with_padding(
+    num_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+):
+    torch.set_default_device("cuda")
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
+    current_platform.seed_everything(42)
+
+    batch_size = 4
+    base = 0.01
+    q = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+    k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+    v = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+
+    kv_caches = base * torch.randn(batch_size,
+                                   num_heads,
+                                   head_size,
+                                   head_size,
+                                   dtype=dtype,
+                                   device="cuda")
+
+    kv_caches_copy = kv_caches.clone()
+
+    slope_rate = torch.zeros(num_heads, device="cuda")
+    for h in range(num_heads):
+        slope_rate[h] = 0.1 * (h + 1)
+
+    slot_idx = torch.tensor([0, 1, -1, 2], device="cuda")
+
+    triton_output = linear_decode_forward_triton(q, k, v, kv_caches,
+                                                 slope_rate, slot_idx)
+
+    reference_output = reference_linear_decode(q, k, v, kv_caches_copy,
+                                               slope_rate, slot_idx)
+
+    padding_mask = (slot_idx
+                    != -1).unsqueeze(1).expand(-1, num_heads * head_size)
+
+    triton_masked = triton_output[padding_mask]
+    reference_masked = reference_output[padding_mask]
+
+    atol, rtol = 1.5e-1, 1.5e-1
+
+    valid_indices = slot_idx != -1
+
+    for i in range(batch_size):
+        if valid_indices[i] > 0:
+            torch.testing.assert_close(kv_caches[i],
+                                       kv_caches_copy[i],
+                                       rtol=rtol,
+                                       atol=atol)
+
+    torch.testing.assert_close(triton_masked,
+                               reference_masked,
+                               rtol=rtol,
+                               atol=atol)
+
+    assert triton_output.shape == (batch_size, num_heads * head_size)
+
+
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("seq_len", SEQ_LENGTHS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@torch.inference_mode()
+def test_lightning_attention_reference(
+    batch_size: int,
+    num_heads: int,
+    head_size: int,
+    seq_len: int,
+    dtype: torch.dtype,
+):
+    torch.set_default_device("cuda")
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
+    current_platform.seed_everything(42)
+
+    base = 0.01
+    q = base * torch.randn(
+        batch_size, num_heads, seq_len, head_size, dtype=dtype)
+    k = base * torch.randn(
+        batch_size, num_heads, seq_len, head_size, dtype=dtype)
+    v = base * torch.randn(
+        batch_size, num_heads, seq_len, head_size, dtype=dtype)
+
+    ed = torch.zeros(num_heads, device="cuda")
+    for h in range(num_heads):
+        ed[h] = 0.1 * (h + 1)
+
+    kv_history = base * torch.randn(batch_size,
+                                    num_heads,
+                                    head_size,
+                                    head_size,
+                                    dtype=dtype,
+                                    device="cuda")
+
+    kv_history_clone = kv_history.clone()
+
+    ref_output, ref_kv_cache = reference_lightning_attention(
+        q, k, v, ed, 256, kv_history)
+
+    from vllm.model_executor.layers.lightning_attn import lightning_attention
+    actual_output, actual_kv_cache = lightning_attention(
+        q, k, v, ed, 256, kv_history_clone)
+
+    atol, rtol = 1.5e-1, 1.5e-1
+    torch.testing.assert_close(ref_output, actual_output, rtol=rtol, atol=atol)
+    torch.testing.assert_close(ref_kv_cache,
+                               actual_kv_cache,
+                               rtol=rtol,
+                               atol=atol)
+
+    assert ref_output.shape == (batch_size, num_heads, seq_len, head_size)
+    assert ref_kv_cache.shape == actual_kv_cache.shape
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 137f1418736b4..39e104a11ab11 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -176,6 +176,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                          trust_remote_code=True),
     "MiniCPM3ForCausalLM": _HfExamplesInfo("openbmb/MiniCPM3-4B",
                                          trust_remote_code=True),
+    "MiniMaxText01ForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01",
+                                                trust_remote_code=True),
     "MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"),
     "MixtralForCausalLM": _HfExamplesInfo("mistralai/Mixtral-8x7B-Instruct-v0.1"),  # noqa: E501
     "QuantMixtralForCausalLM": _HfExamplesInfo("mistral-community/Mixtral-8x22B-v0.1-AWQ"),  # noqa: E501
diff --git a/vllm/config.py b/vllm/config.py
index 6ec5d1bc28faa..ba20e3fd75125 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -971,26 +971,34 @@ class ModelConfig:
             return sum(not bc.attention.no_op
                        for bc in block_configs[start:end])
         else:
-            # Hybrid model
+            # Hybrid model Jamba
             layers_block_type_value = getattr(self.hf_config,
                                               "layers_block_type", None)
-            if layers_block_type_value is None:
-                raise ValueError("The model is an hybrid without a "
-                                 "layers_block_type in the hf_config, "
-                                 "cannot determine the num of "
-                                 f"{block_type.value} layers")
+            if layers_block_type_value is not None:
+                if hasattr(self.hf_text_config,
+                           "model_type") and (self.hf_text_config.model_type
+                                              == "zamba2"):
+                    if attn_block_type:
+                        return sum(t == "hybrid"
+                                   for t in layers_block_type_value[start:end])
+                    else:
+                        return self.get_num_layers(parallel_config)
+                return sum(t == block_type.value
+                           for t in layers_block_type_value[start:end])
 
-            if hasattr(self.hf_text_config,
-                       "model_type") and (self.hf_text_config.model_type
-                                          == "zamba2"):
-                if attn_block_type:
-                    return sum(t == "hybrid"
-                               for t in layers_block_type_value[start:end])
-                else:
-                    return self.get_num_layers(parallel_config)
+            # Hybrid model Minimax
+            attn_type_list = getattr(self.hf_config, "attn_type_list", None)
+            if attn_type_list:
+                return sum(t == 1 for t in attn_type_list[start:end])
 
-            return sum(t == block_type.value
-                       for t in layers_block_type_value[start:end])
+            if layers_block_type_value is None and attn_type_list is None:
+                raise ValueError(
+                    "The model is an hybrid without a"
+                    "layers_block_type or an attn_type_list in the hf_config,"
+                    "cannot determine the num of "
+                    f"{block_type.value} layers")
+
+            return sum(t == 1 for t in attn_type_list[start:end])
 
     def get_multimodal_config(self) -> "MultiModalConfig":
         """
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 079e2a0815296..3e337731d63d8 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -303,8 +303,11 @@ class _AsyncLLMEngine(LLMEngine):
             ctx.seq_group_metadata_list = seq_group_metadata_list
             ctx.scheduler_outputs = scheduler_outputs
 
-            finished_requests_ids = self.scheduler[
-                virtual_engine].get_and_reset_finished_requests_ids()
+            if not scheduler_outputs.is_empty():
+                # this will cause mamba_cache/minimax_cache failed
+                # to release finished_requests_ids of the last steps
+                finished_requests_ids = self.scheduler[
+                    virtual_engine].get_and_reset_finished_requests_ids()
 
             # Maybe switch from async mode to sync mode
             if not allow_async_output_proc and len(ctx.output_queue) > 0:
diff --git a/vllm/model_executor/layers/lightning_attn.py b/vllm/model_executor/layers/lightning_attn.py
new file mode 100644
index 0000000000000..de360778f28cd
--- /dev/null
+++ b/vllm/model_executor/layers/lightning_attn.py
@@ -0,0 +1,651 @@
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import triton
+import triton.language as tl
+from einops import rearrange
+
+
+@triton.jit
+def _fwd_diag_kernel(Q, K, V, Out, S, b: tl.constexpr, h: tl.constexpr, n,
+                     d: tl.constexpr, e: tl.constexpr, BLOCK: tl.constexpr,
+                     NUM_BLOCK, CBLOCK: tl.constexpr):
+    # This kernel computes the diagonal blocks of the attention matrix
+    # Each diagonal block represents attention
+    # where queries attend to keys in the same block
+    off = tl.program_id(0)
+    off_bh = off // NUM_BLOCK  # batch-head index
+    off_block = off % NUM_BLOCK  # block index within the sequence
+    off_cblock = tl.program_id(1)  # sub-block index within a block
+
+    off_h = off_bh % h  # head index
+
+    # Calculate base offsets for the current batch and head
+    qk_offset = off_bh * n * d
+    v_offset = off_bh * n * e
+    o_offset = off_bh * n * e
+
+    # Calculate offsets for the current block
+    block_offset = off_block * BLOCK
+    qk_block_offset = block_offset * d
+    v_block_offset = block_offset * e
+    o_block_offset = block_offset * e
+
+    # Calculate offsets for the current sub-block
+    cblock_offset = off_cblock * CBLOCK
+    q_cblock_offset = cblock_offset * d
+    o_cblock_offset = cblock_offset * e
+
+    # Calculate pointers to the query, key, value, and output tensors
+    Q_block_ptr = (Q + qk_offset + qk_block_offset + q_cblock_offset +
+                   tl.arange(0, CBLOCK)[:, None] * d +
+                   tl.arange(0, d)[None, :])
+    K_trans_block_ptr = (K + qk_offset + qk_block_offset +
+                         tl.arange(0, CBLOCK)[None, :] * d +
+                         tl.arange(0, d)[:, None])
+    V_block_ptr = (V + v_offset + v_block_offset +
+                   tl.arange(0, CBLOCK)[:, None] * e +
+                   tl.arange(0, e)[None, :])
+    O_block_ptr = (Out + o_offset + o_block_offset + o_cblock_offset +
+                   tl.arange(0, CBLOCK)[:, None] * e +
+                   tl.arange(0, e)[None, :])
+
+    # Load the decay rate for the current head
+    S_block_ptr = S + off_h
+    s = tl.load(S_block_ptr)
+
+    i = off_cblock
+    q_index = tl.arange(0, CBLOCK) + i * CBLOCK
+
+    # Load query values
+    q = tl.load(Q_block_ptr,
+                mask=block_offset + q_index[:, None] < n,
+                other=0.0).to(tl.float32)
+
+    # Initialize output accumulator
+    qkv = tl.zeros([CBLOCK, e], dtype=tl.float32)
+
+    # Process all sub-blocks up to and
+    # including the current one (causal attention)
+    for j in range(i + 1):
+        kv_index = tl.arange(0, CBLOCK) + j * CBLOCK
+        diff = q_index[:, None] - kv_index[None, :]
+        s_index = s * diff
+        # Apply causal mask: only attend to positions before the current one
+        s_index = tl.where(diff >= 0, -s_index, float("-inf"))
+        decay = tl.exp(s_index)
+
+        # Load key and value
+        k_trans = tl.load(
+            K_trans_block_ptr,
+            mask=block_offset + kv_index[None, :] < n,
+            other=0.0,
+        ).to(tl.float32)
+        v = tl.load(
+            V_block_ptr,
+            mask=block_offset + kv_index[:, None] < n,
+            other=0.0,
+        ).to(tl.float32)
+
+        # Compute attention scores and apply decay
+        qk = tl.dot(q, k_trans) * decay
+
+        # Compute weighted values and accumulate
+        qkv += tl.dot(qk, v)
+
+        # Move to the next sub-block
+        K_trans_block_ptr += CBLOCK * d
+        V_block_ptr += CBLOCK * e
+
+    # Store the result
+    tl.store(
+        O_block_ptr,
+        qkv.to(O_block_ptr.dtype.element_ty),
+        mask=block_offset + q_index[:, None] < n,
+    )
+
+
+@triton.jit
+def _fwd_kv_parallel(
+    K,
+    V,
+    K_decay,
+    KV,
+    b: tl.constexpr,
+    h: tl.constexpr,
+    n,
+    d: tl.constexpr,
+    e: tl.constexpr,
+    BLOCK: tl.constexpr,
+    NUM_BLOCK,
+    D_FBLOCK: tl.constexpr,
+    E_FBLOCK: tl.constexpr,
+    NUM_FBLOCK: tl.constexpr,
+    CBLOCK: tl.constexpr,
+    NUM_CBLOCK: tl.constexpr,
+):
+    # This kernel computes the key-value outer
+    # products for each block in parallel
+    off_bh = tl.program_id(0)  # batch-head index
+    off_block = tl.program_id(1)  # block index
+
+    off_h = off_bh % h  # head index
+
+    block_offset = off_block * BLOCK
+
+    # Calculate offsets for the current block
+    k_block_offset = block_offset * d
+    v_block_offset = block_offset * e
+    kv_block_offset = off_block * d * e
+
+    # Calculate base offsets for the current batch and head
+    k_offset = off_bh * n * d
+    v_offset = off_bh * n * e
+    kv_offset = off_bh * NUM_BLOCK * d * e
+
+    # Calculate pointers to the key, value, and key-value tensors
+    K_trans_block_ptr = (K + k_offset + k_block_offset +
+                         tl.arange(0, CBLOCK)[None, :] * d +
+                         tl.arange(0, D_FBLOCK)[:, None])
+    V_block_ptr = (V + v_offset + v_block_offset +
+                   tl.arange(0, CBLOCK)[:, None] * e +
+                   tl.arange(0, E_FBLOCK)[None, :])
+    KV_block_ptr = (KV + kv_offset + kv_block_offset +
+                    tl.arange(0, D_FBLOCK)[:, None] * e +
+                    tl.arange(0, E_FBLOCK)[None, :])
+
+    # Load the decay factors for the current head and block
+    k_decay_ptr = (K_decay + off_h * BLOCK + tl.arange(0, CBLOCK)[None, :])
+
+    kv_index = tl.arange(0, CBLOCK)
+
+    # Initialize the key-value outer product accumulator
+    kv = tl.zeros([D_FBLOCK, E_FBLOCK], dtype=tl.float32)
+
+    # Handle the last block which might be smaller than BLOCK
+    if off_block == NUM_BLOCK - 1:
+        split_n = n - (NUM_BLOCK - 1) * BLOCK
+    else:
+        split_n = BLOCK
+    left_shift = tl.cdiv(split_n, CBLOCK) * CBLOCK - split_n
+    num_blocks = min(tl.cdiv(split_n, CBLOCK), NUM_CBLOCK)
+    k_decay_ptr += (NUM_CBLOCK - num_blocks) * CBLOCK
+
+    # Process all sub-blocks in the current block
+    for j in range(num_blocks):
+        left_bound = (1 - j) * left_shift
+        # Load key and value, handling boundary conditions
+        k_trans = tl.load(K_trans_block_ptr - left_shift * d,
+                          mask=kv_index[None, :] >= left_bound,
+                          other=0.0)
+        v = tl.load(V_block_ptr - left_shift * e,
+                    mask=kv_index[:, None] >= left_bound,
+                    other=0.0)
+
+        # Load decay factor and compute weighted key-value outer product
+        k_decay = tl.load(k_decay_ptr)
+        kv += tl.dot(k_trans * k_decay, v)
+
+        # Move to the next sub-block
+        K_trans_block_ptr += CBLOCK * d
+        V_block_ptr += CBLOCK * e
+        k_decay_ptr += CBLOCK
+
+    # Store the result
+    tl.store(KV_block_ptr, kv.to(KV_block_ptr.dtype.element_ty))
+
+
+@triton.jit
+def _fwd_kv_reduce(S, KV, KV_HISTORY, b: tl.constexpr, h: tl.constexpr, n,
+                   d: tl.constexpr, e: tl.constexpr, BLOCK: tl.constexpr,
+                   NUM_BLOCK, D_FBLOCK: tl.constexpr, E_FBLOCK: tl.constexpr):
+    # This kernel reduces the key-value outer products
+    # across blocks and updates the KV history
+    off_bh = tl.program_id(0)  # batch-head index
+    off_h = off_bh % h  # head index
+
+    kv_offset = off_bh * NUM_BLOCK * d * e
+
+    # Calculate pointer to the key-value tensor
+    KV_block_ptr = (KV + kv_offset + tl.arange(0, D_FBLOCK)[:, None] * e +
+                    tl.arange(0, E_FBLOCK)[None, :])
+
+    # Load the decay rate for the current head
+    s_ptrs = S + off_h
+    s = tl.load(s_ptrs)
+
+    # Calculate pointer to the key-value history tensor
+    kv_history_offset = off_bh * d * e
+    KV_HISTORY_block_ptr = (KV_HISTORY + kv_history_offset +
+                            tl.arange(0, D_FBLOCK)[:, None] * e +
+                            tl.arange(0, E_FBLOCK)[None, :])
+
+    # Load the previous key-value history
+    kv_pre = tl.load(KV_HISTORY_block_ptr).to(tl.float32)
+
+    # Process all blocks in reverse order to compute the prefix sum
+    for i in range(NUM_BLOCK):
+        block_size = min(n - i * BLOCK, BLOCK)
+        # Compute decay factor for the current block
+        block_decay = tl.exp(-s.to(tl.float32) * block_size)
+
+        # Load the current key-value outer product
+        kv_cur = tl.load(KV_block_ptr).to(tl.float32)
+        # Store the previous key-value history to the current block
+        tl.store(KV_block_ptr, kv_pre.to(KV_block_ptr.dtype.element_ty))
+
+        # Update the key-value history with the current block
+        kv_pre = block_decay * kv_pre + kv_cur
+        KV_block_ptr += d * e
+
+    # Store the updated key-value history
+    tl.store(KV_HISTORY_block_ptr, kv_pre)
+
+
+@triton.jit
+def _fwd_none_diag_kernel(
+    Q,
+    Out,
+    S,
+    KV,
+    b: tl.constexpr,
+    h: tl.constexpr,
+    n,
+    d: tl.constexpr,
+    e: tl.constexpr,
+    BLOCK: tl.constexpr,
+    NUM_BLOCK,
+    E_FBLOCK: tl.constexpr,
+    CBLOCK: tl.constexpr,
+    NUM_CBLOCK: tl.constexpr,
+):
+    # This kernel computes the non-diagonal blocks of the attention matrix
+    # Each non-diagonal block represents attention
+    # where queries attend to keys in different blocks
+    off_bh = tl.program_id(0)  # batch-head index
+    off_h = off_bh % h  # head index
+
+    off_nc = tl.program_id(1)
+    off_n = off_nc // NUM_CBLOCK  # block index
+    off_c = off_nc % NUM_CBLOCK  # sub-block index
+    off_e = tl.program_id(2)  # output feature block index
+
+    n_offset = off_n * BLOCK
+    c_offset = off_c * CBLOCK
+    e_offset = off_e * E_FBLOCK
+    block_offset = n_offset + c_offset
+
+    # Calculate offsets for the current batch, head, and block
+    q_offset = off_bh * n * d + (n_offset + c_offset) * d
+    o_offset = off_bh * n * e + (n_offset + c_offset) * e + e_offset
+    kv_offset = off_bh * NUM_BLOCK * d * e + off_n * d * e + e_offset
+
+    # Calculate pointers to the query, output, and key-value tensors
+    Q_block_ptr = (Q + q_offset + tl.arange(0, CBLOCK)[:, None] * d +
+                   tl.arange(0, d)[None, :])
+    O_block_ptr = (Out + o_offset + tl.arange(0, CBLOCK)[:, None] * e +
+                   tl.arange(0, E_FBLOCK)[None, :])
+    KV_block_ptr = (KV + kv_offset + tl.arange(0, d)[:, None] * e +
+                    tl.arange(0, E_FBLOCK)[None, :])
+
+    # Load the decay rate for the current head
+    S_block_ptr = S + off_h
+    s = tl.load(S_block_ptr)
+
+    c_array = tl.arange(0, CBLOCK)
+
+    # Load the key-value outer product for the current block
+    kv = tl.load(KV_block_ptr).to(tl.float32)
+    q_index = block_offset + tl.arange(0, CBLOCK)
+
+    # Load query values
+    q = tl.load(Q_block_ptr, mask=q_index[:, None] < n,
+                other=0.).to(tl.float32)
+
+    # Compute decay factors for the current sub-block
+    q_decay = tl.exp(-s.to(tl.float32) * (off_c * CBLOCK + c_array[:, None]))
+
+    # Compute non-diagonal attention output
+    qkv_none_diag = tl.dot(q, kv) * q_decay
+
+    # Load diagonal attention output (computed by _fwd_diag_kernel)
+    qkv_diag = tl.load(O_block_ptr, mask=q_index[:, None] < n,
+                       other=0.).to(tl.float32)
+
+    # Combine diagonal and non-diagonal attention outputs
+    qkv = qkv_diag + qkv_none_diag
+
+    # Store the result
+    tl.store(O_block_ptr,
+             qkv.to(O_block_ptr.dtype.element_ty),
+             mask=q_index[:, None] < n)
+
+
+class _attention(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, q, k, v, s, kv_history):
+        # Forward pass of the lightning attention algorithm
+        q = q.contiguous()
+        k = k.contiguous()
+        v = v.contiguous()
+        s = s.contiguous()
+
+        # Check CUDA compute capability
+        capability = torch.cuda.get_device_capability()
+        if capability[0] < 8:
+            raise RuntimeError("Flash attention currently only supported",
+                               "for compute capability >= 80")
+
+        # Get input dimensions
+        b, h, n, d = q.shape
+        e = v.shape[-1]
+
+        # Initialize output tensor
+        o = torch.empty((b, h, n, e), dtype=q.dtype, device=q.device)
+
+        # Set block sizes
+        BLOCK = 256
+        NUM_BLOCK = triton.cdiv(n, BLOCK)
+
+        CBLOCK = 32
+        NUM_CBLOCK = BLOCK // CBLOCK
+        assert BLOCK % CBLOCK == 0, "BLOCK must be a multiple of CBLOCK"
+
+        # Compute decay factors for keys
+        array = torch.arange(0, BLOCK, device=q.device) + 1
+        k_decay = torch.exp(-s * (BLOCK - array.reshape(1, -1)))
+
+        # Step 1: Compute diagonal blocks of attention
+        grid = (b * h * NUM_BLOCK, NUM_CBLOCK)
+        _fwd_diag_kernel[grid](q,
+                               k,
+                               v,
+                               o,
+                               s,
+                               b,
+                               h,
+                               n,
+                               d,
+                               e,
+                               BLOCK=BLOCK,
+                               NUM_BLOCK=NUM_BLOCK,
+                               CBLOCK=CBLOCK)
+
+        # Set feature block sizes
+        NUM_FBLOCK = 1
+        D_FBLOCK = d // NUM_FBLOCK
+        assert d % NUM_FBLOCK == 0
+        E_FBLOCK = e // NUM_FBLOCK
+        assert e % NUM_FBLOCK == 0
+
+        CBLOCK = 64
+        NUM_CBLOCK = BLOCK // CBLOCK
+        assert BLOCK % CBLOCK == 0, "BLOCK must be a multiple of CBLOCK"
+
+        # Step 2: Compute key-value outer products for each block in parallel
+        kv = torch.empty((b, h, NUM_BLOCK, d, e),
+                         dtype=torch.float32,
+                         device=q.device)
+        grid = (b * h, NUM_BLOCK)
+        _fwd_kv_parallel[grid](
+            k,
+            v,
+            k_decay,
+            kv,
+            b,
+            h,
+            n,
+            d,
+            e,
+            BLOCK=BLOCK,
+            NUM_BLOCK=NUM_BLOCK,
+            D_FBLOCK=D_FBLOCK,
+            E_FBLOCK=E_FBLOCK,
+            NUM_FBLOCK=NUM_FBLOCK,
+            CBLOCK=CBLOCK,
+            NUM_CBLOCK=NUM_CBLOCK,
+        )
+
+        # Step 3: Reduce key-value outer products
+        # across blocks and update KV history
+        grid = (b * h, NUM_FBLOCK)
+        _fwd_kv_reduce[grid](s,
+                             kv,
+                             kv_history,
+                             b,
+                             h,
+                             n,
+                             d,
+                             e,
+                             BLOCK=BLOCK,
+                             NUM_BLOCK=NUM_BLOCK,
+                             D_FBLOCK=D_FBLOCK,
+                             E_FBLOCK=E_FBLOCK)
+
+        # Step 4: Compute non-diagonal blocks of attention
+        grid = (b * h, NUM_BLOCK * NUM_CBLOCK)
+        _fwd_none_diag_kernel[grid](
+            q,
+            o,
+            s,
+            kv,
+            b,
+            h,
+            n,
+            d,
+            e,
+            BLOCK=BLOCK,
+            NUM_BLOCK=NUM_BLOCK,
+            E_FBLOCK=E_FBLOCK,
+            CBLOCK=CBLOCK,
+            NUM_CBLOCK=NUM_CBLOCK,
+        )
+
+        # Save tensors for backward pass
+        ctx.save_for_backward(q, k, v, s, kv)
+        ctx.BLOCK = BLOCK
+
+        return o, torch.cat([kv, kv_history.unsqueeze(2)], dim=2)
+
+
+# Apply the lightning attention function
+lightning_attention_ = _attention.apply
+
+
+def lightning_attention(q, k, v, ed, block_size=256, kv_history=None):
+    """
+    Apply lightning attention algorithm 
+    to compute attention efficiently.
+    
+    Args:
+        q: Query tensor of shape [batch, heads, seq_len, dim]
+        k: Key tensor of shape [batch, heads, seq_len, dim]
+        v: Value tensor of shape [batch, heads, seq_len, dim_v]
+        ed: Decay rate tensor of shape [heads]
+        block_size: Size of blocks for block-sparse attention
+        kv_history: Optional key-value history from previous computations
+        
+    Returns:
+        output: Attention output
+        kv: Updated key-value history
+    """
+    d = q.shape[-1]
+    e = v.shape[-1]
+
+    if ed.dim() == 1:
+        ed = ed.view(1, -1, 1, 1)
+
+    # Split the computation into chunks for better parallelism
+    m = 128 if d >= 128 else 64
+    assert d % m == 0, f"Dimension d ({d}) must be divisible by m ({m})"
+    arr = [m * i for i in range(d // m + 1)]
+    if arr[-1] != d:
+        arr.append(d)
+    n = len(arr)
+    output = 0
+
+    # Initialize or clone key-value history
+    if kv_history is None:
+        kv_history = torch.zeros((q.shape[0], q.shape[1], d, e),
+                                 dtype=torch.float32,
+                                 device=q.device)
+    else:
+        kv_history = kv_history.clone().contiguous()
+
+    # Process each chunk and accumulate results
+    for i in range(n - 1):
+        s = arr[i]
+        e = arr[i + 1]
+        q1 = q[..., s:e]
+        k1 = k[..., s:e]
+        o, kv = lightning_attention_(q1, k1, v, ed, kv_history)
+        output = output + o
+    return output, kv
+
+
+@triton.jit
+def _linear_attn_decode_kernel(
+    q_ptr,
+    k_ptr,
+    v_ptr,
+    kv_cache_ptr,
+    slope_rate,
+    slot_idx,
+    output_ptr,
+    D: tl.constexpr,
+    qkv_b_stride,
+    qkv_h_stride,
+    cache_b_stride,
+    cache_h_stride,
+    cache_d0_stride,
+    cache_d1_stride,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    Kernel for linear attention decoding with KV cache.
+    
+    This kernel computes attention for a single token using the KV cache.
+    """
+    pid_b = tl.program_id(0)  # batch index
+    pid_h = tl.program_id(1)  # head index
+    pid_d = tl.program_id(2)  # dimension block index
+
+    # Load slot index for the current batch
+    slot_id = tl.load(slot_idx + pid_b)
+
+    # Skip if slot_id is -1 (padding)
+    if slot_id == -1:
+        return
+
+    batch_id = pid_b
+    head_id = pid_h
+
+    # Load decay rate for the current head
+    ratio = tl.load(slope_rate + pid_h)
+
+    # Calculate offsets for dimensions
+    qk_d_offsets = tl.arange(0, D)
+    v_d_offsets = tl.arange(0, BLOCK_SIZE) + pid_d * BLOCK_SIZE
+    cache_d_offsets = qk_d_offsets[:, None] * cache_d0_stride + v_d_offsets[
+        None, :] * cache_d1_stride
+
+    # Calculate offsets for the current batch and head
+    q_offset = batch_id * qkv_b_stride + head_id * qkv_h_stride
+    k_offset = batch_id * qkv_b_stride + head_id * qkv_h_stride
+    v_offset = batch_id * qkv_b_stride + head_id * qkv_h_stride
+
+    cache_offset = slot_id * cache_b_stride + head_id * cache_h_stride
+
+    # Create masks for loading tensors
+    qk_mask = qk_d_offsets < D
+    v_mask = v_d_offsets < D
+
+    # Load query, key, and value tensors
+    q = tl.load(q_ptr + q_offset + qk_d_offsets, mask=qk_mask, other=0.0)
+    k = tl.load(k_ptr + k_offset + qk_d_offsets, mask=qk_mask, other=0.0)
+    v = tl.load(v_ptr + v_offset + v_d_offsets, mask=v_mask, other=0.0)
+
+    # Compute key-value outer product
+    kv_outer = k[:, None] * v[None, :]
+    kv_mask = qk_mask[:, None] & v_mask[None, :]
+
+    # Apply decay to previous KV cache
+    ratio = tl.exp(-ratio)
+    kv_ptr = kv_cache_ptr + cache_offset + cache_d_offsets
+    kv_cache_old = tl.load(kv_ptr, mask=kv_mask, other=0.0)
+    kv_outer = kv_outer + ratio * kv_cache_old
+
+    # Compute attention output
+    output = q[:, None].to(tl.float32) * kv_outer
+    output = tl.sum(output, axis=0)
+
+    # Update KV cache and store output
+    tl.store(kv_ptr, kv_outer, mask=kv_mask)
+    tl.store(output_ptr + q_offset + v_d_offsets, output, mask=v_mask)
+
+
+def linear_decode_forward_triton(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    kv_caches: torch.Tensor,
+    slope_rate: torch.Tensor,
+    slot_idx: torch.Tensor,
+    BLOCK_SIZE: int = 32,
+) -> torch.Tensor:
+    """
+    Perform linear attention decoding using Triton kernels.
+    
+    Args:
+        q: Query tensor of shape [B, H, 1, D]
+        k: Key tensor of shape [B, H, 1, D]
+        v: Value tensor of shape [B, H, 1, D]
+        kv_caches: Key-value cache tensor
+        slope_rate: Decay rate tensor
+        slot_idx: Slot indices for batches
+        BLOCK_SIZE: Size of blocks for processing
+        
+    Returns:
+        output: Attention output tensor
+    """
+    B, H, _, D = q.shape
+    assert k.shape == (B, H, 1, D)
+    assert v.shape == (B, H, 1, D)
+
+    # Initialize output tensor
+    output = torch.empty_like(q)
+
+    # Set grid dimensions for the kernel
+    grid = (B, H, D // BLOCK_SIZE)
+
+    # Calculate strides for tensors
+    qkv_b_stride = q.stride(0)
+    qkv_h_stride = q.stride(1)
+
+    cache_b_stride = kv_caches.stride(0)
+    cache_h_stride = kv_caches.stride(1)
+    cache_d0_stride = kv_caches.stride(2)
+    cache_d1_stride = kv_caches.stride(3)
+
+    # Launch the kernel
+    _linear_attn_decode_kernel[grid](
+        q,
+        k,
+        v,
+        kv_caches,
+        slope_rate,
+        slot_idx,
+        output,
+        D,
+        qkv_b_stride,
+        qkv_h_stride,
+        cache_b_stride,
+        cache_h_stride,
+        cache_d0_stride,
+        cache_d1_stride,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+
+    # Reshape output and return
+    output = rearrange(output, "b h n d -> b n (h d)")
+    return output.squeeze(1).contiguous()
diff --git a/vllm/model_executor/models/constant_size_cache.py b/vllm/model_executor/models/constant_size_cache.py
new file mode 100644
index 0000000000000..d073a7de69178
--- /dev/null
+++ b/vllm/model_executor/models/constant_size_cache.py
@@ -0,0 +1,136 @@
+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Tuple
+
+import torch
+
+from vllm.attention.backends.utils import PAD_SLOT_ID
+
+
+class ConstantSizeCache(ABC):
+    """
+    Abstract base class for managing constant size caches 
+    like Mamba and Minimax.
+    """
+
+    def __init__(self, max_batch_size: int):
+        # Maps between the request id and a dict that maps between the seq_id
+        # and its index inside the cache
+        self.cache_indices_mapping: Dict[str, Dict[int, int]] = {}
+        self.free_cache_indices = list(range(max_batch_size))
+
+    @property
+    @abstractmethod
+    def cache(self) -> Any:
+        """Return the underlying cache tensor(s)"""
+        pass
+
+    @abstractmethod
+    def _copy_cache(self, from_index: int, to_index: int):
+        """Copy cache data from one index to another"""
+        pass
+
+    def current_run_tensors(self, **kwargs) -> Tuple:
+        """
+        Return the tensors for the current run's conv and ssm state.
+        """
+        if "seqlen_agnostic_capture_inputs" not in kwargs:
+            # We get here only on Prefill/Eager mode runs
+            request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
+            finished_requests_ids = kwargs["finished_requests_ids"]
+
+            self._release_finished_requests(finished_requests_ids)
+            state_indices = self._prepare_current_run_cache(
+                request_ids_to_seq_ids, finished_requests_ids)
+
+            state_indices_tensor = torch.as_tensor(state_indices,
+                                                   dtype=torch.int32,
+                                                   device="cuda")
+            cache_tensors = self.cache
+        else:
+            # CUDA graph capturing runs
+            cache_tensors, state_indices_tensor = kwargs[
+                "seqlen_agnostic_capture_inputs"]
+
+        return (cache_tensors, state_indices_tensor)
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        """
+        Copy the relevant state_indices into the CUDA graph input buffer 
+        """
+        assert all(
+            key in kwargs
+            for key in ["request_ids_to_seq_ids", "finished_requests_ids"])
+        finished_requests_ids = kwargs["finished_requests_ids"]
+        request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
+        assert "seqlen_agnostic_capture_inputs" in input_buffers
+        _, input_state_indices_buffer = input_buffers[
+            "seqlen_agnostic_capture_inputs"]
+
+        self._release_finished_requests(finished_requests_ids)
+        state_indices = self._prepare_current_run_cache(
+            request_ids_to_seq_ids, finished_requests_ids)
+        cuda_graph_pad_len = input_state_indices_buffer.shape[0] - len(
+            state_indices)
+        state_indices.extend([PAD_SLOT_ID] * cuda_graph_pad_len)
+
+        input_state_indices_buffer.copy_(
+            torch.as_tensor(state_indices, dtype=torch.int32, device="cuda"))
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        """
+        Provide the CUDA graph capture runs with a buffer in adjusted size.
+        The buffer is used to maintain the Cache during the CUDA graph replay
+        runs.
+        """
+        state_indices_tensor = torch.as_tensor([PAD_SLOT_ID] * batch_size,
+                                               dtype=torch.int32,
+                                               device="cuda")
+        return (self.cache, state_indices_tensor)
+
+    def _assign_seq_id_to_cache_index(self, cur_rid: str, seq_id: int,
+                                      finished_requests_ids) -> int:
+        """
+        Assign (req_id,seq_id) pair to a `destination_index` index, if
+        already occupied, move the occupying index to a free index.
+        """
+        if cur_rid in finished_requests_ids:
+            # set as pad, do not allocate destination index
+            return PAD_SLOT_ID
+        elif cur_rid not in self.cache_indices_mapping:
+            destination_index = self.free_cache_indices.pop()
+            self.cache_indices_mapping[cur_rid] = {seq_id: destination_index}
+            return destination_index
+        elif seq_id not in (seq_ids2indices :=
+                            self.cache_indices_mapping[cur_rid]):
+            # parallel sampling , where n > 1, assume prefill have
+            # already happened, so we copy the
+            # existing cache into the siblings seq_ids caches
+            index_exists = next(iter(seq_ids2indices.values()))
+            # case of decoding n>1, copy prefill cache to decoding indices
+            destination_index = self.free_cache_indices.pop()
+            self._copy_cache(from_index=index_exists,
+                             to_index=destination_index)
+            self.cache_indices_mapping[cur_rid][seq_id] = destination_index
+            return destination_index
+        else:
+            return self.cache_indices_mapping[cur_rid][seq_id]
+
+    def _prepare_current_run_cache(
+            self, request_ids_to_seq_ids: Dict[str, list[int]],
+            finished_requests_ids: List[str]) -> List[int]:
+        return [
+            self._assign_seq_id_to_cache_index(req_id, seq_id,
+                                               finished_requests_ids)
+            for req_id, seq_ids in request_ids_to_seq_ids.items()
+            for seq_id in seq_ids
+        ]
+
+    def _release_finished_requests(self,
+                                   finished_seq_groups_req_ids: List[str]):
+        for req_id in finished_seq_groups_req_ids:
+            if req_id in self.cache_indices_mapping:
+                for seq_id in self.cache_indices_mapping[req_id]:
+                    self.free_cache_indices.append(
+                        self.cache_indices_mapping[req_id][seq_id])
+                self.cache_indices_mapping.pop(req_id)
diff --git a/vllm/model_executor/models/mamba_cache.py b/vllm/model_executor/models/mamba_cache.py
index d529833093cea..25839727898fb 100644
--- a/vllm/model_executor/models/mamba_cache.py
+++ b/vllm/model_executor/models/mamba_cache.py
@@ -1,12 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from dataclasses import dataclass
-from typing import Dict, List, Tuple
+from typing import Tuple
 
 import torch
 
 from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.config import VllmConfig
+from vllm.model_executor.models.constant_size_cache import ConstantSizeCache
 
 
 @dataclass
@@ -21,7 +22,7 @@ class MambaCacheParams:
                                 self.state_indices_tensor)
 
 
-class MambaCacheManager:
+class MambaCacheManager(ConstantSizeCache):
 
     def __init__(self, vllm_config: VllmConfig, dtype: torch.dtype,
                  num_mamba_layers: int, conv_state_shape: Tuple[int, int],
@@ -32,6 +33,9 @@ class MambaCacheManager:
         if not vllm_config.model_config.enforce_eager:
             max_batch_size = vllm_config.pad_for_cudagraph(max_batch_size)
 
+        # Initialize parent class
+        super().__init__(max_batch_size)
+
         conv_state = torch.empty(size=(num_mamba_layers, max_batch_size) +
                                  conv_state_shape,
                                  dtype=dtype,
@@ -41,126 +45,32 @@ class MambaCacheManager:
                                      dtype=dtype,
                                      device="cuda")
 
-        self.mamba_cache = (conv_state, temporal_state)
+        self._mamba_cache = (conv_state, temporal_state)
 
-        # Maps between the request id and a dict that maps between the seq_id
-        # and its index inside the self.mamba_cache
-        self.mamba_cache_indices_mapping: Dict[str, Dict[int, int]] = {}
-        self.free_cache_indices = list(range(max_batch_size))
+    @property
+    def cache(self):
+        return self._mamba_cache
+
+    def _copy_cache(self, from_index: int, to_index: int):
+        for cache_t in self.cache:
+            cache_t[:, to_index].copy_(cache_t[:, from_index],
+                                       non_blocking=True)
 
     def current_run_tensors(self, **kwargs) -> MambaCacheParams:
         """
         Return the tensors for the current run's conv and ssm state.
         """
-        if "seqlen_agnostic_capture_inputs" not in kwargs:
-            # We get here only on Prefill/Eager mode runs
-            request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
-            finished_requests_ids = kwargs["finished_requests_ids"]
-
-            self._release_finished_requests(finished_requests_ids)
-            state_indices = self._prepare_current_run_mamba_cache(
-                request_ids_to_seq_ids, finished_requests_ids)
-
-            state_indices_tensor = torch.as_tensor(state_indices,
-                                                   dtype=torch.int32,
-                                                   device="cuda")
-            mamba_cache_tensors = self.mamba_cache
-
-        else:
-            # CUDA graph capturing runs
-            (mamba_cache_tensors,
-             state_indices_tensor) = kwargs["seqlen_agnostic_capture_inputs"]
-
-        return MambaCacheParams(mamba_cache_tensors[0], mamba_cache_tensors[1],
+        cache_tensors, state_indices_tensor = super().current_run_tensors(
+            **kwargs)
+        return MambaCacheParams(cache_tensors[0], cache_tensors[1],
                                 state_indices_tensor)
 
-    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
-        """
-        Copy the relevant state_indices into the CUDA graph input buffer 
-        """
-        assert all(
-            key in kwargs
-            for key in ["request_ids_to_seq_ids", "finished_requests_ids"])
-        finished_requests_ids = kwargs["finished_requests_ids"]
-        request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
-        assert "seqlen_agnostic_capture_inputs" in input_buffers
-        _, input_state_indices_buffer = input_buffers[
-            "seqlen_agnostic_capture_inputs"]
-
-        self._release_finished_requests(finished_requests_ids)
-        state_indices = self._prepare_current_run_mamba_cache(
-            request_ids_to_seq_ids, finished_requests_ids)
-        cuda_graph_pad_len = input_state_indices_buffer.shape[0] - len(
-            state_indices)
-        state_indices.extend([PAD_SLOT_ID] * cuda_graph_pad_len)
-
-        input_state_indices_buffer.copy_(
-            torch.as_tensor(state_indices, dtype=torch.int32, device="cuda"))
-
     def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
         """
         Provide the CUDA graph capture runs with a buffer in adjusted size.
         The buffer is used to maintain the Mamba Cache during the CUDA graph
         replay runs.
         """
-        state_indices_tensor = torch.as_tensor([PAD_SLOT_ID] * batch_size,
-                                               dtype=torch.int32,
-                                               device="cuda")
-        return (self.mamba_cache, state_indices_tensor)
-
-    def _copy_mamba_cache(self, from_index: int, to_index: int):
-        assert len(self.mamba_cache) > 0
-        for cache_t in self.mamba_cache:
-            cache_t[:, to_index].copy_(cache_t[:, from_index],
-                                       non_blocking=True)
-
-    def _assign_seq_id_to_cache_index(self, cur_rid: str, seq_id: int,
-                                      finished_requests_ids) -> int:
-        """
-        Assign (req_id,seq_id) pair to a `destination_index` index, if
-        already occupied, move the occupying index to a free index.
-        """
-        if cur_rid in finished_requests_ids:
-            # set as pad, do not allocate destination index
-            return PAD_SLOT_ID
-        elif cur_rid not in self.mamba_cache_indices_mapping:
-            destination_index = self.free_cache_indices.pop()
-            self.mamba_cache_indices_mapping[cur_rid] = {
-                seq_id: destination_index
-            }
-            return destination_index
-        elif seq_id not in (seq_ids2indices :=
-                            self.mamba_cache_indices_mapping[cur_rid]):
-            # parallel sampling , where n > 1, assume prefill have
-            # already happened, so we copy the
-            # existing cache into the siblings seq_ids caches
-            index_exists = next(iter(seq_ids2indices.values()))
-            # case of decoding n>1, copy prefill cache to decoding indices
-            destination_index = self.free_cache_indices.pop()
-            self._copy_mamba_cache(from_index=index_exists,
-                                   to_index=destination_index)
-            self.mamba_cache_indices_mapping[cur_rid][
-                seq_id] = destination_index
-            return destination_index
-        else:
-            # already exists
-            return self.mamba_cache_indices_mapping[cur_rid][seq_id]
-
-    def _prepare_current_run_mamba_cache(
-            self, request_ids_to_seq_ids: Dict[str, list[int]],
-            finished_requests_ids: List[str]) -> List[int]:
-        return [
-            self._assign_seq_id_to_cache_index(req_id, seq_id,
-                                               finished_requests_ids)
-            for req_id, seq_ids in request_ids_to_seq_ids.items()
-            for seq_id in seq_ids
-        ]
-
-    def _release_finished_requests(self,
-                                   finished_seq_groups_req_ids: List[str]):
-        for req_id in finished_seq_groups_req_ids:
-            if req_id in self.mamba_cache_indices_mapping:
-                for seq_id in self.mamba_cache_indices_mapping[req_id]:
-                    self.free_cache_indices.append(
-                        self.mamba_cache_indices_mapping[req_id][seq_id])
-                self.mamba_cache_indices_mapping.pop(req_id)
+        return self._mamba_cache, torch.as_tensor([PAD_SLOT_ID] * batch_size,
+                                                  dtype=torch.int32,
+                                                  device="cuda")
diff --git a/vllm/model_executor/models/minimax_cache.py b/vllm/model_executor/models/minimax_cache.py
new file mode 100644
index 0000000000000..c95cbb419eb95
--- /dev/null
+++ b/vllm/model_executor/models/minimax_cache.py
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: Apache-2.0
+from dataclasses import dataclass
+
+import torch
+
+from vllm.model_executor.models.constant_size_cache import ConstantSizeCache
+
+
+@dataclass
+class MinimaxCacheParams:
+    minimax_cache: torch.Tensor = torch.Tensor()
+    state_indices_tensor: torch.Tensor = torch.Tensor()
+
+    def at_layer_idx(self, layer_idx):
+        return MinimaxCacheParams(self.minimax_cache[layer_idx, ...],
+                                  self.state_indices_tensor)
+
+
+class MinimaxCacheManager(ConstantSizeCache):
+
+    def __init__(self, dtype, cache_shape):
+        super().__init__(cache_shape[1])  # max_batch_size is cache_shape[1]
+        self._minimax_cache = torch.empty(size=cache_shape,
+                                          dtype=dtype,
+                                          device="cuda")
+
+    @property
+    def cache(self):
+        return self._minimax_cache
+
+    def _copy_cache(self, from_index: int, to_index: int):
+        assert len(self.cache) > 0
+        for cache_t in self.cache:
+            cache_t[:, to_index].copy_(cache_t[:, from_index],
+                                       non_blocking=True)
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
new file mode 100644
index 0000000000000..7562aa678d5ab
--- /dev/null
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -0,0 +1,1273 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Inference-only MiniMaxText01 model."""
+import copy
+import math
+import re
+from typing import Dict, Iterable, List, Optional, Tuple, Union
+
+import torch
+import torch.distributed
+import torch.nn.functional as F
+from einops import rearrange
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
+from vllm.distributed.parallel_state import (
+    get_pp_group, get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size)
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.lightning_attn import (
+    lightning_attention, linear_decode_forward_triton)
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.utils import maybe_prefix
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import HasInnerState, IsHybrid, SupportsV0Only
+from .minimax_cache import MinimaxCacheManager, MinimaxCacheParams
+from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers
+
+
+def replace_weight_name(name: str,
+                        key: str = None,
+                        to: str = None,
+                        count: int = None,
+                        prefix: str = None) -> str:
+    name = name.replace(key, to) if count is None else \
+        name.replace(key, to, count)
+    return name
+
+
+def weight_loader_with_alias(alias: str):
+
+    def wrapper(func: callable):
+
+        def inner_func(param: torch.Tensor,
+                       loaded_weight: torch.Tensor,
+                       *args,
+                       prefix: str = None,
+                       **kwargs):
+            value = func(param, loaded_weight, *args, **kwargs)
+            return value
+
+        return inner_func
+
+    return wrapper
+
+
+class MiniMaxText01RMSNormTP(CustomOp):
+    name = "MiniMaxText01RMSNormTP"
+
+    def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.tp_world = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.weight = nn.Parameter(torch.ones(int(hidden_size /
+                                                  self.tp_world)))
+
+        self.weight.weight_loader = self.weight_loader
+        self.variance_epsilon = eps
+        return
+
+    @staticmethod
+    def weight_loader(
+        param: nn.Parameter,
+        loaded_weight: torch.Tensor,
+    ) -> None:
+        tp_world = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+
+        shard_size = loaded_weight.shape[0] // tp_world
+        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
+        param.data.copy_(loaded_weight[shard])
+        return
+
+    def _forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        orig_dtype = x.dtype
+        x = x.to(torch.float32)
+        variance = x.pow(2).mean(dim=-1, keepdim=True, dtype=torch.float32)
+        if self.tp_world > 1:
+            variance = tensor_model_parallel_all_reduce(
+                variance) / self.tp_world
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        x = x.to(orig_dtype) * self.weight
+        return x
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        assert residual is None, "RMSNorm does not support residual connection."
+        return self._forward(x)
+
+
+class MiniMaxText01RotaryEmbedding(CustomOp):
+    name = "MiniMaxText01RotaryEmbedding"
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position: int,
+        base: int,
+        is_neox_style: bool,
+        cache_dtype: torch.dtype,
+    ) -> None:
+        super().__init__()
+        self.head_size = head_size
+        self.rotary_dim = rotary_dim
+        self.max_position_embeddings = max_position
+        self.base = base
+        self.is_neox_style = is_neox_style
+        self.cache_dtype = cache_dtype
+        cache = self._compute_cos_sin_cache().to(cache_dtype)
+        self.register_buffer("cos_sin_cache", cache, persistent=False)
+
+    def _compute_inv_freq(
+        self,
+        base: Union[int, float],
+    ) -> torch.Tensor:
+        """Compute the inverse frequency."""
+        inv_freq = 1.0 / (base**(torch.arange(
+            0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim))
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        """Compute the cos and sin cache."""
+        inv_freq = self._compute_inv_freq(self.base)
+        t = torch.arange(self.max_position_embeddings, dtype=torch.float)
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        from vllm import _custom_ops as ops
+        self.cos_sin_cache = self.cos_sin_cache.to(positions.device)
+        query_cast = query.to(self.cache_dtype)
+        key_cast = key.to(self.cache_dtype)
+        ops.rotary_embedding(positions, query_cast, key_cast, self.head_size,
+                             self.cos_sin_cache, self.is_neox_style)
+        query = query_cast.to(query.dtype)
+        key = key_cast.to(key.dtype)
+        return query, key
+
+
+class MiniMaxText01MLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_idx: int = None,
+        prefix: str = "mlp",
+    ) -> None:
+        super().__init__()
+        self.layer_idx = layer_idx
+
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+        self.act_fn = SiluAndMul()
+        return
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class MiniMaxText01MoE(nn.Module):
+
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: Optional[torch.dtype] = None,
+        layer_idx: int = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "moe",
+    ) -> None:
+        super().__init__()
+
+        self.layer_idx = layer_idx
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_total_experts = num_experts
+        self.top_k = top_k
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size // self.tp_size
+        self.quant_config = quant_config
+
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
+
+        self.gate = ReplicatedLinear(
+            self.hidden_size,
+            self.num_total_experts,
+            bias=False,
+            params_dtype=torch.float32,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+        self.gate.weight.weight_loader = MiniMaxText01MoE.gate_weight_loader
+
+        self.experts = FusedMoE(
+            num_experts=self.num_total_experts,
+            top_k=self.top_k,
+            hidden_size=self.hidden_size,
+            intermediate_size=self.intermediate_size * self.tp_size,
+            params_dtype=self.params_dtype,
+            reduce_results=True,
+            renormalize=True,
+            quant_config=self.quant_config,
+            tp_size=self.tp_size,
+            prefix=f"{prefix}.experts",
+        )
+        return
+
+    @staticmethod
+    def gate_weight_loader(param: nn.Parameter,
+                           loaded_weight: torch.Tensor) -> None:
+        assert param.size() == loaded_weight.size()
+        param.data.copy_(loaded_weight.to(torch.float32))
+        return
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_size = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        router_logits_fp32, _ = self.gate(hidden_states.to(torch.float32))
+        final_hidden_states = self.experts(
+            hidden_states, router_logits_fp32.to(hidden_states.dtype))
+        final_hidden = final_hidden_states.view(num_tokens, hidden_size)
+        return final_hidden
+
+
+class MiniMaxText01LinearKernel:
+
+    @staticmethod
+    def jit_linear_forward_prefix(q: torch.Tensor,
+                                  k: torch.Tensor,
+                                  v: torch.Tensor,
+                                  kv_caches: torch.Tensor,
+                                  slope_rate: torch.Tensor,
+                                  block_size: int,
+                                  layer_idx: int = None,
+                                  **kwargs) -> torch.Tensor:
+
+        slope_rate = slope_rate.to(torch.float32)
+        should_pad_dim = q.dim() == 3
+        if should_pad_dim:
+            q = q.unsqueeze(0)
+            k = k.unsqueeze(0)
+            v = v.unsqueeze(0)
+        b, h, n, d = q.shape
+        e = d
+        kv_history = kv_caches.reshape(1, h, d, e).contiguous()
+        output, kv_history = lightning_attention(q,
+                                                 k,
+                                                 v,
+                                                 slope_rate,
+                                                 block_size=block_size,
+                                                 kv_history=kv_history)
+        kv_caches.copy_(kv_history[:, :, -1, :, :].reshape(h, d, e))
+        assert output.shape[0] == 1, "batch size must be 1"
+        return rearrange(output.squeeze(0), "h n d -> n (h d)")
+
+
+class MiniMaxText01LinearAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        hidden_inner_size: int,
+        num_heads: int,
+        head_dim: int,
+        max_position: int,
+        block_size: int,
+        num_hidden_layer: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_idx: int = 0,
+        linear_layer_idx: int = 0,
+        prefix: str = "linear_attn",
+    ) -> None:
+        super().__init__()
+
+        self.layer_idx = layer_idx
+        self.BLOCK = block_size
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.total_num_heads = num_heads
+        self.hidden_inner_size = hidden_inner_size
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+        assert self.total_num_heads % self.tp_size == 0
+        self.tp_heads = self.total_num_heads // self.tp_size
+        self.qkv_size = self.num_heads * self.head_dim
+        self.tp_hidden = self.head_dim * self.tp_heads
+
+        self.qkv_proj = ColumnParallelLinear(
+            hidden_size,
+            self.hidden_inner_size * 3,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.output_gate = ColumnParallelLinear(
+            hidden_size,
+            self.hidden_inner_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.output_gate",
+        )
+        self.out_proj = RowParallelLinear(
+            self.hidden_inner_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+        self.norm = MiniMaxText01RMSNormTP(
+            self.hidden_inner_size,
+            eps=1e-5,
+        )
+
+        slope_rate = MiniMaxText01LinearAttention._build_slope_tensor(
+            self.num_heads)
+        if num_hidden_layer <= 1:
+            self.slope_rate = slope_rate * (1 + 1e-5)
+        else:
+            self.slope_rate = slope_rate * (1 - layer_idx /
+                                            (num_hidden_layer - 1) + 1e-5)
+        self.tp_slope = self.slope_rate[self.tp_rank *
+                                        self.tp_heads:(self.tp_rank + 1) *
+                                        self.tp_heads].contiguous()
+
+    @staticmethod
+    def weight_direct_load(param: torch.Tensor,
+                           loaded_weight: torch.Tensor) -> None:
+        assert param.size() == loaded_weight.size()
+        param.data.copy_(loaded_weight)
+        return
+
+    @staticmethod
+    def _build_slope_tensor(n_attention_heads: int):
+
+        def get_slopes(n):
+
+            def get_slopes_power_of_2(n):
+                start = 2**(-(2**-(math.log2(n) - 3)))
+                ratio = start
+                return [start * ratio**i for i in range(n)]
+
+            if math.log2(n).is_integer():
+                return get_slopes_power_of_2(n)
+            else:
+                closest_power_of_2 = 2**math.floor(math.log2(n))
+                return (get_slopes_power_of_2(closest_power_of_2) + get_slopes(
+                    2 * closest_power_of_2)[0::2][:n - closest_power_of_2])
+
+        slopes = torch.tensor(get_slopes(n_attention_heads),
+                              dtype=torch.float32).reshape(
+                                  n_attention_heads, 1, 1)
+        return slopes
+
+    def _prefill_and_mix_infer(self, q, k, v, kv_cache, state_indices_tensor,
+                               attn_metadata):
+        hidden = []
+        for _prefill_idx in range(getattr(attn_metadata, "num_prefills", 0)):
+            _start = attn_metadata.query_start_loc[_prefill_idx]
+            _end = attn_metadata.query_start_loc[_prefill_idx + 1]
+            slot_id = state_indices_tensor[_prefill_idx]
+            qs = q[_start:_end].transpose(0, 1).contiguous()
+            ks = k[_start:_end].transpose(0, 1).contiguous()
+            vs = v[_start:_end].transpose(0, 1).contiguous()
+            slot_id = state_indices_tensor[_prefill_idx]
+            slice_layer_cache = kv_cache[slot_id, ...]
+
+            out_slice = MiniMaxText01LinearKernel.jit_linear_forward_prefix(
+                qs,
+                ks,
+                vs,
+                slice_layer_cache,
+                self.tp_slope,
+                self.BLOCK,
+                layer_idx=self.layer_idx)
+            hidden.append(out_slice.contiguous())
+        if attn_metadata.num_decode_tokens > 0:
+            hidden.append(
+                self._decode_infer(q, k, v, kv_cache, state_indices_tensor,
+                                   attn_metadata))
+        hidden = torch.concat(hidden, dim=0).contiguous()
+        return hidden
+
+    def _decode_infer(self, q, k, v, kv_cache, state_indices_tensor,
+                      attn_metadata):
+        q = q[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
+        k = k[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
+        v = v[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
+        slot_id = state_indices_tensor[getattr(attn_metadata, "num_prefills", 0
+                                               ):]
+        hidden = linear_decode_forward_triton(q, k, v, kv_cache, self.tp_slope,
+                                              slot_id, 32)
+        return hidden
+
+    def forward(self, hidden_states: torch.Tensor, positions: torch.Tensor,
+                kv_caches: MinimaxCacheParams, **kwargs) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        qkv32 = qkv.to(torch.float32)
+        qkvact = torch.nn.functional.silu(qkv32)
+        qkvact = qkvact.view((qkv.shape[0], self.tp_heads, -1))
+        q, k, v = torch.split(qkvact, [self.head_dim] * 3, dim=-1)
+        forward_context = get_forward_context()
+        attn_metadata = forward_context.attn_metadata
+        kv_cache = kv_caches.minimax_cache
+        state_indices_tensor = kv_caches.state_indices_tensor
+
+        decode_only = getattr(attn_metadata, "num_prefills", 0) == 0
+        if not decode_only:
+            hidden = self._prefill_and_mix_infer(q, k, v, kv_cache,
+                                                 state_indices_tensor,
+                                                 attn_metadata)
+        else:
+            hidden = self._decode_infer(q, k, v, kv_cache,
+                                        state_indices_tensor, attn_metadata)
+
+        hidden = self.norm._forward(hidden)
+        gate, _ = self.output_gate(hidden_states)
+        hidden = F.sigmoid(gate) * hidden
+        hidden = hidden.to(hidden_states.dtype)
+        hidden, _ = self.out_proj(hidden)
+        return hidden
+
+
+class MiniMaxText01Attention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        head_dim: int,
+        num_kv_heads: int,
+        rotary_dim: int,
+        max_position: int = 4096 * 32,
+        rope_theta: float = 10000,
+        sliding_window: Optional[int] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_idx: int = None,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "mha",
+    ) -> None:
+        super().__init__()
+        self.layer_idx = layer_idx
+
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim
+
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.sliding_window = sliding_window
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+        return
+
+    def forward(self, hidden_states: torch.Tensor, positions: torch.Tensor,
+                **kwargs) -> torch.Tensor:
+        forward_context = get_forward_context()
+        attn_metadata = forward_context.attn_metadata
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = attn_metadata.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class MiniMaxText01DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        expert_num: int = 1,
+        layer_id: int = None,
+        linear_layer_id: Optional[int] = None,
+        prefix: str = "decoder",
+    ) -> None:
+        self._ilayer = layer_id
+        self._irank = get_tensor_model_parallel_rank()
+        super().__init__()
+
+        self.hidden_size = config.hidden_size
+        self.expert_num = expert_num
+
+        rope_theta = getattr(config, "rope_theta", 10000)
+
+        head_dim = getattr(config, "head_dim",
+                           config.hidden_size // config.num_attention_heads)
+        if hasattr(config, "max_model_len") and isinstance(
+                config.max_model_len, int):
+            max_position_embeddings = min(config.max_position_embeddings,
+                                          config.max_model_len)
+        if config.attention_type == 0:
+            use_headxdim = True
+            hidden_inner = (head_dim * config.num_attention_heads
+                            if use_headxdim else config.hidden_size)
+            self.self_attn = MiniMaxText01LinearAttention(
+                hidden_size=self.hidden_size,
+                hidden_inner_size=hidden_inner,
+                num_heads=config.num_attention_heads,
+                head_dim=head_dim,
+                max_position=max_position_embeddings,
+                block_size=config.block if hasattr(config, "block") else 256,
+                num_hidden_layer=config.num_hidden_layers,
+                quant_config=quant_config,
+                layer_idx=self._ilayer,
+                linear_layer_idx=linear_layer_id,
+                prefix=prefix)
+        elif config.attention_type == 1:
+            self.self_attn = MiniMaxText01Attention(
+                hidden_size=self.hidden_size,
+                num_heads=config.num_attention_heads,
+                head_dim=head_dim,
+                rotary_dim=config.rotary_dim
+                if hasattr(config, "rotary_dim") else head_dim,
+                num_kv_heads=config.num_key_value_heads,
+                max_position=max_position_embeddings,
+                rope_theta=rope_theta,
+                sliding_window=config.sliding_window,
+                quant_config=quant_config,
+                layer_idx=self._ilayer,
+                cache_config=cache_config,
+                prefix=prefix)
+        else:
+            raise ValueError(
+                f"Unsupported attention type: {self.config.attention_type}")
+
+        if expert_num == 1:
+            self.mlp = MiniMaxText01MLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=config.intermediate_size,
+                quant_config=quant_config,
+                layer_idx=self._ilayer,
+                prefix=prefix)
+        else:
+            self.block_sparse_moe = MiniMaxText01MoE(
+                num_experts=expert_num,
+                top_k=config.num_experts_per_tok,
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                layer_idx=self._ilayer,
+                quant_config=quant_config,
+                prefix=prefix)
+
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+        if config.attention_type == 0:
+            self.layernorm_attention_alpha = getattr(
+                config, 'layernorm_linear_attention_alpha', 1)
+            self.layernorm_attention_beta = getattr(
+                config, 'layernorm_linear_attention_beta', 1)
+        else:
+            self.layernorm_attention_alpha = getattr(
+                config, 'layernorm_full_attention_alpha', 1)
+            self.layernorm_attention_beta = getattr(
+                config, 'layernorm_full_attention_beta', 1)
+        self.layernorm_mlp_alpha = getattr(config, 'layernorm_mlp_alpha', 1)
+        self.layernorm_mlp_beta = getattr(config, 'layernorm_mlp_beta', 1)
+        self.postnorm = getattr(config, 'postnorm', False)
+        self.shared_moe = False
+
+        shared_intermediate = getattr(config, 'shared_intermediate_size', 0)
+        if shared_intermediate > 0:
+            self.shared_moe = True
+            self.shared_mlp = MiniMaxText01MLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=shared_intermediate,
+                quant_config=quant_config,
+                layer_idx=self._ilayer,
+                prefix=prefix)
+            self.coefficient = ReplicatedLinear(
+                self.hidden_size,
+                1,
+                bias=False,
+                quant_config=quant_config,
+                params_dtype=torch.float32,
+            )
+            self.coefficient.weight.weight_loader = (
+                self.shared_moe_coefficient_loader)
+            self.shared_moe_mode = getattr(config, 'shared_moe_mode',
+                                           'softmax')
+        return
+
+    def forward(self,
+                hidden_states: torch.Tensor,
+                positions: torch.Tensor,
+                kv_caches: Union[List[Dict], Optional[torch.Tensor]],
+                attn_metadata: AttentionMetadata,
+                residual: Optional[torch.Tensor],
+                is_warmup: bool = False,
+                **kwargs) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        forward_context = get_forward_context()
+        attn_metadata = forward_context.attn_metadata
+        layernorm_input = hidden_states
+        layernorm_output = self.input_layernorm(layernorm_input)
+        residual = layernorm_output if self.postnorm else layernorm_input
+        self_attention_output = self.self_attn(
+            hidden_states=layernorm_output,
+            positions=positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+        )
+
+        residual = residual * self.layernorm_attention_alpha
+        self_attention_output = (self_attention_output *
+                                 self.layernorm_attention_beta)
+
+        layernorm_input = residual + self_attention_output
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+        residual = layernorm_output if self.postnorm else layernorm_input
+
+        if self.expert_num == 1:
+            hidden_states = self.mlp(layernorm_output)
+        else:
+            moe_hidden_states = self.block_sparse_moe(
+                copy.deepcopy(layernorm_output))
+            if self.shared_moe:
+                before_moe_dtype = layernorm_output.dtype
+                moe_hidden_fp32 = moe_hidden_states.to(torch.float32)
+                output_mlp = self.shared_mlp(layernorm_output).to(
+                    torch.float32)
+
+                coef, _ = self.coefficient(layernorm_output.to(torch.float32))
+
+                if self.shared_moe_mode == 'softmax':
+                    coef = torch.nn.functional.softmax(coef, dim=-1)
+                    hidden_states = moe_hidden_fp32 * (
+                        1 - coef) + output_mlp * coef
+                elif self.shared_moe_mode == 'sigmoid':
+                    coef = torch.nn.functional.sigmoid(coef)
+                    hidden_states = moe_hidden_fp32 * (
+                        1 - coef) + output_mlp * coef
+
+                hidden_states = hidden_states.to(before_moe_dtype)
+            else:
+                hidden_states = moe_hidden_states
+
+        residual = residual * self.layernorm_mlp_alpha
+        hidden_states = hidden_states * self.layernorm_mlp_beta
+
+        hidden_states = residual + hidden_states
+
+        return hidden_states, None
+
+    @staticmethod
+    def shared_moe_coefficient_loader(param: torch.Tensor,
+                                      loaded_weight: torch.Tensor) -> None:
+        assert param.size() == loaded_weight.size()
+
+        param.data.copy_(loaded_weight.to(torch.float32))
+        return
+
+
+class MiniMaxText01Model(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: Optional[CacheConfig] = None,
+        scheduler_config=None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.decoder_attention_types = getattr(
+            config, "attn_type_list", False) or getattr(
+                config, "decoder_attention_types", False)
+        if not self.decoder_attention_types:
+            self.decoder_attention_types = [1] * config.num_hidden_layers
+        self.num_layers = config.num_hidden_layers
+
+        self._layer_barrier = False
+        if get_pp_group().is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=self.vocab_size,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        def layer_fn(prefix):
+            layer_idx = int(prefix.split('.')[-1])
+            layer_config = config
+            layer_config.attention_type = self.decoder_attention_types[
+                layer_idx]
+            layer_config.layer_idx = layer_idx
+
+            decoder_kwargs = {
+                "quant_config": quant_config,
+                "layer_id": layer_idx,
+                "cache_config": cache_config
+            }
+
+            if layer_config.attention_type == 0:
+                decoder_kwargs["linear_layer_id"] = sum(
+                    1 for i in range(layer_idx)
+                    if self.decoder_attention_types[i] == 0)
+            else:
+                decoder_kwargs["linear_layer_id"] = None
+
+            if hasattr(config, "num_local_experts") and isinstance(
+                    config.num_local_experts, list):
+                decoder_kwargs["expert_num"] = config.num_local_experts[
+                    layer_idx]
+            elif hasattr(config, "num_local_experts") and isinstance(
+                    config.num_local_experts, int):
+                decoder_kwargs["expert_num"] = config.num_local_experts
+            else:
+                decoder_kwargs["expert_num"] = 1
+
+            return MiniMaxText01DecoderLayer(layer_config,
+                                             **decoder_kwargs,
+                                             prefix=prefix)
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers, layer_fn, prefix=f"{prefix}.layers")
+
+        linear_layer_nums = sum(1 for i in range(config.num_hidden_layers)
+                                if self.decoder_attention_types[i] == 0)
+        max_slots_number = scheduler_config.max_num_seqs
+        self.cache_shape = (linear_layer_nums, max_slots_number,
+                            config.num_attention_heads //
+                            get_tensor_model_parallel_world_size(),
+                            config.head_dim, config.head_dim)
+        _dummy = torch.zeros(1)
+        self._dtype = _dummy.dtype
+        del _dummy
+
+        self.minimax_cache = MinimaxCacheManager(dtype=self._dtype,
+                                                 cache_shape=self.cache_shape)
+
+        rope_theta = getattr(config, "rope_theta", 10000)
+        head_dim = getattr(config, "head_dim",
+                           config.hidden_size // config.num_attention_heads)
+        if hasattr(config, "max_model_len") and isinstance(
+                config.max_model_len, int):
+            max_position_embeddings = min(config.max_position_embeddings,
+                                          config.max_model_len)
+        self.rotary_emb = MiniMaxText01RotaryEmbedding(
+            head_dim,
+            rotary_dim=config.rotary_dim
+            if hasattr(config, "rotary_dim") else head_dim,
+            max_position=max_position_embeddings,
+            base=int(rope_theta),
+            is_neox_style=True,
+            cache_dtype=torch.float32,
+        )
+
+        norm_kwargs = {}
+        if hasattr(config, "rms_norm_eps"):
+            norm_kwargs["eps"] = config.rms_norm_eps
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, **norm_kwargs)
+        else:
+            self.norm = PPMissingLayer()
+        self.embed_scale = 1.0
+        return
+
+    def _clear_prefill_cache(self, attn_metadata,
+                             minimax_cache_tensors: torch.Tensor, **kwargs):
+        seq_to_slot_maps = {}
+        seq_id_map = sum(list(kwargs["request_ids_to_seq_ids"].values()), [])
+        for _, seq_to_slot_map in (
+                self.minimax_cache.cache_indices_mapping.items()):
+            seq_to_slot_maps.update(seq_to_slot_map)
+
+        slots_to_clear = []
+        for _prefill_id in range(getattr(attn_metadata, "num_prefills", 0)):
+            seq_id = seq_id_map[_prefill_id]
+            if attn_metadata.context_lens_tensor[
+                    _prefill_id] == 0 and seq_id in seq_to_slot_maps:
+                slots_to_clear.append(seq_to_slot_maps[seq_id])
+
+        if slots_to_clear:
+            slots_tensor = torch.tensor(slots_to_clear,
+                                        device=minimax_cache_tensors.device,
+                                        dtype=torch.long)
+            minimax_cache_tensors[:, slots_tensor, ...] = 0
+
+    def forward(self,
+                input_ids: Optional[torch.Tensor],
+                positions: torch.Tensor,
+                kv_caches: List[torch.Tensor],
+                intermediate_tensors=None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs) -> torch.Tensor:
+        forward_context = get_forward_context()
+        attn_metadata = forward_context.attn_metadata
+        if attn_metadata is None:
+            return None
+        if "request_ids_to_seq_ids" not in kwargs:
+            kwargs["request_ids_to_seq_ids"] = {}
+        if "finished_requests_ids" not in kwargs:
+            kwargs["finished_requests_ids"] = []
+        (
+            minimax_cache_tensors,
+            state_indices_tensor,
+        ) = self.minimax_cache.current_run_tensors(**kwargs)
+        if getattr(attn_metadata, "num_prefills", 0) > 0:
+            self._clear_prefill_cache(attn_metadata, minimax_cache_tensors,
+                                      **kwargs)
+
+        minimax_cache_params = MinimaxCacheParams(minimax_cache_tensors,
+                                                  state_indices_tensor)
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is None:
+                hidden_states = self.embed_scale * self.embed_tokens(input_ids)
+            else:
+                hidden_states = inputs_embeds
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        kv_cache_index = 0
+        minimax_cache_index = 0
+        attn_metadata.rotary_emb = self.rotary_emb
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            _caches = None
+            if isinstance(layer.self_attn, MiniMaxText01Attention):
+                _caches = kv_caches[kv_cache_index]
+                kv_cache_index += 1
+            if isinstance(layer.self_attn, MiniMaxText01LinearAttention):
+                current_state_layer = minimax_cache_index
+                _caches = minimax_cache_params.at_layer_idx(
+                    current_state_layer)
+                minimax_cache_index += 1
+            hidden_states, residual = layer(
+                hidden_states=hidden_states,
+                positions=positions,
+                kv_caches=_caches,
+                attn_metadata=attn_metadata,
+                residual=residual,
+            )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        if residual is not None:
+            hidden_states, _ = self.norm(hidden_states, residual)
+        else:
+            hidden_states = self.norm(hidden_states)
+
+        return hidden_states
+
+
+class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid,
+                               SupportsV0Only):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        self.config = config
+        self.lora_config = lora_config
+
+        if not hasattr(config, "sliding_window"):
+            config.sliding_window = None
+
+        self.CONCAT_FFN = True
+
+        self.unpadded_vocab_size = self.config.vocab_size
+        if hasattr(vllm_config.model_config, "max_model_len"):
+            self.config.max_model_len = vllm_config.model_config.max_model_len
+        self.model = MiniMaxText01Model(
+            self.config,
+            quant_config,
+            cache_config=vllm_config.cache_config,
+            scheduler_config=vllm_config.scheduler_config,
+            prefix=maybe_prefix(prefix, "model"))
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                self.config.hidden_size,
+                org_num_embeddings=self.config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            )
+
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    self.config.vocab_size)
+
+            self.sampler = Sampler()
+        else:
+            self.lm_head = PPMissingLayer()
+
+        flash_layer_count = sum(1 for attn_type in self.config.attn_type_list
+                                if attn_type == 1)
+        self.kv_cache = [torch.tensor([]) for _ in range(flash_layer_count)]
+        return
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        return self.model.minimax_cache.copy_inputs_before_cuda_graphs(
+            input_buffers, **kwargs)
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        return self.model.minimax_cache.get_seqlen_agnostic_capture_inputs(
+            batch_size)
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, self.kv_cache,
+                                   intermediate_tensors, inputs_embeds,
+                                   **kwargs)
+
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ):
+
+        next_tokens = self.sampler(logits, sampling_metadata)
+
+        return next_tokens
+
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> None:
+        params_dict = dict(self.named_parameters())
+
+        def which_layer(name: str) -> int:
+            if "layers" in name:
+                after_layer = name.split("layers")[-1]
+                return int(after_layer.split(".")[1])
+            return None
+
+        def is_linear_attn_layer(layer_idx: int) -> bool:
+            if layer_idx is None or not hasattr(self.config, "attn_type_list"):
+                return False
+            return self.config.attn_type_list[layer_idx] == 0
+
+        def is_moe_weight(name: str) -> bool:
+            return "block_sparse_moe" in name and not name.endswith(".bias")
+
+        def get_expert_id(param_name):
+            pattern = r'model\.layers\.\d+\.block_sparse_moe\.experts\.(\d+)\.'
+            match = re.search(pattern, param_name)
+            if match:
+                return match.group(1)
+            return None
+
+        def load_sparse_moe_weight(name: str, loaded_weight: torch.Tensor,
+                                   self) -> None:
+            if isinstance(self.config.num_local_experts, list):
+                expert_params_mapping = [
+                    ("w13_weight"
+                     if weight_name in ["w1", "w3"] else "w2_weight",
+                     f"experts.{expert_id}.{weight_name}.weight", expert_id)
+                    for expert_id in range(max(self.config.num_local_experts))
+                    for weight_name in ["w1", "w2", "w3"]
+                ]
+            else:
+                expert_params_mapping = [
+                    ("w13_scale" if weight_name in ["w1", "w3"] else
+                     "w2_scale", f"{expert_id}.{weight_name}.weight_scale",
+                     expert_id, weight_name)
+                    for expert_id in range(self.config.num_local_experts)
+                    for weight_name in ["w1", "w2", "w3"]
+                ] + [("w13_weight" if weight_name in ["w1", "w3"] else
+                      "w2_weight", f"{expert_id}.{weight_name}.weight",
+                      expert_id, weight_name)
+                     for expert_id in range(self.config.num_local_experts)
+                     for weight_name in ["w1", "w2", "w3"]]
+            for (param_name, weight_name, expert_id,
+                 shard_id) in expert_params_mapping:
+                name_expert_id = get_expert_id(name)
+                if name_expert_id is not None and int(name_expert_id) != int(
+                        expert_id):
+                    continue
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if is_pp_missing_parameter(name, self):
+                    return
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader = weight_loader_with_alias(name)(weight_loader)
+                weight_loader(param,
+                              loaded_weight,
+                              weight_name,
+                              expert_id=expert_id,
+                              shard_id=shard_id)
+                break
+            else:
+                if is_pp_missing_parameter(name, self):
+                    return
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader = weight_loader_with_alias(name)(weight_loader)
+                weight_loader(param, loaded_weight)
+            return
+
+        def is_shared_mlp_weight(name: str) -> bool:
+            return "shared_mlp" in name and not name.endswith(".bias")
+
+        def load_shared_mlp_weight(name: str, loaded_weight: torch.Tensor,
+                                   self) -> None:
+            if not self.CONCAT_FFN:
+                if "gate_proj" in name:
+                    name = name.replace("gate_proj", "w1", 1)
+                elif "up_proj" in name:
+                    name = name.replace("up_proj", "w3", 1)
+                elif "down_proj" in name:
+                    name = name.replace("down_proj", "w2", 1)
+            else:
+                if "gate_proj" in name:
+                    name = name.replace("gate_proj", "gate_up_proj", 1)
+                    loaded_shard_id = 0
+                elif "up_proj" in name:
+                    name = name.replace("up_proj", "gate_up_proj", 1)
+                    loaded_shard_id = 1
+            if is_pp_missing_parameter(name, self):
+                return
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader = weight_loader_with_alias(name)(weight_loader)
+            if not self.CONCAT_FFN:
+                weight_loader(param, loaded_weight)
+            else:
+                if "gate_up_proj" in name:
+                    weight_loader(param, loaded_weight, loaded_shard_id)
+                elif "down_proj" in name:
+                    weight_loader(param, loaded_weight)
+                else:
+                    raise AssertionError(
+                        "MLP weight not in [gate_up_proj, down_proj]")
+            return
+
+        def is_mha_weight(name: str) -> bool:
+            return "self_attn" in name and not name.endswith(".bias")
+
+        def load_linear_attn_weight(name: str, loaded_weight: torch.Tensor,
+                                    self) -> None:
+            if is_pp_missing_parameter(name, self):
+                return
+            param = params_dict[name]
+
+            weight_loader = getattr(
+                param, "weight_loader",
+                MiniMaxText01LinearAttention.weight_direct_load)
+            weight_loader = weight_loader_with_alias(name)(weight_loader)
+            weight_loader(param, loaded_weight)
+            return
+
+        def load_flash_attn_weight(name: str, loaded_weight: torch.Tensor,
+                                   self) -> None:
+
+            flash_mha_params_mapping = [
+                ("qkv_proj", "q_proj", "q"),
+                ("qkv_proj", "k_proj", "k"),
+                ("qkv_proj", "v_proj", "v"),
+                ("gate_up_proj", "gate_proj", 0),
+                ("gate_up_proj", "up_proj", 1),
+            ]
+            for (param_name, weight_name,
+                 shard_id) in flash_mha_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if is_pp_missing_parameter(name, self):
+                    return
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader = weight_loader_with_alias(name)(weight_loader)
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if is_pp_missing_parameter(name, self):
+                    return
+                param = params_dict[name]
+
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader = weight_loader_with_alias(name)(weight_loader)
+                weight_loader(param, loaded_weight)
+            return
+
+        def is_layer_norm_weight(name: str) -> bool:
+            return "norm" in name and not name.endswith(
+                ".bias") and name in params_dict
+
+        def load_layer_norm_weight(name: str, loaded_weight: torch.Tensor,
+                                   self) -> None:
+            if is_pp_missing_parameter(name, self):
+                return
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader = weight_loader_with_alias(name)(weight_loader)
+            weight_loader(param, loaded_weight)
+            return
+
+        def load_basic_weight(name: str, loaded_weight: torch.Tensor,
+                              self) -> None:
+            if is_pp_missing_parameter(name, self):
+                return
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader = weight_loader_with_alias(name)(weight_loader)
+            weight_loader(param, loaded_weight)
+            return
+
+        for name, loaded_weight in weights:
+            weight_at_layer = which_layer(name)
+            if weight_at_layer and weight_at_layer >= len(
+                    self.config.attn_type_list):
+                continue
+
+            if is_layer_norm_weight(name):
+                load_layer_norm_weight(name, loaded_weight, self)
+                continue
+            if is_mha_weight(name):
+                if is_linear_attn_layer(weight_at_layer):
+                    load_linear_attn_weight(name, loaded_weight, self)
+                else:
+                    load_flash_attn_weight(name, loaded_weight, self)
+                continue
+            if is_moe_weight(name):
+                load_sparse_moe_weight(name, loaded_weight, self)
+                continue
+            if is_shared_mlp_weight(name):
+                load_shared_mlp_weight(name, loaded_weight, self)
+                continue
+
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            load_basic_weight(name, loaded_weight, self)
+        return
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 2f1827c174082..6ead6509bfe8f 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -35,6 +35,7 @@ _TEXT_GENERATION_MODELS = {
     "AquilaModel": ("llama", "LlamaForCausalLM"),
     "AquilaForCausalLM": ("llama", "LlamaForCausalLM"),  # AquilaChat2
     "ArcticForCausalLM": ("arctic", "ArcticForCausalLM"),
+    "MiniMaxText01ForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"),
     # baichuan-7b, upper case 'C' in the class name
     "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"),
     # baichuan-13b, lower case 'c' in the class name

From db9dfcfa6a0b88fb880ee21b56f133c9c5a600ab Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Tue, 1 Apr 2025 13:58:59 -0700
Subject: [PATCH 169/593] [Docs] Add Ollama meetup slides (#15905)

Signed-off-by: simon-mo <simon.mo@hey.com>
---
 README.md                        | 4 +---
 docs/source/community/meetups.md | 2 ++
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index f2da0467e5c34..f3e9403dd4b37 100644
--- a/README.md
+++ b/README.md
@@ -15,14 +15,12 @@ Easy, fast, and cheap LLM serving for everyone
 
 ---
 
-[2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center!
-
 [2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
 
 ---
 
 *Latest News* 🔥
-
+- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
 - [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
 - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
 - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
diff --git a/docs/source/community/meetups.md b/docs/source/community/meetups.md
index efb4f692972b5..954dc4e7ec997 100644
--- a/docs/source/community/meetups.md
+++ b/docs/source/community/meetups.md
@@ -4,6 +4,8 @@
 
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
+- [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama), March 27th 2025. [[Slides]](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
+- [The first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg), March 16th 2025. [[Slides]](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
 - [The East Coast vLLM Meetup](https://lu.ma/7mu4k4xx), March 11th 2025. [[Slides]](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0)
 - [The ninth vLLM meetup](https://lu.ma/h7g3kuj9), with Meta, February 27th 2025. [[Slides]](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing)
 - [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing)

From 58f5a59769b89a9457dfbedaac9d200bb100be78 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Tue, 1 Apr 2025 17:16:55 -0700
Subject: [PATCH 170/593] [Docs] Add Intel as Sponsor (#15913)

Signed-off-by: simon-mo <simon.mo@hey.com>
---
 README.md                         | 1 +
 docs/source/community/sponsors.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/README.md b/README.md
index f3e9403dd4b37..03643bb682b3d 100644
--- a/README.md
+++ b/README.md
@@ -124,6 +124,7 @@ Compute Resources:
 - Databricks
 - DeepInfra
 - Google Cloud
+- Intel
 - Lambda Lab
 - Nebius
 - Novita AI
diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md
index fb93e65673dff..b8a1ddbe38794 100644
--- a/docs/source/community/sponsors.md
+++ b/docs/source/community/sponsors.md
@@ -22,6 +22,7 @@ Compute Resources:
 - Databricks
 - DeepInfra
 - Google Cloud
+- Intel
 - Lambda Lab
 - Nebius
 - Novita AI

From 24b7fb455a77518f50b21c35a693ae28a5b49b0b Mon Sep 17 00:00:00 2001
From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Date: Tue, 1 Apr 2025 21:15:14 -0400
Subject: [PATCH 171/593] [Spec Decode] Fix input triton kernel for eagle
 (#15909)

---
 vllm/v1/spec_decode/eagle.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 57c6b652593d6..3aaaf34bc79bf 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -250,13 +250,12 @@ def prepare_input_kernel(
     num_tokens = end_pos - start_pos
 
     index_start = tl.load(cu_query_lens_ptr + pid)
-    indices = index_start + tl.arange(0, BLOCK_SIZE)
 
     num_blocks = tl.cdiv(num_tokens, BLOCK_SIZE)
     for i in tl.range(num_blocks):
         offset = i * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
         tl.store(
             out_ptr + start_pos + offset,
-            indices,
+            index_start + offset,
             mask=offset < num_tokens,
         )

From 6efb195a6e02468001cc2a1ef818d55426da97c0 Mon Sep 17 00:00:00 2001
From: Brayden Zhong <b8zhong@uwaterloo.ca>
Date: Tue, 1 Apr 2025 22:06:44 -0400
Subject: [PATCH 172/593] [V1] Fix: make sure `k_index` is int64 for
 `apply_top_k_only` (#15907)

Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
---
 vllm/v1/sample/ops/topk_topp_sampler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 5dfcae08b170c..d4bc23364c574 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -200,7 +200,7 @@ def apply_top_k_only(
     # topk.values tensor has shape [batch_size, max_top_k].
     # Convert top k to 0-based index in range [0, max_top_k).
     k_index = k.sub_(1).unsqueeze(1)
-    top_k_mask = logits.topk(max_top_k, dim=1).values.gather(1, k_index)
+    top_k_mask = logits.topk(max_top_k, dim=1).values.gather(1, k_index.long())
     # Handle non-topk rows.
     top_k_mask.masked_fill_(no_top_k_mask.unsqueeze(1), -float("inf"))
     logits.masked_fill_(logits < top_k_mask, -float("inf"))

From 2039c6305bdcf2f920c80d8fcfce07cd6396f62f Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 2 Apr 2025 11:33:55 +0800
Subject: [PATCH 173/593] [Bugfix] Fix imports for MoE on CPU (#15841)

Signed-off-by: Thien Tran <gau.nernst@yahoo.com.sg>
---
 vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index c9bb676710a78..ac158a7eee534 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -4,8 +4,6 @@ from typing import List, Optional
 import torch
 
 import vllm.envs as envs
-from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    per_token_group_quant_fp8)
 from vllm.platforms import current_platform
 
 
@@ -38,6 +36,9 @@ def rocm_aiter_fused_experts(
     import aiter as rocm_aiter
     import aiter.fused_moe_bf16_asm as rocm_aiter_asm_fmoe
 
+    from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+        per_token_group_quant_fp8)
+
     if envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE and use_fp8_w8a8:
         assert w1_scale is not None
         assert w2_scale is not None

From 274d8e8818b85084c8d2e994b42dd77cb12d23af Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 1 Apr 2025 23:38:02 -0700
Subject: [PATCH 174/593] [V1][Minor] Enhance SpecDecoding Metrics Log in V1
 (#15902)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/spec_decode/metrics.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py
index 7fecbaeed4f74..7bb3c209d1dcb 100644
--- a/vllm/v1/spec_decode/metrics.py
+++ b/vllm/v1/spec_decode/metrics.py
@@ -47,13 +47,16 @@ class SpecDecodingMetrics:
         num_draft_tokens = np.sum(self.num_draft_tokens)
         num_accepted_tokens = np.sum(self.num_accepted_tokens)
 
-        draft_acceptance_rate = (num_accepted_tokens / num_draft_tokens
-                                 if num_draft_tokens > 0 else float("nan"))
+        draft_acceptance_rate = (num_accepted_tokens / num_draft_tokens *
+                                 100 if num_draft_tokens > 0 else float("nan"))
 
         logger.info(
-            "Speculative metrics: "
-            "Draft acceptance rate: %.3f, "
-            "Number of accepted tokens: %d, "
-            "Number of draft tokens: %d, ", draft_acceptance_rate,
-            num_accepted_tokens, num_draft_tokens)
+            "SpecDecoding metrics: "
+            "Draft acceptance rate: %.1f%%, "
+            "Accepted: %d tokens, "
+            "Drafted: %d tokens",
+            draft_acceptance_rate,
+            num_accepted_tokens,
+            num_draft_tokens,
+        )
         self.reset()

From c920e01242c042d9d6143a306fcfb8b7adab9287 Mon Sep 17 00:00:00 2001
From: chun <chun.jb.37@gmail.com>
Date: Wed, 2 Apr 2025 15:38:26 +0900
Subject: [PATCH 175/593] [Doc] Update rocm.inc.md (#15917)

Signed-off-by: chun37 <chun.jb.37@gmail.com>
---
 docs/source/getting_started/installation/gpu/rocm.inc.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md
index eae7a23585103..21c8d7d01adeb 100644
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@@ -31,7 +31,7 @@ Currently, there are no pre-built ROCm wheels.
     ```console
     # Install PyTorch
     $ pip uninstall torch -y
-    $ pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/rocm6.3
+    $ pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3
     ```
 
 1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton)

From 0e00d40e4faa1c433d9e16fbca82ba961c80c649 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 1 Apr 2025 23:46:42 -0700
Subject: [PATCH 176/593] [V1][Bugfix] Fix typo in MoE TPU checking (#15927)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 143123e577b1c..5cbbe49bbba49 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -309,7 +309,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                                 expert_map=expert_map,
                                 renormalize=renormalize)
 
-    forward_native = forward_tpu if current_platform.is_tpu else forward_cuda
+    forward_native = forward_tpu if current_platform.is_tpu() else forward_cuda
 
 
 def determine_expert_map(

From aa557e64227d9640e2af8c03a6fc92f3513e3f54 Mon Sep 17 00:00:00 2001
From: Li Wang <wangli858794774@gmail.com>
Date: Wed, 2 Apr 2025 16:32:24 +0800
Subject: [PATCH 177/593] [Benchmark]Fix error message (#15866)

Signed-off-by: wangli <wangli858794774@gmail.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
---
 benchmarks/benchmark_dataset.py |  9 ---------
 benchmarks/benchmark_serving.py | 17 ++++++++++++++---
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index f332566d64f80..c2fbe2bb6d276 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -582,15 +582,6 @@ class HuggingFaceDataset(BenchmarkDataset):
     ) -> None:
         super().__init__(dataset_path=dataset_path, **kwargs)
 
-        # Validate dataset path
-        if self.SUPPORTED_DATASET_PATHS and \
-            self.dataset_path not in self.SUPPORTED_DATASET_PATHS:
-            raise ValueError(
-                f"{self.__class__.__name__} "
-                f"only supports: {', '.join(self.SUPPORTED_DATASET_PATHS)}. "
-                "Please consider contributing if you would "
-                "like to add support for additional dataset formats.")
-
         self.dataset_split = dataset_split
         self.dataset_subset = dataset_subset
         self.load_data()
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index dabf2214c84a0..ec2ed1a1750b9 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -50,9 +50,9 @@ except ImportError:
     from argparse import ArgumentParser as FlexibleArgumentParser
 
 from benchmark_dataset import (BurstGPTDataset, ConversationDataset,
-                               InstructCoderDataset, RandomDataset,
-                               SampleRequest, ShareGPTDataset, SonnetDataset,
-                               VisionArenaDataset)
+                               HuggingFaceDataset, InstructCoderDataset,
+                               RandomDataset, SampleRequest, ShareGPTDataset,
+                               SonnetDataset, VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@@ -595,6 +595,17 @@ def main(args: argparse.Namespace):
             args.hf_split = "train"
         elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
             dataset_class = ConversationDataset
+        else:
+            supported_datasets = set([
+                dataset_name for cls in HuggingFaceDataset.__subclasses__()
+                for dataset_name in cls.SUPPORTED_DATASET_PATHS
+            ])
+            raise ValueError(
+                f"Unsupported dataset path: {args.dataset_path}. "
+                "Huggingface dataset only supports dataset_path"
+                f" from one of following: {supported_datasets}. "
+                "Please consider contributing if you would "
+                "like to add support for additional dataset formats.")
         input_requests = dataset_class(
             dataset_path=args.dataset_path,
             dataset_subset=args.hf_subset,

From cdb57015a7e09834d3d50e404b045d8667d7a5b4 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Wed, 2 Apr 2025 16:37:38 +0800
Subject: [PATCH 178/593] [Misc] Replace print with logger (#15923)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 .../openai/tool_parsers/phi4mini_tool_parser.py      | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
index 167eb0ea2a97d..668776a832e27 100644
--- a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
@@ -47,13 +47,13 @@ class Phi4MiniJsonToolParser(ToolParser):
         """
         Extract the tool calls from a complete model response.
         """
-        print(f"Model output: {model_output}")
+        logger.debug("Model output: %s", model_output)
 
         pattern = r'functools\[(.*?)\]'
         matches = re.search(pattern, model_output, re.DOTALL)
 
         if not matches:
-            print("No function calls found")
+            logger.debug("No function calls found")
             return ExtractedToolCallInformation(tools_called=False,
                                                 tool_calls=[],
                                                 content=model_output)
@@ -64,10 +64,12 @@ class Phi4MiniJsonToolParser(ToolParser):
                 json_content = '[' + matches.group(1) + ']'
 
                 function_call_arr = json.loads(json_content)
-                print(f"Successfully extracted {len(function_call_arr)} "
-                      "function calls")
+                logger.debug("Successfully extracted %d function calls",
+                             len(function_call_arr))
             except json.JSONDecodeError as e:
-                print(f"Error parsing JSON: {e}")
+                logger.error(
+                    "Failed to parse function calls from model output: %s. "
+                    "Error: %s", model_output, str(e))
 
             tool_calls: list[ToolCall] = [
                 ToolCall(

From 4203926f10f5170c7ef0dfefb676f3b82a7f5e0a Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 2 Apr 2025 16:39:09 +0800
Subject: [PATCH 179/593] [CI/Build] Further clean up LoRA tests (#15920)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .buildkite/test-pipeline.yaml        |  4 +---
 tests/lora/conftest.py               | 23 -----------------------
 tests/lora/test_layers.py            |  2 +-
 tests/lora/test_llama_tp.py          | 17 -----------------
 tests/lora/test_minicpmv_tp.py       |  1 -
 tests/lora/test_transfomers_model.py |  8 +++++++-
 6 files changed, 9 insertions(+), 46 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 99358d5579919..e2b452d8a05fe 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -289,7 +289,7 @@ steps:
   source_file_dependencies:
   - vllm/lora
   - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py  --ignore=lora/test_transfomers_model.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
   parallelism: 4
 
 - label: PyTorch Fullgraph Smoke Test # 9min
@@ -602,8 +602,6 @@ steps:
     # requires multi-GPU testing for validation.
     - pytest -v -s -x lora/test_chatglm3_tp.py
     - pytest -v -s -x lora/test_llama_tp.py
-    - pytest -v -s -x lora/test_minicpmv_tp.py
-    - pytest -v -s -x lora/test_transfomers_model.py
 
 
 - label: Weight Loading Multiple GPU Test  # 33min
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 523bebe06ee59..91733fde13078 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -2,7 +2,6 @@
 
 import tempfile
 from collections import OrderedDict
-from typing import TypedDict
 from unittest.mock import MagicMock, patch
 
 import pytest
@@ -26,28 +25,6 @@ from vllm.model_executor.models.interfaces import SupportsLoRA
 from vllm.platforms import current_platform
 
 
-class ContextIDInfo(TypedDict):
-    lora_id: int
-    context_length: str
-
-
-class ContextInfo(TypedDict):
-    lora: str
-    context_length: str
-
-
-LONG_LORA_INFOS: list[ContextIDInfo] = [{
-    "lora_id": 1,
-    "context_length": "16k",
-}, {
-    "lora_id": 2,
-    "context_length": "16k",
-}, {
-    "lora_id": 3,
-    "context_length": "32k",
-}]
-
-
 @pytest.fixture()
 def should_do_global_cleanup_after_test(request) -> bool:
     """Allow subdirectories to skip global cleanup by overriding this fixture.
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 99d60b332e659..f85725fe42302 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -59,7 +59,7 @@ DEVICES = ([
 # prefill stage(True) or decode stage(False)
 STAGES = [True, False]
 
-NUM_RANDOM_SEEDS = 10
+NUM_RANDOM_SEEDS = 6
 
 VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
 
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index 9f20e47c2f948..31abac87d19d6 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -153,20 +153,3 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
         enable_chunked_prefill=True,
     )
     generate_and_test(llm, sql_lora_files)
-
-
-@multi_gpu_test(num_gpus=4)
-@create_new_process_for_each_test()
-def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files):
-
-    llm = vllm.LLM(
-        MODEL_PATH,
-        enable_lora=True,
-        max_num_seqs=16,
-        max_loras=4,
-        tensor_parallel_size=4,
-        fully_sharded_loras=True,
-        enable_lora_bias=True,
-        enable_chunked_prefill=True,
-    )
-    generate_and_test(llm, sql_lora_files)
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
index 00e6fe7c61de7..0b223e5011ff9 100644
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -58,7 +58,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
 @pytest.mark.xfail(
     current_platform.is_rocm(),
     reason="MiniCPM-V dependency xformers incompatible with ROCm")
-@create_new_process_for_each_test()
 def test_minicpmv_lora(minicpmv_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
diff --git a/tests/lora/test_transfomers_model.py b/tests/lora/test_transfomers_model.py
index 87db0b4bbde06..b50e210ed0822 100644
--- a/tests/lora/test_transfomers_model.py
+++ b/tests/lora/test_transfomers_model.py
@@ -1,7 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import pytest
+
 import vllm
 from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
 
 from ..utils import create_new_process_for_each_test, multi_gpu_test
 
@@ -44,7 +47,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     return generated_texts
 
 
-@create_new_process_for_each_test()
 def test_ilama_lora(ilama_lora_files):
     llm = vllm.LLM(MODEL_PATH,
                    max_model_len=1024,
@@ -63,6 +65,8 @@ def test_ilama_lora(ilama_lora_files):
         assert output2[i] == EXPECTED_LORA_OUTPUT[i]
 
 
+@pytest.mark.skipif(current_platform.is_cuda_alike(),
+                    reason="Skipping to avoid redundant model tests")
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_ilama_lora_tp4(ilama_lora_files):
@@ -84,6 +88,8 @@ def test_ilama_lora_tp4(ilama_lora_files):
         assert output2[i] == EXPECTED_LORA_OUTPUT[i]
 
 
+@pytest.mark.skipif(current_platform.is_cuda_alike(),
+                    reason="Skipping to avoid redundant model tests")
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):

From 2edc87b161a17a71b00626800ad489ac8b563cb0 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 2 Apr 2025 16:45:02 +0800
Subject: [PATCH 180/593] [Bugfix] Fix cache block size calculation for CPU MLA
 (#15848)

Signed-off-by: Thien Tran <gau.nernst@yahoo.com.sg>
---
 vllm/worker/cpu_worker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index b93aae9c91bd6..5f35c1af2e7df 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -106,7 +106,7 @@ class CPUCacheEngine:
         num_layers = model_config.get_num_layers(parallel_config)
 
         key_cache_block = block_size * num_heads * head_size
-        value_cache_block = key_cache_block
+        value_cache_block = key_cache_block if not model_config.use_mla else 0
         total = num_layers * (key_cache_block + value_cache_block)
         if cache_dtype == "auto":
             dtype = model_config.dtype

From 101f1481f9c4e3e108d30ce2f8715ee89288992b Mon Sep 17 00:00:00 2001
From: Chris Thi <chris.c.thi@gmail.com>
Date: Wed, 2 Apr 2025 04:47:57 -0400
Subject: [PATCH 181/593] [Build/CI] Update lm-eval to 0.4.8 (#15912)

Signed-off-by: Chris Thi <chris.c.thi@gmail.com>
---
 requirements/test.in  | 2 +-
 requirements/test.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements/test.in b/requirements/test.in
index c1b70bca70ead..43fa1bb58c825 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -29,7 +29,7 @@ matplotlib # required for qwen-vl test
 mistral_common[opencv] >= 1.5.4 # required for pixtral test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
-lm-eval[api]==0.4.4 # required for model evaluation test
+lm-eval[api]==0.4.8 # required for model evaluation test
 transformers==4.50.3
 # quantization
 bitsandbytes>=0.45.3
diff --git a/requirements/test.txt b/requirements/test.txt
index c46fa0721d6f0..bddf9990ca014 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -219,7 +219,7 @@ librosa==0.10.2.post1
     # via -r requirements/test.in
 llvmlite==0.44.0
     # via numba
-lm-eval==0.4.4
+lm-eval==0.4.8
     # via -r requirements/test.in
 lxml==5.3.0
     # via sacrebleu

From 90969fb39a58593515f6a087d9200bc72333ab9a Mon Sep 17 00:00:00 2001
From: LukasBluebaum <38468743+LukasBluebaum@users.noreply.github.com>
Date: Wed, 2 Apr 2025 10:58:48 +0200
Subject: [PATCH 182/593] [Kernel] Add more dtype support for GGUF
 dequantization (#15879)

Signed-off-by: lukas.bluebaum <lukas.bluebaum@aleph-alpha.com>
---
 csrc/ops.h                                    |  3 +-
 csrc/quantization/gguf/dequantize.cuh         | 65 ++++++++++---------
 csrc/quantization/gguf/ggml-common.h          | 17 ++++-
 csrc/quantization/gguf/gguf_kernel.cu         | 15 +++--
 csrc/torch_bindings.cpp                       |  4 +-
 tests/kernels/test_ggml.py                    |  3 +-
 tests/kernels/test_gguf.py                    |  4 +-
 vllm/_custom_ops.py                           | 15 +++--
 .../layers/quantization/gguf.py               |  4 +-
 9 files changed, 80 insertions(+), 50 deletions(-)

diff --git a/csrc/ops.h b/csrc/ops.h
index a0985d3242662..152c94e860032 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -145,7 +145,8 @@ torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm);
 #endif
 
 torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
-                              int64_t n);
+                              int64_t n,
+                              std::optional<at::ScalarType> const& dtype);
 
 torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X,
                                   int64_t type, int64_t row);
diff --git a/csrc/quantization/gguf/dequantize.cuh b/csrc/quantization/gguf/dequantize.cuh
index 41fc032ff1a56..9d355003ef91d 100644
--- a/csrc/quantization/gguf/dequantize.cuh
+++ b/csrc/quantization/gguf/dequantize.cuh
@@ -94,8 +94,8 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
     dfloat2 v;
     dequantize_kernel(vx, ib, iqs, v);
 
-    y[iybs + iqs + 0]        = v.x;
-    y[iybs + iqs + y_offset] = v.y;
+    y[iybs + iqs + 0]        = convert_from_half<dst_t>(v.x);
+    y[iybs + iqs + y_offset] = convert_from_half<dst_t>(v.y);
 }
 
 template<typename dst_t>
@@ -114,10 +114,10 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
 
     half dall = __low2half(x[i].dm);
     half dmin = __high2half(x[i].dm);
-    y[l+ 0] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+0] & 0xF) * ((q >> 0) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+0] >> 4)));
-    y[l+32] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+2] & 0xF) * ((q >> 2) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+2] >> 4)));
-    y[l+64] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+4] & 0xF) * ((q >> 4) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+4] >> 4)));
-    y[l+96] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+6] & 0xF) * ((q >> 6) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+6] >> 4)));
+    y[l+ 0] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+0] & 0xF) * ((q >> 0) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+0] >> 4))));
+    y[l+32] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+2] & 0xF) * ((q >> 2) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+2] >> 4))));
+    y[l+64] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+4] & 0xF) * ((q >> 4) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+4] >> 4))));
+    y[l+96] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+6] & 0xF) * ((q >> 6) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+6] >> 4))));
 }
 
 template<typename dst_t>
@@ -148,7 +148,9 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
     const uint8_t * q = x[i].qs + 32*n;
     const uint8_t * hm = x[i].hmask;
 
-    for (int l = l0; l < l0+4; ++l) y[l] = __hmul(dl,  __int2half_rn((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)));
+    for (int l = l0; l < l0+4; ++l) {
+        y[l] = convert_from_half<dst_t>(__hmul(dl,  __int2half_rn((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4))));
+    }
 }
 
 static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
@@ -188,8 +190,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
     const half d2 = __hmul(dall, __int2half_rn(sc));
     const half m2 = __hmul(dmin, __int2half_rn(m));
     for (int l = 0; l < n; ++l) {
-        y[l + 0] = __hsub(__hmul(d1, __int2half_rn(q[l] & 0xF)), m1);
-        y[l +32] = __hsub(__hmul(d2,  __int2half_rn(q[l] >> 4)), m2);
+        y[l + 0] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn(q[l] & 0xF)), m1));
+        y[l +32] = convert_from_half<dst_t>(__hsub(__hmul(d2,  __int2half_rn(q[l] >> 4)), m2));
     }
 }
 
@@ -220,11 +222,11 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
     const half d2 = __hmul(dall, __int2half_rn(sc)); const half m2 = __hmul(dmin, __int2half_rn(m));
 
     uint8_t   hm  = 1 << (2*il);
-    y[ 0] = __hsub(__hmul(d1, __int2half_rn((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0))), m1);
-    y[ 1] = __hsub(__hmul(d1, __int2half_rn((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0))), m1);
+    y[ 0] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0))), m1));
+    y[ 1] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0))), m1));
     hm <<= 1;
-    y[32] = __hsub(__hmul(d2, __int2half_rn((ql[0] >>  4) + (qh[0] & hm ? 16 : 0))), m2);
-    y[33] = __hsub(__hmul(d2, __int2half_rn((ql[1] >>  4) + (qh[1] & hm ? 16 : 0))), m2);
+    y[32] = convert_from_half<dst_t>(__hsub(__hmul(d2, __int2half_rn((ql[0] >>  4) + (qh[0] & hm ? 16 : 0))), m2));
+    y[33] = convert_from_half<dst_t>(__hsub(__hmul(d2, __int2half_rn((ql[1] >>  4) + (qh[1] & hm ? 16 : 0))), m2));
 }
 
 template<typename dst_t>
@@ -247,10 +249,10 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
     const uint8_t   qh = x[i].qh[32*ip + il];
     const int8_t  * sc = x[i].scales + is;
 
-    y[ 0] = __hmul(d, __int2half_rn(sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32)));
-    y[32] = __hmul(d, __int2half_rn(sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32)));
-    y[64] = __hmul(d, __int2half_rn(sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32)));
-    y[96] = __hmul(d, __int2half_rn(sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32)));
+    y[ 0] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32))));
+    y[32] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32))));
+    y[64] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32))));
+    y[96] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32))));
 }
 
 template<typename dst_t>
@@ -269,7 +271,7 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
     const uint32_t aux32 = q2[2] | (q2[3] << 16);
     const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.25f;
     const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
-    for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
+    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
 }
 
 template<typename dst_t>
@@ -286,7 +288,7 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
     const uint8_t  * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
     const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
     const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
-    for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
+    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
 
 }
 
@@ -303,7 +305,7 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
     const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
     const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
     const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
-    for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
+    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
 }
 
 template<typename dst_t>
@@ -324,8 +326,8 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
     const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.5f;
     const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
     for (int j = 0; j < 4; ++j) {
-        y[j+0] = __float2half(d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f));
-        y[j+4] = __float2half(d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f));
+        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
+        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
     }
 }
 
@@ -345,8 +347,8 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
     const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)) * 0.5f;
     const uint8_t signs = x[i].signs[4*ib + il];
     for (int j = 0; j < 4; ++j) {
-        y[j+0] = __float2half(d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f));
-        y[j+4] = __float2half(d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f));
+        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
+        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
     }
 }
 
@@ -367,7 +369,7 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
     grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
     grid32[0] &= 0x0f0f0f0f;
     for (int j = 0; j < 8; ++j) {
-        y[j] = __float2half(d * (q[j] + delta));
+        y[j] = d * (q[j] + delta);
     }
 }
 
@@ -392,7 +394,7 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
     grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
     grid32[0] &= 0x0f0f0f0f;
     for (int j = 0; j < 8; ++j) {
-        y[j] = __float2half(d * (q[j] + delta));
+        y[j] = d * (q[j] + delta);
     }
 }
 
@@ -409,8 +411,8 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
     const uint8_t  * q4 = x[ib].qs + 4*il;
     const float d = __half2float(x[ib].d);
     for (int j = 0; j < 4; ++j) {
-        y[j+ 0] = __float2half(d * kvalues_iq4nl[q4[j] & 0xf]);
-        y[j+16] = __float2half(d * kvalues_iq4nl[q4[j] >>  4]);
+        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
+        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
     }
 
 }
@@ -427,8 +429,8 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
     const uint8_t  * q4 = x[i].qs + 16*ib + 4*il;
     const float d = __half2float(x[i].d) * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
     for (int j = 0; j < 4; ++j) {
-        y[j+ 0] = __float2half(d * kvalues_iq4nl[q4[j] & 0xf]);
-        y[j+16] = __float2half(d * kvalues_iq4nl[q4[j] >>  4]);
+        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
+        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
     }
 }
 
@@ -522,7 +524,8 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k,
     dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
 }
 
-static to_fp16_cuda_t ggml_get_to_fp16_cuda(int64_t type) {
+template<typename dst_t>
+static to_cuda_ggml_t<dst_t> ggml_get_to_cuda(int64_t type) {
     switch (type) {
         case 2:
             return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
diff --git a/csrc/quantization/gguf/ggml-common.h b/csrc/quantization/gguf/ggml-common.h
index d42205a6571db..99a7ea0fb277e 100644
--- a/csrc/quantization/gguf/ggml-common.h
+++ b/csrc/quantization/gguf/ggml-common.h
@@ -1063,7 +1063,8 @@ static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -
 typedef half dfloat; // dequantize float
 typedef half2 dfloat2;
 typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
-typedef void (*to_fp16_cuda_t)(const void * __restrict__ x, dfloat * __restrict__ y, int k, cudaStream_t stream);
+template<typename dst_t>
+using to_cuda_ggml_t = void (*)(const void * __restrict__ x, dst_t * __restrict__ y, int k, cudaStream_t stream);
 typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
 typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
 typedef void (*load_tiles_cuda_t)(
@@ -1075,6 +1076,20 @@ typedef float (*vec_dot_q_mul_mat_cuda_t)(
 
 // Utility function
 
+template<typename dst_t>
+static __device__ __forceinline__ dst_t convert_from_half(half val) {
+    return val;
+}
+
+template<>
+__device__ __forceinline__ c10::BFloat16 convert_from_half<c10::BFloat16>(half val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    return __float2bfloat16(__half2float(val));
+#else
+    return __half2float(val);
+#endif  // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+}
+
 #if defined(USE_ROCM)
 
 #ifndef __has_builtin
diff --git a/csrc/quantization/gguf/gguf_kernel.cu b/csrc/quantization/gguf/gguf_kernel.cu
index dbbb97e6fb3a9..56b78f1834d15 100644
--- a/csrc/quantization/gguf/gguf_kernel.cu
+++ b/csrc/quantization/gguf/gguf_kernel.cu
@@ -71,14 +71,19 @@ static void quantize_row_q8_1_cuda(const scalar_t* x, void* vy, const int kx,
 }
 
 torch::Tensor ggml_dequantize(torch::Tensor W,  // quant weight
-                              int64_t type, int64_t m, int64_t n) {
+                              int64_t type, int64_t m, int64_t n,
+                              std::optional<at::ScalarType> const& dtype) {
   const at::cuda::OptionalCUDAGuard device_guard(device_of(W));
-  auto options =
-      torch::TensorOptions().dtype(torch::kFloat16).device(W.device());
+  auto dtype_ = dtype.value_or(torch::kFloat16);
+  auto options = torch::TensorOptions().dtype(dtype_).device(W.device());
   at::Tensor DW = torch::empty({m, n}, options);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-  const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(type);
-  to_fp16_cuda((void*)W.data_ptr(), (half*)DW.data_ptr(), m * n, stream);
+
+  VLLM_DISPATCH_FLOATING_TYPES(DW.scalar_type(), "ggml_dequantize", [&] {
+    auto to_cuda = ggml_get_to_cuda<scalar_t>(type);
+    to_cuda((void*)W.data_ptr(), (scalar_t*)DW.data_ptr(), m * n, stream);
+  });
+
   return DW;
 }
 
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index feb3882c4d54e..d3b80572b6ead 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -295,7 +295,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 #endif
 
   // Dequantization for GGML.
-  ops.def("ggml_dequantize(Tensor W, int type, SymInt m, SymInt n) -> Tensor");
+  ops.def(
+      "ggml_dequantize(Tensor W, int type, SymInt m, SymInt n, ScalarType? "
+      "dtype) -> Tensor");
   ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize);
 
   // mmvq kernel for GGML.
diff --git a/tests/kernels/test_ggml.py b/tests/kernels/test_ggml.py
index 23fa1fdfda179..cc157da518cbf 100644
--- a/tests/kernels/test_ggml.py
+++ b/tests/kernels/test_ggml.py
@@ -15,7 +15,8 @@ def test_ggml_opcheck(quant_type):
     qweight = torch.randint(0, 100, shape, device='cuda', dtype=torch.uint8)
     m = qweight.shape[0]
     n = qweight.shape[1] // type_size * block_size
-    opcheck(torch.ops._C.ggml_dequantize, (qweight, quant_type, m, n))
+    opcheck(torch.ops._C.ggml_dequantize,
+            (qweight, quant_type, m, n, torch.float16))
 
     x = torch.rand((m, 512), device='cuda', dtype=torch.float16)
     opcheck(torch.ops._C.ggml_mul_mat_a8,
diff --git a/tests/kernels/test_gguf.py b/tests/kernels/test_gguf.py
index ede941844dc0e..4c0fae9d9fd75 100644
--- a/tests/kernels/test_gguf.py
+++ b/tests/kernels/test_gguf.py
@@ -65,7 +65,7 @@ QUANT_TYPES = [
 
 
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("dtype", [torch.half])
+@pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("quant_type", QUANT_TYPES)
 @torch.inference_mode()
 def test_dequantize(hidden_size: int, dtype: torch.dtype,
@@ -78,7 +78,7 @@ def test_dequantize(hidden_size: int, dtype: torch.dtype,
         ref_output = torch.tensor(dequantize(tensor.data, quant_type),
                                   device="cuda").to(dtype)
         output = ops.ggml_dequantize(torch.tensor(tensor.data, device="cuda"),
-                                     quant_type, *list(shape)).to(dtype)
+                                     quant_type, *list(shape), dtype)
 
         torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=4e-2)
 
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 039397f5a5ef5..fe41a2d963b2e 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -436,9 +436,12 @@ if hasattr(torch.ops._C, "allspark_w8a16_gemm"):
 if hasattr(torch.ops._C, "ggml_dequantize"):
 
     @register_fake("_C::ggml_dequantize")
-    def _ggml_dequantize_fake(W: torch.Tensor, quant_type: int,
-                              m: torch.SymInt,
-                              n: torch.SymInt) -> torch.Tensor:
+    def _ggml_dequantize_fake(
+            W: torch.Tensor,
+            quant_type: int,
+            m: torch.SymInt,
+            n: torch.SymInt,
+            dtype: Optional[torch.dtype] = None) -> torch.Tensor:
         return torch.empty((m, n), dtype=torch.float16, device=W.device)
 
     @register_fake("_C::ggml_mul_mat_vec_a8")
@@ -1097,9 +1100,9 @@ def marlin_qqq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
 
 
 # gguf
-def ggml_dequantize(W: torch.Tensor, quant_type: int, m: int,
-                    n: int) -> torch.Tensor:
-    return torch.ops._C.ggml_dequantize(W, quant_type, m, n)
+def ggml_dequantize(W: torch.Tensor, quant_type: int, m: int, n: int,
+                    dtype: Optional[torch.dtype]) -> torch.Tensor:
+    return torch.ops._C.ggml_dequantize(W, quant_type, m, n, dtype)
 
 
 def ggml_mul_mat_vec_a8(
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index c8ab12d9a0aa2..9861e0a85b3f1 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -117,7 +117,7 @@ def _fuse_mul_mat(x: torch.Tensor, qweight: torch.Tensor,
     elif qweight_type in DEQUANT_TYPES:
         block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type]
         shape = (qweight.shape[0], qweight.shape[1] // type_size * block_size)
-        weight = ops.ggml_dequantize(qweight, qweight_type, *shape)
+        weight = ops.ggml_dequantize(qweight, qweight_type, *shape, x.dtype)
         y = x @ weight.T
     else:
         # Raise an error if the quantization type is not supported.
@@ -377,7 +377,7 @@ class GGUFEmbeddingMethod(GGUFLinearMethod):
         x_flat = x.flatten()
         quant = torch.index_select(qweight, dim=0, index=x_flat)
         dequant = ops.ggml_dequantize(quant, qweight_type, hidden_size,
-                                      x_flat.shape[0]).to(self.params_dtype)
+                                      x_flat.shape[0], self.params_dtype)
         return dequant.view(*x.shape, hidden_size)
 
 
From ddb94c26058cd7658e2435699ce353b3406235ac Mon Sep 17 00:00:00 2001
From: Eric Tang <46737979+erictang000@users.noreply.github.com>
Date: Wed, 2 Apr 2025 01:59:27 -0700
Subject: [PATCH 183/593] [core] Add tags parameter to wake_up() (#15500)

Signed-off-by: Eric <erictang000@gmail.com>
---
 tests/basic_correctness/test_cumem.py   | 20 ++++++++++++-
 tests/entrypoints/openai/test_sleep.py  | 31 +++++++++++++++++----
 vllm/device_allocator/cumem.py          | 32 ++++++++++++---------
 vllm/engine/async_llm_engine.py         |  4 +--
 vllm/engine/llm_engine.py               |  4 +--
 vllm/engine/multiprocessing/__init__.py |  5 ++--
 vllm/engine/multiprocessing/client.py   |  4 +--
 vllm/engine/multiprocessing/engine.py   |  6 ++--
 vllm/engine/protocol.py                 |  2 +-
 vllm/entrypoints/llm.py                 | 37 +++++++++++++++----------
 vllm/entrypoints/openai/api_server.py   |  9 ++++--
 vllm/executor/executor_base.py          | 25 +++++++++++++----
 vllm/v1/engine/async_llm.py             |  4 +--
 vllm/v1/engine/core.py                  |  4 +--
 vllm/v1/engine/core_client.py           | 16 +++++------
 vllm/v1/engine/llm_engine.py            |  4 +--
 vllm/v1/worker/gpu_worker.py            |  4 +--
 vllm/worker/worker.py                   |  4 +--
 18 files changed, 144 insertions(+), 71 deletions(-)

diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index 31aa898282004..76b266aada684 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -155,6 +155,24 @@ def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
 
         llm.wake_up()
         output2 = llm.generate(prompt, sampling_params)
-
         # cmp output
         assert output[0].outputs[0].text == output2[0].outputs[0].text
+
+        llm.sleep(level=1)
+        llm.wake_up(tags=["weights"])
+
+        free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
+        used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
+
+        # should just reallocate memory for weights (1B model, ~2GiB weights)
+        if use_v1:
+            assert used_bytes < 10 * GiB_bytes
+        else:
+            assert used_bytes < 6 * GiB_bytes
+
+        # now allocate kv cache memory
+        llm.wake_up(tags=["kv_cache"])
+        output3 = llm.generate(prompt, sampling_params)
+
+        # cmp output
+        assert output[0].outputs[0].text == output3[0].outputs[0].text
diff --git a/tests/entrypoints/openai/test_sleep.py b/tests/entrypoints/openai/test_sleep.py
index 66d8d9294018c..3ca8a9a410ffd 100644
--- a/tests/entrypoints/openai/test_sleep.py
+++ b/tests/entrypoints/openai/test_sleep.py
@@ -25,16 +25,37 @@ def test_sleep_mode():
                                 "VLLM_SERVER_DEV_MODE": "1",
                                 "CUDA_VISIBLE_DEVICES": "0"
                             }) as remote_server:
-
-        response = requests.post(remote_server.url_for("/sleep"),
+        response = requests.post(remote_server.url_for("sleep"),
                                  params={"level": "1"})
         assert response.status_code == 200
-        response = requests.get(remote_server.url_for("/is_sleeping"))
+        response = requests.get(remote_server.url_for("is_sleeping"))
         assert response.status_code == 200
         assert response.json().get("is_sleeping") is True
 
-        response = requests.post(remote_server.url_for("/wake_up"))
+        response = requests.post(remote_server.url_for("wake_up"))
         assert response.status_code == 200
-        response = requests.get(remote_server.url_for("/is_sleeping"))
+        response = requests.get(remote_server.url_for("is_sleeping"))
+        assert response.status_code == 200
+        assert response.json().get("is_sleeping") is False
+
+        # test wake up with tags
+        response = requests.post(remote_server.url_for("sleep"),
+                                 params={"level": "1"})
+        assert response.status_code == 200
+
+        response = requests.post(remote_server.url_for("wake_up"),
+                                 params={"tags": ["weights"]})
+        assert response.status_code == 200
+
+        # is sleeping should be false after waking up any part of the engine
+        response = requests.get(remote_server.url_for("is_sleeping"))
+        assert response.status_code == 200
+        assert response.json().get("is_sleeping") is True
+
+        response = requests.post(remote_server.url_for("wake_up"),
+                                 params={"tags": ["kv_cache"]})
+        assert response.status_code == 200
+
+        response = requests.get(remote_server.url_for("is_sleeping"))
         assert response.status_code == 200
         assert response.json().get("is_sleeping") is False
diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
index f666c18c1999b..9ff77f14a5e84 100644
--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -208,22 +208,28 @@ class CuMemAllocator:
         gc.collect()
         torch.cuda.empty_cache()
 
-    def wake_up(self):
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
         """
         Wake up the allocator from sleep mode.
-        All data that is previously offloaded will be loaded back to GPU
-        memory, and the rest of the data will have empty memory."""
+        All data that is previously offloaded will be loaded back to GPU 
+        memory, and the rest of the data will have empty memory.
+        
+        :param tags: The tags of the memory allocation that will be loaded
+            back to GPU memory. If None, all memory allocation will be loaded
+            back to GPU memory.
+        """
         for ptr, data in self.pointer_to_data.items():
-            handle = data.handle
-            create_and_map(handle)
-            if data.cpu_backup_tensor is not None:
-                cpu_backup_tensor = data.cpu_backup_tensor
-                if cpu_backup_tensor is not None:
-                    size_in_bytes = cpu_backup_tensor.numel(
-                    ) * cpu_backup_tensor.element_size()
-                    cpu_ptr = cpu_backup_tensor.data_ptr()
-                    libcudart.cudaMemcpy(ptr, cpu_ptr, size_in_bytes)
-                    data.cpu_backup_tensor = None
+            if tags is None or data.tag in tags:
+                handle = data.handle
+                create_and_map(handle)
+                if data.cpu_backup_tensor is not None:
+                    cpu_backup_tensor = data.cpu_backup_tensor
+                    if cpu_backup_tensor is not None:
+                        size_in_bytes = cpu_backup_tensor.numel(
+                        ) * cpu_backup_tensor.element_size()
+                        cpu_ptr = cpu_backup_tensor.data_ptr()
+                        libcudart.cudaMemcpy(ptr, cpu_ptr, size_in_bytes)
+                        data.cpu_backup_tensor = None
 
     @contextmanager
     def use_memory_pool(self, tag: Optional[str] = None):
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 3e337731d63d8..7f9f85e1f93f2 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1225,8 +1225,8 @@ class AsyncLLMEngine(EngineClient):
     async def sleep(self, level: int = 1) -> None:
         self.engine.sleep(level)
 
-    async def wake_up(self) -> None:
-        self.engine.wake_up()
+    async def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        self.engine.wake_up(tags)
 
     async def is_sleeping(self) -> bool:
         return self.engine.is_sleeping()
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 10677878ecc8f..f842581bf5517 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1938,10 +1938,10 @@ class LLMEngine:
             "Sleep mode is not enabled in the model config")
         self.model_executor.sleep(level=level)
 
-    def wake_up(self) -> None:
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
         assert self.vllm_config.model_config.enable_sleep_mode, (
             "Sleep mode is not enabled in the model config")
-        self.model_executor.wake_up()
+        self.model_executor.wake_up(tags)
 
     def is_sleeping(self) -> bool:
         return self.model_executor.is_sleeping
diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
index fdad53580ee7a..cafd8150bc017 100644
--- a/vllm/engine/multiprocessing/__init__.py
+++ b/vllm/engine/multiprocessing/__init__.py
@@ -133,8 +133,9 @@ class RPCSleepRequest(Enum):
     SLEEP_LEVEL_2 = 2
 
 
-class RPCWakeUpRequest(Enum):
-    WAKE_UP = 1
+@dataclass
+class RPCWakeUpRequest:
+    tags: Optional[list[str]] = None
 
 
 @dataclass
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index db91c5d3564a0..f058b13297bb0 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -697,10 +697,10 @@ class MQLLMEngineClient(EngineClient):
         return await self._send_one_way_rpc_request(
             request=RPCSleepRequest(level), socket=self.input_socket)
 
-    async def wake_up(self) -> None:
+    async def wake_up(self, tags: Optional[list[str]] = None) -> None:
         """Wake up the engine"""
         return await self._send_one_way_rpc_request(
-            request=RPCWakeUpRequest.WAKE_UP, socket=self.input_socket)
+            request=RPCWakeUpRequest(tags), socket=self.input_socket)
 
     async def is_sleeping(self) -> bool:
         """Check whether the engine is sleeping"""
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 739cbedc2f8cc..6ed5ae0a94f1a 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -274,7 +274,7 @@ class MQLLMEngine:
                 elif isinstance(request, RPCSleepRequest):
                     self.sleep(request.value)
                 elif isinstance(request, RPCWakeUpRequest):
-                    self.wake_up()
+                    self.wake_up(request.tags)
                 elif isinstance(request, RPCIsSleepingRequest):
                     self._handle_is_sleeping_request(request)
                 else:
@@ -415,8 +415,8 @@ class MQLLMEngine:
     def sleep(self, level: int = 1) -> None:
         self.engine.sleep(level)
 
-    def wake_up(self) -> None:
-        self.engine.wake_up()
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        self.engine.wake_up(tags)
 
     def is_sleeping(self) -> bool:
         return self.engine.is_sleeping()
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index d2f2c226d2fc4..e2974b02c5ba3 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -282,7 +282,7 @@ class EngineClient(ABC):
         ...
 
     @abstractmethod
-    async def wake_up(self) -> None:
+    async def wake_up(self, tags: Optional[list[str]] = None) -> None:
         """Wake up the engine"""
         ...
 
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 7c354be2d45c5..f39b011c9301e 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1200,26 +1200,35 @@ class LLM:
         The caller should guarantee that no requests are being processed
         during the sleep period, before `wake_up` is called.
 
-        :param level: The sleep level. Level 1 sleep will offload the model 
-            weights and discard the kv cache. The content of kv cache is 
-            forgotten. Level 1 sleep is good for sleeping and waking up the 
-            engine to run the same model again. The model weights are backed 
-            up in CPU memory. Please make sure there's enough CPU memory to 
-            store the model weights. Level 2 sleep will discard both the model 
-            weights and the kv cache. The content of both the model weights 
-            and kv cache is forgotten. Level 2 sleep is good for sleeping and 
-            waking up the engine to run a different model or update the model, 
-            where previous model weights are not needed. It reduces CPU memory 
-            pressure.
+        Args:
+            level: The sleep level. Level 1 sleep will offload the model 
+                weights and discard the kv cache. The content of kv cache 
+                is forgotten. Level 1 sleep is good for sleeping and waking
+                up the engine to run the same model again. The model weights 
+                are backed up in CPU memory. Please make sure there's enough 
+                CPU memory to store the model weights. Level 2 sleep will 
+                discard both the model weights and the kv cache. The content 
+                of both the model weights and kv cache is forgotten. Level 2 
+                sleep is good for sleeping and waking up the engine to run a
+                different model or update the model, where previous model 
+                weights are not needed. It reduces CPU memory pressure.
         """
         self.reset_prefix_cache()
         self.llm_engine.sleep(level=level)
 
-    def wake_up(self):
+    def wake_up(self, tags: Optional[list[str]] = None):
         """
         Wake up the engine from sleep mode. See the :meth:`sleep` method
-        for more details."""
-        self.llm_engine.wake_up()
+        for more details.
+        
+        Args:
+            tags: An optional list of tags to reallocate the engine memory 
+                for specific memory allocations. Values must be in 
+                ("weights", "kv_cache",). If None, all memory is reallocated.
+                wake_up should be called with all tags (or None) before the 
+                engine is used again.
+        """
+        self.llm_engine.wake_up(tags)
 
     # LEGACY
     def _convert_v1_inputs(
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 1e7d9eb83b9af..6a8bdd0602285 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -705,7 +705,6 @@ if envs.VLLM_SERVER_DEV_MODE:
     async def sleep(raw_request: Request):
         # get POST params
         level = raw_request.query_params.get("level", "1")
-        logger.info("sleep the engine with level %s", level)
         await engine_client(raw_request).sleep(int(level))
         # FIXME: in v0 with frontend multiprocessing, the sleep command
         # is sent but does not finish yet when we return a response.
@@ -713,8 +712,12 @@ if envs.VLLM_SERVER_DEV_MODE:
 
     @router.post("/wake_up")
     async def wake_up(raw_request: Request):
-        logger.info("wake up the engine")
-        await engine_client(raw_request).wake_up()
+        tags = raw_request.query_params.getlist("tags")
+        if tags == []:
+            # set to None to wake up all tags if no tags are provided
+            tags = None
+        logger.info("wake up the engine with tags: %s", tags)
+        await engine_client(raw_request).wake_up(tags)
         # FIXME: in v0 with frontend multiprocessing, the wake-up command
         # is sent but does not finish yet when we return a response.
         return Response(status_code=200)
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 6f5adb4f64728..58796e5d7326c 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -51,6 +51,7 @@ class ExecutorBase(ABC):
         self.observability_config = vllm_config.observability_config
         self._init_executor()
         self.is_sleeping = False
+        self.sleeping_tags: set[str] = set()
 
     @abstractmethod
     def _init_executor(self) -> None:
@@ -204,20 +205,34 @@ class ExecutorBase(ABC):
         time_before_sleep = time.perf_counter()
         self.collective_rpc("sleep", kwargs=dict(level=level))
         time_after_sleep = time.perf_counter()
+        self.sleeping_tags = {"weights", "kv_cache"}
         self.is_sleeping = True
         logger.info("It took %.6f seconds to fall asleep.",
                     time_after_sleep - time_before_sleep)
 
-    def wake_up(self):
+    def wake_up(self, tags: Optional[list[str]] = None):
         if not self.is_sleeping:
             logger.warning("Executor is not sleeping.")
             return
+        if tags:
+            for tag in tags:
+                if tag not in self.sleeping_tags:
+                    logger.warning("Tag %s is not in sleeping tags %s", tag,
+                                   self.sleeping_tags)
+                    return
         time_before_wakeup = time.perf_counter()
-        self.collective_rpc("wake_up")
+        self.collective_rpc("wake_up", kwargs=dict(tags=tags))
         time_after_wakeup = time.perf_counter()
-        self.is_sleeping = False
-        logger.info("It took %.6f seconds to wake up.",
-                    time_after_wakeup - time_before_wakeup)
+        logger.info("It took %.6f seconds to wake up tags %s.",
+                    time_after_wakeup - time_before_wakeup,
+                    tags if tags is not None else self.sleeping_tags)
+        if tags:
+            for tag in tags:
+                self.sleeping_tags.remove(tag)
+        else:
+            self.sleeping_tags.clear()
+        if not self.sleeping_tags:
+            self.is_sleeping = False
 
     def save_sharded_state(
         self,
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index a8d86e70f6abf..b77a6824cddbd 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -424,8 +424,8 @@ class AsyncLLM(EngineClient):
     async def sleep(self, level: int = 1) -> None:
         await self.engine_core.sleep_async(level)
 
-    async def wake_up(self) -> None:
-        await self.engine_core.wake_up_async()
+    async def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        await self.engine_core.wake_up_async(tags)
 
     async def is_sleeping(self) -> bool:
         return await self.engine_core.is_sleeping_async()
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index d915d474cfd0a..19c7799b59b75 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -264,8 +264,8 @@ class EngineCore:
     def sleep(self, level: int = 1):
         self.model_executor.sleep(level)
 
-    def wake_up(self):
-        self.model_executor.wake_up()
+    def wake_up(self, tags: Optional[list[str]] = None):
+        self.model_executor.wake_up(tags)
 
     def is_sleeping(self) -> bool:
         return self.model_executor.is_sleeping
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 3dc33a1284a12..99774ff4556e8 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -92,7 +92,7 @@ class EngineCoreClient(ABC):
     def sleep(self, level: int = 1) -> None:
         raise NotImplementedError
 
-    def wake_up(self) -> None:
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
         raise NotImplementedError
 
     def is_sleeping(self) -> bool:
@@ -141,7 +141,7 @@ class EngineCoreClient(ABC):
     async def sleep_async(self, level: int = 1) -> None:
         raise NotImplementedError
 
-    async def wake_up_async(self) -> None:
+    async def wake_up_async(self, tags: Optional[list[str]] = None) -> None:
         raise NotImplementedError
 
     async def is_sleeping_async(self) -> bool:
@@ -206,8 +206,8 @@ class InprocClient(EngineCoreClient):
     def sleep(self, level: int = 1) -> None:
         self.engine_core.sleep(level)
 
-    def wake_up(self) -> None:
-        self.engine_core.wake_up()
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        self.engine_core.wake_up(tags)
 
     def is_sleeping(self) -> bool:
         return self.engine_core.is_sleeping()
@@ -520,8 +520,8 @@ class SyncMPClient(MPClient):
     def sleep(self, level: int = 1) -> None:
         self.call_utility("sleep", level)
 
-    def wake_up(self) -> None:
-        self.call_utility("wake_up")
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        self.call_utility("wake_up", tags)
 
     def is_sleeping(self) -> bool:
         return self.call_utility("is_sleeping")
@@ -647,8 +647,8 @@ class AsyncMPClient(MPClient):
     async def sleep_async(self, level: int = 1) -> None:
         await self.call_utility_async("sleep", level)
 
-    async def wake_up_async(self) -> None:
-        await self.call_utility_async("wake_up")
+    async def wake_up_async(self, tags: Optional[list[str]] = None) -> None:
+        await self.call_utility_async("wake_up", tags)
 
     async def is_sleeping_async(self) -> bool:
         return await self.call_utility_async("is_sleeping")
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 764c643b5c974..4c67186f70401 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -245,8 +245,8 @@ class LLMEngine:
     def sleep(self, level: int = 1):
         self.engine_core.sleep(level)
 
-    def wake_up(self):
-        self.engine_core.wake_up()
+    def wake_up(self, tags: Optional[list[str]] = None):
+        self.engine_core.wake_up(tags)
 
     def is_sleeping(self) -> bool:
         return self.engine_core.is_sleeping()
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 51b9f56739665..191443683fa09 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -83,9 +83,9 @@ class Worker(WorkerBase):
             "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes,
             used_bytes / GiB_bytes)
 
-    def wake_up(self) -> None:
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
         allocator = CuMemAllocator.get_instance()
-        allocator.wake_up()
+        allocator.wake_up(tags)
 
     def init_device(self):
         if self.device_config.device.type == "cuda":
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index ad94a6a4db7a3..d59f20f499964 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -135,9 +135,9 @@ class Worker(LocalOrDistributedWorkerBase):
             "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes,
             used_bytes / GiB_bytes)
 
-    def wake_up(self) -> None:
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
         allocator = CuMemAllocator.get_instance()
-        allocator.wake_up()
+        allocator.wake_up(tags=tags)
 
     def init_device(self) -> None:
         if self.device_config.device.type == "cuda":

From 14e53ed11f5134381bde03484148cb3cd1b69cd6 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Wed, 2 Apr 2025 05:00:08 -0400
Subject: [PATCH 184/593] [V1] Fix json_object support with xgrammar (#15488)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 requirements/common.txt                              |  2 +-
 .../entrypoints/llm/test_struct_output_generate.py   | 12 ++----------
 vllm/model_executor/guided_decoding/__init__.py      |  6 ------
 .../guided_decoding/xgrammar_decoding.py             |  5 ++++-
 vllm/v1/structured_output/backend_xgrammar.py        |  4 +++-
 5 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 48e58c85c89b1..08fee27fe681b 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -21,7 +21,7 @@ lm-format-enforcer >= 0.10.11, < 0.11
 llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
 outlines == 0.1.11
 lark == 1.2.2
-xgrammar == 0.1.16; platform_machine == "x86_64" or platform_machine == "aarch64"
+xgrammar == 0.1.17; platform_machine == "x86_64" or platform_machine == "aarch64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 0ffee08c23462..d848490b89e8a 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -125,17 +125,9 @@ def test_structured_output(
             print(generated_text)
             assert generated_text is not None
 
-            # Parse to verify it is valid JSON
+            # Parse to verify it is a valid JSON object
             parsed_json = json.loads(generated_text)
-            allowed_types: tuple[type, ...] = (dict, )
-            if guided_decoding_backend.startswith("xgrammar"):
-                # TODO - we are currently too permissive with xgrammar and
-                # allow # any valid json (typically comes back as a list or
-                # object).  We can fix this by specifying a jsonschema of
-                # {"type": "object"}, # but we need this fix in a release
-                # first: https://github.com/mlc-ai/xgrammar/pull/264
-                allowed_types = (dict, list)
-            assert isinstance(parsed_json, allowed_types)
+            assert isinstance(parsed_json, dict)
 
     #
     # Test 3: test a jsonschema incompatible with xgrammar
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index cecb3a8a1d4a8..d4fd11fd2e305 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -79,12 +79,6 @@ def maybe_backend_fallback(
                     "xgrammar does not support Lark grammars and the "
                     "grammar failed to convert to GBNF.", "outlines")
 
-        elif guided_params.json_object:
-            # https://github.com/mlc-ai/xgrammar/issues/256
-            fallback_or_error(guided_params,
-                              "xgrammar does not support json_object.",
-                              "guidance")
-
         # If the xgrammar module cannot be imported successfully,
         # we should still allow users to use guided decoding with a fallback.
         elif not xgr_installed:
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index 47b1e7e3f9811..b44301f1a4c9b 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -320,7 +320,10 @@ class XGrammarLogitsProcessor:
             elif self.config.grammar_str is not None:
                 self.ctx = compiler.compile_grammar(self.config.grammar_str)
             elif self.config.json_object:
-                self.ctx = compiler.compile_builtin_json_grammar()
+                any_whitespace = self.config.any_whitespace
+                self.ctx = compiler\
+                    .compile_json_schema('{"type": "object"}',
+                                         any_whitespace=any_whitespace)
             else:
                 raise ValueError(
                     "Invalid configuration for xgrammar logits processor")
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index 7fe62f26af597..783a33481243c 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -84,7 +84,9 @@ class XgrammarBackend(StructuredOutputBackend):
             ctx = self.compiler.compile_json_schema(
                 grammar_spec, any_whitespace=not self.disable_any_whitespace)
         elif request_type == StructuredOutputOptions.JSON_OBJECT:
-            ctx = self.compiler.compile_builtin_json_grammar()
+            ctx = self.compiler.compile_json_schema(
+                '{"type": "object"}',
+                any_whitespace=not self.disable_any_whitespace)
         elif request_type == StructuredOutputOptions.GRAMMAR:
             ctx = self.compiler.compile_grammar(grammar_spec)
         elif request_type == StructuredOutputOptions.REGEX:

From 51826d51fa6ef36963ddd79e99dc77c7660ffbf5 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 2 Apr 2025 10:03:36 +0100
Subject: [PATCH 185/593] Add minimum version for `huggingface_hub` to enable
 Xet downloads (#15873)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 requirements/common.txt | 1 +
 requirements/test.in    | 1 +
 requirements/test.txt   | 5 ++++-
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 08fee27fe681b..c754dd12bc2ff 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -7,6 +7,7 @@ tqdm
 blake3
 py-cpuinfo
 transformers >= 4.50.3
+huggingface-hub[hf-xet] >= 0.30.0  # Required for Xet downloads.
 tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
diff --git a/requirements/test.in b/requirements/test.in
index 43fa1bb58c825..eb74198ab9f70 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -31,6 +31,7 @@ opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.8 # required for model evaluation test
 transformers==4.50.3
+huggingface-hub[hf-xet]>=0.30.0  # Required for Xet downloads.
 # quantization
 bitsandbytes>=0.45.3
 buildkite-test-collector==0.1.9
diff --git a/requirements/test.txt b/requirements/test.txt
index bddf9990ca014..236b8be328058 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -152,14 +152,17 @@ genson==1.3.0
     # via datamodel-code-generator
 h11==0.14.0
     # via httpcore
+hf-xet==0.1.4
+    # via huggingface-hub
 hiredis==3.0.0
     # via tensorizer
 httpcore==1.0.6
     # via httpx
 httpx==0.27.2
     # via -r requirements/test.in
-huggingface-hub==0.26.2
+huggingface-hub==0.30.1
     # via
+    #   -r requirements/test.in
     #   accelerate
     #   datasets
     #   evaluate

From 252937806c4065b6b7226f80bf7a2045465f739a Mon Sep 17 00:00:00 2001
From: Brayden Zhong <b8zhong@uwaterloo.ca>
Date: Wed, 2 Apr 2025 05:19:35 -0400
Subject: [PATCH 186/593] [Bugfix][Benchmarks] Ensure
 `async_request_deepspeed_mii` uses the OpenAI choices key (#15926)

Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
---
 benchmarks/backend_request_func.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 0f13c79ae234b..ea70a1f48a0bb 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -219,7 +219,15 @@ async def async_request_deepspeed_mii(
                 if response.status == 200:
                     parsed_resp = await response.json()
                     output.latency = time.perf_counter() - st
-                    output.generated_text = parsed_resp["text"][0]
+                    if "choices" in parsed_resp:
+                        output.generated_text = parsed_resp["choices"][0][
+                            "text"]
+                    elif "text" in parsed_resp:
+                        output.generated_text = parsed_resp["text"][0]
+                    else:
+                        output.error = ("Unexpected response format: "
+                                        "neither 'choices' nor 'text' found")
+                        output.success = False
                     output.success = True
                 else:
                     output.error = response.reason or ""

From 44f990515b124272f87954fc763d90697d8aa1db Mon Sep 17 00:00:00 2001
From: Kay Yan <kay.yan@daocloud.io>
Date: Wed, 2 Apr 2025 17:44:01 +0800
Subject: [PATCH 187/593] [CI] Remove duplicate entrypoints-test (#15940)

Signed-off-by: Kay Yan <kay.yan@daocloud.io>
---
 .buildkite/test-pipeline.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index e2b452d8a05fe..59e64dc527f18 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -204,7 +204,6 @@ steps:
   commands:
     # split the test to avoid interference
     - pytest -v -s v1/core
-    - pytest -v -s v1/entrypoints
     - pytest -v -s v1/engine
     - pytest -v -s v1/entrypoints
     - pytest -v -s v1/sample

From 594a8b9030a64b9807f987762c950ee1595f3849 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Wed, 2 Apr 2025 21:33:52 +0800
Subject: [PATCH 188/593] [Bugfix] Fix the issue where the model name is empty
 string, causing no response with the model name. (#15938)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 tests/entrypoints/openai/test_chat.py     | 31 ++++++++++++++++++++---
 vllm/entrypoints/openai/serving_engine.py |  2 +-
 2 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 4d13421adee0b..a1844502500df 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -11,7 +11,7 @@ import pytest
 import pytest_asyncio
 import requests
 import torch
-from openai import BadRequestError
+from openai import BadRequestError, OpenAI
 
 from ...utils import RemoteOpenAIServer
 from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
@@ -1054,7 +1054,7 @@ async def test_long_seed(client: openai.AsyncOpenAI):
 
 
 @pytest.mark.asyncio
-async def test_http_chat_wo_model_name(server: RemoteOpenAIServer):
+async def test_http_chat_no_model_name_with_curl(server: RemoteOpenAIServer):
     url = f"http://localhost:{server.port}/v1/chat/completions"
     headers = {
         "Content-Type": "application/json",
@@ -1075,10 +1075,35 @@ async def test_http_chat_wo_model_name(server: RemoteOpenAIServer):
     response = requests.post(url, headers=headers, json=data)
     response_data = response.json()
     print(response_data)
-
+    assert response_data.get("model") == MODEL_NAME
     choice = response_data.get("choices")[0]
     message = choice.get("message")
     assert message is not None
     content = message.get("content")
     assert content is not None
     assert len(content) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME, ""])
+async def test_http_chat_no_model_name_with_openai(server: RemoteOpenAIServer,
+                                                   model_name: str):
+
+    openai_api_key = "EMPTY"
+    openai_api_base = f"http://localhost:{server.port}/v1"
+
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    messages = [
+        {
+            "role": "user",
+            "content": "Hello, vLLM!"
+        },
+    ]
+    response = client.chat.completions.create(
+        model="",  # empty string
+        messages=messages,
+    )
+    assert response.model == MODEL_NAME
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 7cb4a2dce1dc0..bbc8eddd8b1b0 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -537,7 +537,7 @@ class OpenAIServing:
                         lora_request: Optional[LoRARequest] = None) -> str:
         if lora_request:
             return lora_request.lora_name
-        if model_name is None:
+        if not model_name:
             return self.models.base_model_paths[0].name
         return model_name
 

From 98d7367b614bf4bb2189e849723df24156de67d0 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Wed, 2 Apr 2025 15:37:19 +0100
Subject: [PATCH 189/593] [Metrics] Hide deprecated metrics (#15458)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 tests/entrypoints/openai/test_metrics.py |  21 ++-
 tests/utils.py                           |   3 +
 vllm/engine/metrics.py                   | 163 ++++++++++++-----------
 vllm/version.py                          |   9 ++
 4 files changed, 114 insertions(+), 82 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 2bffd0ce138e6..42f7b098f917d 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -13,9 +13,12 @@ import requests
 from prometheus_client.parser import text_string_to_metric_families
 from transformers import AutoTokenizer
 
+from vllm import version
+
 from ...utils import RemoteOpenAIServer
 
 MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+PREV_MINOR_VERSION = version._prev_minor_version()
 
 
 @pytest.fixture(scope="module", params=[True, False])
@@ -55,6 +58,7 @@ def default_server_args():
                     "",
                     "--enable-chunked-prefill",
                     "--disable-frontend-multiprocessing",
+                    f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
                 ])
 def server(use_v1, default_server_args, request):
     if request.param:
@@ -129,7 +133,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
 
     # Loop over all expected metric_families
     for metric_family, suffix_values_list in EXPECTED_VALUES.items():
-        if use_v1 and metric_family not in EXPECTED_METRICS_V1:
+        if ((use_v1 and metric_family not in EXPECTED_METRICS_V1)
+                or (not server.show_hidden_metrics
+                    and metric_family in HIDDEN_DEPRECATED_METRICS)):
             continue
 
         found_metric = False
@@ -165,10 +171,10 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
 
 EXPECTED_METRICS = [
     "vllm:num_requests_running",
-    "vllm:num_requests_swapped",
+    "vllm:num_requests_swapped",  # deprecated
     "vllm:num_requests_waiting",
     "vllm:gpu_cache_usage_perc",
-    "vllm:cpu_cache_usage_perc",
+    "vllm:cpu_cache_usage_perc",  # deprecated
     "vllm:time_to_first_token_seconds_sum",
     "vllm:time_to_first_token_seconds_bucket",
     "vllm:time_to_first_token_seconds_count",
@@ -268,6 +274,11 @@ EXPECTED_METRICS_V1 = [
     "vllm:request_decode_time_seconds_count",
 ]
 
+HIDDEN_DEPRECATED_METRICS = [
+    "vllm:num_requests_swapped",
+    "vllm:cpu_cache_usage_perc",
+]
+
 
 @pytest.mark.asyncio
 async def test_metrics_exist(server: RemoteOpenAIServer,
@@ -282,7 +293,9 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
     assert response.status_code == HTTPStatus.OK
 
     for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS):
-        assert metric in response.text
+        if (not server.show_hidden_metrics
+                and metric not in HIDDEN_DEPRECATED_METRICS):
+            assert metric in response.text
 
 
 def test_metrics_exist_run_batch(use_v1: bool):
diff --git a/tests/utils.py b/tests/utils.py
index 69c96d3f0658f..8f8c102b73b8e 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -104,6 +104,9 @@ class RemoteOpenAIServer:
         self.host = str(args.host or 'localhost')
         self.port = int(args.port)
 
+        self.show_hidden_metrics = \
+            args.show_hidden_metrics_for_version is not None
+
         # download the model before starting the server to avoid timeout
         is_local = os.path.isdir(model)
         if not is_local:
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 70f36d1290ca3..5890f654e3820 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -52,6 +52,11 @@ class Metrics:
 
         max_model_len = vllm_config.model_config.max_model_len
 
+        # Use this flag to hide metrics that were deprecated in
+        # a previous release and which will be removed future
+        self.show_hidden_metrics = \
+            vllm_config.observability_config.show_hidden_metrics
+
         # System stats
         #   Scheduler State
         self.gauge_scheduler_running = self._gauge_cls(
@@ -76,14 +81,15 @@ class Metrics:
         )
 
         # Deprecated in 0.8 - KV cache offloading is not used in V1
-        # TODO: in 0.9, only enable if show_hidden_metrics=True
-        self.gauge_scheduler_swapped = self._gauge_cls(
-            name="vllm:num_requests_swapped",
-            documentation=(
-                "Number of requests swapped to CPU. "
-                "DEPRECATED: KV cache offloading is not used in V1"),
-            labelnames=labelnames,
-            multiprocess_mode="sum")
+        # Hidden in 0.9, due to be removed in 0.10
+        if self.show_hidden_metrics:
+            self.gauge_scheduler_swapped = self._gauge_cls(
+                name="vllm:num_requests_swapped",
+                documentation=(
+                    "Number of requests swapped to CPU. "
+                    "DEPRECATED: KV cache offloading is not used in V1"),
+                labelnames=labelnames,
+                multiprocess_mode="sum")
 
         #   KV Cache Usage in %
         self.gauge_gpu_cache_usage = self._gauge_cls(
@@ -93,34 +99,33 @@ class Metrics:
             multiprocess_mode="sum")
 
         # Deprecated in 0.8 - KV cache offloading is not used in V1
-        # TODO: in 0.9, only enable if show_hidden_metrics=True
-        self.gauge_cpu_cache_usage = self._gauge_cls(
-            name="vllm:cpu_cache_usage_perc",
-            documentation=(
-                "CPU KV-cache usage. 1 means 100 percent usage. "
-                "DEPRECATED: KV cache offloading is not used in V1"),
-            labelnames=labelnames,
-            multiprocess_mode="sum")
-
-        # Deprecated in 0.8 - KV cache offloading is not used in V1
-        # TODO: in 0.9, only enable if show_hidden_metrics=True
-        self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls(
-            name="vllm:cpu_prefix_cache_hit_rate",
-            documentation=(
-                "CPU prefix cache block hit rate. "
-                "DEPRECATED: KV cache offloading is not used in V1"),
-            labelnames=labelnames,
-            multiprocess_mode="sum")
+        # Hidden in 0.9, due to be removed in 0.10
+        if self.show_hidden_metrics:
+            self.gauge_cpu_cache_usage = self._gauge_cls(
+                name="vllm:cpu_cache_usage_perc",
+                documentation=(
+                    "CPU KV-cache usage. 1 means 100 percent usage. "
+                    "DEPRECATED: KV cache offloading is not used in V1"),
+                labelnames=labelnames,
+                multiprocess_mode="sum")
+            self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls(
+                name="vllm:cpu_prefix_cache_hit_rate",
+                documentation=(
+                    "CPU prefix cache block hit rate. "
+                    "DEPRECATED: KV cache offloading is not used in V1"),
+                labelnames=labelnames,
+                multiprocess_mode="sum")
 
         # Deprecated in 0.8 - replaced by queries+hits counters in V1
-        # TODO: in 0.9, only enable if show_hidden_metrics=True
-        self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls(
-            name="vllm:gpu_prefix_cache_hit_rate",
-            documentation=("GPU prefix cache block hit rate. "
-                           "DEPRECATED: use vllm:gpu_prefix_cache_queries and "
-                           "vllm:gpu_prefix_cache_queries in V1"),
-            labelnames=labelnames,
-            multiprocess_mode="sum")
+        # Hidden in 0.9, due to be removed in 0.10
+        if self.show_hidden_metrics:
+            self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls(
+                name="vllm:gpu_prefix_cache_hit_rate",
+                documentation=("GPU prefix cache block hit rate. "
+                               "DEPRECATED: use vllm:gpu_prefix_cache_queries "
+                               "and vllm:gpu_prefix_cache_queries in V1"),
+                labelnames=labelnames,
+                multiprocess_mode="sum")
 
         # Iteration stats
         self.counter_num_preemption = self._counter_cls(
@@ -198,33 +203,35 @@ class Metrics:
             labelnames=labelnames,
             buckets=request_latency_buckets)
         # Deprecated in 0.8 - duplicates vllm:request_queue_time_seconds:
-        # TODO: in 0.9, only enable if show_hidden_metrics=True
-        self.histogram_time_in_queue_request = self._histogram_cls(
-            name="vllm:time_in_queue_requests",
-            documentation=(
-                "Histogram of time the request spent in the queue in seconds. "
-                "DEPRECATED: use vllm:request_queue_time_seconds instead."),
-            labelnames=labelnames,
-            buckets=request_latency_buckets)
+        # Hidden in 0.9, due to be removed in 0.10
+        if self.show_hidden_metrics:
+            self.histogram_time_in_queue_request = self._histogram_cls(
+                name="vllm:time_in_queue_requests",
+                documentation=
+                ("Histogram of time the request spent in the queue in seconds. "
+                 "DEPRECATED: use vllm:request_queue_time_seconds instead."),
+                labelnames=labelnames,
+                buckets=request_latency_buckets)
 
         # Deprecated in 0.8 - use prefill/decode/inference time metrics
-        # TODO: in 0.9, only enable if show_hidden_metrics=True
-        self.histogram_model_forward_time_request = self._histogram_cls(
-            name="vllm:model_forward_time_milliseconds",
-            documentation=(
-                "Histogram of time spent in the model forward pass in ms. "
-                "DEPRECATED: use prefill/decode/inference time metrics instead."
-            ),
-            labelnames=labelnames,
-            buckets=build_1_2_3_5_8_buckets(3000))
-        self.histogram_model_execute_time_request = self._histogram_cls(
-            name="vllm:model_execute_time_milliseconds",
-            documentation=(
-                "Histogram of time spent in the model execute function in ms."
-                "DEPRECATED: use prefill/decode/inference time metrics instead."
-            ),
-            labelnames=labelnames,
-            buckets=build_1_2_3_5_8_buckets(3000))
+        # Hidden in 0.9, due to be removed in 0.10
+        if self.show_hidden_metrics:
+            self.histogram_model_forward_time_request = self._histogram_cls(
+                name="vllm:model_forward_time_milliseconds",
+                documentation=
+                ("Histogram of time spent in the model forward pass in ms. "
+                 "DEPRECATED: use prefill/decode/inference time metrics instead"
+                 ),
+                labelnames=labelnames,
+                buckets=build_1_2_3_5_8_buckets(3000))
+            self.histogram_model_execute_time_request = self._histogram_cls(
+                name="vllm:model_execute_time_milliseconds",
+                documentation=
+                ("Histogram of time spent in the model execute function in ms."
+                 "DEPRECATED: use prefill/decode/inference time metrics instead"
+                 ),
+                labelnames=labelnames,
+                buckets=build_1_2_3_5_8_buckets(3000))
 
         #   Metadata
         self.histogram_num_prompt_tokens_request = self._histogram_cls(
@@ -543,11 +550,6 @@ class PrometheusStatLogger(StatLoggerBase):
         self.metrics = self._metrics_cls(labelnames=list(labels.keys()),
                                          vllm_config=vllm_config)
 
-        # Use this flag to hide metrics that were deprecated in
-        # a previous release and which will be removed future
-        self.show_hidden_metrics = \
-            vllm_config.observability_config.show_hidden_metrics
-
     def _log_gauge(self, gauge, data: Union[int, float]) -> None:
         # Convenience function for logging to gauge.
         gauge.labels(**self.labels).set(data)
@@ -580,18 +582,20 @@ class PrometheusStatLogger(StatLoggerBase):
         # System state data
         self._log_gauge(self.metrics.gauge_scheduler_running,
                         stats.num_running_sys)
-        self._log_gauge(self.metrics.gauge_scheduler_swapped,
-                        stats.num_swapped_sys)
+        if self.metrics.show_hidden_metrics:
+            self._log_gauge(self.metrics.gauge_scheduler_swapped,
+                            stats.num_swapped_sys)
         self._log_gauge(self.metrics.gauge_scheduler_waiting,
                         stats.num_waiting_sys)
         self._log_gauge(self.metrics.gauge_gpu_cache_usage,
                         stats.gpu_cache_usage_sys)
-        self._log_gauge(self.metrics.gauge_cpu_cache_usage,
-                        stats.cpu_cache_usage_sys)
-        self._log_gauge(self.metrics.gauge_cpu_prefix_cache_hit_rate,
-                        stats.cpu_prefix_cache_hit_rate)
-        self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
-                        stats.gpu_prefix_cache_hit_rate)
+        if self.metrics.show_hidden_metrics:
+            self._log_gauge(self.metrics.gauge_cpu_cache_usage,
+                            stats.cpu_cache_usage_sys)
+            self._log_gauge(self.metrics.gauge_cpu_prefix_cache_hit_rate,
+                            stats.cpu_prefix_cache_hit_rate)
+            self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
+                            stats.gpu_prefix_cache_hit_rate)
         # Including max-lora in metric, in future this property of lora
         # config maybe extended to be dynamic.
         lora_info = {
@@ -629,12 +633,15 @@ class PrometheusStatLogger(StatLoggerBase):
                             stats.time_prefill_requests)
         self._log_histogram(self.metrics.histogram_decode_time_request,
                             stats.time_decode_requests)
-        self._log_histogram(self.metrics.histogram_time_in_queue_request,
-                            stats.time_in_queue_requests)
-        self._log_histogram(self.metrics.histogram_model_forward_time_request,
-                            stats.model_forward_time_requests)
-        self._log_histogram(self.metrics.histogram_model_execute_time_request,
-                            stats.model_execute_time_requests)
+        if self.metrics.show_hidden_metrics:
+            self._log_histogram(self.metrics.histogram_time_in_queue_request,
+                                stats.time_in_queue_requests)
+            self._log_histogram(
+                self.metrics.histogram_model_forward_time_request,
+                stats.model_forward_time_requests)
+            self._log_histogram(
+                self.metrics.histogram_model_execute_time_request,
+                stats.model_execute_time_requests)
         # Metadata
         finished_reason_counter = CollectionsCounter(
             stats.finished_reason_requests)
diff --git a/vllm/version.py b/vllm/version.py
index ab5909b101a09..8329d7becb683 100644
--- a/vllm/version.py
+++ b/vllm/version.py
@@ -28,4 +28,13 @@ def _prev_minor_version_was(version_str):
         return True
 
     # Note - this won't do the right thing when we release 1.0!
+    assert __version_tuple__[0] == 0
+    assert isinstance(__version_tuple__[1], int)
     return version_str == f"{__version_tuple__[0]}.{__version_tuple__[1] - 1}"
+
+
+def _prev_minor_version():
+    """For the purpose of testing, return a previous minor version number."""
+    # In dev tree, this will return "0.-1", but that will work fine"
+    assert isinstance(__version_tuple__[1], int)
+    return f"{__version_tuple__[0]}.{__version_tuple__[1] - 1}"

From cefb9e5a2816da74c22115236e35c5a75ef59484 Mon Sep 17 00:00:00 2001
From: Matthias Matt <37695050+meffmadd@users.noreply.github.com>
Date: Wed, 2 Apr 2025 16:45:45 +0200
Subject: [PATCH 190/593] [Frontend] Implement Tool Calling with
 `tool_choice='required'` (#13483)

Signed-off-by: Liangfu Chen <liangfc@amazon.com>
Signed-off-by: Matt, Matthias <matthias.matt@tuwien.ac.at>
Co-authored-by: Liangfu Chen <liangfc@amazon.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
---
 docs/source/features/tool_calling.md          |   8 +-
 ...t_completion_client_with_tools_required.py | 136 +++++++
 tests/entrypoints/openai/test_chat.py         | 163 ++++++---
 ...est_chat_completion_request_validations.py |   7 +-
 tests/tool_use/test_tool_choice_required.py   | 336 ++++++++++++++++++
 vllm/entrypoints/openai/protocol.py           | 134 +++++--
 vllm/entrypoints/openai/serving_chat.py       | 177 ++++++++-
 7 files changed, 868 insertions(+), 93 deletions(-)
 create mode 100644 examples/online_serving/openai_chat_completion_client_with_tools_required.py
 create mode 100644 tests/tool_use/test_tool_choice_required.py

diff --git a/docs/source/features/tool_calling.md b/docs/source/features/tool_calling.md
index 2e1081bf8d14b..17cee6da471c3 100644
--- a/docs/source/features/tool_calling.md
+++ b/docs/source/features/tool_calling.md
@@ -1,6 +1,6 @@
 # Tool Calling
 
-vLLM currently supports named function calling, as well as the `auto` and `none` options for the `tool_choice` field in the chat completion API. The `tool_choice` option `required` is **not yet supported** but [on the roadmap](gh-issue:13002).
+vLLM currently supports named function calling, as well as the `auto`, `required` (as of `vllm>=0.8.3`) and `none` options for the `tool_choice` field in the chat completion API.
 
 ## Quickstart
 
@@ -91,6 +91,12 @@ For best results, we recommend ensuring that the expected output format / schema
 To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and
 specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request.
 
+## Required Function Calling
+
+vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The required guided decoding features (JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](https://docs.vllm.ai/en/latest/getting_started/v1_user_guide.html#feature-model) for the V1 engine.
+
+When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter.
+
 ## Automatic Function Calling
 
 To enable this feature, you should set the following flags:
diff --git a/examples/online_serving/openai_chat_completion_client_with_tools_required.py b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
new file mode 100644
index 0000000000000..779369d163442
--- /dev/null
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
@@ -0,0 +1,136 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+To run this example, you can start the vLLM server 
+without any specific flags:
+
+```bash
+VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
+    --guided-decoding-backend outlines
+```
+
+This example demonstrates how to generate chat completions 
+using the OpenAI Python client library.
+"""
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    # defaults to os.environ.get("OPENAI_API_KEY")
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type":
+                        "string",
+                        "description":
+                        "The city to find the weather for"
+                        ", e.g. 'San Francisco'",
+                    },
+                    "state": {
+                        "type":
+                        "string",
+                        "description":
+                        "the two-letter abbreviation for the state that the "
+                        "city is in, e.g. 'CA' which would mean 'California'",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["city", "state", "unit"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_forecast",
+            "description": "Get the weather forecast for a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type":
+                        "string",
+                        "description":
+                        "The city to get the forecast for, e.g. 'New York'",
+                    },
+                    "state": {
+                        "type":
+                        "string",
+                        "description":
+                        "The two-letter abbreviation for the state, e.g. 'NY'",
+                    },
+                    "days": {
+                        "type":
+                        "integer",
+                        "description":
+                        "Number of days to get the forecast for (1-7)",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["city", "state", "days", "unit"],
+            },
+        },
+    },
+]
+
+messages = [
+    {
+        "role": "user",
+        "content": "Hi! How are you doing today?"
+    },
+    {
+        "role": "assistant",
+        "content": "I'm doing well! How can I help you?"
+    },
+    {
+        "role":
+        "user",
+        "content":
+        "Can you tell me what the current weather is in Dallas \
+            and the forecast for the next 5 days, in fahrenheit?",
+    },
+]
+
+chat_completion = client.chat.completions.create(
+    messages=messages,
+    model=model,
+    tools=tools,
+    tool_choice="required",
+    stream=True  # Enable streaming response
+)
+
+for chunk in chat_completion:
+    if chunk.choices and chunk.choices[0].delta.tool_calls:
+        print(chunk.choices[0].delta.tool_calls)
+
+chat_completion = client.chat.completions.create(messages=messages,
+                                                 model=model,
+                                                 tools=tools,
+                                                 tool_choice="required")
+
+print(chat_completion.choices[0].message.tool_calls)
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index a1844502500df..b83c37a9032d3 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -786,56 +786,135 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
 
 
 @pytest.mark.asyncio
-async def test_required_tool_use_not_yet_supported(client: openai.AsyncOpenAI,
-                                                   sample_json_schema):
-
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_required_tool_use(client: openai.AsyncOpenAI,
+                                 is_v1_server: bool, model_name: str):
     if is_v1_server:
-        pytest.skip("sample_json_schema has features unsupported on V1")
+        pytest.skip(
+            "tool_choice='required' requires features unsupported on V1")
 
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        f"Give an example JSON for an employee profile that "
-        f"fits this schema: {sample_json_schema}"
-    }]
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_current_weather",
+                "description": "Get the current weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "city": {
+                            "type": "string",
+                            "description":
+                            "The city to find the weather for, e.g. 'Vienna'",
+                            "default": "Vienna",
+                        },
+                        "country": {
+                            "type":
+                            "string",
+                            "description":
+                            "The country that the city is in, e.g. 'Austria'",
+                        },
+                        "unit": {
+                            "type": "string",
+                            "description":
+                            "The unit to fetch the temperature in",
+                            "enum": ["celsius", "fahrenheit"],
+                        },
+                    },
+                    "required": ["country", "unit"],
+                },
+            },
+        },
+        {
+            "type": "function",
+            "function": {
+                "name": "get_forecast",
+                "description": "Get the weather forecast for a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "city": {
+                            "type": "string",
+                            "description":
+                            "The city to get the forecast for, e.g. 'Vienna'",
+                            "default": "Vienna",
+                        },
+                        "country": {
+                            "type":
+                            "string",
+                            "description":
+                            "The country that the city is in, e.g. 'Austria'",
+                        },
+                        "days": {
+                            "type":
+                            "integer",
+                            "description":
+                            "Number of days to get the forecast for (1-7)",
+                        },
+                        "unit": {
+                            "type": "string",
+                            "description":
+                            "The unit to fetch the temperature in",
+                            "enum": ["celsius", "fahrenheit"],
+                        },
+                    },
+                    "required": ["country", "days", "unit"],
+                },
+            },
+        },
+    ]
 
-    with pytest.raises(openai.BadRequestError):
-        await client.chat.completions.create(
-            model=MODEL_NAME,
-            messages=messages,
-            max_completion_tokens=1000,
-            tools=[{
-                "type": "function",
-                "function": {
-                    "name": "dummy_function_name",
-                    "description": "This is a dummy function",
-                    "parameters": sample_json_schema
-                }
-            }],
-            tool_choice="required")
+    messages = [
+        {
+            "role": "user",
+            "content": "Hi! How are you doing today?"
+        },
+        {
+            "role": "assistant",
+            "content": "I'm doing well! How can I help you?"
+        },
+        {
+            "role":
+            "user",
+            "content":
+            "Can you tell me what the current weather is in Berlin and the "\
+            "forecast for the next 5 days, in fahrenheit?",
+        },
+    ]
 
-    with pytest.raises(openai.BadRequestError):
-        await client.chat.completions.create(
-            model=MODEL_NAME,
-            messages=messages,
-            max_completion_tokens=1000,
-            tools=[{
-                "type": "function",
-                "function": {
-                    "name": "dummy_function_name",
-                    "description": "This is a dummy function",
-                    "parameters": sample_json_schema
-                }
-            }],
-            tool_choice="auto")
+    # Non-streaming test
+    chat_completion = await client.chat.completions.create(
+        messages=messages,
+        model=model_name,
+        tools=tools,
+        tool_choice="required",
+        extra_body=dict(guided_decoding_backend="outlines"),
+    )
+
+    assert chat_completion.choices[0].message.tool_calls is not None
+    assert len(chat_completion.choices[0].message.tool_calls) > 0
+
+    # Streaming test
+    stream = await client.chat.completions.create(
+        messages=messages,
+        model=model_name,
+        tools=tools,
+        tool_choice="required",
+        extra_body=dict(guided_decoding_backend="outlines"),
+        stream=True,
+    )
+
+    output = []
+    async for chunk in stream:
+        if chunk.choices and chunk.choices[0].delta.tool_calls:
+            output.extend(chunk.choices[0].delta.tool_calls)
+
+    assert len(output) > 0
 
 
 @pytest.mark.asyncio
 async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
+                                                  is_v1_server: bool,
                                                   sample_json_schema):
 
     if is_v1_server:
diff --git a/tests/tool_use/test_chat_completion_request_validations.py b/tests/tool_use/test_chat_completion_request_validations.py
index 7bee56281c7d6..ba0ad78f64675 100644
--- a/tests/tool_use/test_chat_completion_request_validations.py
+++ b/tests/tool_use/test_chat_completion_request_validations.py
@@ -43,7 +43,8 @@ def test_chat_completion_request_with_no_tools():
     assert request.tool_choice == 'none'
 
 
-def test_chat_completion_request_with_tool_choice_but_no_tools():
+@pytest.mark.parametrize('tool_choice', ['auto', 'required'])
+def test_chat_completion_request_with_tool_choice_but_no_tools(tool_choice):
     with pytest.raises(ValueError,
                        match="When using `tool_choice`, `tools` must be set."):
         ChatCompletionRequest.model_validate({
@@ -54,7 +55,7 @@ def test_chat_completion_request_with_tool_choice_but_no_tools():
             'model':
             'facebook/opt-125m',
             'tool_choice':
-            'auto'
+            tool_choice
         })
 
     with pytest.raises(ValueError,
@@ -67,7 +68,7 @@ def test_chat_completion_request_with_tool_choice_but_no_tools():
             'model':
             'facebook/opt-125m',
             'tool_choice':
-            'auto',
+            tool_choice,
             'tools':
             None
         })
diff --git a/tests/tool_use/test_tool_choice_required.py b/tests/tool_use/test_tool_choice_required.py
new file mode 100644
index 0000000000000..2ab87a0ef41ff
--- /dev/null
+++ b/tests/tool_use/test_tool_choice_required.py
@@ -0,0 +1,336 @@
+# SPDX-License-Identifier: Apache-2.0
+import json
+import re
+from copy import deepcopy
+from unittest.mock import MagicMock
+
+import pytest
+from pydantic import TypeAdapter
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              ChatCompletionToolsParam)
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+
+EXAMPLE_TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type":
+                        "string",
+                        "description":
+                        "The city to find the weather for"
+                        ", e.g. 'San Francisco'",
+                    },
+                },
+                "required": ["city"],
+                "additionalProperties": False
+            },
+        },
+        "strict": True
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_forecast",
+            "description": "Get the weather forecast for a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type":
+                        "string",
+                        "description":
+                        "The city to get the forecast for, e.g. 'New York'",
+                    },
+                    "days": {
+                        "type":
+                        "integer",
+                        "description":
+                        "Number of days to get the forecast for (1-7)",
+                    },
+                },
+                "required": ["city", "days"],
+                "additionalProperties": False
+            },
+        },
+        "strict": True
+    },
+]
+
+
+def _compile_and_check(tools: list[ChatCompletionToolsParam], sample_output,
+                       should_match: bool):
+    self = MagicMock(tool_choice="required", tools=tools)
+    schema = ChatCompletionRequest._get_guided_json_from_tool(self)
+    assert isinstance(schema, dict)
+
+    # use build_regex_from_schema used in JSONLogitsProcessor to create Guide
+    from outlines_core.fsm.json_schema import build_regex_from_schema
+    regex = build_regex_from_schema(json.dumps(schema))
+    compiled = re.compile(regex)
+    matches = compiled.fullmatch(json.dumps(sample_output)) is not None
+
+    assert matches == should_match
+
+
+VALID_TOOL_OUTPUTS = [
+    ([{
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Vienna"
+        }
+    }], True),
+    ([{
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Vienna"
+        }
+    }, {
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Berlin"
+        }
+    }], True),
+    ([{
+        "name": "get_forecast",
+        "parameters": {
+            "city": "Vienna",
+            "days": 7
+        }
+    }], True),
+    ([{
+        "name": "get_forecast",
+        "parameters": {
+            "city": "Vienna",
+            "days": 7
+        }
+    }, {
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Vienna"
+        }
+    }], True),
+    ([{
+        "name": "get_forecast",
+        "parameters": {
+            "city": "Vienna",
+            "days": 7
+        }
+    }, {
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Vienna"
+        }
+    }, {
+        "name": "get_forecast",
+        "parameters": {
+            "city": "Berlin",
+            "days": 7
+        }
+    }, {
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Berlin"
+        }
+    }], True),
+]
+
+VALID_TOOLS = [t[0] for t in VALID_TOOL_OUTPUTS]
+
+
+@pytest.mark.parametrize(
+    "sample_output, should_match",
+    VALID_TOOL_OUTPUTS + [
+        (None, False),
+        ([], False),  # empty list cannot be generated
+        ({}, False),  # empty object cannot be generated
+        ([{}], False),  # list with empty object cannot be generated
+        (
+            [{  # function without required parameters cannot be generated
+                "name": "get_current_weather"
+            }],
+            False),
+        (
+            [{  # function without required parameters cannot be generated
+                "name": "get_current_weather",
+                "parameters": {}
+            }],
+            False),
+        (
+            [{  # function without required parameters cannot be generated
+                "name": "get_current_weather",
+                "parameters": None
+            }],
+            False),
+        (
+            {  # tool call without lists cannot be generated
+                "name": "get_current_weather",
+                "parameters": {
+                    "city": "Vienna"
+                }
+            },
+            False),
+        (
+            [{  # tool call with extra parameters cannot be generated
+                "name": "get_current_weather",
+                "parameters": {
+                    "city": "Vienna",
+                    "extra": "value"
+                }
+            }],
+            False),
+        (
+            [{  # tool call where parameters are first cannot be generated
+                "parameters": {
+                    "city": "Vienna"
+                },
+                "name": "get_current_weather"
+            }],
+            False),
+        (
+            [{  # tool call without all required parameters cannot be generated
+                "name": "get_forecast",
+                "parameters": {
+                    "city": "Vienna"
+                }
+            }],
+            False),
+        (  # tool call with incorrect name/parameters cannot be generated
+            [{
+                "name": "get_weather",
+                "parameters": {
+                    "city": "Vienna",
+                    "days": 7
+                }
+            }], False),
+        (  #  tool call with both valid and empty function cannot be generated
+            [{
+                "name": "get_current_weather",
+                "parameters": {
+                    "city": "Vienna"
+                }
+            }, {}], False),
+    ])
+def test_guided_json(sample_output, should_match):
+    _compile_and_check(tools=TypeAdapter(
+        list[ChatCompletionToolsParam]).validate_python(EXAMPLE_TOOLS),
+                       sample_output=sample_output,
+                       should_match=should_match)
+
+
+def update_parameters_none(
+        tool: ChatCompletionToolsParam) -> ChatCompletionToolsParam:
+    tool.function.parameters = None
+    return tool
+
+
+def update_parameters_empty_dict(
+        tool: ChatCompletionToolsParam) -> ChatCompletionToolsParam:
+    tool.function.parameters = {}
+    return tool
+
+
+@pytest.mark.parametrize(
+    "sample_output, should_match",
+    [
+        (None, False),
+        ([], False),  # empty list cannot be generated
+        ({}, False),  # empty object cannot be generated
+        ([{}], False),  # list with empty object cannot be generated
+        (
+            [{  # function without required parameters cannot be generated
+                "name": "get_current_weather"
+            }],
+            False),
+        (
+            [{  # function without required parameters cannot be generated
+                "name": "get_current_weather",
+                "parameters": None
+            }],
+            False),
+        (
+            [{  # function with extra parameters cannot be generated
+                "name": "get_current_weather",
+                "parameters": {
+                    "extra": "value"
+                }
+            }],
+            False),
+        (
+            [{  # only function with empty parameters object is valid
+                "name": "get_current_weather",
+                "parameters": {}
+            }],
+            True),
+    ])
+@pytest.mark.parametrize(
+    "update_parameters",
+    [update_parameters_none, update_parameters_empty_dict])
+def test_guided_json_without_parameters(sample_output, should_match,
+                                        update_parameters):
+    updated_tools = [deepcopy(EXAMPLE_TOOLS[0])]
+    tools = TypeAdapter(
+        list[ChatCompletionToolsParam]).validate_python(updated_tools)
+    tools = list(map(update_parameters, tools))
+    assert all([
+        tool.function.parameters is None or tool.function.parameters == {}
+        for tool in tools
+    ])
+    _compile_and_check(tools=tools,
+                       sample_output=sample_output,
+                       should_match=should_match)
+
+
+@pytest.mark.parametrize("output", VALID_TOOLS)
+@pytest.mark.parametrize("empty_params", [False, True])
+@pytest.mark.parametrize("delta_len", [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+def test_streaming_output_valid(output, empty_params, delta_len):
+    self = MagicMock()
+
+    output = deepcopy(output)
+    if empty_params:
+        output = [{"name": o["name"], "parameters": {}} for o in output]
+    output_json = json.dumps(output)
+
+    previous_text = ""
+    function_name_returned = False
+    messages = []
+    for i in range(0, len(output_json), delta_len):
+        delta_text = output_json[i:i + delta_len]
+        current_text = previous_text + delta_text
+
+        delta_message, function_name_returned = (
+            OpenAIServingChat.extract_tool_call_required_streaming(
+                self,
+                previous_text=previous_text,
+                current_text=current_text,
+                delta_text=delta_text,
+                function_name_returned=function_name_returned))
+
+        if delta_message:
+            messages.append(delta_message)
+
+        previous_text = current_text
+
+    assert len(messages) > 0
+    combined_messages = "["
+    for message in messages:
+        if message.tool_calls[0].function.name:
+            if len(combined_messages) > 1:
+                combined_messages += "},"
+
+            combined_messages += '{"name": "' + \
+                message.tool_calls[0].function.name  + \
+                    '", "parameters": ' + \
+                        message.tool_calls[0].function.arguments
+        else:
+            combined_messages += message.tool_calls[0].function.arguments
+    combined_messages += "}]"
+    assert json.loads(combined_messages) == output
+    assert json.dumps(json.loads(combined_messages)) == output_json
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 032dc49d16de1..7cbd9d7ce2a64 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -61,7 +61,7 @@ class OpenAIBaseModel(BaseModel):
             field_names = set()
             for field_name, field in cls.model_fields.items():
                 field_names.add(field_name)
-                if alias := getattr(field, 'alias', None):
+                if alias := getattr(field, "alias", None):
                     field_names.add(alias)
             cls.field_names = field_names
 
@@ -70,7 +70,8 @@ class OpenAIBaseModel(BaseModel):
             logger.warning(
                 "The following fields were present in the request "
                 "but ignored: %s",
-                data.keys() - field_names)
+                data.keys() - field_names,
+            )
         return result
 
 
@@ -234,8 +235,12 @@ class ChatCompletionRequest(OpenAIBaseModel):
     temperature: Optional[float] = None
     top_p: Optional[float] = None
     tools: Optional[list[ChatCompletionToolsParam]] = None
-    tool_choice: Optional[Union[Literal["none"], Literal["auto"],
-                                ChatCompletionNamedToolChoiceParam]] = "none"
+    tool_choice: Optional[Union[
+        Literal["none"],
+        Literal["auto"],
+        Literal["required"],
+        ChatCompletionNamedToolChoiceParam,
+    ]] = "none"
 
     # NOTE this will be ignored by vLLM -- the model determines the behavior
     parallel_tool_calls: Optional[bool] = False
@@ -340,24 +345,28 @@ class ChatCompletionRequest(OpenAIBaseModel):
         description=(
             "If specified, will override the default guided decoding backend "
             "of the server for this specific request. If set, must be either "
-            "'outlines' / 'lm-format-enforcer'"))
+            "'outlines' / 'lm-format-enforcer'"),
+    )
     guided_whitespace_pattern: Optional[str] = Field(
         default=None,
         description=(
             "If specified, will override the default whitespace pattern "
-            "for guided json decoding."))
+            "for guided json decoding."),
+    )
     priority: int = Field(
         default=0,
         description=(
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."))
+            "if the served model does not use priority scheduling."),
+    )
     request_id: str = Field(
         default_factory=lambda: f"{random_uuid()}",
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
-            "through out the inference process and return in response."))
+            "through out the inference process and return in response."),
+    )
     logits_processors: Optional[LogitsProcessors] = Field(
         default=None,
         description=(
@@ -415,13 +424,15 @@ class ChatCompletionRequest(OpenAIBaseModel):
             ignore_eos=self.ignore_eos,
             temperature=temperature,
             length_penalty=self.length_penalty,
-            include_stop_str_in_output=self.include_stop_str_in_output)
+            include_stop_str_in_output=self.include_stop_str_in_output,
+        )
 
     def to_sampling_params(
-            self,
-            default_max_tokens: int,
-            logits_processor_pattern: Optional[str],
-            default_sampling_params: Optional[dict] = None) -> SamplingParams:
+        self,
+        default_max_tokens: int,
+        logits_processor_pattern: Optional[str],
+        default_sampling_params: Optional[dict] = None,
+    ) -> SamplingParams:
         # TODO(#9845): remove max_tokens when field is removed from OpenAI API
         max_tokens = self.max_completion_tokens or self.max_tokens
 
@@ -475,7 +486,8 @@ class ChatCompletionRequest(OpenAIBaseModel):
             grammar=self.guided_grammar,
             json_object=guided_json_object,
             backend=self.guided_decoding_backend,
-            whitespace_pattern=self.guided_whitespace_pattern)
+            whitespace_pattern=self.guided_whitespace_pattern,
+        )
 
         return SamplingParams.from_optional(
             n=self.n,
@@ -522,6 +534,41 @@ class ChatCompletionRequest(OpenAIBaseModel):
             tool = tools[tool_name]
             return tool.parameters
 
+        if self.tool_choice == "required":
+            # Pydantic schema generation cannot be used since the JSON schema
+            # has to be constructed for a specific instantiation of a tool list
+            # so that parameters of a function are correctly generated
+            # based on the chosen function name
+            def get_tool_schema(tool: ChatCompletionToolsParam) -> dict:
+                return {
+                    "properties": {
+                        "name": {
+                            "type": "string",
+                            "enum": [tool.function.name]
+                        },
+                        # parameters are always generated as '{}' in the final
+                        # output if they are missing from the request
+                        # (i.e. are None or '{}') so the schema is
+                        # updated to produce an empty object in that case
+                        "parameters": tool.function.parameters
+                        if tool.function.parameters else {
+                            "type": "object",
+                            "properties": {}
+                        }
+                    },
+                    "required": ["name", "parameters"]
+                }
+
+            json_schema = {
+                "type": "array",
+                "minItems": 1,
+                "items": {
+                    "type": "object",
+                    "anyOf": [get_tool_schema(tool) for tool in self.tools]
+                }
+            }
+            return json_schema
+
         return None
 
     @model_validator(mode="before")
@@ -572,8 +619,11 @@ class ChatCompletionRequest(OpenAIBaseModel):
                 "You can only use one kind of guided decoding "
                 "('guided_json', 'guided_regex' or 'guided_choice').")
         # you can only either use guided decoding or tools, not both
-        if guide_count > 1 and data.get("tool_choice",
-                                        "none") not in ("none", "auto"):
+        if guide_count > 1 and data.get("tool_choice", "none") not in (
+                "none",
+                "auto",
+                "required",
+        ):
             raise ValueError(
                 "You can only either use guided decoding or tools, not both.")
         return data
@@ -602,12 +652,15 @@ class ChatCompletionRequest(OpenAIBaseModel):
                     "When using `tool_choice`, `tools` must be set.")
 
             # make sure that tool choice is either a named tool
-            # OR that it's set to "auto"
-            if data["tool_choice"] != "auto" and not isinstance(
-                    data["tool_choice"], dict):
-                raise ValueError(
-                    "`tool_choice` must either be a named tool, \"auto\", "
-                    "or \"none\".")
+            # OR that it's set to "auto" or "required"
+            if data["tool_choice"] not in [
+                    "auto", "required"
+            ] and not isinstance(data["tool_choice"], dict):
+                raise NotImplementedError(
+                    f'Invalid value for `tool_choice`: {data["tool_choice"]}! '\
+                    'Only named tools, "none", "auto" or "required" '\
+                    'are supported.'
+                )
 
             # ensure that if "tool_choice" is specified as an object,
             # it matches a valid tool
@@ -722,18 +775,21 @@ class CompletionRequest(OpenAIBaseModel):
         description=(
             "If specified, will override the default guided decoding backend "
             "of the server for this specific request. If set, must be one of "
-            "'outlines' / 'lm-format-enforcer'"))
+            "'outlines' / 'lm-format-enforcer'"),
+    )
     guided_whitespace_pattern: Optional[str] = Field(
         default=None,
         description=(
             "If specified, will override the default whitespace pattern "
-            "for guided json decoding."))
+            "for guided json decoding."),
+    )
     priority: int = Field(
         default=0,
         description=(
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."))
+            "if the served model does not use priority scheduling."),
+    )
     logits_processors: Optional[LogitsProcessors] = Field(
         default=None,
         description=(
@@ -745,6 +801,7 @@ class CompletionRequest(OpenAIBaseModel):
             "arguments. For example: {'qualname': "
             "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
             "{'param': 'value'}}."))
+
     return_tokens_as_token_ids: Optional[bool] = Field(
         default=None,
         description=(
@@ -789,13 +846,15 @@ class CompletionRequest(OpenAIBaseModel):
             ignore_eos=self.ignore_eos,
             temperature=temperature,
             length_penalty=self.length_penalty,
-            include_stop_str_in_output=self.include_stop_str_in_output)
+            include_stop_str_in_output=self.include_stop_str_in_output,
+        )
 
     def to_sampling_params(
-            self,
-            default_max_tokens: int,
-            logits_processor_pattern: Optional[str],
-            default_sampling_params: Optional[dict] = None) -> SamplingParams:
+        self,
+        default_max_tokens: int,
+        logits_processor_pattern: Optional[str],
+        default_sampling_params: Optional[dict] = None,
+    ) -> SamplingParams:
         max_tokens = self.max_tokens
 
         if default_sampling_params is None:
@@ -844,7 +903,8 @@ class CompletionRequest(OpenAIBaseModel):
             grammar=self.guided_grammar,
             json_object=guided_json_object,
             backend=self.guided_decoding_backend,
-            whitespace_pattern=self.guided_whitespace_pattern)
+            whitespace_pattern=self.guided_whitespace_pattern,
+        )
 
         return SamplingParams.from_optional(
             n=self.n,
@@ -942,7 +1002,8 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
         description=(
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."))
+            "if the served model does not use priority scheduling."),
+    )
 
     # doc: end-embedding-extra-params
 
@@ -995,7 +1056,8 @@ class EmbeddingChatRequest(OpenAIBaseModel):
         description=(
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."))
+            "if the served model does not use priority scheduling."),
+    )
     # doc: end-chat-embedding-extra-params
 
     @model_validator(mode="before")
@@ -1034,7 +1096,8 @@ class ScoreRequest(OpenAIBaseModel):
         description=(
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."))
+            "if the served model does not use priority scheduling."),
+    )
 
     # doc: end-score-extra-params
 
@@ -1059,7 +1122,8 @@ class RerankRequest(OpenAIBaseModel):
         description=(
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."))
+            "if the served model does not use priority scheduling."),
+    )
 
     # doc: end-rerank-extra-params
 
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index eda4722836bdb..d4d0cfa400094 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -2,13 +2,16 @@
 
 import asyncio
 import json
+import re
 import time
 from collections.abc import AsyncGenerator, AsyncIterator
 from collections.abc import Sequence as GenericSequence
 from typing import Callable, Final, Optional, Union
 
 import jinja2
+import partial_json_parser
 from fastapi import Request
+from pydantic import TypeAdapter
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
@@ -21,8 +24,8 @@ from vllm.entrypoints.openai.protocol import (
     ChatCompletionRequest, ChatCompletionResponse,
     ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
     ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage,
-    DeltaToolCall, ErrorResponse, FunctionCall, PromptTokenUsageInfo,
-    RequestResponseMetadata, ToolCall, UsageInfo)
+    DeltaToolCall, ErrorResponse, FunctionCall, FunctionDefinition,
+    PromptTokenUsageInfo, RequestResponseMetadata, ToolCall, UsageInfo)
 from vllm.entrypoints.openai.serving_engine import (OpenAIServing,
                                                     clamp_prompt_logprobs)
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
@@ -150,12 +153,6 @@ class OpenAIServingChat(OpenAIServing):
 
             tool_parser = self.tool_parser
 
-            # validation for OpenAI tools
-            # tool_choice = "required" is not supported
-            if request.tool_choice == "required":
-                return self.create_error_response(
-                    "tool_choice = \"required\" is not supported!")
-
             if isinstance(tokenizer, MistralTokenizer):
                 # because of issues with pydantic we need to potentially
                 # re-serialize the tool_calls field of the request
@@ -277,6 +274,122 @@ class OpenAIServingChat(OpenAIServing):
             return self.response_role
         return request.messages[-1]["role"]
 
+    @staticmethod
+    def _bracket_level(s: str, opening='{', closing='}') -> int:
+        """
+        Calculate the current level of nested brackets in a given string.
+        """
+        level = 0
+        for char in s:
+            if char == opening:
+                level += 1
+            elif char == closing:
+                level -= 1
+        return level
+
+    @staticmethod
+    def _filter_delta_text(delta_text: str,
+                           previous_text: str) -> tuple[str, bool]:
+        # remove last '},' of the tool definition stemming from the
+        # "name"/"parameters" outer object or closing ']' of the tool list
+        # count occurrences of opening and closing curly braces and
+        # once level 0 is reached stop outputting text
+        # if 0 is reached while parsing the delta_text we know the current
+        # tool will finish in this current iteration
+        bracket_level = OpenAIServingChat._bracket_level(previous_text)
+        updated_delta, passed_zero = "", False
+        for c in delta_text:
+            if c == '{':
+                bracket_level += 1
+                passed_zero = bracket_level == 0
+            elif c == '}':
+                bracket_level -= 1
+                passed_zero = bracket_level == 0
+
+            if bracket_level != 0:
+                updated_delta += c
+            else:
+                # if a comma is reached at level 0 we can stop
+                if c == ',':
+                    break
+        return updated_delta, passed_zero
+
+    def extract_tool_call_required_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        function_name_returned: bool,
+    ) -> tuple[Optional[DeltaMessage], bool]:
+        try:
+            obj = partial_json_parser.loads(current_text)
+        except partial_json_parser.core.exceptions.MalformedJSON:
+            logger.debug('not enough tokens to parse into JSON yet')
+            obj = None
+
+        # check if the current text is a valid array
+        # containing a partial tool calling object
+        # if not repeat
+        if obj is None or not isinstance(obj, list) or not len(obj) > 0:
+            function_name_returned = False
+            delta_message = None
+        else:
+            _, finishes_previous_tool = OpenAIServingChat._filter_delta_text(
+                delta_text, previous_text)
+            # take the last tool call from the generated list
+            current_tool_call = obj[-1]
+
+            # once parameters have been generated the name is complete as well
+            if not finishes_previous_tool and ("name" not in current_tool_call
+                                               or "parameters"
+                                               not in current_tool_call):
+                function_name_returned = False
+                delta_message = None
+            else:
+                if not function_name_returned:
+                    # get partly generated arguments from the latest tool call
+                    param_match = re.search(r'.*"parameters":\s*(.*)',
+                                            current_text)
+                    arguments = param_match.group(1) if param_match else ""
+                    arguments, _ = OpenAIServingChat._filter_delta_text(
+                        arguments, previous_text)
+
+                    # if this iteration finishes a previous tool call but a
+                    # new incomplete tool is already generated, take the
+                    # previous from the list
+                    if (finishes_previous_tool
+                            and "parameters" not in current_tool_call):
+                        current_tool_call = obj[-2]
+
+                    function_name_returned = True
+                    delta_message = DeltaMessage(tool_calls=[
+                        DeltaToolCall(function=DeltaFunctionCall(
+                            name=current_tool_call["name"],
+                            arguments=arguments),
+                                      index=len(obj) - 1,
+                                      type="function")
+                    ])
+
+                else:
+                    delta_text, _ = OpenAIServingChat._filter_delta_text(
+                        delta_text, previous_text)
+
+                    if delta_text != "":
+                        delta_message = DeltaMessage(tool_calls=[
+                            DeltaToolCall(
+                                function=DeltaFunctionCall(
+                                    # OpenAI API returns None
+                                    # instead of name every time
+                                    name=None,
+                                    arguments=delta_text),
+                                index=len(obj) - 1,
+                                type="function")
+                        ])
+                    else:
+                        delta_message = None
+
+        return delta_message, function_name_returned
+
     async def chat_completion_stream_generator(
         self,
         request: ChatCompletionRequest,
@@ -312,6 +425,7 @@ class OpenAIServingChat(OpenAIServing):
             self._should_stream_with_reasoning_parsing(request))
 
         all_previous_token_ids: Optional[list[list[int]]]
+        function_name_returned: Optional[list[bool]] = None
 
         # Only one of these will be used, thus previous_texts and
         # all_previous_token_ids will not be used twice in the same iteration.
@@ -322,6 +436,10 @@ class OpenAIServingChat(OpenAIServing):
             # For reasoning parser and tool call all enabled
             added_content_delta_arr = [False] * num_choices
             reasoning_end_arr = [False] * num_choices
+        elif request.tool_choice == "required":
+            previous_texts = [""] * num_choices
+            function_name_returned = [False] * num_choices
+            all_previous_token_ids = None
         else:
             previous_texts, all_previous_token_ids = None, None
 
@@ -521,6 +639,23 @@ class OpenAIServingChat(OpenAIServing):
                                               index=i)
                             ])
 
+                    elif request.tool_choice == "required":
+                        assert previous_texts is not None
+                        assert function_name_returned is not None
+                        previous_text = previous_texts[i]
+                        current_text = previous_text + delta_text
+                        fn_name_returned = function_name_returned[i]
+
+                        delta_message, function_name_returned[i] = (
+                            self.extract_tool_call_required_streaming(
+                                previous_text=previous_text,
+                                current_text=current_text,
+                                delta_text=delta_text,
+                                function_name_returned=fn_name_returned))
+
+                        # update the previous values for the next iteration
+                        previous_texts[i] = current_text
+
                     # handle streaming deltas for tools with "auto" tool choice
                     # and reasoning parser
                     elif tool_choice_auto and self.enable_reasoning:
@@ -821,10 +956,10 @@ class OpenAIServingChat(OpenAIServing):
 
             # if auto tools are not enabled, and a named tool choice using
             #   outlines is not being used
-            if (not self.enable_auto_tools
-                    or not self.tool_parser) and not isinstance(
-                        request.tool_choice,
-                        ChatCompletionNamedToolChoiceParam):
+            if (not self.enable_auto_tools or not self.tool_parser) and \
+                (not isinstance(request.tool_choice,
+                                ChatCompletionNamedToolChoiceParam
+                                ) and request.tool_choice != "required"):
                 message = ChatMessage(role=role,
                                       reasoning_content=reasoning_content,
                                       content=content)
@@ -845,6 +980,24 @@ class OpenAIServingChat(OpenAIServing):
                             arguments=content))
                     ])
 
+            elif request.tool_choice and request.tool_choice == "required":
+                tool_call_class = MistralToolCall if isinstance(
+                    tokenizer, MistralTokenizer) else ToolCall
+
+                # the fields of FunctionDefinition are a superset of the
+                # tool call outputs and can be used for parsing
+                tool_calls = TypeAdapter(
+                    list[FunctionDefinition]).validate_json(output.text)
+                message = ChatMessage(
+                    role=role,
+                    content="",
+                    tool_calls=[
+                        tool_call_class(function=FunctionCall(
+                            name=tool_call.name,
+                            arguments=json.dumps(tool_call.parameters)))
+                        for tool_call in tool_calls
+                    ])
+
             # if the request doesn't use tool choice
             # OR specifies to not use a tool
             elif not request.tool_choice or request.tool_choice == "none":

From 550b2801ad57b8f1e1782037c40fd01b2632435e Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Wed, 2 Apr 2025 22:46:47 +0800
Subject: [PATCH 191/593] [CPU][Bugfix] Using custom allreduce for CPU backend
 (#15934)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 cmake/cpu_extension.cmake                     |   1 +
 csrc/cpu/cpu_types_x86.hpp                    |  58 ++
 csrc/cpu/shm.cpp                              | 781 ++++++++++++++++++
 csrc/cpu/torch_bindings.cpp                   |  43 +
 csrc/cpu/utils.cpp                            |   2 +-
 .../getting_started/installation/cpu.md       |   8 +-
 .../device_communicators/cpu_communicator.py  | 129 ++-
 vllm/worker/cpu_worker.py                     |   7 +
 8 files changed, 1013 insertions(+), 16 deletions(-)
 create mode 100644 csrc/cpu/shm.cpp

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index b57d9e2263109..fdc03a7950563 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -197,6 +197,7 @@ set(VLLM_EXT_SRC
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
     set(VLLM_EXT_SRC
         "csrc/cpu/quant.cpp"
+        "csrc/cpu/shm.cpp"
         ${VLLM_EXT_SRC})
 endif()
 
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
index 4568699b30773..cf67847b45ba0 100644
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -78,9 +78,14 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
 
   __m256i reg;
 
+  // normal load
   explicit FP16Vec16(const void* ptr)
       : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
 
+  // non-temproal load
+  explicit FP16Vec16(bool, void* ptr)
+      : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
+
   explicit FP16Vec16(const FP32Vec16&);
 
   void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
@@ -110,9 +115,14 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
 
   __m256i reg;
 
+  // normal load
   explicit BF16Vec16(const void* ptr)
       : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
 
+  // non-temproal load
+  explicit BF16Vec16(bool, void* ptr)
+      : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
+
   explicit BF16Vec16(const FP32Vec16&);
 
   void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
@@ -313,8 +323,13 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
 
+  // normal load
   explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}
 
+  // non-temproal load
+  explicit FP32Vec16(bool, void* ptr)
+      : reg((__m512)_mm512_stream_load_si512(ptr)) {}
+
   explicit FP32Vec16(__m512 data) : reg(data) {}
 
   explicit FP32Vec16(const FP32Vec4& data)
@@ -547,6 +562,33 @@ struct INT8Vec16 : public Vec<INT8Vec16> {
     _mm_mask_storeu_epi8(ptr, mask, reg);
   }
 };
+
+struct INT8Vec64 : public Vec<INT8Vec64> {
+  constexpr static int VEC_ELEM_NUM = 64;
+  union AliasReg {
+    __m512i reg;
+    int8_t values[VEC_ELEM_NUM];
+  };
+
+  __m512i reg;
+
+  // normal load
+  explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {}
+
+  // non-temproal load
+  explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {}
+
+  void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); }
+
+  void save(int8_t* ptr, const int elem_num) const {
+    constexpr uint64_t M = 0xFFFFFFFFFFFFFFFF;
+    __mmask64 mask = _cvtu64_mask64(M >> (64 - elem_num));
+    _mm512_mask_storeu_epi8(ptr, mask, reg);
+  }
+
+  // non-temproal save
+  void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); }
+};
 #endif
 
 template <typename T>
@@ -657,6 +699,22 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
 
 inline void prefetch(const void* addr) { _mm_prefetch(addr, _MM_HINT_T1); }
 
+#ifdef __AVX512F__
+inline void non_temporal_save(FP16Vec16& vec, void* ptr) {
+  _mm256_stream_si256((__m256i*)ptr, vec.reg);
+}
+inline void non_temporal_save(BF16Vec32& vec, void* ptr) {
+  _mm512_stream_si512((__m512i*)ptr, vec.reg);
+}
+inline void non_temporal_save(BF16Vec16& vec, void* ptr) {
+  _mm256_stream_si256((__m256i*)ptr, vec.reg);
+}
+inline void non_temporal_save(FP32Vec16& vec, void* ptr) {
+  _mm512_stream_ps((float*)ptr, vec.reg);
+}
+#endif
+
+inline void mem_barrier() { _mm_mfence(); }
 };  // namespace vec_op
 
 #endif
diff --git a/csrc/cpu/shm.cpp b/csrc/cpu/shm.cpp
new file mode 100644
index 0000000000000..f55e96de251d0
--- /dev/null
+++ b/csrc/cpu/shm.cpp
@@ -0,0 +1,781 @@
+#include "cpu/cpu_types.hpp"
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+namespace {
+#define MAX_SHM_RANK_NUM 8
+#define MAX_THREAD_NUM 12
+#define PER_THREAD_SHM_BUFFER_BYTES (4 * 1024 * 1024)
+#define MIN_THREAD_PROCESS_SIZE (8 * 1024)
+#define MAX_P2P_SEND_TENSOR_NUM 8
+
+template <typename scalar_t>
+struct KernelVecType {
+  using scalar_vec_t = void;
+};
+
+template <>
+struct KernelVecType<float> {
+  using scalar_vec_t = vec_op::FP32Vec16;
+};
+
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using scalar_vec_t = vec_op::BF16Vec16;
+};
+
+template <>
+struct KernelVecType<c10::Half> {
+  using scalar_vec_t = vec_op::FP16Vec16;
+};
+
+enum class ThreadSHMStat : char { THREAD_READY = 0, SHM_DATA_READY, DONE };
+
+struct ThreadSHMContext {
+  volatile ThreadSHMStat thread_stats[MAX_SHM_RANK_NUM];
+  int thread_id;
+  int thread_num;
+  int rank;
+  int group_size;
+  size_t _spinning_count;
+  int swizzled_ranks[MAX_SHM_RANK_NUM];
+  void* thread_shm_ptrs[MAX_SHM_RANK_NUM];
+  ThreadSHMContext* shm_contexts[MAX_SHM_RANK_NUM];
+
+  ThreadSHMContext(const int thread_id, const int thread_num, const int rank,
+                   const int group_size, void* thread_shm_ptr)
+      : thread_id(thread_id),
+        thread_num(thread_num),
+        rank(rank),
+        group_size(group_size),
+        _spinning_count(0) {
+    static_assert(sizeof(ThreadSHMContext) % 64 == 0);
+    TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM);
+    TORCH_CHECK((size_t)this % 64 == 0);
+    TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0);
+    for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
+      shm_contexts[i] = nullptr;
+      thread_shm_ptrs[i] = nullptr;
+      swizzled_ranks[i] = (i + rank) % group_size;
+      thread_stats[i] = ThreadSHMStat::DONE;
+    }
+    set_context(rank, this, thread_shm_ptr);
+  }
+
+  void set_context(int rank, ThreadSHMContext* ptr, void* thread_shm_ptr) {
+    TORCH_CHECK(rank < MAX_SHM_RANK_NUM);
+    TORCH_CHECK(ptr);
+    TORCH_CHECK(thread_shm_ptr);
+    TORCH_CHECK_EQ(ptr->thread_num, thread_num);
+    TORCH_CHECK_EQ(ptr->thread_id, thread_id);
+    shm_contexts[rank] = ptr;
+    thread_shm_ptrs[rank] = thread_shm_ptr;
+  }
+
+  template <typename T>
+  T* get_thread_shm_ptr(int rank) {
+    return reinterpret_cast<T*>(thread_shm_ptrs[rank]);
+  }
+
+  int get_swizzled_rank(int idx) { return swizzled_ranks[idx]; }
+
+  void wait_for_all(ThreadSHMStat prev_stat) {
+    for (int idx = 0; idx < group_size; ++idx) {
+      int rank = get_swizzled_rank(idx);
+      while (thread_stats[rank] == prev_stat) {
+        ++_spinning_count;
+        _mm_pause();
+      }
+    }
+    vec_op::mem_barrier();
+  }
+
+  void wait_for_one(int rank, ThreadSHMStat prev_stat) {
+    while (thread_stats[rank] == prev_stat) {
+      ++_spinning_count;
+      _mm_pause();
+    }
+    vec_op::mem_barrier();
+  }
+
+  void set_thread_stat(ThreadSHMStat stat) {
+    for (int idx = 0; idx < group_size; ++idx) {
+      int rank = get_swizzled_rank(idx);
+      shm_contexts[rank]->thread_stats[this->rank] = stat;
+    }
+  }
+
+  void set_thread_stat(int target_rank, ThreadSHMStat stat) {
+    for (int idx = 0; idx < group_size; ++idx) {
+      int rank = get_swizzled_rank(idx);
+      shm_contexts[rank]->thread_stats[target_rank] = stat;
+    }
+  }
+
+  // barrier for all ranks in the group, used for all2all ops
+  // DONE -> THREAD_READY -> SHM_DATA_READY -> DONE -> ...
+  void barrier(ThreadSHMStat next_stat) {
+    if (next_stat == ThreadSHMStat::THREAD_READY) {
+      set_thread_stat(ThreadSHMStat::THREAD_READY);
+      wait_for_all(ThreadSHMStat::DONE);
+    } else if (next_stat == ThreadSHMStat::SHM_DATA_READY) {
+      set_thread_stat(ThreadSHMStat::SHM_DATA_READY);
+      wait_for_all(ThreadSHMStat::THREAD_READY);
+    } else if (next_stat == ThreadSHMStat::DONE) {
+      set_thread_stat(ThreadSHMStat::DONE);
+      wait_for_all(ThreadSHMStat::SHM_DATA_READY);
+    } else {
+      TORCH_CHECK(false, "Invalid next_stat to barrier.");
+    }
+  }
+
+  std::string to_string() const {
+    std::stringstream ss;
+    ss << "SHMContext:";
+    ss << "\nrank: " << rank;
+    ss << "\ngroup_size: " << group_size;
+    ss << "\nthread_num: " << thread_num;
+    ss << "\nthread_id: " << thread_id;
+
+    ss << "\nshm_ctx_stat_loop_seq: [";
+    for (int i = 0; i < group_size; ++i) {
+      ss << swizzled_ranks[i] << ", ";
+    }
+    ss << "]";
+
+    ss << "\nshm_contexts: [";
+    for (int i = 0; i < group_size; ++i) {
+      if (shm_contexts[i]) {
+        ss << shm_contexts[i]->rank << ", ";
+      }
+    }
+    ss << "]";
+
+    return ss.str();
+  }
+};
+
+class SHMManager {
+ public:
+  explicit SHMManager(const std::string& name, const int rank,
+                      const int group_size)
+      : _rank(rank),
+        _group_size(group_size),
+        _thread_num(std::min(torch::get_num_threads(), MAX_THREAD_NUM)),
+        _shm_names({""}),
+        _shared_mem_ptrs({nullptr}),
+        _shm_ctx(nullptr) {
+    _shm_names[rank] = get_shm_name(name, rank);
+    _shared_mem_ptrs[rank] = init_shm(rank);
+    _shm_ctx = reinterpret_cast<ThreadSHMContext*>(_shared_mem_ptrs[rank]);
+
+    for (int i = 0; i < _thread_num; ++i) {
+      ThreadSHMContext* ctx = new (_shm_ctx + i)
+          ThreadSHMContext(i, _thread_num, _rank, _group_size,
+                           compute_thread_shm_ptr(_shm_ctx, i));
+    }
+  }
+
+  void join(const std::string& name) {
+    for (int rank_idx = 0; rank_idx < _group_size; ++rank_idx) {
+      if (rank_idx != _rank) {
+        TORCH_CHECK(_shm_names[rank_idx].empty());
+        TORCH_CHECK(_shared_mem_ptrs[rank_idx] == nullptr);
+        _shm_names[rank_idx] = get_shm_name(name, rank_idx);
+        _shared_mem_ptrs[rank_idx] = init_shm(rank_idx);
+        ThreadSHMContext* target_ctx =
+            reinterpret_cast<ThreadSHMContext*>(_shared_mem_ptrs[rank_idx]);
+        for (int thread_idx = 0; thread_idx < _thread_num; ++thread_idx) {
+          _shm_ctx[thread_idx].set_context(
+              rank_idx, target_ctx + thread_idx,
+              compute_thread_shm_ptr(target_ctx, thread_idx));
+        }
+      }
+    }
+  }
+
+  ~SHMManager() { destroy_shm(); }
+
+  ThreadSHMContext* get_shm_ctx() const { return _shm_ctx; }
+
+  static std::string get_shm_name(const std::string& name, int rank) {
+    return name + "_" + std::to_string(rank);
+  }
+
+  static int64_t create_singleton_instance(const std::string& name,
+                                           const int group_size,
+                                           const int rank) {
+    std::lock_guard<std::mutex> guard(SingletonInstancesLock);
+    SingletonInstances.emplace_back(
+        std::make_unique<SHMManager>(name, rank, group_size));
+    return static_cast<int64_t>(SingletonInstances.size() - 1);
+  }
+
+  static SHMManager* get_singleton_instance(int64_t handle) {
+    return SingletonInstances[handle].get();
+  }
+
+ protected:
+  static std::vector<std::unique_ptr<SHMManager>> SingletonInstances;
+  static std::mutex SingletonInstancesLock;
+
+ private:
+  static size_t round_to_alignment(size_t num) {
+    return ((num + 63) / 64) * 64;
+  }
+
+  int8_t* compute_thread_shm_ptr(ThreadSHMContext* ctx, int thread_id) {
+    int8_t* thread_shm_ptr =
+        reinterpret_cast<int8_t*>(ctx) +
+        round_to_alignment(_thread_num * sizeof(ThreadSHMContext));
+    return thread_shm_ptr +
+           thread_id * round_to_alignment(PER_THREAD_SHM_BUFFER_BYTES);
+  }
+
+  size_t compute_shm_size() {
+    const size_t rounded_rank_buffer_size =
+        round_to_alignment(PER_THREAD_SHM_BUFFER_BYTES) * _thread_num;
+    const size_t rounded_thread_shm_ctx_size =
+        round_to_alignment(_thread_num * sizeof(ThreadSHMContext));
+    const size_t shm_size =
+        rounded_thread_shm_ctx_size + rounded_rank_buffer_size;
+    return shm_size;
+  }
+
+  void* init_shm(int target_rank) {
+    const std::string& shm_name = _shm_names[target_rank];
+    const int local_rank = _rank;
+    const size_t shm_size = compute_shm_size();
+
+    int fd = -1;
+    if (local_rank == target_rank) {
+      fd = shm_open(shm_name.c_str(), O_CREAT | O_EXCL | O_RDWR,
+                    S_IRUSR | S_IWUSR);
+
+      if (fd == -1)
+        TORCH_CHECK(false, "create shm in SHMManager failed. errno: " +
+                               std::to_string(errno));
+
+      if (ftruncate(fd, shm_size) == -1)
+        TORCH_CHECK(false, "ftruncate in SHMManager failed. errno: " +
+                               std::to_string(errno));
+    } else {
+      fd = shm_open(shm_name.c_str(), O_RDWR, S_IRUSR | S_IWUSR);
+
+      if (fd == -1)
+        TORCH_CHECK(false, "open shm in SHMManager failed. errno: " +
+                               std::to_string(errno));
+    }
+
+    void* shm_ptr = mmap(nullptr, shm_size, PROT_READ | PROT_WRITE,
+                         MAP_SHARED | MAP_POPULATE, fd, 0);
+
+    if (shm_ptr == MAP_FAILED) {
+      TORCH_CHECK(false,
+                  "mmap in SHMManager failed. errno: " + std::to_string(errno));
+    }
+
+    if (close(fd) != 0) {
+      TORCH_CHECK(
+          false, "close in SHMManager failed. errno: " + std::to_string(errno));
+    }
+
+    TORCH_CHECK((size_t)shm_ptr % 64 == 0);
+
+    return shm_ptr;
+  }
+
+  void destroy_shm() {
+    std::stringstream ss;
+    ss << "local rank " << _rank << ": [";
+    for (int thread_id = 0; thread_id < _thread_num; ++thread_id) {
+      ss << _shm_ctx[thread_id]._spinning_count << ", ";
+    }
+    ss << "]\n";
+
+    for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
+      if (_shared_mem_ptrs[i] != nullptr) {
+        munmap(_shared_mem_ptrs[i], compute_shm_size());
+      }
+
+      if (!_shm_names[i].empty()) {
+        shm_unlink(_shm_names[i].c_str());
+      }
+    }
+  }
+
+  int _rank;
+  int _group_size;
+  int _thread_num;
+  std::array<std::string, MAX_SHM_RANK_NUM> _shm_names;
+  std::array<void*, MAX_SHM_RANK_NUM> _shared_mem_ptrs;
+  ThreadSHMContext* _shm_ctx;
+};
+
+namespace shm_cc_ops {
+template <typename scalar_t, typename F>
+void shm_cc_loop(ThreadSHMContext* ctx, int64_t elem_num, F&& inner_func) {
+  int thread_num = ctx->thread_num;
+  int64_t total_bytes = elem_num * sizeof(scalar_t);
+  int64_t total_units_num =
+      (total_bytes + MIN_THREAD_PROCESS_SIZE - 1) / MIN_THREAD_PROCESS_SIZE;
+  int64_t per_thread_units_num =
+      (total_units_num + thread_num - 1) / thread_num;
+  int64_t per_unit_elem_num = MIN_THREAD_PROCESS_SIZE / sizeof(scalar_t);
+  int64_t max_per_thread_iteration_elem_num =
+      PER_THREAD_SHM_BUFFER_BYTES / sizeof(scalar_t);
+  int64_t per_thread_elem_num = per_unit_elem_num * per_thread_units_num;
+
+#pragma omp parallel for schedule(static, 1)
+  for (int i = 0; i < thread_num; ++i) {
+    int64_t offset = i * per_thread_elem_num;
+    int64_t end = std::min(elem_num, offset + per_thread_elem_num);
+    int64_t curr_elem_num =
+        std::min(max_per_thread_iteration_elem_num, end - offset);
+    ThreadSHMContext* thread_ctx = ctx + i;
+
+    while (curr_elem_num > 0) {
+      inner_func(thread_ctx, offset, curr_elem_num);
+
+      offset += max_per_thread_iteration_elem_num;
+      curr_elem_num = std::min(max_per_thread_iteration_elem_num, end - offset);
+    }
+  }
+}
+};  // namespace shm_cc_ops
+
+namespace shm_cc_ops {
+
+void memcpy_from_shm(void* dst, void* src, const int64_t bytes) {
+  const int64_t aligned_bytes = ((bytes >> 6) << 6);  // 64 bytes aligned
+  int64_t i = 0;
+#pragma GCC unroll 4
+  for (; i < aligned_bytes; i += 64) {
+    vec_op::INT8Vec64 data(
+        true, (int8_t*)src + i);  // stream loading shm to avoid caching
+    data.save((int8_t*)dst + i);
+  }
+  if (aligned_bytes < bytes) {
+    vec_op::INT8Vec64 data(true, (int8_t*)src + aligned_bytes);
+    data.save((int8_t*)dst + aligned_bytes, bytes - aligned_bytes);
+  }
+}
+
+void memcpy_to_shm(void* dst, void* src, const int64_t bytes) {
+#pragma GCC unroll 4
+  for (int64_t i = 0; i < bytes; i += 64) {
+    vec_op::INT8Vec64 data((int8_t*)src + i);
+    data.nt_save((int8_t*)dst + i);
+  }
+}
+
+void memcpy(void* dst, void* src, const int64_t bytes) {
+  const int64_t aligned_bytes = ((bytes >> 6) << 6);  // 64 bytes aligned
+  int64_t i = 0;
+#pragma GCC unroll 4
+  for (; i < aligned_bytes; i += 64) {
+    vec_op::INT8Vec64 data((int8_t*)src + i);
+    data.save((int8_t*)dst + i);
+  }
+  if (aligned_bytes < bytes) {
+    vec_op::INT8Vec64 data((int8_t*)src + aligned_bytes);
+    data.save((int8_t*)dst + aligned_bytes, bytes - aligned_bytes);
+  }
+}
+
+template <typename scalar_t, int RANKS>
+void all_reduce_sum_impl(ThreadSHMContext* ctx, scalar_t* data,
+                         size_t elem_num) {
+  CPU_KERNEL_GUARD_IN(all_reduce_sum_impl)
+  using vec_t = typename KernelVecType<scalar_t>::scalar_vec_t;
+  constexpr int64_t vec_elem_num = vec_t::get_elem_num();
+  const int worldsize = ctx->group_size;
+
+  shm_cc_ops::shm_cc_loop<scalar_t>(
+      ctx, elem_num,
+      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
+          int64_t data_elem_num) {
+        int rank = thread_ctx->rank;
+        scalar_t* thread_shm_ptr =
+            thread_ctx->get_thread_shm_ptr<scalar_t>(rank);
+        scalar_t* thread_data_ptr = data + data_offset;
+        int64_t thread_data_elem_num = data_elem_num * sizeof(scalar_t);
+
+        scalar_t* remote_data_ptrs[RANKS - 1];
+        vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
+          remote_data_ptrs[idx] = thread_ctx->get_thread_shm_ptr<scalar_t>(
+              thread_ctx->get_swizzled_rank(idx + 1));
+        });
+
+        thread_ctx->barrier(ThreadSHMStat::THREAD_READY);
+
+        shm_cc_ops::memcpy_to_shm(thread_shm_ptr, thread_data_ptr,
+                                  thread_data_elem_num);
+
+        thread_ctx->barrier(ThreadSHMStat::SHM_DATA_READY);
+
+        int64_t aligned_data_elem_num =
+            (data_elem_num / vec_elem_num) * vec_elem_num;
+        int64_t i = 0;
+#pragma GCC unroll 4
+        for (; i < aligned_data_elem_num; i += vec_elem_num) {
+          vec_t local_data(thread_data_ptr + i);  // load from cache
+          vec_op::FP32Vec16 local_data_fp32(local_data);
+          vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
+            vec_t remote_data(
+                true, remote_data_ptrs[idx] + i);  // stream load from shm
+            vec_op::FP32Vec16 remote_data_fp32(remote_data);
+            local_data_fp32 = local_data_fp32 + remote_data_fp32;  // sum reduce
+          });
+          vec_t reduced_data(local_data_fp32);
+          reduced_data.save(thread_data_ptr + i);
+        }
+
+        if (i < data_elem_num) {
+          vec_t local_data(thread_data_ptr + i);  // load from cache
+          vec_op::FP32Vec16 local_data_fp32(local_data);
+          vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
+            vec_t remote_data(
+                true, remote_data_ptrs[idx] + i);  // stream load from shm
+            vec_op::FP32Vec16 remote_data_fp32(remote_data);
+            local_data_fp32 = local_data_fp32 + remote_data_fp32;  // sum reduce
+          });
+          vec_t reduced_data(local_data_fp32);
+          reduced_data.save(thread_data_ptr + i,
+                            data_elem_num - aligned_data_elem_num);
+        }
+
+        thread_ctx->barrier(ThreadSHMStat::DONE);
+      });
+
+  return;
+}
+};  // namespace shm_cc_ops
+
+std::vector<std::unique_ptr<SHMManager>> SHMManager::SingletonInstances = {};
+std::mutex SHMManager::SingletonInstancesLock = {};
+
+template <typename scalar_t>
+void shm_allreduce_sum(ThreadSHMContext* ctx, scalar_t* data, size_t elem_num) {
+  switch (ctx->group_size) {
+    case 2:
+      shm_cc_ops::all_reduce_sum_impl<scalar_t, 2>(ctx, data, elem_num);
+      break;
+    case 3:
+      shm_cc_ops::all_reduce_sum_impl<scalar_t, 3>(ctx, data, elem_num);
+      break;
+    case 4:
+      shm_cc_ops::all_reduce_sum_impl<scalar_t, 4>(ctx, data, elem_num);
+      break;
+    case 8:
+      shm_cc_ops::all_reduce_sum_impl<scalar_t, 8>(ctx, data, elem_num);
+      break;
+    default:
+      TORCH_CHECK(false,
+                  "Invalid world size: " + std::to_string(ctx->group_size));
+  }
+}
+
+template <typename scalar_t>
+void shm_gather_impl(ThreadSHMContext* ctx, scalar_t* data, size_t elem_num,
+                     scalar_t** outputs, const int dst) {
+  CPU_KERNEL_GUARD_IN(shm_gather_impl)
+  const int worldsize = ctx->group_size;
+  TORCH_CHECK_LT(dst, worldsize);
+  shm_cc_ops::shm_cc_loop<scalar_t>(
+      ctx, elem_num,
+      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
+          int64_t data_elem_num) {
+        int rank = thread_ctx->rank;
+        scalar_t* thread_shm_ptr =
+            thread_ctx->get_thread_shm_ptr<scalar_t>(rank);
+
+        thread_ctx->barrier(ThreadSHMStat::THREAD_READY);
+
+        shm_cc_ops::memcpy_to_shm(thread_shm_ptr, data + data_offset,
+                                  data_elem_num * sizeof(scalar_t));
+
+        thread_ctx->barrier(ThreadSHMStat::SHM_DATA_READY);
+
+        if (rank == dst) {
+          shm_cc_ops::memcpy(outputs[rank] + data_offset, data + data_offset,
+                             data_elem_num * sizeof(scalar_t));
+          for (int i = 1; i < worldsize; ++i) {
+            int src_rank = thread_ctx->get_swizzled_rank(i);
+            scalar_t* src_ptr =
+                thread_ctx->get_thread_shm_ptr<scalar_t>(src_rank);  // shm
+            scalar_t* dst_ptr = outputs[src_rank] + data_offset;
+            shm_cc_ops::memcpy_from_shm(dst_ptr, src_ptr,
+                                        data_elem_num * sizeof(scalar_t));
+          }
+        }
+
+        thread_ctx->barrier(ThreadSHMStat::DONE);
+      });
+
+  return;
+}
+
+struct MemPiece {
+  void* ptr;
+  int64_t size;
+
+  template <typename T>
+  T* data_ptr() {
+    return reinterpret_cast<T*>(ptr);
+  }
+};
+
+struct TensorListMeta {
+  int64_t tensor_bytes[MAX_P2P_SEND_TENSOR_NUM];
+  torch::ScalarType tensor_types[MAX_P2P_SEND_TENSOR_NUM];
+  int64_t tensor_num;
+  int64_t total_bytes;
+
+  TensorListMeta() : tensor_num(0), total_bytes(0) {
+    static_assert(sizeof(TensorListMeta) % 64 == 0);
+    static_assert(sizeof(TensorListMeta) <
+                  MIN_THREAD_PROCESS_SIZE);  // To ensure the metadata always
+                                             // hold by the thread 0
+    for (int i = 0; i < MAX_P2P_SEND_TENSOR_NUM; ++i) {
+      tensor_bytes[i] = 0;
+      tensor_ptrs[i] = nullptr;
+      tensor_types[i] = torch::ScalarType::Undefined;
+    }
+  }
+
+  // For send and recv
+  void bind_tensor_list(std::vector<torch::Tensor>& tensor_list) {
+    TORCH_CHECK(tensor_types[0] == torch::ScalarType::Undefined,
+                "Re-bind TensorListMeta is not allowed.")
+    TORCH_CHECK_LE(tensor_list.size(), MAX_P2P_SEND_TENSOR_NUM);
+    tensor_num = tensor_list.size();
+    int64_t bytes_sum = 0;
+    for (int i = 0; i < tensor_list.size(); ++i) {
+      torch::Tensor& t = tensor_list[i];
+      TORCH_CHECK(t.is_contiguous());
+      tensor_bytes[i] = t.nbytes();
+      tensor_types[i] = t.scalar_type();
+      tensor_ptrs[i] = t.data_ptr();
+      bytes_sum += t.nbytes();
+    }
+    total_bytes = bytes_sum;
+  }
+
+  // For recv
+  std::vector<torch::Tensor> generate_tensor_list() {
+    std::vector<torch::Tensor> tensor_list;
+    tensor_list.reserve(tensor_num);
+
+    for (int i = 0; i < tensor_num; ++i) {
+      int64_t bytes = tensor_bytes[i];
+      auto type = tensor_types[i];
+      int64_t elem_bytes = torch::elementSize(type);
+
+      TORCH_CHECK_EQ(bytes % elem_bytes, 0);
+      int64_t elem_num = bytes / elem_bytes;
+      auto options = torch::TensorOptions().dtype(type).device(torch::kCPU);
+      tensor_list.emplace_back(torch::empty({elem_num}, options));
+    }
+    return tensor_list;
+  }
+
+  MemPiece get_data(int64_t offset) {
+    for (int i = 0; i < tensor_num; ++i) {
+      if (offset < tensor_bytes[i]) {
+        return {reinterpret_cast<int8_t*>(tensor_ptrs[i]) + offset,
+                tensor_bytes[i] - offset};
+      }
+      offset -= tensor_bytes[i];
+    }
+    return {nullptr, 0};
+  }
+
+ private:
+  void* tensor_ptrs[MAX_P2P_SEND_TENSOR_NUM];
+  int8_t _padding[40];
+};
+
+void shm_send_tensor_list_impl(ThreadSHMContext* ctx,
+                               const std::vector<torch::Tensor>& tensor_list) {
+  CPU_KERNEL_GUARD_IN(shm_send_tensor_list_impl)
+  std::vector<torch::Tensor> tensor_list_with_metadata;
+  tensor_list_with_metadata.reserve(1 + tensor_list.size());
+
+  auto options = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
+  tensor_list_with_metadata.emplace_back(
+      torch::empty({sizeof(TensorListMeta)}, options));
+  tensor_list_with_metadata.insert(tensor_list_with_metadata.end(),
+                                   tensor_list.begin(), tensor_list.end());
+
+  torch::Tensor& metadata_tensor = tensor_list_with_metadata[0];
+  TORCH_CHECK_EQ(metadata_tensor.nbytes(), sizeof(TensorListMeta));
+
+  TensorListMeta* metadata = new (metadata_tensor.data_ptr()) TensorListMeta();
+  metadata->bind_tensor_list(tensor_list_with_metadata);
+
+  shm_cc_ops::shm_cc_loop<int8_t>(
+      ctx, metadata->total_bytes,
+      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
+          int64_t data_elem_num) {
+        int rank = thread_ctx->rank;
+        // Wait until the receiver set the stat to DONE
+        thread_ctx->wait_for_one(rank, ThreadSHMStat::SHM_DATA_READY);
+
+        int64_t curr_shm_offset = 0;
+        while (curr_shm_offset < data_elem_num) {
+          MemPiece frag = metadata->get_data(data_offset + curr_shm_offset);
+          frag.size = std::min(frag.size, data_elem_num - curr_shm_offset);
+          shm_cc_ops::memcpy(
+              thread_ctx->get_thread_shm_ptr<int8_t>(rank) + curr_shm_offset,
+              frag.ptr, frag.size);
+          curr_shm_offset += frag.size;
+        }
+
+        thread_ctx->set_thread_stat(rank, ThreadSHMStat::SHM_DATA_READY);
+      });
+}
+
+std::vector<torch::Tensor> shm_recv_tensor_list_impl(ThreadSHMContext* ctx,
+                                                     int64_t src) {
+  CPU_KERNEL_GUARD_IN(shm_recv_tensor_list_impl)
+  auto options = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
+  torch::Tensor metadata_tensor =
+      torch::empty({sizeof(TensorListMeta)}, options);
+
+  // Wait until the sender set the stat of the thread 0 to SHM_DATA_READY
+  ctx->wait_for_one(src, ThreadSHMStat::DONE);
+  shm_cc_ops::memcpy(metadata_tensor.data_ptr(),
+                     ctx->get_thread_shm_ptr<void>(src),
+                     sizeof(TensorListMeta));
+  TensorListMeta* src_metadata =
+      reinterpret_cast<TensorListMeta*>(metadata_tensor.data_ptr());
+  std::vector<torch::Tensor> tensor_list_with_metadata =
+      src_metadata->generate_tensor_list();
+
+  TensorListMeta metadata;
+  metadata.bind_tensor_list(tensor_list_with_metadata);
+  TORCH_CHECK_EQ(metadata.tensor_num, src_metadata->tensor_num);
+  TORCH_CHECK_EQ(metadata.total_bytes, src_metadata->total_bytes);
+
+  shm_cc_ops::shm_cc_loop<int8_t>(
+      ctx, metadata.total_bytes,
+      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
+          int64_t data_elem_num) {
+        // Wait until the sender set the stat to SHM_DATA_READY
+        thread_ctx->wait_for_one(src, ThreadSHMStat::DONE);
+        int64_t curr_shm_offset = 0;
+        while (curr_shm_offset < data_elem_num) {
+          MemPiece frag = metadata.get_data(data_offset + curr_shm_offset);
+          frag.size = std::min(frag.size, data_elem_num - curr_shm_offset);
+          shm_cc_ops::memcpy(
+              frag.ptr,
+              thread_ctx->get_thread_shm_ptr<int8_t>(src) + curr_shm_offset,
+              frag.size);
+          curr_shm_offset += frag.size;
+        }
+
+        thread_ctx->set_thread_stat(src, ThreadSHMStat::DONE);
+      });
+
+  std::vector<torch::Tensor> tensor_list;
+  tensor_list.reserve(metadata.tensor_num - 1);
+  tensor_list.insert(tensor_list.begin(), tensor_list_with_metadata.begin() + 1,
+                     tensor_list_with_metadata.end());
+
+  return tensor_list;
+}
+}  // namespace
+
+void shm_gather(int64_t handle, torch::Tensor& data,
+                const std::optional<std::vector<torch::Tensor>>& outputs,
+                int64_t dst) {
+  TORCH_CHECK(data.is_contiguous())
+  VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_gather_impl", [&] {
+    CPU_KERNEL_GUARD_IN(shm_gather_impl)
+
+    if (outputs.has_value()) {
+      TORCH_CHECK_LE(outputs->size(), MAX_SHM_RANK_NUM);
+      scalar_t* output_ptrs[MAX_SHM_RANK_NUM] = {nullptr};
+      for (int i = 0; i < outputs->size(); ++i) {
+        output_ptrs[i] = outputs->at(i).data_ptr<scalar_t>();
+      }
+      shm_gather_impl(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
+                      data.data_ptr<scalar_t>(), data.numel(), output_ptrs,
+                      dst);
+    } else {
+      shm_gather_impl(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
+                      data.data_ptr<scalar_t>(), data.numel(), (scalar_t**)(0),
+                      dst);
+    }
+
+    CPU_KERNEL_GUARD_OUT(shm_gather_impl)
+  });
+}
+
+void shm_all_gather(int64_t handle, const torch::Tensor& data,
+                    torch::Tensor& output) {
+  TORCH_CHECK(data.is_contiguous())
+  TORCH_CHECK(output.is_contiguous())
+
+  const int64_t input_elem_num = data.numel();
+  const int64_t output_elem_num = output.numel();
+  TORCH_CHECK_EQ(output_elem_num % input_elem_num, 0);
+  const int world_size = output_elem_num / input_elem_num;
+
+  VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_all_gather_impl", [&] {
+    CPU_KERNEL_GUARD_IN(shm_all_gather_impl)
+    auto ctx = SHMManager::get_singleton_instance(handle)->get_shm_ctx();
+    TORCH_CHECK_EQ(ctx->group_size, world_size);
+
+    scalar_t* output_ptrs[MAX_SHM_RANK_NUM] = {nullptr};
+    for (int i = 0; i < world_size; ++i) {
+      output_ptrs[i] = output.data_ptr<scalar_t>() + i * input_elem_num;
+    }
+    shm_gather_impl(ctx, data.data_ptr<scalar_t>(), data.numel(), output_ptrs,
+                    ctx->rank);
+    CPU_KERNEL_GUARD_OUT(shm_all_gather_impl)
+  });
+}
+
+void shm_allreduce(int64_t handle, torch::Tensor& data) {
+  TORCH_CHECK(data.is_contiguous())
+  VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_allreduce_sum", [&] {
+    CPU_KERNEL_GUARD_IN(shm_allreduce_sum)
+    shm_allreduce_sum(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
+                      data.data_ptr<scalar_t>(), data.numel());
+    CPU_KERNEL_GUARD_OUT(shm_allreduce_sum)
+  });
+}
+
+void shm_send_tensor_list(int64_t handle,
+                          const std::vector<torch::Tensor>& tensor_list,
+                          int64_t dst) {
+  CPU_KERNEL_GUARD_IN(shm_send_tensor_list)
+  shm_send_tensor_list_impl(
+      SHMManager::get_singleton_instance(handle)->get_shm_ctx(), tensor_list);
+  CPU_KERNEL_GUARD_OUT(shm_send_tensor_list)
+}
+
+std::vector<torch::Tensor> shm_recv_tensor_list(int64_t handle, int64_t src) {
+  CPU_KERNEL_GUARD_IN(shm_recv_tensor_list)
+  auto tensor_list = shm_recv_tensor_list_impl(
+      SHMManager::get_singleton_instance(handle)->get_shm_ctx(), src);
+  CPU_KERNEL_GUARD_OUT(shm_recv_tensor_list)
+  return tensor_list;
+}
+
+int64_t init_shm_manager(const std::string& name, const int64_t group_size,
+                         const int64_t rank) {
+  return SHMManager::create_singleton_instance(name, group_size, rank);
+}
+
+std::string join_shm_manager(int64_t handle, const std::string& name) {
+  auto shm_manager = SHMManager::get_singleton_instance(handle);
+  TORCH_CHECK(shm_manager);
+  shm_manager->join(name);
+  return shm_manager->get_shm_ctx()->to_string();
+}
\ No newline at end of file
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index ef5a2fb5c4d22..7ae7e3386b4ed 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -22,6 +22,26 @@ void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
                         torch::Tensor& kv_cache, double scale,
                         torch::Tensor& block_tables, torch::Tensor& seq_lens);
 
+int64_t init_shm_manager(const std::string& name, const int64_t group_size,
+                         const int64_t rank);
+
+std::string join_shm_manager(int64_t handle, const std::string& name);
+
+void shm_allreduce(int64_t handle, torch::Tensor& data);
+
+void shm_gather(int64_t handle, torch::Tensor& data,
+                const std::optional<std::vector<torch::Tensor>>& outputs,
+                int64_t dst);
+
+void shm_all_gather(int64_t handle, const torch::Tensor& data,
+                    torch::Tensor& output);
+
+void shm_send_tensor_list(int64_t handle,
+                          const std::vector<torch::Tensor>& tensor_list,
+                          int64_t dst);
+
+std::vector<torch::Tensor> shm_recv_tensor_list(int64_t handle, int64_t src);
+
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
 
@@ -131,6 +151,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "                  Tensor? azp, Tensor? bias) -> ()");
   ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
 #endif
+
+// SHM CCL
+#ifdef __AVX512F__
+  ops.def("init_shm_manager(str name, int group_size, int rank) -> int",
+          &init_shm_manager);
+  ops.def("join_shm_manager(int handle, str name) -> str", &join_shm_manager);
+  ops.def("shm_allreduce(int handle, Tensor! data) -> ()");
+  ops.impl("shm_allreduce", torch::kCPU, &shm_allreduce);
+  ops.def(
+      "shm_gather(int handle, Tensor data, Tensor[](a!)? outputs, int dst) -> "
+      "()");
+  ops.impl("shm_gather", torch::kCPU, &shm_gather);
+  ops.def(
+      "shm_all_gather(int handle, Tensor data, Tensor! output) -> "
+      "()");
+  ops.impl("shm_all_gather", torch::kCPU, &shm_all_gather);
+  ops.def(
+      "shm_send_tensor_list(int handle, Tensor[](a) tensor_list, int dst) -> "
+      "()");
+  ops.impl("shm_send_tensor_list", torch::kCPU, &shm_send_tensor_list);
+  ops.def("shm_recv_tensor_list(int handle, int src) -> Tensor[](a)",
+          &shm_recv_tensor_list);
+#endif
 }
 
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp
index 42a1c1d924bac..79771ecd9c081 100644
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -18,7 +18,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
 
 #ifndef VLLM_NUMA_DISABLED
 std::string init_cpu_threads_env(const std::string& cpu_ids) {
-  bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
+  bitmask* omp_cpu_mask = numa_parse_cpustring_all(cpu_ids.c_str());
   TORCH_CHECK(omp_cpu_mask->size > 0);
   std::vector<int> omp_cpu_ids;
   omp_cpu_ids.reserve(omp_cpu_mask->size);
diff --git a/docs/source/getting_started/installation/cpu.md b/docs/source/getting_started/installation/cpu.md
index e7e12bd683074..db22ef79c926a 100644
--- a/docs/source/getting_started/installation/cpu.md
+++ b/docs/source/getting_started/installation/cpu.md
@@ -272,12 +272,14 @@ $ python examples/offline_inference/basic/basic.py
 
 - Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance.
 
-- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.inc.md#non-uniform-memory-access-numa). For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel.
+- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.inc.md#non-uniform-memory-access-numa). For NUMA architecture, Tensor Parallel is a option for better performance.
 
-  - Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With [TP feature on CPU](gh-pr:6125) merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:
+  - Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:
 
     ```console
     VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
     ```
 
-  - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.inc.md).
+  - For each thread id list in `VLLM_CPU_OMP_THREADS_BIND`, users should guarantee threads in the list belong to a same NUMA node.
+
+  - Meanwhile, users should also take care of memory capacity of each NUMA node. The memory usage of each TP rank is the sum of `weight shard size` and `VLLM_CPU_KVCACHE_SPACE`, if it exceeds the capacity of a single NUMA node, TP worker will be killed due to out-of-memory.
diff --git a/vllm/distributed/device_communicators/cpu_communicator.py b/vllm/distributed/device_communicators/cpu_communicator.py
index b920cd7e1acfc..1f4b4faf1190a 100644
--- a/vllm/distributed/device_communicators/cpu_communicator.py
+++ b/vllm/distributed/device_communicators/cpu_communicator.py
@@ -1,10 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Optional
+import os
+from typing import List, Optional
 
 import torch
 from torch.distributed import ProcessGroup
 
+from vllm.platforms import current_platform
+from vllm.platforms.interface import CpuArchEnum
+
 from .base_device_communicator import DeviceCommunicatorBase
 
 
@@ -16,19 +20,120 @@ class CpuCommunicator(DeviceCommunicatorBase):
                  device_group: Optional[ProcessGroup] = None,
                  unique_name: str = ""):
         super().__init__(cpu_group, device, device_group, unique_name)
-        self.ipex_available = False
         self.dist_module = torch.distributed
-        try:
-            import intel_extension_for_pytorch as ipex
-            self.ipex_available = True
-            self.dist_module = ipex.distributed
-        except ImportError:
-            """
-            Intel IPEX not found. Falling back to PyTorch native 
-            all_reduce for CPU (e.g. MacOS)
-            """
-            pass
+
+        if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
+            self.dist_module = _CPUSHMDistributed(self)
 
     def all_reduce(self, input_):
         self.dist_module.all_reduce(input_, group=self.device_group)
         return input_
+
+    def gather(self,
+               input_: torch.Tensor,
+               dst: int = 0,
+               dim: int = -1) -> Optional[torch.Tensor]:
+        """
+        NOTE: We assume that the input tensor is on the same device across
+        all the ranks.
+        NOTE: `dst` is the local rank of the destination rank.
+        """
+        world_size = self.world_size
+        assert -input_.dim() <= dim < input_.dim(), (
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+
+        # Allocate output tensor.
+        if self.rank_in_group == dst:
+            gather_list = [torch.empty_like(input_) for _ in range(world_size)]
+        else:
+            gather_list = None
+
+        # Gather.
+        self.dist_module.gather(input_,
+                                gather_list,
+                                dst=self.ranks[dst],
+                                group=self.device_group)
+
+        if self.rank_in_group == dst:
+            output_tensor = torch.cat(gather_list, dim=dim)
+        else:
+            output_tensor = None
+        return output_tensor
+
+    def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+        input_size = input_.size()
+        # NOTE: we have to use concat-style all-gather here,
+        # stack-style all-gather has compatibility issues with
+        # torch.compile . see https://github.com/pytorch/pytorch/issues/138795
+        output_size = (input_size[0] * self.world_size, ) + input_size[1:]
+        # Allocate output tensor.
+        output_tensor = torch.empty(output_size,
+                                    dtype=input_.dtype,
+                                    device=input_.device)
+        # All-gather.
+        self.dist_module.all_gather_into_tensor(output_tensor,
+                                                input_,
+                                                group=self.device_group)
+
+        # Reshape
+        output_tensor = output_tensor.reshape((self.world_size, ) + input_size)
+        output_tensor = output_tensor.movedim(0, dim)
+        output_tensor = output_tensor.reshape(input_size[:dim] +
+                                              (self.world_size *
+                                               input_size[dim], ) +
+                                              input_size[dim + 1:])
+        return output_tensor
+
+
+class _CPUSHMDistributed:
+
+    def __init__(self, communicator: CpuCommunicator):
+        instance_identifier = os.environ["VLLM_DIST_IDENT"]
+        self.communicator = communicator
+
+        group_ranks = [str(rank) for rank in self.communicator.ranks]
+        shm_group_identifier = f"[{'-'.join(group_ranks)}]"
+        self.group_name = f"{instance_identifier}-{shm_group_identifier}-cpushm"
+
+        self.handle = self._init_cpu_shm()
+
+    def _init_cpu_shm(self) -> int:
+        handle = torch.ops._C.init_shm_manager(
+            self.group_name,
+            self.communicator.world_size,
+            self.communicator.rank,
+        )
+        torch.distributed.barrier(self.communicator.device_group)
+        torch.ops._C.join_shm_manager(
+            handle,
+            self.group_name,
+        )
+        torch.distributed.barrier(self.communicator.device_group)
+
+        return handle
+
+    def all_reduce(self,
+                   input: torch.Tensor,
+                   group: Optional[ProcessGroup] = None) -> None:
+        torch.ops._C.shm_allreduce(self.handle, input)
+
+    def gather(self,
+               input: torch.Tensor,
+               gather_list: Optional[List[torch.Tensor]],
+               dst: int = -1,
+               group: Optional[ProcessGroup] = None) -> None:
+        # Note: different from the torch gather, here we use local dst rank.
+        torch.ops._C.shm_gather(self.handle, input, gather_list,
+                                torch.distributed.get_group_rank(group, dst))
+
+    def all_gather_into_tensor(self,
+                               output: torch.Tensor,
+                               input: torch.Tensor,
+                               group: Optional[ProcessGroup] = None) -> None:
+        torch.ops._C.shm_all_gather(self.handle, input, output)
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 5f35c1af2e7df..1436a404335a0 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 """A CPU worker class."""
+import os
 from typing import Dict, List, Optional, Set, Tuple, Type
 
 import torch
@@ -139,6 +140,8 @@ class CPUWorker(LocalOrDistributedWorkerBase):
 
         self.local_rank = local_rank
         self.rank = rank
+        vllm_config.parallel_config.rank = rank
+
         self.distributed_init_method = distributed_init_method
 
         self.is_driver_worker = is_driver_worker
@@ -217,6 +220,10 @@ class CPUWorker(LocalOrDistributedWorkerBase):
             ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
             if ret:
                 logger.info(ret)
+
+        # Note: unique identifier for creating allreduce shared memory
+        os.environ["VLLM_DIST_IDENT"] = self.distributed_init_method.split(
+            ":")[-1]
         self.device = torch.device("cpu")
         self.init_distributed_environment()
         # Set random seed.

From e86c414d6ac540166781f203b1ec48826aab9f7c Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <lenronfu@gmail.com>
Date: Wed, 2 Apr 2025 22:47:31 +0800
Subject: [PATCH 192/593] [Model] use AutoWeightsLoader in model load_weights
 (#15770)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 docs/source/models/supported_models.md |   5 +
 vllm/model_executor/models/bamba.py    | 105 ++++++++++---------
 vllm/model_executor/models/exaone.py   | 140 +++++++++++++------------
 vllm/model_executor/models/falcon.py   | 104 +++++++++---------
 4 files changed, 189 insertions(+), 165 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index af0f7304c6657..bf7e2b5b1c5d0 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -218,6 +218,11 @@ See [this page](#generative-models) for more information on how to use generativ
   * `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.
   * ✅︎
   * ✅︎
+- * `BambaForCausalLM`
+  * Bamba
+  * `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B`
+  *
+  *
 - * `BloomForCausalLM`
   * BLOOM, BLOOMZ, BLOOMChat
   * `bigscience/bloom`, `bigscience/bloomz`, etc.
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index de0209d0b43b1..e5896f5fd355a 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -34,7 +34,7 @@ from vllm.utils import LayerBlockType
 
 from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
                          SupportsQuant, SupportsV0Only)
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -363,6 +363,58 @@ class BambaModel(nn.Module):
         hidden_states, _ = self.final_layernorm(hidden_states, residual)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if "A_log" in name:
+                name = name.replace("A_log", "A")
+
+            if ".self_attn." in name:
+                name = name.replace(".self_attn", "")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
                        IsHybrid, SupportsV0Only, SupportsQuant):
@@ -502,52 +554,5 @@ class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            if "A_log" in name:
-                name = name.replace("A_log", "A")
-
-            if ".self_attn." in name:
-                name = name.replace(".self_attn", "")
-
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Skip layers on other devices.
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
\ No newline at end of file
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 7d01dd37826a0..553c524ebc370 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -51,7 +51,7 @@ from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.exaone import ExaoneConfig
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (PPMissingLayer, is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -313,6 +313,7 @@ class ExaoneModel(nn.Module):
         lora_config = vllm_config.lora_config
 
         self.config = config
+        self.quant_config = quant_config
         lora_vocab = ((lora_config.lora_extra_vocab_size *
                        (lora_config.max_loras or 1)) if lora_config else 0)
         self.vocab_size = config.vocab_size + lora_vocab
@@ -384,6 +385,72 @@ class ExaoneModel(nn.Module):
         hidden_states, _ = self.ln_f(hidden_states, residual)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".c_fc_0", 0),
+            (".gate_up_proj", ".c_fc_1", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
@@ -481,71 +548,12 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            (".qkv_proj", ".q_proj", "q"),
-            (".qkv_proj", ".k_proj", "k"),
-            (".qkv_proj", ".v_proj", "v"),
-            (".gate_up_proj", ".c_fc_0", 0),
-            (".gate_up_proj", ".c_fc_1", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                continue
+        loader = AutoWeightsLoader(
+            self,
             # With tie_word_embeddings, we can skip lm_head.weight
             # The weight might appear unnecessarily in the files if the model is
             # processed with quantization, LoRA, fine-tuning, etc.
-            if self.config.tie_word_embeddings and "lm_head.weight" in name:
-                continue
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache quantization scales
-                param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
-                weight_loader(param, loaded_weight)
-                loaded_params.add(scale_name)
-                continue
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Remapping the name of FP8 kv-scale.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 7154ac2e6a5af..0e67b1ec94f69 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -49,7 +49,7 @@ from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import RWConfig
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -395,6 +395,54 @@ class FalconModel(nn.Module):
         hidden_states = self.ln_f(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        total_num_heads = self.config.num_attention_heads
+        if self.config.new_decoder_architecture:
+            total_num_kv_heads = self.config.num_kv_heads
+        elif self.config.multi_query:
+            total_num_kv_heads = 1
+        else:
+            total_num_kv_heads = total_num_heads
+        num_query_heads_per_kv_head = total_num_heads // total_num_kv_heads
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            # Skip loading extra bias for GPTQ models.
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+            param = params_dict[name]
+            if "query_key_value" in name:
+                output_dim = getattr(param, "output_dim", None)
+                loaded_weight_shape = loaded_weight.shape
+                if output_dim is not None:
+                    loaded_weight = loaded_weight.view(
+                        loaded_weight_shape[:output_dim] +
+                        (total_num_kv_heads, num_query_heads_per_kv_head + 2,
+                         -1) + loaded_weight_shape[output_dim + 1:])
+                    wq = loaded_weight.narrow(
+                        output_dim + 1, 0,
+                        num_query_heads_per_kv_head).reshape(
+                            *loaded_weight_shape[:output_dim], -1,
+                            *loaded_weight_shape[output_dim + 1:])
+                    wk = loaded_weight.narrow(
+                        output_dim + 1, num_query_heads_per_kv_head,
+                        1).reshape(*loaded_weight_shape[:output_dim], -1,
+                                   *loaded_weight_shape[output_dim + 1:])
+                    wv = loaded_weight.narrow(
+                        output_dim + 1, num_query_heads_per_kv_head + 1,
+                        1).reshape(*loaded_weight_shape[:output_dim], -1,
+                                   *loaded_weight_shape[output_dim + 1:])
+                    loaded_weight = torch.cat([wq, wk, wv], dim=output_dim)
+
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class FalconForCausalLM(nn.Module, SupportsPP):
     packed_modules_mapping = {
@@ -462,51 +510,9 @@ class FalconForCausalLM(nn.Module, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        total_num_heads = self.config.num_attention_heads
-        if self.config.new_decoder_architecture:
-            total_num_kv_heads = self.config.num_kv_heads
-        elif self.config.multi_query:
-            total_num_kv_heads = 1
-        else:
-            total_num_kv_heads = total_num_heads
-        num_query_heads_per_kv_head = total_num_heads // total_num_kv_heads
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if name == "lm_head.weight" and self.tie_word_embeddings:
-                # Falcon uses tied embeddings except Falcon-11b.
-                continue
-            # Skip loading extra bias for GPTQ models.
-            if name.endswith(".bias") and name not in params_dict:
-                continue
-            if is_pp_missing_parameter(name, self):
-                continue
-            param = params_dict[name]
-            if "query_key_value" in name:
-                output_dim = getattr(param, "output_dim", None)
-                loaded_weight_shape = loaded_weight.shape
-                if output_dim is not None:
-                    loaded_weight = loaded_weight.view(
-                        loaded_weight_shape[:output_dim] +
-                        (total_num_kv_heads, num_query_heads_per_kv_head + 2,
-                         -1) + loaded_weight_shape[output_dim + 1:])
-                    wq = loaded_weight.narrow(
-                        output_dim + 1, 0,
-                        num_query_heads_per_kv_head).reshape(
-                            *loaded_weight_shape[:output_dim], -1,
-                            *loaded_weight_shape[output_dim + 1:])
-                    wk = loaded_weight.narrow(
-                        output_dim + 1, num_query_heads_per_kv_head,
-                        1).reshape(*loaded_weight_shape[:output_dim], -1,
-                                   *loaded_weight_shape[output_dim + 1:])
-                    wv = loaded_weight.narrow(
-                        output_dim + 1, num_query_heads_per_kv_head + 1,
-                        1).reshape(*loaded_weight_shape[:output_dim], -1,
-                                   *loaded_weight_shape[output_dim + 1:])
-                    loaded_weight = torch.cat([wq, wk, wv], dim=output_dim)
-
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)

From 58e234a7542f6df2e769eaab4f1661a3a7494441 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 2 Apr 2025 23:04:43 +0800
Subject: [PATCH 193/593] [Misc] V1 LoRA support CPU offload (#15843)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/config.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index ba20e3fd75125..1255d716a2e4c 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2434,9 +2434,9 @@ class LoRAConfig:
                 f"max_loras ({self.max_loras})")
 
     def verify_with_cache_config(self, cache_config: CacheConfig):
-        # TODO LoRA supports CPU offload.
-        if cache_config.cpu_offload_gb > 0:
-            raise ValueError("CPU offload is not supported with LoRA yet.")
+        if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1:
+            raise ValueError(
+                "V0 LoRA does not support CPU offload, please use V1.")
 
     def verify_with_model_config(self, model_config: ModelConfig):
         if self.lora_dtype in (None, "auto"):

From 8bd651b3186f64517cf4f430fb3757729f11c454 Mon Sep 17 00:00:00 2001
From: Nishidha <nishidha.panpaliya@partner.ibm.com>
Date: Wed, 2 Apr 2025 21:49:39 +0530
Subject: [PATCH 194/593] =?UTF-8?q?Restricted=20cmake=20to=20be=20less=20t?=
 =?UTF-8?q?han=20version=204=20as=204.x=20breaks=20the=20build=20of?=
 =?UTF-8?q?=E2=80=A6=20(#15859)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Nishidha Panpaliya <nishidha.panpaliya@partner.ibm.com>
---
 docker/Dockerfile.ppc64le | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile.ppc64le b/docker/Dockerfile.ppc64le
index 913c289adc01e..4540af4e8cdc8 100644
--- a/docker/Dockerfile.ppc64le
+++ b/docker/Dockerfile.ppc64le
@@ -38,7 +38,7 @@ RUN microdnf install -y openssl-devel dnf \
     && ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
     && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
     && python -m pip install -U pip uv \
-    && uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python cmake ninja cython scikit_build_core scikit_build \
+    && uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python 'cmake<4' ninja cython scikit_build_core scikit_build \
     && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
     && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
     && cd /tmp && touch control
@@ -238,7 +238,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     && python -m pip install -U pip uv --no-cache \
     && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
     && make -C /numactl install \
-    && uv pip install cmake \
+    && uv pip install 'cmake<4' \
     && cmake --install /lapack/build \
     && uv pip uninstall cmake
 

From 1cab43c2d2172459987652a7fbf922c6611a71fa Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 3 Apr 2025 01:02:58 +0800
Subject: [PATCH 195/593] [misc] instruct pytorch to use nvml-based cuda check
 (#15951)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/__init__.py     | 20 ++++----------------
 vllm/env_override.py | 21 +++++++++++++++++++++
 2 files changed, 25 insertions(+), 16 deletions(-)
 create mode 100644 vllm/env_override.py

diff --git a/vllm/__init__.py b/vllm/__init__.py
index 457780824c743..52022fb8f0168 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -4,9 +4,10 @@
 # version library first.  Such assumption is critical for some customization.
 from .version import __version__, __version_tuple__  # isort:skip
 
-import os
-
-import torch
+# The environment variables override should be imported before any other
+# modules to ensure that the environment variables are set before any
+# other modules are imported.
+import vllm.env_override  # isort:skip  # noqa: F401
 
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@@ -23,19 +24,6 @@ from vllm.outputs import (ClassificationOutput, ClassificationRequestOutput,
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 
-# set some common config/environment variables that should be set
-# for all processes created by vllm and all processes
-# that interact with vllm workers.
-# they are executed whenever `import vllm` is called.
-
-# see https://github.com/NVIDIA/nccl/issues/1234
-os.environ['NCCL_CUMEM_ENABLE'] = '0'
-
-# see https://github.com/vllm-project/vllm/issues/10480
-os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
-# see https://github.com/vllm-project/vllm/issues/10619
-torch._inductor.config.compile_threads = 1
-
 __all__ = [
     "__version__",
     "__version_tuple__",
diff --git a/vllm/env_override.py b/vllm/env_override.py
new file mode 100644
index 0000000000000..0fa5b70c2ef91
--- /dev/null
+++ b/vllm/env_override.py
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: Apache-2.0
+import os
+
+import torch
+
+# set some common config/environment variables that should be set
+# for all processes created by vllm and all processes
+# that interact with vllm workers.
+# they are executed whenever `import vllm` is called.
+
+# see https://github.com/NVIDIA/nccl/issues/1234
+os.environ['NCCL_CUMEM_ENABLE'] = '0'
+
+# see https://github.com/vllm-project/vllm/pull/15951
+# it avoids unintentional cuda initialization from torch.cuda.is_available()
+os.environ['PYTORCH_NVML_BASED_CUDA_CHECK'] = '1'
+
+# see https://github.com/vllm-project/vllm/issues/10480
+os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
+# see https://github.com/vllm-project/vllm/issues/10619
+torch._inductor.config.compile_threads = 1

From f021b9799386d0df3bb3befad351de11fcc36c5b Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 2 Apr 2025 16:36:24 -0600
Subject: [PATCH 196/593] [V1] Support Mistral3 in V1 (#15950)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 docs/source/models/supported_models.md |  2 +-
 vllm/model_executor/models/mistral3.py | 15 +++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index bf7e2b5b1c5d0..74b4eab920438 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -888,7 +888,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc.
   *
   * ✅︎
-  *
+  * ✅︎
 - * `MllamaForConditionalGeneration`
   * Llama 3.2
   * T + I<sup>+</sup>
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index 4cd9a7bf58e77..872769dd649e0 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -31,12 +31,12 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
-                         SupportsV0Only)
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import get_vision_encoder_info, select_patch_features
+from .vision import (get_vision_encoder_info, scatter_patch_features,
+                     select_patch_features)
 
 
 class Mistral3ImagePixelInputs(TypedDict):
@@ -425,7 +425,7 @@ def init_vision_tower_for_llava(
     info=_build_mistral3_info,
     dummy_inputs=Mistral3DummyInputsBuilder)
 class Mistral3ForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                       SupportsPP, SupportsV0Only):
+                                       SupportsPP):
 
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
@@ -518,7 +518,7 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsMultiModal,
         return Mistral3ImagePixelInputs(
             type="pixel_values_pixtral",
             pixel_values=flatten_bn(pixel_values),
-            embed_is_patch=embed_is_patch,
+            embed_is_patch=flatten_bn(embed_is_patch),
         )
 
     def _process_image_input(
@@ -557,7 +557,10 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         vision_embeddings = self._process_image_input(image_input)
 
-        return vision_embeddings
+        return scatter_patch_features(
+            vision_embeddings,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,

From 55acf86bf817cca4baf9ed7a3f6e23be510d0198 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 3 Apr 2025 00:37:30 +0100
Subject: [PATCH 197/593] Fix `huggingface-cli[hf-xet]` ->
 `huggingface-cli[hf_xet]` (#15969)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 requirements/common.txt | 2 +-
 requirements/test.in    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index c754dd12bc2ff..7365a5b46a308 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -7,7 +7,7 @@ tqdm
 blake3
 py-cpuinfo
 transformers >= 4.50.3
-huggingface-hub[hf-xet] >= 0.30.0  # Required for Xet downloads.
+huggingface-hub[hf_xet] >= 0.30.0  # Required for Xet downloads.
 tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
diff --git a/requirements/test.in b/requirements/test.in
index eb74198ab9f70..364747e9c08f2 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -31,7 +31,7 @@ opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.8 # required for model evaluation test
 transformers==4.50.3
-huggingface-hub[hf-xet]>=0.30.0  # Required for Xet downloads.
+huggingface-hub[hf_xet]>=0.30.0  # Required for Xet downloads.
 # quantization
 bitsandbytes>=0.45.3
 buildkite-test-collector==0.1.9

From 1b84eff03adc04bdde9b2378a751d862a28326eb Mon Sep 17 00:00:00 2001
From: Hyesoo Yang <45211235+hyeygit@users.noreply.github.com>
Date: Wed, 2 Apr 2025 17:18:08 -0700
Subject: [PATCH 198/593] [V1][TPU] TPU-optimized top-p implementation (avoids
 scattering). (#15736)

Signed-off-by: Hyesoo Yang <hyeygit@gmail.com>
Co-authored-by: root <root@t1v-n-822696b7-w-0.us-central2-b.c.tpu-prod-env-large-adhoc.internal>
---
 .buildkite/run-tpu-v1-test.sh           |   4 +-
 tests/v1/tpu/test_topk_topp_sampler.py  | 132 ++++++++++++++++++++++++
 vllm/v1/sample/ops/topk_topp_sampler.py |  53 +++++++---
 3 files changed, 174 insertions(+), 15 deletions(-)
 create mode 100644 tests/v1/tpu/test_topk_topp_sampler.py

diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh
index 4aac57cca94c0..5b7ce9a7677e3 100755
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@@ -36,7 +36,9 @@ docker run --privileged --net host --shm-size=16G -it \
     && echo TEST_6 \
     && pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
     && echo TEST_7 \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py \
+    && echo TEST_8 \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py" \
 
 
 # TODO: This test fails because it uses RANDOM_SEED sampling
diff --git a/tests/v1/tpu/test_topk_topp_sampler.py b/tests/v1/tpu/test_topk_topp_sampler.py
new file mode 100644
index 0000000000000..dce0303e68d55
--- /dev/null
+++ b/tests/v1/tpu/test_topk_topp_sampler.py
@@ -0,0 +1,132 @@
+# SPDX-License-Identifier: Apache-2.0
+import math
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p_tpu
+
+if not current_platform.is_tpu():
+    pytest.skip("This test needs a TPU.", allow_module_level=True)
+import torch_xla.core.xla_model as xm
+
+BATCH_SIZE = 1024
+VOCAB_SIZE = 128 * 1024
+TOLERANCE = 1e-6
+
+
+def test_topp_result_sums_past_p():
+    with torch.device(xm.xla_device()):
+        xm.set_rng_state(seed=33)
+
+        logits = torch.rand((BATCH_SIZE, VOCAB_SIZE))
+        probs = logits.softmax(dim=-1)
+
+        # Random top-p values between 0 and 1.
+        p = torch.rand((BATCH_SIZE, ))
+
+        # Set p=1 for ~50% of requests in the batch (top-p disabled).
+        p.masked_fill_(torch.randint(0, 2, (BATCH_SIZE, ), dtype=bool), 1)
+
+        no_op_k = torch.tensor([VOCAB_SIZE])
+        logits_masked = apply_top_k_top_p_tpu(logits=logits.clone(),
+                                              k=no_op_k,
+                                              p=p)
+
+        # Verify that the masked logit's probability sums to at least p.
+        probs.masked_fill_(logits_masked.isinf(), 0)
+        masked_prob_sum = probs.sum(dim=-1)
+
+        xm.mark_step()
+
+    # Perform assertion on CPU.
+    assert torch.all(torch.ge(masked_prob_sum.cpu() + TOLERANCE, p.cpu()))
+
+
+def test_topp_basic():
+    with torch.device(xm.xla_device()):
+        logits = torch.tensor([[math.log(0.2),
+                                math.log(0.3),
+                                math.log(0.5)],
+                               [math.log(0.5),
+                                math.log(0.1),
+                                math.log(0.4)]])
+
+        result = apply_top_k_top_p_tpu(logits=logits.clone(),
+                                       k=torch.tensor([3, 3]),
+                                       p=torch.tensor([0.79, 0.79]))
+
+        xm.mark_step()
+
+    # Expect the smallest elements to be dropped.
+    expected_result = logits.clone().cpu()
+    expected_result[0, 0] = float("-inf")
+    expected_result[1, 1] = float("-inf")
+    assert torch.allclose(expected_result, result.cpu())
+
+
+def test_topp_select_all():
+    with torch.device(xm.xla_device()):
+        logits = torch.tensor([[math.log(0.2),
+                                math.log(0.3),
+                                math.log(0.5)],
+                               [math.log(0.5),
+                                math.log(0.1),
+                                math.log(0.4)]])
+
+        result = apply_top_k_top_p_tpu(logits=logits.clone(),
+                                       k=torch.tensor([3, 3]),
+                                       p=torch.tensor([1.0, 1.0]))
+
+        xm.mark_step()
+
+    assert torch.allclose(logits.cpu(), result.cpu())
+
+
+def test_topp_with_ties():
+    with torch.device(xm.xla_device()):
+        # Input has multiple math.log(0.3).
+        logits = torch.tensor(
+            [[math.log(0.3),
+              math.log(0.3),
+              math.log(0.3),
+              math.log(0.1)]])
+
+        result = apply_top_k_top_p_tpu(logits=logits.clone(),
+                                       k=torch.tensor([4]),
+                                       p=torch.tensor([0.2]))
+
+        xm.mark_step()
+
+    # All tie values are included in the top-p set. Tie breaking is left
+    # to be done during final sampling (all tie tokens have equal
+    # probability of being chosen).
+    expected_result = logits.clone().cpu()
+    expected_result[0, 3] = float("-inf")
+    assert torch.allclose(expected_result, result.cpu())
+
+
+def test_both_topk_topp():
+    with torch.device(xm.xla_device()):
+        logits = torch.tensor([[math.log(0.2),
+                                math.log(0.3),
+                                math.log(0.5)],
+                               [math.log(0.5),
+                                math.log(0.1),
+                                math.log(0.4)]])
+
+        # Set k=1 for the first batch.
+        result = apply_top_k_top_p_tpu(logits=logits.clone(),
+                                       k=torch.tensor([1, 3]),
+                                       p=torch.tensor([0.79, 0.79]))
+
+        xm.mark_step()
+
+    # Since for the first batch k=1, expect only the largest element gets
+    # selected.
+    expected_result = logits.clone().cpu()
+    expected_result[0, 0] = float("-inf")
+    expected_result[0, 1] = float("-inf")
+    expected_result[1, 1] = float("-inf")
+    assert torch.allclose(expected_result, result.cpu())
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index d4bc23364c574..f69623edd6321 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -122,23 +122,48 @@ class TopKTopPSampler(nn.Module):
         k: Optional[torch.Tensor],
         p: Optional[torch.Tensor],
     ) -> torch.Tensor:
-        # If only top-k is specified, use pytorch's builtin topk op. This leads
-        # to significant speed up on TPU compared to using apply_top_k_top_p.
-        if k is not None and p is None:
-            topk_values, topk_indices = torch.topk(logits, k, dim=-1)
-
-            mask = torch.ones_like(logits, dtype=torch.bool)
-            mask.scatter_(-1, topk_indices, False)
-            logits.masked_fill_(mask, float('-inf'))
-        else:
-            # TODO Placeholder for TPU optimized topp kernel
-            # logits = apply_top_k_top_p(logits, k, p)
-            pass
-
+        logits = apply_top_k_top_p_tpu(logits, k, p)
         probs = logits.softmax(dim=-1, dtype=torch.float32)
         return random_sample(probs, generators)
 
 
+def apply_top_k_top_p_tpu(
+    logits: torch.Tensor,
+    k: torch.Tensor,
+    p: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Apply top-k and top-p optimized for TPU.
+
+    This algorithm avoids using torch.scatter which is extremely slow on TPU.
+    This is achieved by finding a "cut-off" element in the original logit, and
+    after thresholding the logit using this cut-off, the remaining elements
+    shall constitute the top-p set.
+
+    Note: in the case of tie (i.e. multipple cut-off elements present in the
+    logit), all tie elements are included in the top-p set. In other words,
+    this function does not break ties. Instead, these tie tokens have equal
+    chance of being chosen during final sampling, so we can consider the tie
+    being broken then.
+    """
+    if k is not None:
+        logits = apply_top_k_only(logits, k)
+
+    if p is not None:
+        probs = logits.softmax(dim=-1)
+        probs_sort, _ = probs.sort(dim=-1, descending=False)
+        cumprob = torch.cumsum(probs_sort, dim=-1)
+        top_p_mask = cumprob <= 1 - p.unsqueeze(dim=1)
+        top_p_mask[:, -1] = False  # at least one
+
+        top_p_count = top_p_mask.sum(dim=-1).unsqueeze(1)
+        top_p_cutoff = probs_sort.gather(-1, top_p_count)
+        elements_to_discard = probs < top_p_cutoff
+        logits.masked_fill_(elements_to_discard, -float("inf"))
+
+    return logits
+
+
 def apply_top_k_top_p(
     logits: torch.Tensor,
     k: Optional[torch.Tensor],
@@ -199,7 +224,7 @@ def apply_top_k_only(
     max_top_k = k.max()
     # topk.values tensor has shape [batch_size, max_top_k].
     # Convert top k to 0-based index in range [0, max_top_k).
-    k_index = k.sub_(1).unsqueeze(1)
+    k_index = k.sub_(1).unsqueeze(1).expand(logits.shape[0], 1)
     top_k_mask = logits.topk(max_top_k, dim=1).values.gather(1, k_index.long())
     # Handle non-topk rows.
     top_k_mask.masked_fill_(no_top_k_mask.unsqueeze(1), -float("inf"))

From 01b6113659cf4ad115c39e2ebb18de7535e423a7 Mon Sep 17 00:00:00 2001
From: Chengji Yao <chengjiyao@google.com>
Date: Wed, 2 Apr 2025 17:25:14 -0700
Subject: [PATCH 199/593] [TPU] optimize the all-reduce performance (#15903)

Signed-off-by: Chengji Yao <chengjiyao@google.com>
---
 vllm/distributed/device_communicators/tpu_communicator.py | 7 ++++++-
 vllm/distributed/parallel_state.py                        | 5 ++++-
 vllm/v1/worker/tpu_worker.py                              | 6 ++++++
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py
index 05cb1e0f6ef5a..de66ceaeef6f1 100644
--- a/vllm/distributed/device_communicators/tpu_communicator.py
+++ b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -22,6 +22,8 @@ if current_platform.is_tpu():
     import torch_xla.core.xla_model as xm
     import torch_xla.runtime as xr
     from torch_xla._internal import pjrt
+    from torch_xla.distributed.xla_multiprocessing import (
+        create_optimized_replica_groups)
 
     if USE_RAY:
         from vllm.executor import ray_utils
@@ -79,9 +81,12 @@ class TpuCommunicator(DeviceCommunicatorBase):
 
         pjrt.initialize_multiprocess(local_rank, local_world_size)
         xr._init_world_size_ordinal()
+        self.groups = create_optimized_replica_groups()
 
     def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
-        return xm.all_reduce(xm.REDUCE_SUM, input_)
+        # TODO: Remove the groups specification after XLA compiler can support
+        # auto-reordering the ring order for all-reduce.
+        return xm.all_reduce(xm.REDUCE_SUM, input_, groups=self.groups)
 
     def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
         assert dim == -1, "TPUs only support dim=-1 for all-gather."
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 514851694837d..fa493fefb8f05 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -119,11 +119,13 @@ def all_reduce_fake(tensor: torch.Tensor, group_name: str) -> torch.Tensor:
 
 
 if supports_custom_op():
+    from vllm.platforms import current_platform
     direct_register_custom_op(
         op_name="all_reduce",
         op_func=all_reduce,
         mutates_args=[],
         fake_impl=all_reduce_fake,
+        dispatch_key=current_platform.dispatch_key,
     )
 
 
@@ -219,7 +221,8 @@ class GroupCoordinator:
                 self.cpu_group, 1 << 22, 6)
 
         from vllm.platforms import current_platform
-        self.use_custom_op_call = current_platform.is_cuda_alike()
+        self.use_custom_op_call = (current_platform.is_cuda_alike()
+                                   or current_platform.is_tpu())
 
     @property
     def first_rank(self):
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index 9add8cee02e5b..bd24072f4c1a1 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -84,6 +84,12 @@ class TPUWorker:
 
     def init_device(self):
         os.environ["PJRT_DEVICE"] = "TPU"
+        # Note: Currently the XLA compiler wrongly uses 2D ring strategy on 1D
+        # ring, the xla tpu compiler flag
+        # `xla_tpu_force_1d_allreduce_at_chunk_count` is a temporary solution to
+        # fix this. It will be removed after the bug in XLA compiler is fixed.
+        os.environ["LIBTPU_INIT_ARGS"] = (
+            "--xla_tpu_force_1d_allreduce_at_chunk_count=1")
         torch.set_grad_enabled(False)
         torch.set_default_dtype(self.model_config.dtype)
 

From bd7599d34ae9aa46994f8e233e2293f6f2d13019 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Thu, 3 Apr 2025 03:36:01 +0200
Subject: [PATCH 200/593] [V1][TPU] Do not compile sampling more than needed
 (#15883)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 vllm/v1/worker/tpu_model_runner.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index c2edbaf351d04..b1d5c0f338541 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -862,7 +862,9 @@ class TPUModelRunner:
                 out = self.model.sample_from_hidden(dummy_hidden,
                                                     sampling_meta)
                 out = out.cpu()
-                if num_reqs_to_sample >= self.max_num_reqs:
+                # Requests can't be more than tokens. But do compile for the
+                # next bigger value in case num_tokens uses bucketed padding.
+                if num_reqs_to_sample >= min(num_tokens, self.max_num_reqs):
                     break
                 # Make sure to compile the `max_num_reqs` upper-limit case
                 num_reqs_to_sample = _get_padded_num_reqs_with_upper_limit(

From e73ff24e31d29b095e24e160d3b0651d2436d977 Mon Sep 17 00:00:00 2001
From: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com>
Date: Wed, 2 Apr 2025 19:48:00 -0700
Subject: [PATCH 201/593] [ROCM][KERNEL] Paged attention for V1 (#15720)

Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Signed-off-by: root <root@banff-cyxtera-s65-4.amd.com>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: root <root@banff-cyxtera-s65-4.amd.com>
---
 csrc/rocm/attention.cu                        | 103 ++++++++----
 csrc/rocm/ops.h                               |   5 +-
 csrc/rocm/torch_bindings.cpp                  |   4 +-
 tests/kernels/test_prefix_prefill.py          |   4 +
 vllm/_custom_ops.py                           |   6 +-
 vllm/attention/backends/rocm_flash_attn.py    |  24 +--
 .../ops/chunked_prefill_paged_decode.py       | 157 ++++++++++++------
 vllm/attention/ops/paged_attn.py              |   2 +
 vllm/attention/ops/prefix_prefill.py          |   1 +
 vllm/platforms/rocm.py                        |  21 ++-
 vllm/v1/attention/backends/triton_attn.py     |   1 +
 11 files changed, 219 insertions(+), 109 deletions(-)

diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
index 8ab2af22f4d0c..2c3cae95e7f55 100644
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@@ -272,6 +272,7 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
     const float scale,    
     const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
     const int* __restrict__ context_lens,   // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,   // [num_seqs]
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes, // [num_heads]
     const int q_stride,
@@ -291,6 +292,13 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
   const int rowid = laneid / 16;
 
   const auto seq_idx = blockIdx.x;
+  // NOTE queries with sequence len > 1 are prefills and taken care by another
+  // kernel.
+  if (query_start_loc_ptr != nullptr &&
+      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx]) != 1) {
+    return;
+  }
+
   const auto partition_idx = blockIdx.y;
 
   constexpr int T_PAR_SIZE = 256;  // token partition size set to 256
@@ -377,9 +385,10 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
   // fetch Q in shared across warps and then write to registers
   const int local_qhead_idx = 4 * warpid + rowid;
   const int global_qhead_idx = wg_start_head_idx + local_qhead_idx;
-  const int64_t seq_idx64 = static_cast<int64_t>(seq_idx);
+  const int64_t query_start_off = static_cast<int64_t>(
+      query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
   const scalar_t* q_ptr =
-      q + seq_idx64 * q_stride + global_qhead_idx * HEAD_SIZE;
+      q + query_start_off * q_stride + global_qhead_idx * HEAD_SIZE;
 
   const int qhead_element = lane16id * CONTIGUOUS_SCALAR_ELEMS_16B;
   if ((local_qhead_idx < GQA_RATIO) && (qhead_element < HEAD_SIZE)) {
@@ -777,6 +786,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
     const float scale,
     const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
     const int* __restrict__ context_lens,   // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,   // [num_seqs]
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes, // [num_heads]
     const int q_stride,
@@ -794,6 +804,12 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
   const int lane4id = laneid % 4;
 
   const auto seq_idx = blockIdx.x;
+  // NOTE queries with sequence len > 1 are prefills and taken care by another
+  // kernel.
+  if (query_start_loc_ptr != nullptr &&
+      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) {
+    return;
+  }
   const auto partition_idx = blockIdx.y;
   const auto partition_size = blockDim.x;
   const auto max_num_partitions = gridDim.y;
@@ -882,9 +898,11 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
     }
 
     // fetch q elements
-    // every 4 lanes fetch 8 elems, so warp fetches 8*16 = 128 elems
+    // every 4 lanes fetch 8 elems, so warp fetches 8*16 = 128 elemsc
+    const int64_t query_start_off = static_cast<int64_t>(
+        query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
     const scalar_t* q_ptr =
-        q + seq_idx * q_stride + wg_start_head_idx * HEAD_SIZE;
+        q + query_start_off * q_stride + wg_start_head_idx * HEAD_SIZE;
     const _B16x8* q_ptrh8 = reinterpret_cast<const _B16x8*>(q_ptr);
     const int qhead_elemh8 = laneid / 4;
 
@@ -1267,10 +1285,19 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
     const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads,
                                            // max_num_partitions, head_size]
     const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
     const int max_num_partitions) {
   const auto num_heads = gridDim.x;
   const auto head_idx = blockIdx.x;
   const auto seq_idx = blockIdx.y;
+
+  // NOTE queries with sequence len > 1 are prefills and taken care by another
+  // kernel.
+  if (query_start_loc_ptr != nullptr &&
+      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) {
+    return;
+  }
+
   const int context_len = context_lens[seq_idx];
   const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
   [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
@@ -1439,7 +1466,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
       __fdividef(1.0f, shared_global_exp_sum + 1e-6f);
   acc *= inv_global_exp_sum;
 
-  OUTT* out_ptr = out + static_cast<int64_t>(seq_idx) * num_heads * HEAD_SIZE +
+  const int64_t query_start_off = static_cast<int64_t>(
+      query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
+  OUTT* out_ptr = out + query_start_off * num_heads * HEAD_SIZE +
                   static_cast<int64_t>(head_idx) * HEAD_SIZE;
   if constexpr (std::is_same<OUTT, bit8_t>::value) {
     out_ptr[threadIdx.x] =
@@ -1466,6 +1495,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma16_kernel(
     const float scale,
     const int* __restrict__ block_tables,    // [num_seqs, max_num_blocks_per_seq]
     const int* __restrict__ context_lens,    // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes,  // [num_heads]
     const int q_stride,
@@ -1492,6 +1522,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
     const float scale,
     const int* __restrict__ block_tables,    // [num_seqs, max_num_blocks_per_seq]
     const int* __restrict__ context_lens,    // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes,  // [num_heads]
     const int q_stride,
@@ -1515,6 +1546,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
     const float* __restrict__ max_logits,  // [num_seqs, num_heads, max_num_partitions]
     const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads, max_num_partitions, head_size]
     const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
     const int max_num_partitions) {
   UNREACHABLE_CODE
 }
@@ -1522,34 +1554,34 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
 
 #endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
 
-#define LAUNCH_CUSTOM_ATTENTION_MFMA16(GQA_RATIO)                             \
-  paged_attention_ll4mi_QKV_mfma16_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE, \
-                                          HEAD_SIZE, NTHR, ALIBI_ENABLED,     \
-                                          GQA_RATIO>                          \
-      <<<grid, block, 0, stream>>>(                                           \
-          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,     \
-          block_tables_ptr, context_lens_ptr, max_num_blocks_per_seq,         \
-          alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride,        \
-          exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, max_ctx_blocks, \
-          k_scale_ptr, v_scale_ptr);
+#define LAUNCH_CUSTOM_ATTENTION_MFMA16(GQA_RATIO)                              \
+  paged_attention_ll4mi_QKV_mfma16_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE,  \
+                                          HEAD_SIZE, NTHR, ALIBI_ENABLED,      \
+                                          GQA_RATIO>                           \
+      <<<grid, block, 0, stream>>>(                                            \
+          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,      \
+          block_tables_ptr, context_lens_ptr, query_start_loc_ptr,             \
+          max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \
+          kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr,  \
+          max_ctx_blocks, k_scale_ptr, v_scale_ptr);
 
-#define LAUNCH_CUSTOM_ATTENTION_MFMA4(GQA_RATIO)                              \
-  paged_attention_ll4mi_QKV_mfma4_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE,  \
-                                         HEAD_SIZE, NTHR, ALIBI_ENABLED,      \
-                                         GQA_RATIO>                           \
-      <<<grid, block, 0, stream>>>(                                           \
-          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,     \
-          block_tables_ptr, context_lens_ptr, max_num_blocks_per_seq,         \
-          alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride,        \
-          exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, max_ctx_blocks, \
-          k_scale_ptr, v_scale_ptr);
+#define LAUNCH_CUSTOM_ATTENTION_MFMA4(GQA_RATIO)                               \
+  paged_attention_ll4mi_QKV_mfma4_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE,   \
+                                         HEAD_SIZE, NTHR, ALIBI_ENABLED,       \
+                                         GQA_RATIO>                            \
+      <<<grid, block, 0, stream>>>(                                            \
+          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,      \
+          block_tables_ptr, context_lens_ptr, query_start_loc_ptr,             \
+          max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \
+          kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr,  \
+          max_ctx_blocks, k_scale_ptr, v_scale_ptr);
 
 #define LAUNCH_CUSTOM_REDUCTION(NPAR_LOOPS)                          \
   paged_attention_ll4mi_reduce_kernel<T, OUTT, HEAD_SIZE, HEAD_SIZE, \
                                       PARTITION_SIZE, NPAR_LOOPS>    \
       <<<reduce_grid, reduce_block, 0, stream>>>(                    \
           out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr,        \
-          context_lens_ptr, max_num_partitions);
+          context_lens_ptr, query_start_loc_ptr, max_num_partitions);
 
 template <typename T, typename KVT, vllm::Fp8KVCacheDataType KV_DTYPE,
           int BLOCK_SIZE, int HEAD_SIZE, typename OUTT, int PARTITION_SIZE_OLD,
@@ -1559,9 +1591,10 @@ void paged_attention_custom_launcher(
     torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, const int num_kv_heads, float scale,
     torch::Tensor& block_tables, torch::Tensor& context_lens,
-    int max_context_len, const std::optional<torch::Tensor>& alibi_slopes,
-    torch::Tensor& k_scale, torch::Tensor& v_scale) {
-  int num_seqs = query.size(0);
+    const std::optional<torch::Tensor>& query_start_loc, int max_context_len,
+    const std::optional<torch::Tensor>& alibi_slopes, torch::Tensor& k_scale,
+    torch::Tensor& v_scale) {
+  int num_seqs = block_tables.size(0);
   int num_heads = query.size(1);
   int head_size = query.size(2);
   int max_num_blocks_per_seq = block_tables.size(1);
@@ -1569,6 +1602,13 @@ void paged_attention_custom_launcher(
   int kv_block_stride = key_cache.stride(0);
   int kv_head_stride = key_cache.stride(1);
 
+  // NOTE: query start location is optional for V0 decode should not be used.
+  // If batch contains mix of prefills and decode, prefills should be skipped.
+  const int* query_start_loc_ptr =
+      query_start_loc
+          ? reinterpret_cast<const int*>(query_start_loc.value().data_ptr())
+          : nullptr;
+
   // NOTE: alibi_slopes is optional.
   const float* alibi_slopes_ptr =
       alibi_slopes
@@ -1700,8 +1740,8 @@ void paged_attention_custom_launcher(
   paged_attention_custom_launcher<T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, T, \
                                   PSIZE, ALIBI_ENABLED>(                    \
       out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,    \
-      num_kv_heads, scale, block_tables, context_lens, max_context_len,     \
-      alibi_slopes, k_scale, v_scale);
+      num_kv_heads, scale, block_tables, context_lens, query_start_loc,     \
+      max_context_len, alibi_slopes, k_scale, v_scale);
 
 #define CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE,      \
                                    PSIZE)                                      \
@@ -1750,6 +1790,7 @@ void paged_attention(
     double scale,
     torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
     torch::Tensor& context_lens, // [num_seqs]
+    const std::optional<torch::Tensor>& query_start_loc, // [num_seqs]
     int64_t block_size, int64_t max_context_len,
     const std::optional<torch::Tensor>& alibi_slopes,
     const std::string& kv_cache_dtype, torch::Tensor& k_scale,
diff --git a/csrc/rocm/ops.h b/csrc/rocm/ops.h
index ba161951772ad..afb735450e0cb 100644
--- a/csrc/rocm/ops.h
+++ b/csrc/rocm/ops.h
@@ -7,8 +7,9 @@ void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums,
                      torch::Tensor& query, torch::Tensor& key_cache,
                      torch::Tensor& value_cache, int64_t num_kv_heads,
                      double scale, torch::Tensor& block_tables,
-                     torch::Tensor& context_lens, int64_t block_size,
-                     int64_t max_context_len,
+                     torch::Tensor& context_lens,
+                     const std::optional<torch::Tensor>& query_start_loc,
+                     int64_t block_size, int64_t max_context_len,
                      const std::optional<torch::Tensor>& alibi_slopes,
                      const std::string& kv_cache_dtype, torch::Tensor& k_scale,
                      torch::Tensor& v_scale);
diff --git a/csrc/rocm/torch_bindings.cpp b/csrc/rocm/torch_bindings.cpp
index a5d2e2f97a3ed..537e9357d52be 100644
--- a/csrc/rocm/torch_bindings.cpp
+++ b/csrc/rocm/torch_bindings.cpp
@@ -23,7 +23,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
       "                Tensor query, Tensor key_cache,"
       "                Tensor value_cache, int num_kv_heads,"
       "                float scale, Tensor block_tables,"
-      "                Tensor context_lens, int block_size,"
+      "                Tensor context_lens,"
+      "                Tensor? query_start_loc,"
+      "                int block_size,"
       "                int max_context_len,"
       "                Tensor? alibi_slopes,"
       "                str kv_cache_dtype,"
diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py
index 50eaa92f59b5c..9333777d38ea0 100644
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -164,6 +164,7 @@ def test_contexted_kv_attention(
        block_table,
        b_start_loc,
        b_seq_len,
+       MAX_CTX_LEN,
        max_input_len,
        k_scale,
        v_scale,
@@ -180,6 +181,7 @@ def test_contexted_kv_attention(
        block_table,
        b_start_loc,
        b_seq_len,
+       MAX_CTX_LEN,
        max_input_len,
        k_scale,
        v_scale,
@@ -397,6 +399,7 @@ def test_contexted_kv_attention_alibi(
        block_table,
        b_start_loc,
        b_seq_len,
+       MAX_CTX_LEN,
        max_input_len,
        k_scale,
        v_scale,
@@ -413,6 +416,7 @@ def test_contexted_kv_attention_alibi(
        block_table,
        b_start_loc,
        b_seq_len,
+       MAX_CTX_LEN,
        max_input_len,
        k_scale,
        v_scale,
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index fe41a2d963b2e..719e02ecd6830 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -110,6 +110,7 @@ def paged_attention_rocm(
     scale: float,
     block_tables: torch.Tensor,
     seq_lens: torch.Tensor,
+    query_start_loc: Optional[torch.Tensor],
     block_size: int,
     max_seq_len: int,
     alibi_slopes: Optional[torch.Tensor],
@@ -120,8 +121,9 @@ def paged_attention_rocm(
     torch.ops._rocm_C.paged_attention(out, exp_sum, max_logits, tmp_out, query,
                                       key_cache, value_cache, num_kv_heads,
                                       scale, block_tables, seq_lens,
-                                      block_size, max_seq_len, alibi_slopes,
-                                      kv_cache_dtype, k_scale, v_scale)
+                                      query_start_loc, block_size, max_seq_len,
+                                      alibi_slopes, kv_cache_dtype, k_scale,
+                                      v_scale)
 
 
 def mla_decode_kvcache_cpu(
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index f19773bb2843a..9a4ee2ae70612 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -17,16 +17,13 @@ from vllm.attention.ops.paged_attn import (PagedAttention,
                                            PagedAttentionMetadata)
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
+from vllm.platforms.rocm import use_rocm_custom_paged_attention
 
 if TYPE_CHECKING:
     from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
 
 logger = init_logger(__name__)
-
 _PARTITION_SIZE_ROCM = 256
-_GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
-_ON_NAVI = "gfx1" in _GPU_ARCH
-_ON_MI250_MI300 = any(arch in _GPU_ARCH for arch in ["gfx90a", "gfx942"])
 
 
 class ROCmFlashAttentionBackend(AttentionBackend):
@@ -790,9 +787,9 @@ class ROCmFlashAttentionImpl(AttentionImpl):
             num_seqs, num_heads, head_size = decode_query.shape
             block_size = value_cache.shape[3]
             gqa_ratio = num_heads // self.num_kv_heads
-            use_custom = _use_rocm_custom_paged_attention(
+            use_custom = use_rocm_custom_paged_attention(
                 decode_query.dtype, head_size, block_size, gqa_ratio,
-                decode_meta.max_decode_seq_len)
+                decode_meta.max_decode_seq_len, self.sliding_window)
             if use_custom:
                 max_seq_len = (decode_meta.max_decode_seq_len if self.attn_type
                                != AttentionType.ENCODER_DECODER else
@@ -817,6 +814,8 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                     out = output[num_prefill_tokens:]
                 else:
                     out = output
+
+                query_start_loc = None
                 ops.paged_attention_rocm(
                     out,
                     exp_sums,
@@ -833,6 +832,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                     decode_meta.seq_lens_tensor
                     if self.attn_type != AttentionType.ENCODER_DECODER else
                     decode_meta.encoder_seq_lens_tensor,
+                    query_start_loc,
                     block_size,
                     max_seq_len,
                     self.alibi_slopes,
@@ -898,15 +898,3 @@ def _sdpa_attention(
             start = end
 
     return output
-
-
-def _use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
-                                     block_size: int, gqa_ratio: int,
-                                     max_seq_len: int) -> bool:
-    # rocm custom page attention not support on navi (gfx1*)
-    return (_ON_MI250_MI300 and not _ON_NAVI
-            and (qtype == torch.half or qtype == torch.bfloat16)
-            and (head_size == 64 or head_size == 128)
-            and (block_size == 16 or block_size == 32)
-            and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768
-            and envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py
index 48db3ebfd7412..1b47581641b06 100644
--- a/vllm/attention/ops/chunked_prefill_paged_decode.py
+++ b/vllm/attention/ops/chunked_prefill_paged_decode.py
@@ -10,6 +10,9 @@ import torch
 import triton
 import triton.language as tl
 
+from vllm import _custom_ops as ops
+from vllm.platforms.rocm import use_rocm_custom_paged_attention
+
 from .prefix_prefill import context_attention_fwd
 
 
@@ -33,26 +36,26 @@ def kernel_paged_attention_2d(
         num_query_heads: tl.constexpr,  # int
         num_queries_per_kv: tl.constexpr,  # int
         num_queries_per_kv_padded: tl.constexpr,  # int
-        block_table_stride: tl.constexpr,  # int
-        query_stride_0: tl.constexpr,  # int
-        query_stride_1: tl.constexpr,  # int, should be equal to head_size
-        output_stride_0: tl.constexpr,  # int
-        output_stride_1: tl.constexpr,  # int, should be equal to head_size
+        block_table_stride: tl.int64,  # int
+        query_stride_0: tl.int64,  # int
+        query_stride_1: tl.int64,  # int, should be equal to head_size
+        output_stride_0: tl.int64,  # int
+        output_stride_1: tl.int64,  # int, should be equal to head_size
         BLOCK_SIZE: tl.constexpr,  # int
         HEAD_SIZE: tl.constexpr,  # int
         HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
         USE_ALIBI_SLOPES: tl.constexpr,  # bool
         SLIDING_WINDOW: tl.constexpr,  # int
         x: tl.constexpr,  # int
-        stride_k_cache_0: tl.constexpr,  # int
-        stride_k_cache_1: tl.constexpr,  # int
-        stride_k_cache_2: tl.constexpr,  # int
-        stride_k_cache_3: tl.constexpr,  # int
-        stride_k_cache_4: tl.constexpr,  # int
-        stride_v_cache_0: tl.constexpr,  # int
-        stride_v_cache_1: tl.constexpr,  # int
-        stride_v_cache_2: tl.constexpr,  # int
-        stride_v_cache_3: tl.constexpr,  # int
+        stride_k_cache_0: tl.int64,  # int
+        stride_k_cache_1: tl.int64,  # int
+        stride_k_cache_2: tl.int64,  # int
+        stride_k_cache_3: tl.int64,  # int
+        stride_k_cache_4: tl.int64,  # int
+        stride_v_cache_0: tl.int64,  # int
+        stride_v_cache_1: tl.int64,  # int
+        stride_v_cache_2: tl.int64,  # int
+        stride_v_cache_3: tl.int64,  # int
         filter_by_query_len: tl.constexpr,  # bool
         query_start_len_ptr,  # [num_seqs+1]
 ):
@@ -212,6 +215,7 @@ def chunked_prefill_paged_decode(
     block_table,
     query_start_loc,
     seq_lens,
+    max_seq_len,
     max_query_len,
     k_scale,
     v_scale,
@@ -240,6 +244,7 @@ def chunked_prefill_paged_decode(
             b_loc=block_table,
             b_start_loc=query_start_loc,
             b_seq_len=seq_lens,
+            max_seq_len=max_seq_len,
             max_input_len=max_query_len,
             k_scale=k_scale,
             v_scale=v_scale,
@@ -275,43 +280,87 @@ def chunked_prefill_paged_decode(
     num_queries_per_kv_padded = max(triton.next_power_of_2(num_queries_per_kv),
                                     16)
 
-    kernel_paged_attention_2d[(
-        num_seqs,
-        num_kv_heads,
-    )](
-        output_ptr=output,
-        query_ptr=query,
-        key_cache_ptr=key_cache,
-        value_cache_ptr=value_cache,
-        block_tables_ptr=block_table,
-        seq_lens_ptr=seq_lens,
-        alibi_slopes_ptr=alibi_slopes,
-        scale=sm_scale,
-        k_scale=k_scale,
-        v_scale=v_scale,
-        num_query_heads=num_query_heads,
-        num_queries_per_kv=num_queries_per_kv,
-        num_queries_per_kv_padded=num_queries_per_kv_padded,
-        block_table_stride=block_table.stride(0),
-        query_stride_0=query.stride(0),
-        query_stride_1=query.stride(1),
-        output_stride_0=output.stride(0),
-        output_stride_1=output.stride(1),
-        BLOCK_SIZE=block_size,
-        HEAD_SIZE=head_size,
-        HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
-        USE_ALIBI_SLOPES=use_alibi_slopes,
-        SLIDING_WINDOW=sliding_window,
-        x=key_cache.shape[4],
-        stride_k_cache_0=key_cache.stride(0),
-        stride_k_cache_1=key_cache.stride(1),
-        stride_k_cache_2=key_cache.stride(2),
-        stride_k_cache_3=key_cache.stride(3),
-        stride_k_cache_4=key_cache.stride(4),
-        stride_v_cache_0=value_cache.stride(0),
-        stride_v_cache_1=value_cache.stride(1),
-        stride_v_cache_2=value_cache.stride(2),
-        stride_v_cache_3=value_cache.stride(3),
-        filter_by_query_len=True,
-        query_start_len_ptr=query_start_loc,
-    )
+    use_custom = use_rocm_custom_paged_attention(query.dtype, head_size,
+                                                 block_size,
+                                                 num_queries_per_kv,
+                                                 max_seq_len, sliding_window)
+    if use_custom:
+        _PARTITION_SIZE_ROCM = 256
+        max_num_partitions = ((max_seq_len + _PARTITION_SIZE_ROCM - 1) //
+                              _PARTITION_SIZE_ROCM)
+        assert _PARTITION_SIZE_ROCM % block_size == 0
+        total_num_seq = query.shape[0]
+        tmp_output = torch.empty(
+            size=(total_num_seq, num_query_heads, max_num_partitions,
+                  head_size),
+            dtype=output.dtype,
+            device=output.device,
+        )
+        exp_sums = torch.empty(
+            size=(total_num_seq, num_query_heads, max_num_partitions),
+            dtype=torch.float32,
+            device=output.device,
+        )
+        max_logits = torch.empty_like(exp_sums)
+
+        ops.paged_attention_rocm(
+            output,
+            exp_sums,
+            max_logits,
+            tmp_output,
+            query,
+            key_cache,
+            value_cache,
+            num_kv_heads,
+            scale=sm_scale,
+            block_tables=block_table,
+            seq_lens=seq_lens,
+            query_start_loc=query_start_loc,
+            block_size=block_size,
+            max_seq_len=max_seq_len,
+            alibi_slopes=alibi_slopes,
+            kv_cache_dtype=kv_cache_dtype,
+            k_scale=k_scale,
+            v_scale=v_scale,
+        )
+    else:
+        kernel_paged_attention_2d[(
+            num_seqs,
+            num_kv_heads,
+        )](
+            output_ptr=output,
+            query_ptr=query,
+            key_cache_ptr=key_cache,
+            value_cache_ptr=value_cache,
+            block_tables_ptr=block_table,
+            seq_lens_ptr=seq_lens,
+            alibi_slopes_ptr=alibi_slopes,
+            scale=sm_scale,
+            k_scale=k_scale,
+            v_scale=v_scale,
+            num_query_heads=num_query_heads,
+            num_queries_per_kv=num_queries_per_kv,
+            num_queries_per_kv_padded=num_queries_per_kv_padded,
+            block_table_stride=block_table.stride(0),
+            query_stride_0=query.stride(0),
+            query_stride_1=query.stride(1),
+            output_stride_0=output.stride(0),
+            output_stride_1=output.stride(1),
+            BLOCK_SIZE=block_size,
+            HEAD_SIZE=head_size,
+            HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
+            USE_ALIBI_SLOPES=use_alibi_slopes,
+            SLIDING_WINDOW=sliding_window,
+            x=key_cache.shape[4],
+            stride_k_cache_0=key_cache.stride(0),
+            stride_k_cache_1=key_cache.stride(1),
+            stride_k_cache_2=key_cache.stride(2),
+            stride_k_cache_3=key_cache.stride(3),
+            stride_k_cache_4=key_cache.stride(4),
+            stride_v_cache_0=value_cache.stride(0),
+            stride_v_cache_1=value_cache.stride(1),
+            stride_v_cache_2=value_cache.stride(2),
+            stride_v_cache_3=value_cache.stride(3),
+            filter_by_query_len=True,
+            query_start_len_ptr=query_start_loc,
+        )
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index fd703413db908..827c3041a682e 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -209,6 +209,7 @@ class PagedAttention:
         v_scale: torch.Tensor,
     ) -> torch.Tensor:
         output = torch.empty_like(query)
+        max_seq_len = None
         context_attention_fwd(
             query,
             key,
@@ -221,6 +222,7 @@ class PagedAttention:
             # query_start_loc is (batch_size + 1,)
             query_start_loc,
             seq_lens_tensor,
+            max_seq_len,
             max_query_len,
             k_scale,
             v_scale,
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index e85ec605ad2f9..49ba476d78b62 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -725,6 +725,7 @@ if triton.__version__ >= "2.1.0":
                               b_loc,
                               b_start_loc,
                               b_seq_len,
+                              max_seq_len,
                               max_input_len,
                               k_scale: torch.Tensor,
                               v_scale: torch.Tensor,
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 89b778c7b5b02..1d0714305cc86 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
-from functools import lru_cache, wraps
+from functools import cache, lru_cache, wraps
 from typing import TYPE_CHECKING, Dict, List, Optional
 
 import torch
@@ -98,6 +98,25 @@ def device_id_to_physical_device_id(device_id: int) -> int:
         return device_id
 
 
+@cache
+def use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
+                                    block_size: int, gqa_ratio: int,
+                                    max_seq_len: int,
+                                    sliding_window: int) -> bool:
+
+    GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
+    ON_NAVI = "gfx1" in GPU_ARCH
+    ON_MI250_MI300 = any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942"])
+
+    # rocm custom page attention not support on navi (gfx1*)
+    return (ON_MI250_MI300 and not ON_NAVI and (sliding_window == 0)
+            and (qtype == torch.half or qtype == torch.bfloat16)
+            and (head_size == 64 or head_size == 128)
+            and (block_size == 16 or block_size == 32)
+            and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768
+            and envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
+
+
 class RocmPlatform(Platform):
     _enum = PlatformEnum.ROCM
     device_name: str = "rocm"
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index f11f2b6271ff8..15b49b14c1dd7 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -168,6 +168,7 @@ class TritonAttentionImpl(AttentionImpl):
             block_table=attn_metadata.block_table,
             query_start_loc=attn_metadata.query_start_loc,
             seq_lens=attn_metadata.seq_lens,
+            max_seq_len=attn_metadata.max_seq_len,
             max_query_len=attn_metadata.max_query_len,
             k_scale=layer._k_scale,
             v_scale=layer._v_scale,

From 37bfee92bf4159f5839d9bed6b2fb2b96db4e741 Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Thu, 3 Apr 2025 11:53:19 +0800
Subject: [PATCH 202/593] fix: better error message for get_config close #13889
 (#15943)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
---
 vllm/transformers_utils/config.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 71990468c315a..d27a126ddbc06 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -262,6 +262,11 @@ def get_config(
                                      MISTRAL_CONFIG_NAME,
                                      revision=revision):
                 config_format = ConfigFormat.MISTRAL
+            else:
+                raise ValueError(
+                    "Could not detect config format for no config file found. "
+                    "Ensure your model has either config.json (HF format) "
+                    "or params.json (Mistral format).")
 
         except Exception as e:
             error_message = (
@@ -324,7 +329,14 @@ def get_config(
     elif config_format == ConfigFormat.MISTRAL:
         config = load_params_config(model, revision, token=HF_TOKEN, **kwargs)
     else:
-        raise ValueError(f"Unsupported config format: {config_format}")
+        supported_formats = [
+            fmt.value for fmt in ConfigFormat if fmt != ConfigFormat.AUTO
+        ]
+        raise ValueError(
+            f"Unsupported config format: {config_format}. "
+            f"Supported formats are: {', '.join(supported_formats)}. "
+            f"Ensure your model uses one of these configuration formats "
+            f"or specify the correct format explicitly.")
 
     # Special architecture mapping check for GGUF models
     if is_gguf:

From 8b664706aa72315e436097d19257536228e01792 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 3 Apr 2025 12:25:01 +0800
Subject: [PATCH 203/593] [bugfix] add seed in torchrun_example.py (#15980)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 examples/offline_inference/torchrun_example.py | 4 ++++
 vllm/config.py                                 | 6 ++++++
 2 files changed, 10 insertions(+)

diff --git a/examples/offline_inference/torchrun_example.py b/examples/offline_inference/torchrun_example.py
index 35df6011550f2..7a57f29a07fa8 100644
--- a/examples/offline_inference/torchrun_example.py
+++ b/examples/offline_inference/torchrun_example.py
@@ -23,10 +23,14 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
 # Use `distributed_executor_backend="external_launcher"` so that
 # this llm engine/instance only creates one worker.
+# it is important to set an explicit seed to make sure that
+# all ranks have the same random seed, so that sampling can be
+# deterministic across ranks.
 llm = LLM(
     model="facebook/opt-125m",
     tensor_parallel_size=2,
     distributed_executor_backend="external_launcher",
+    seed=0,
 )
 
 outputs = llm.generate(prompts, sampling_params)
diff --git a/vllm/config.py b/vllm/config.py
index 1255d716a2e4c..69cde4e362c8a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -761,6 +761,12 @@ class ModelConfig:
         self,
         parallel_config: "ParallelConfig",
     ) -> None:
+
+        if parallel_config.distributed_executor_backend == "external_launcher":
+            assert self.seed is not None, (
+                "Seed must be set when using external launcher backend to "
+                "make sure sampling results are the same across workers.")
+
         total_num_attention_heads = getattr(self.hf_text_config,
                                             "num_attention_heads", 0)
         tensor_parallel_size = parallel_config.tensor_parallel_size

From 57a810db9c3cdfff96edb004ed111df31859c775 Mon Sep 17 00:00:00 2001
From: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com>
Date: Wed, 2 Apr 2025 22:28:44 -0700
Subject: [PATCH 204/593] [ROCM][V0] PA kennel selection when no sliding window
 provided (#15982)

Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
---
 vllm/platforms/rocm.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 1d0714305cc86..0bedd80e5ecf1 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -109,7 +109,8 @@ def use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
     ON_MI250_MI300 = any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942"])
 
     # rocm custom page attention not support on navi (gfx1*)
-    return (ON_MI250_MI300 and not ON_NAVI and (sliding_window == 0)
+    return (ON_MI250_MI300 and not ON_NAVI
+            and (sliding_window == 0 or sliding_window == (-1, -1))
             and (qtype == torch.half or qtype == torch.bfloat16)
             and (head_size == 64 or head_size == 128)
             and (block_size == 16 or block_size == 32)

From 06f21ce7a55222fbda9d1d6c7543c325df741386 Mon Sep 17 00:00:00 2001
From: "Ziji Shi (Steven)" <shi.ziji.sm@gmail.com>
Date: Wed, 2 Apr 2025 23:09:18 -0700
Subject: [PATCH 205/593] [Benchmark] Add AIMO Dataset to Benchmark (#15955)

Signed-off-by: Ziji Shi <shi.ziji.sm@gmail.com>
Signed-off-by: StevenShi-23 <shi.ziji.sm@gmail.com>
---
 benchmarks/benchmark_dataset.py | 49 +++++++++++++++++++++++++++++++++
 benchmarks/benchmark_serving.py | 14 ++++++----
 2 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index c2fbe2bb6d276..1d61485e70b5b 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -752,3 +752,52 @@ class InstructCoderDataset(HuggingFaceDataset):
                 ))
         self.maybe_oversample_requests(sampled_requests, num_requests)
         return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# AIMO Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class AIMODataset(HuggingFaceDataset):
+    """
+    Dataset class for processing a AIMO dataset with reasoning questions.
+    """
+    SUPPORTED_DATASET_PATHS = {
+        "AI-MO/aimo-validation-aime", "AI-MO/NuminaMath-1.5",
+        "AI-MO/NuminaMath-CoT"
+    }
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               output_len: Optional[int] = None,
+               **kwargs) -> list:
+        sampled_requests = []
+        dynamic_output = output_len is None
+
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt, completion = item['problem'], item["solution"]
+
+            prompt_ids = tokenizer(prompt).input_ids
+            completion_ids = tokenizer(completion).input_ids
+            prompt_len = len(prompt_ids)
+            completion_len = len(completion_ids)
+            output_len = completion_len if dynamic_output else output_len
+            assert isinstance(output_len, int) and output_len > 0
+            if dynamic_output and not is_valid_sequence(prompt_len,
+                                                        completion_len,
+                                                        max_prompt_len=2048,
+                                                        max_total_len=32000):
+                continue
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=None,
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index ec2ed1a1750b9..59648222e0a61 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -49,10 +49,11 @@ try:
 except ImportError:
     from argparse import ArgumentParser as FlexibleArgumentParser
 
-from benchmark_dataset import (BurstGPTDataset, ConversationDataset,
-                               HuggingFaceDataset, InstructCoderDataset,
-                               RandomDataset, SampleRequest, ShareGPTDataset,
-                               SonnetDataset, VisionArenaDataset)
+from benchmark_dataset import (AIMODataset, BurstGPTDataset,
+                               ConversationDataset, HuggingFaceDataset,
+                               InstructCoderDataset, RandomDataset,
+                               SampleRequest, ShareGPTDataset, SonnetDataset,
+                               VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@@ -595,6 +596,9 @@ def main(args: argparse.Namespace):
             args.hf_split = "train"
         elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
             dataset_class = ConversationDataset
+        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = AIMODataset
+            args.hf_split = "train"
         else:
             supported_datasets = set([
                 dataset_name for cls in HuggingFaceDataset.__subclasses__()
@@ -610,10 +614,10 @@ def main(args: argparse.Namespace):
             dataset_path=args.dataset_path,
             dataset_subset=args.hf_subset,
             dataset_split=args.hf_split,
+            random_seed=args.seed,
         ).sample(
             num_requests=args.num_prompts,
             tokenizer=tokenizer,
-            random_seed=args.seed,
             output_len=args.hf_output_len,
         )
 

From 5e125e74d1b9739025ccb81106e28ce9b0dd4ea5 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 3 Apr 2025 14:45:03 +0800
Subject: [PATCH 206/593] [misc] improve error message for "Failed to infer
 device type" (#15994)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/config.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index 69cde4e362c8a..92e887e08639f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1875,7 +1875,10 @@ class DeviceConfig:
             from vllm.platforms import current_platform
             self.device_type = current_platform.device_type
             if not self.device_type:
-                raise RuntimeError("Failed to infer device type")
+                raise RuntimeError(
+                    "Failed to infer device type, please set "
+                    "the environment variable `VLLM_LOGGING_LEVEL=DEBUG` "
+                    "to turn on verbose logging to help debug the issue.")
         else:
             # Device type is assigned explicitly
             self.device_type = device

From 463bbb1835b8bbb1a80e6286c6396f6f3182c4f7 Mon Sep 17 00:00:00 2001
From: wwl2755 <wangwenlong2755@gmail.com>
Date: Thu, 3 Apr 2025 02:32:10 -0500
Subject: [PATCH 207/593] [Bugfix][V1] Fix bug from putting
 llm_engine.model_executor in a background process (#15367)

Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
---
 .../offline_inference/load_sharded_state.py   | 93 +++++++++++++++++++
 .../offline_inference/save_sharded_state.py   | 23 ++++-
 vllm/v1/engine/core.py                        | 10 ++
 vllm/v1/engine/core_client.py                 | 31 +++++++
 vllm/v1/worker/gpu_worker.py                  | 14 +++
 5 files changed, 167 insertions(+), 4 deletions(-)
 create mode 100644 examples/offline_inference/load_sharded_state.py

diff --git a/examples/offline_inference/load_sharded_state.py b/examples/offline_inference/load_sharded_state.py
new file mode 100644
index 0000000000000..7e90d5d25e293
--- /dev/null
+++ b/examples/offline_inference/load_sharded_state.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Validates the loading of a model saved with the sharded_state format.
+This script demonstrates how to load a model that was previously saved
+using save_sharded_state.py and validates it by running inference.
+Example usage:
+(First need to save a sharded_state mode)
+
+python save_sharded_state.py \
+    --model /path/to/load \
+    --quantization deepspeedfp \
+    --tensor-parallel-size 8 \
+    --output /path/to/save/sharded/modele
+
+python load_sharded_state.py \
+    --model /path/to/saved/sharded/model \
+    --load-format sharded_state \
+    --quantization deepspeedfp \
+    --tensor-parallel-size 8 \
+    --prompt "Hello, my name is" \
+    --max-tokens 50
+"""
+
+import dataclasses
+
+from vllm import LLM, EngineArgs, SamplingParams
+from vllm.utils import FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    # Add engine arguments
+    EngineArgs.add_cli_args(parser)
+
+    # Override default load_format for clarity
+    parser.set_defaults(load_format="sharded_state")
+
+    # Add validation arguments
+    parser.add_argument("--prompt",
+                        type=str,
+                        default="Hello, world!",
+                        help="Prompt for validation")
+    parser.add_argument("--max-tokens",
+                        type=int,
+                        default=100,
+                        help="Maximum number of tokens to generate")
+    parser.add_argument("--temperature",
+                        type=float,
+                        default=0.7,
+                        help="Sampling temperature")
+    parser.add_argument("--top-p",
+                        type=float,
+                        default=1.0,
+                        help="Top-p sampling parameter")
+
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    engine_args = EngineArgs.from_cli_args(args)
+
+    print(f"Loading model from {engine_args.model} "
+          f"using format {engine_args.load_format}")
+    print(f"Tensor parallel size: {engine_args.tensor_parallel_size}")
+
+    # Load the model using engine args
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    # Prepare sampling parameters
+    sampling_params = SamplingParams(
+        temperature=args.temperature,
+        top_p=args.top_p,
+        max_tokens=args.max_tokens,
+    )
+
+    print("\nRunning inference:")
+    print(f"Prompt: {args.prompt}")
+
+    # Generate completion
+    outputs = llm.generate(args.prompt, sampling_params)
+
+    # Display generated text
+    print("\nGenerated outputs:")
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print("-" * 50)
+        print(f"Full output: {args.prompt}{generated_text}")
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py
index 863276432cb9c..6aac9b75c59cf 100644
--- a/examples/offline_inference/save_sharded_state.py
+++ b/examples/offline_inference/save_sharded_state.py
@@ -57,10 +57,25 @@ def main(args):
     # Prepare output directory
     Path(args.output).mkdir(exist_ok=True)
     # Dump worker states to output directory
-    model_executor = llm.llm_engine.model_executor
-    model_executor.save_sharded_state(path=args.output,
-                                      pattern=args.file_pattern,
-                                      max_size=args.max_file_size)
+
+    # Check which engine version is being used
+    is_v1_engine = hasattr(llm.llm_engine, "engine_core")
+
+    if is_v1_engine:
+        # For V1 engine, we need to use engine_core.save_sharded_state
+        print("Using V1 engine save path")
+        llm.llm_engine.engine_core.save_sharded_state(
+            path=args.output,
+            pattern=args.file_pattern,
+            max_size=args.max_file_size)
+    else:
+        # For V0 engine
+        print("Using V0 engine save path")
+        model_executor = llm.llm_engine.model_executor
+        model_executor.save_sharded_state(path=args.output,
+                                          pattern=args.file_pattern,
+                                          max_size=args.max_file_size)
+
     # Copy metadata files to output directory
     for file in os.listdir(model_path):
         if os.path.splitext(file)[1] not in (".bin", ".pt", ".safetensors"):
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 19c7799b59b75..39caca0c2a452 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -285,6 +285,16 @@ class EngineCore:
     def pin_lora(self, lora_id: int) -> bool:
         return self.model_executor.pin_lora(lora_id)
 
+    def save_sharded_state(
+        self,
+        path: str,
+        pattern: Optional[str] = None,
+        max_size: Optional[int] = None,
+    ) -> None:
+        self.model_executor.save_sharded_state(path=path,
+                                               pattern=pattern,
+                                               max_size=max_size)
+
     def collective_rpc(self,
                        method: Union[str, Callable[..., _R]],
                        timeout: Optional[float] = None,
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 99774ff4556e8..e948e59b8c425 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -119,6 +119,12 @@ class EngineCoreClient(ABC):
     def pin_lora(self, lora_id: int) -> bool:
         raise NotImplementedError
 
+    def save_sharded_state(self,
+                           path: str,
+                           pattern: Optional[str] = None,
+                           max_size: Optional[int] = None) -> None:
+        raise NotImplementedError
+
     def collective_rpc(self,
                        method: Union[str, Callable[..., _R]],
                        timeout: Optional[float] = None,
@@ -162,6 +168,12 @@ class EngineCoreClient(ABC):
     async def pin_lora_async(self, lora_id: int) -> bool:
         raise NotImplementedError
 
+    async def save_sharded_state_async(self,
+                                       path: str,
+                                       pattern: Optional[str] = None,
+                                       max_size: Optional[int] = None) -> None:
+        raise NotImplementedError
+
     async def collective_rpc_async(
             self,
             method: Union[str, Callable[..., _R]],
@@ -227,6 +239,12 @@ class InprocClient(EngineCoreClient):
     def pin_lora(self, lora_id: int) -> bool:
         return self.engine_core.pin_lora(lora_id)
 
+    def save_sharded_state(self,
+                           path: str,
+                           pattern: Optional[str] = None,
+                           max_size: Optional[int] = None) -> None:
+        self.engine_core.save_sharded_state(path, pattern, max_size)
+
     def collective_rpc(self,
                        method: Union[str, Callable[..., _R]],
                        timeout: Optional[float] = None,
@@ -537,6 +555,12 @@ class SyncMPClient(MPClient):
         return self.call_utility("collective_rpc", method, timeout, args,
                                  kwargs)
 
+    def save_sharded_state(self,
+                           path: str,
+                           pattern: Optional[str] = None,
+                           max_size: Optional[int] = None) -> None:
+        self.call_utility("save_sharded_state", path, pattern, max_size)
+
 
 class AsyncMPClient(MPClient):
     """Asyncio-compatible client for multi-proc EngineCore."""
@@ -668,6 +692,13 @@ class AsyncMPClient(MPClient):
     async def pin_lora_async(self, lora_id: int) -> bool:
         return await self.call_utility_async("pin_lora", lora_id)
 
+    async def save_sharded_state_async(self,
+                                       path: str,
+                                       pattern: Optional[str] = None,
+                                       max_size: Optional[int] = None) -> None:
+        await self.call_utility_async("save_sharded_state", path, pattern,
+                                      max_size)
+
     async def collective_rpc_async(
             self,
             method: Union[str, Callable[..., _R]],
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 191443683fa09..2972e0ffb3baa 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -269,6 +269,20 @@ class Worker(WorkerBase):
         # worker will always be healthy as long as it's running.
         return
 
+    def save_sharded_state(
+        self,
+        path: str,
+        pattern: Optional[str] = None,
+        max_size: Optional[int] = None,
+    ) -> None:
+        from vllm.model_executor.model_loader.loader import ShardedStateLoader
+        ShardedStateLoader.save_model(
+            self.model_runner.model,
+            path,
+            pattern=pattern,
+            max_size=max_size,
+        )
+
 
 def init_worker_distributed_environment(
     parallel_config: ParallelConfig,

From a43aa183dc0cb2639044c15d272e0ce1941392b0 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Thu, 3 Apr 2025 18:47:31 +0800
Subject: [PATCH 208/593] [doc] update contribution link (#15922)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 03643bb682b3d..aa1264abbb995 100644
--- a/README.md
+++ b/README.md
@@ -101,7 +101,7 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
 ## Contributing
 
 We welcome and value any contributions and collaborations.
-Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
+Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contributing/overview.html) for how to get involved.
 
 ## Sponsors
 

From 84884cd9ac93c1011d1fd9f6d866c1ad3e84193a Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Thu, 3 Apr 2025 23:18:05 +0800
Subject: [PATCH 209/593] fix: tiny fix make format.sh excutable (#16015)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
---
 format.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 format.sh

diff --git a/format.sh b/format.sh
old mode 100644
new mode 100755

From 421c462948018a565017d9d3504f9445e1193b66 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 3 Apr 2025 11:23:19 -0400
Subject: [PATCH 210/593] [SupportsQuant] Bert, Blip, Blip2, Bloom (#15573)

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 vllm/model_executor/models/bert.py  | 10 ++++++----
 vllm/model_executor/models/blip.py  |  5 ++++-
 vllm/model_executor/models/blip2.py |  6 ++++--
 vllm/model_executor/models/bloom.py |  4 ++--
 4 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 77b2ef0fce5f4..111b49ab8dd2a 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -26,7 +26,7 @@ from vllm.sequence import IntermediateTensors, PoolerOutput
 from vllm.transformers_utils.config import (
     get_cross_encoder_activation_function)
 
-from .interfaces import SupportsCrossEncoding, SupportsV0Only
+from .interfaces import SupportsCrossEncoding, SupportsQuant, SupportsV0Only
 from .utils import WeightsMapper, maybe_prefix
 
 
@@ -313,7 +313,8 @@ class BertOutput(nn.Module):
         return hidden_states
 
 
-class BertModel(nn.Module):
+class BertModel(nn.Module, SupportsQuant):
+    packed_modules_mapping = {"qkv_proj": ["query", "key", "value"]}
 
     def __init__(self,
                  *,
@@ -385,7 +386,7 @@ class BertModel(nn.Module):
         return loaded_params
 
 
-class BertEmbeddingModel(nn.Module, SupportsV0Only):
+class BertEmbeddingModel(nn.Module, SupportsV0Only, SupportsQuant):
     """A model that uses Bert to provide embedding functionalities.
 
    This class encapsulates the BertModel and provides an interface for
@@ -443,7 +444,8 @@ class BertEmbeddingModel(nn.Module, SupportsV0Only):
                                                 softmax=False)
 
 
-class BertForSequenceClassification(nn.Module, SupportsCrossEncoding):
+class BertForSequenceClassification(nn.Module, SupportsCrossEncoding,
+                                    SupportsQuant):
     """A model that uses Bert to provide embedding functionalities.
 
    This class encapsulates the BertModel and provides an interface for
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index bedbdceb7721d..f3d488926d09e 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -16,6 +16,8 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
+from .interfaces import SupportsQuant
+
 
 def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
     assert image_size % patch_size == 0
@@ -243,9 +245,10 @@ class BlipEncoder(nn.Module):
         return hidden_states
 
 
-class BlipVisionModel(nn.Module):
+class BlipVisionModel(nn.Module, SupportsQuant):
     config_class = BlipVisionConfig
     main_input_name = "pixel_values"
+    packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
 
     def __init__(
         self,
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 7adca4f0dc868..db9d42f5b86aa 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -24,7 +24,8 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .blip import BlipVisionModel
-from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
+                         SupportsQuant)
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
 
@@ -498,7 +499,8 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
 @MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor,
                                         info=Blip2ProcessingInfo,
                                         dummy_inputs=Blip2DummyInputsBuilder)
-class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
+class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
+                                    SupportsQuant):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 50f48f91798ac..f960075b98bca 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -42,7 +42,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsPP, SupportsV0Only
+from .interfaces import SupportsPP, SupportsQuant, SupportsV0Only
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -279,7 +279,7 @@ class BloomModel(nn.Module):
         return hidden_states
 
 
-class BloomForCausalLM(nn.Module, SupportsPP, SupportsV0Only):
+class BloomForCausalLM(nn.Module, SupportsPP, SupportsV0Only, SupportsQuant):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()

From 82e7e19a6e00bb1c730c29817305aa37ad587210 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 3 Apr 2025 11:25:22 -0400
Subject: [PATCH 211/593] [SupportsQuant] Chameleon, Chatglm, Commandr (#15952)

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 vllm/model_executor/models/chameleon.py |  9 +++++++--
 vllm/model_executor/models/chatglm.py   | 12 ++++++++----
 vllm/model_executor/models/commandr.py  |  4 ++--
 3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index ebcd36148e073..f758c98ea5e59 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -38,7 +38,8 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
+                         SupportsQuant)
 from .utils import (flatten_bn, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix, merge_multimodal_embeddings)
@@ -927,7 +928,11 @@ class ChameleonModel(nn.Module):
     info=ChameleonProcessingInfo,
     dummy_inputs=ChameleonDummyInputsBuilder)
 class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                        SupportsPP):
+                                        SupportsPP, SupportsQuant):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index a51a0af9e2bcf..1b1738f882b7f 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -29,7 +29,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import ChatGLMConfig
 
-from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
 from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -295,7 +295,11 @@ class GLMTransformer(nn.Module):
 
 
 @support_torch_compile
-class ChatGLMModel(nn.Module):
+class ChatGLMModel(nn.Module, SupportsQuant):
+    packed_modules_mapping = {
+        "linear_proj.merged_proj":
+        ["linear_proj.gate_proj", "linear_proj.dense_h_to_4h"]
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -395,7 +399,6 @@ class ChatGLMModel(nn.Module):
 
 
 class ChatGLMBaseModel(nn.Module):
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_substr={".word_embeddings": ""}, )
 
@@ -452,7 +455,8 @@ class ChatGLMBaseModel(nn.Module):
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
 
-class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP):
+class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
+                         SupportsQuant):
     packed_modules_mapping = {
         "query_key_value": ["query_key_value"],
         "dense_h_to_4h": ["dense_h_to_4h"]
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index e7e73f446df27..bb8d9bf8a03c5 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -49,7 +49,7 @@ from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
 from .utils import (extract_layer_index, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -332,7 +332,7 @@ class CohereModel(nn.Module):
         return hidden_states
 
 
-class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",

From d2b58ca203fcff18c66e93fc4a2f851090b8bf75 Mon Sep 17 00:00:00 2001
From: Liangfu Chen <liangfc@amazon.com>
Date: Thu, 3 Apr 2025 09:51:32 -0700
Subject: [PATCH 212/593] [Neuron][kernel] Fuse kv cache into a single tensor
 (#15911)

Signed-off-by: Liangfu Chen <liangfc@amazon.com>
---
 tests/neuron/1_core/test_cache.py          |  4 +-
 tests/neuron/1_core/test_prefix_prefill.py | 13 ++--
 vllm/attention/ops/nki_flash_attn.py       | 85 ++++++++++------------
 3 files changed, 46 insertions(+), 56 deletions(-)

diff --git a/tests/neuron/1_core/test_cache.py b/tests/neuron/1_core/test_cache.py
index ea33727b7cfae..3d869cd2fa17f 100644
--- a/tests/neuron/1_core/test_cache.py
+++ b/tests/neuron/1_core/test_cache.py
@@ -64,9 +64,11 @@ def test_reshape_and_cache(num_tokens, n_kv_head, d_head, num_blocks,
     key_cache = torch.zeros_like(key_cache_cpu, device=device)
     value_cache = torch.zeros_like(value_cache_cpu, device=device)
     slot_mapping = slot_mapping_cpu.to(device)
+    kv_cache = torch.stack([key_cache, value_cache])
 
     # Run vectorized implementation on XLA device
-    reshape_and_cache(key, value, key_cache, value_cache, slot_mapping)
+    reshape_and_cache(key, value, kv_cache, slot_mapping)
+    key_cache, value_cache = torch.unbind(kv_cache, dim=0)
 
     # Move results back to CPU for comparison
     key_cache_result = key_cache.cpu()
diff --git a/tests/neuron/1_core/test_prefix_prefill.py b/tests/neuron/1_core/test_prefix_prefill.py
index 5a811f6defe6c..8f7e711b525e3 100644
--- a/tests/neuron/1_core/test_prefix_prefill.py
+++ b/tests/neuron/1_core/test_prefix_prefill.py
@@ -258,13 +258,13 @@ def sample_inputs(
                              value[start_loc:end_loc])
             cur_ctx += block_size
             block_id += 1
+    kv_cache = torch.stack([k_cache, v_cache])
 
     return (
         query,
         k,
         v,
-        k_cache,
-        v_cache,
+        kv_cache,
         block_table,
         key,
         value,
@@ -361,8 +361,7 @@ def test_contexted_kv_attention(
             query,
             k_active,
             v_active,
-            k_cache,
-            v_cache,
+            kv_cache,
             block_table,
             key,
             value,
@@ -439,8 +438,7 @@ def test_contexted_kv_attention(
         query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
         k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
         v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
-        k_cache = k_cache.permute(0, 2, 1, 3).contiguous()
-        v_cache = v_cache.permute(0, 2, 1, 3).contiguous()
+        kv_cache = kv_cache.permute(0, 1, 3, 2, 4).contiguous()
 
         # transform block table
         active_block_table = get_active_block_tables(
@@ -487,8 +485,7 @@ def test_contexted_kv_attention(
             query.to(device=device),
             k.to(device=device),
             v.to(device=device),
-            k_cache.to(device=device),
-            v_cache.to(device=device),
+            kv_cache.to(device=device),
             active_block_table.to(device=device),
             attn_mask.to(device=device),
         )
diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py
index dcf9b0ef1f2a1..6bce5879c81df 100644
--- a/vllm/attention/ops/nki_flash_attn.py
+++ b/vllm/attention/ops/nki_flash_attn.py
@@ -144,8 +144,7 @@ def transform_block_tables_for_indirect_load(
 def load_kv_tile_from_cache(
     cur_k_tile,
     cur_v_tile,
-    key_cache,
-    value_cache,
+    kv_cache,
     block_tables,
     large_k_tile_idx,
     num_blocks_per_large_tile,
@@ -169,8 +168,8 @@ def load_kv_tile_from_cache(
     for load_idx in nl.affine_range(num_loads):
         i_p = nl.arange(B_P_SIZE)[:, None]
         i_f = nl.arange(tiled_block_size * B_D_SIZE)[None, :]
-        loaded = nl.load(key_cache[block_tables[load_idx, i_p,
-                                                large_k_tile_idx], i_f])
+        loaded = nl.load(kv_cache[0, block_tables[load_idx, i_p,
+                                                  large_k_tile_idx], i_f])
         if cur_k_tile.dtype != loaded.dtype:
             loaded = nl.copy(loaded, dtype=cur_k_tile.dtype)
         # Transpose SBUF tensor using PE
@@ -185,7 +184,7 @@ def load_kv_tile_from_cache(
 
     # load value cache
     for load_idx in nl.affine_range(num_loads):
-        loaded = nl.load(value_cache[block_tables[load_idx, i_p,
+        loaded = nl.load(kv_cache[1, block_tables[load_idx, i_p,
                                                   large_k_tile_idx], i_f])
         if cur_v_tile.dtype != loaded.dtype:
             loaded = nl.copy(loaded, dtype=cur_v_tile.dtype)
@@ -418,8 +417,7 @@ def flash_paged_attention(
     query,
     key,
     value,
-    key_cache,
-    value_cache,
+    kv_cache,
     block_tables,
     mask,
     softmax_scale=None,
@@ -434,8 +432,7 @@ def flash_paged_attention(
       - query: shape   (1, n_heads, d, seq_q)
       - key:   shape   (1, n_kv_heads, d, seq_k)
       - value: shape   (1, n_kv_heads, seq_v, d)
-      - key_cache: (num_blocks, n_kv_heads, block_size, d)
-      - value_cache: (num_blocks, n_kv_heads, block_size, d)
+      - kv_cache: (2, num_blocks, n_kv_heads, block_size, d)
       - block_tables: (num_active_blocks, )
       - mask: (seq_q, num_active_blocks * block_size + seq_q)
       - o: shape (1, n_heads, seq_q, d)
@@ -444,7 +441,7 @@ def flash_paged_attention(
       - We use continuous batching by default, so the batch dimension is
         always 1, and different requests are concatenated along sequence
         dimension.
-      - We use paged cache blocks (key_cache, value_cache) to store KV cache.
+      - We use paged cache blocks (kv_cache) to store KV cache.
 
     IO tensor dtypes:
       - This kernel assumes all IO tensors have the same dtype except for
@@ -475,15 +472,13 @@ def flash_paged_attention(
     b, h, d, seqlen_q = query.shape
     B_D_SIZE = d
     n_tile_q = seqlen_q // B_P_SIZE  # since q will be loaded on tensor engine
-    num_blocks, k_h, block_size, _ = key_cache.shape
+    _, num_blocks, k_h, block_size, _ = kv_cache.shape
     q_h_per_k_h = h // k_h
     assert b == 1, f"invalid batch size {b=}"
     assert d <= 128, f" we do not support head_dim > 128, got head dim {d=}"
-    cache_shape = (num_blocks, k_h, block_size, d)
-    assert (tuple(key_cache.shape) == cache_shape
-            ), f"{key_cache.shape=} mismatch, expect {cache_shape}"
-    assert (tuple(value_cache.shape) == cache_shape
-            ), f"{value_cache.shape=} mismatch, expect {cache_shape}"
+    cache_shape = (2, num_blocks, k_h, block_size, d)
+    assert (tuple(kv_cache.shape) == cache_shape
+            ), f"{kv_cache.shape=} mismatch, expect {cache_shape}"
     assert key is None or tuple(key.shape) == (
         1,
         k_h,
@@ -580,13 +575,13 @@ def flash_paged_attention(
         head_id=head_id,
     )
 
-    # Flatten KV cache to be 2D for loading into SBUF
+    # Flatten KV cache to be 3D for loading into SBUF
     new_cache_shape = (
+        2,
         num_blocks * k_h * block_size_tiling_factor,
         tiled_block_size * d,
     )
-    key_cache = key_cache.reshape(new_cache_shape)
-    value_cache = value_cache.reshape(new_cache_shape)
+    kv_cache = kv_cache.reshape(new_cache_shape)
 
     # Global Flash Attention accumulators
     o_buffer = nl.zeros(
@@ -621,8 +616,7 @@ def flash_paged_attention(
         load_kv_tile_from_cache(
             cur_k_tile=cur_k_tile,
             cur_v_tile=cur_v_tile,
-            key_cache=key_cache,
-            value_cache=value_cache,
+            kv_cache=kv_cache,
             block_tables=block_tables_sbuf,
             large_k_tile_idx=large_k_tile_idx,
             num_blocks_per_large_tile=num_blocks_per_large_tile,
@@ -821,8 +815,7 @@ def flash_attn_varlen_nkifunc(
     query,
     key,
     value,
-    key_cache,
-    value_cache,
+    kv_cache,
     block_table,
     attn_mask,
     n_kv_head=None,
@@ -838,8 +831,7 @@ def flash_attn_varlen_nkifunc(
       - query: (1, n_heads, d, seq_q)
       - key:   (1, n_kv_heads, d, seq_k)
       - value: (1, n_kv_heads, seq_v, d)
-      - key_cache:   (n_blocks, n_kv_heads, block_size, d)
-      - value_cache: (n_blocks, n_kv_heads, block_size, d)
+      - kv_cache:   (2, n_blocks, n_kv_heads, block_size, d)
       - block_tables: (n_active_blocks, )
       - attn_mask: (seq_q, n_active_blocks * block_size + seq_q)
 
@@ -849,17 +841,17 @@ def flash_attn_varlen_nkifunc(
         for better DMA throughput
     """
     if n_kv_head is None:
-        n_kv_head = key_cache.shape[1]
-    assert key_cache.shape[1] == n_kv_head
+        n_kv_head = kv_cache.shape[2]
+    assert kv_cache.shape[0] == 2
+    assert kv_cache.shape[2] == n_kv_head
     if head_size is None:
-        head_size = key_cache.shape[-1]
+        head_size = kv_cache.shape[-1]
 
     kwargs = dict(
         query=query,
         key=key,
         value=value,
-        key_cache=key_cache,
-        value_cache=value_cache,
+        kv_cache=kv_cache,
         block_tables=block_table,
         mask=attn_mask,
         softmax_scale=1.0 / (head_size**0.5),
@@ -874,8 +866,7 @@ def flash_attn_varlen_nkifunc(
 def reshape_and_cache(
     key: torch.Tensor,
     value: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
+    kv_cache: torch.Tensor,
     slot_mapping: torch.Tensor,
 ) -> None:
     """
@@ -886,29 +877,29 @@ def reshape_and_cache(
             (num_tokens, n_kv_head, d_head)
         value (torch.Tensor): Value tensor with shape 
             (num_tokens, n_kv_head, d_head)
-        key_cache (torch.Tensor): Key cache tensor with shape 
-            (num_blocks, n_kv_head, block_size, d_head)
-        value_cache (torch.Tensor): Value cache tensor with shape
-            (num_blocks, n_kv_head, block_size, d_head) 
+        kv_cache (torch.Tensor): Key/value cache tensor with shape 
+            (2, num_blocks, n_kv_head, block_size, d_head)
         slot_mapping (torch.Tensor): Mapping tensor indicating cache positions
             with shape (num_tokens)
 
     Returns:
-        None: Updates the key_cache and value_cache tensors in-place
+        None: Updates the kv_cache tensor in-place
     """
-    block_size = key_cache.size(2)
+    block_size = kv_cache.size(3)
+    n_kv_head = key.size(1)
 
     # Calculate indices with explicit floor division
     block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
     block_offsets = slot_mapping % block_size
 
-    # Update caches using index_put_
-    key_cache.index_put_(
-        (block_indices.unsqueeze(1),
-         torch.arange(key_cache.size(1),
-                      device=key.device), block_offsets.unsqueeze(1)), key)
+    # Create the head indices tensor
+    head_indices = torch.arange(n_kv_head, device=key.device)
 
-    value_cache.index_put_(
-        (block_indices.unsqueeze(1),
-         torch.arange(value_cache.size(1),
-                      device=value.device), block_offsets.unsqueeze(1)), value)
+    # Update caches using index_put_
+    kv_cache.index_put_(
+        (torch.tensor([0], device=key.device), block_indices[:, None],
+         head_indices[None, :], block_offsets[:, None]), key)
+
+    kv_cache.index_put_(
+        (torch.tensor([1], device=key.device), block_indices[:, None],
+         head_indices[None, :], block_offsets[:, None]), value)

From 15ba07ef25d079c8569959e3ede637f432c51d98 Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Thu, 3 Apr 2025 13:19:38 -0400
Subject: [PATCH 213/593] [Minor] Fused experts refactor (#15914)

Signed-off-by: Bill Nell <bnell@redhat.com>
---
 tests/kernels/test_block_fp8.py               |   9 +-
 tests/kernels/test_cutlass_moe.py             |  16 +-
 .../layers/fused_moe/__init__.py              |   6 +-
 .../layers/fused_moe/cutlass_moe.py           | 144 ++++
 .../layers/fused_moe/deep_gemm_moe.py         | 294 +++++++
 .../layers/fused_moe/fused_moe.py             | 767 +-----------------
 .../layers/fused_moe/moe_align_block_size.py  | 243 ++++++
 vllm/model_executor/layers/fused_moe/utils.py |  48 ++
 8 files changed, 790 insertions(+), 737 deletions(-)
 create mode 100644 vllm/model_executor/layers/fused_moe/cutlass_moe.py
 create mode 100644 vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
 create mode 100644 vllm/model_executor/layers/fused_moe/moe_align_block_size.py
 create mode 100644 vllm/model_executor/layers/fused_moe/utils.py

diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/test_block_fp8.py
index fda981f4c8005..c4488c0c6ff36 100644
--- a/tests/kernels/test_block_fp8.py
+++ b/tests/kernels/test_block_fp8.py
@@ -9,8 +9,11 @@ import torch
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
-from vllm.model_executor.layers.fused_moe.fused_moe import (
-    deep_gemm_moe_fp8, fused_topk, moe_align_block_size)
+from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+    deep_gemm_moe_fp8)
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
+    moe_align_block_size)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     per_token_group_quant_fp8, w8a8_block_fp8_matmul)
 from vllm.platforms import current_platform
@@ -437,7 +440,7 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed):
         pytest.skip(
             f"Skipping test; bad size m={M}, n={N}, k={K}, topk={topk}, E={E}")
 
-    if (N <= 512):
+    if N <= 512:
         pytest.skip("Skipping N <= 512 until performance issues solved.")
 
     vllm_config = VllmConfig()
diff --git a/tests/kernels/test_cutlass_moe.py b/tests/kernels/test_cutlass_moe.py
index 1652c72d86fe1..3cfed6ae8538f 100644
--- a/tests/kernels/test_cutlass_moe.py
+++ b/tests/kernels/test_cutlass_moe.py
@@ -4,8 +4,8 @@ import torch
 
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.fused_moe.fused_moe import (cutlass_moe_fp8,
-                                                            fused_experts,
+from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
+from vllm.model_executor.layers.fused_moe.fused_moe import (fused_experts,
                                                             fused_topk)
 from vllm.platforms import current_platform
 
@@ -131,9 +131,9 @@ def test_cutlass_moe_no_graph(
                                          c_strides2,
                                          a1_scale=a_scale1)
 
-        print(triton_output)
-        print(cutlass_output)
-        print("*")
+        #print(triton_output)
+        #print(cutlass_output)
+        #print("*")
 
         torch.testing.assert_close(triton_output,
                                    cutlass_output,
@@ -234,9 +234,9 @@ def test_cutlass_moe_cuda_graph(
         graph.replay()
         torch.cuda.synchronize()
 
-        print(triton_output)
-        print(cutlass_output)
-        print("*")
+        #print(triton_output)
+        #print(cutlass_output)
+        #print("*")
 
         torch.testing.assert_close(triton_output,
                                    cutlass_output,
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index e096d14fc6f91..9829ccdb384ff 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -35,9 +35,11 @@ if HAS_TRITON:
     # import to register the custom ops
     import vllm.model_executor.layers.fused_moe.fused_marlin_moe  # noqa
     import vllm.model_executor.layers.fused_moe.fused_moe  # noqa
+    from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+        cutlass_moe_fp8)
     from vllm.model_executor.layers.fused_moe.fused_moe import (
-        cutlass_moe_fp8, fused_experts, fused_moe, fused_topk,
-        get_config_file_name, grouped_topk)
+        fused_experts, fused_moe, fused_topk, get_config_file_name,
+        grouped_topk)
 
     __all__ += [
         "fused_moe",
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
new file mode 100644
index 0000000000000..a17afd1b357ed
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Fused MoE kernel."""
+from typing import Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+
+
+#TODO make the grouped gemm kernel consistent with scaled gemm kernel
+def cutlass_moe_fp8(
+    a: torch.Tensor,
+    w1_q: torch.Tensor,
+    w2_q: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    ab_strides1: torch.Tensor,
+    c_strides1: torch.Tensor,
+    ab_strides2: torch.Tensor,
+    c_strides2: torch.Tensor,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    out_dtype: torch.dtype = torch.half,
+) -> torch.Tensor:
+    """
+    This function computes a a8w8-quantized Mixture of Experts (MoE) layer
+    using two sets of quantized weights, w1_q and w2_q, and top-k gating
+    mechanism. The matrix multiplications are implemented with CUTLASS
+    grouped gemm.
+
+    Parameters:
+    - a (torch.Tensor): The input tensor to the MoE layer.
+        Shape: [M, K]
+    - w1_q (torch.Tensor): The first set of fp8-quantized expert weights.
+        Shape: [num_experts, K, 2N] (the weights are passed transposed)
+    - w2_q (torch.Tensor): The second set of fp8-quantized expert weights.
+        Shape: [num_experts, N, K] (the weights are passed transposed)
+    - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
+        Shape: [num_experts] or [num_experts, 2N]
+    - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
+        Shape: [num_experts] or [num_experts, K]
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
+    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
+    - ab_strides1 (torch.Tensor): The input and weights strides of the first
+        grouped gemm.
+    - c_strides1 (torch.Tensor): The output strides of the first grouped gemm.
+    - ab_strides2 (torch.Tensor): The input and weights strides of the second
+        grouped gemm.
+    - c_strides2 (torch.Tensor): The output strides of the second grouped gemm.
+    - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
+        Shape: scalar or [M]
+    - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
+        quantize the intermediate result between the gemms.
+        Shape: scalar or [M]
+    - out_dtype (torch.Tensor): The output tensor type.
+
+    Returns:
+    - torch.Tensor: The fp16 output tensor after applying the MoE layer.
+    """
+
+    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
+    assert w1_q.dtype == torch.float8_e4m3fn
+    assert w2_q.dtype == torch.float8_e4m3fn
+    assert a.shape[1] == w1_q.shape[1], "Hidden size mismatch w1"
+    assert w1_q.shape[2] == w2_q.shape[1] * 2, "Hidden size mismatch w2"
+    assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch"
+    assert a1_scale is None or a1_scale.dim(
+    ) == 0 or a1_scale.shape[0] == 1 or a1_scale.shape[0] == a.shape[
+        0], "Input scale shape mismatch"
+    assert w1_scale.dim() == 1 or w1_scale.shape[1] == 1 or w1_scale.shape[
+        1] == w1_q.shape[2], "W1 scale shape mismatch"
+    assert w2_scale.dim() == 1 or w2_scale.shape[1] == 1 or w2_scale.shape[
+        1] == w2_q.shape[2], "W2 scale shape mismatch"
+    assert w1_q.shape[0] == w2_q.shape[0], "Weights expert number mismatch"
+    assert w1_q.shape[0] == w1_scale.shape[
+        0], "w1 scales expert number mismatch"
+    assert w1_q.shape[0] == w2_scale.shape[
+        0], "w2 scales expert number mismatch"
+    assert a2_scale is None or a1_scale is None or a2_scale.shape == a1_scale.shape, "Intermediate scale shape mismatch"  # noqa: E501
+    assert ab_strides1.shape[0] == w1_q.shape[
+        0], "AB Strides 1 expert number mismatch"
+    assert c_strides1.shape[0] == w1_q.shape[
+        0], "C Strides 1 expert number mismatch"
+    assert ab_strides2.shape[0] == w2_q.shape[
+        0], "AB Strides 2 expert number  mismatch"
+    assert c_strides2.shape[0] == w2_q.shape[
+        0], "C Strides 2 expert number mismatch"
+    assert out_dtype in [torch.half, torch.bfloat16], "Invalid output dtype"
+
+    num_experts = w1_q.size(0)
+    m = a.size(0)
+    k = w1_q.size(1)
+    n = w2_q.size(1)
+
+    topk = topk_ids.size(1)
+    per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
+        a2_scale.numel() != 1 if a2_scale is not None else False)
+
+    a_q, a1_scale = ops.scaled_fp8_quant(
+        a, a1_scale, use_per_token_if_dynamic=per_act_token)
+    device = a_q.device
+
+    expert_offsets = torch.empty((num_experts + 1),
+                                 dtype=torch.int32,
+                                 device=device)
+    problem_sizes1 = torch.empty((num_experts, 3),
+                                 dtype=torch.int32,
+                                 device=device)
+    problem_sizes2 = torch.empty((num_experts, 3),
+                                 dtype=torch.int32,
+                                 device=device)
+
+    a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
+    c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
+
+    ops.get_cutlass_moe_mm_data(topk_ids, expert_offsets, problem_sizes1,
+                                problem_sizes2, a_map, c_map, num_experts, n,
+                                k)
+
+    rep_a_q = a_q.view(dtype=torch.uint8)[a_map].view(dtype=a_q.dtype)
+    rep_a1_scales = a1_scale[a_map] if per_act_token else a1_scale
+
+    c1 = torch.empty((m * topk, n * 2), device=device, dtype=out_dtype)
+    c2 = torch.empty((m * topk, k), device=device, dtype=out_dtype)
+
+    ops.cutlass_moe_mm(c1, rep_a_q, w1_q, rep_a1_scales, w1_scale,
+                       expert_offsets[:-1], problem_sizes1, ab_strides1,
+                       ab_strides1, c_strides1)
+
+    intermediate = torch.empty((m * topk, n), device=device, dtype=out_dtype)
+    torch.ops._C.silu_and_mul(intermediate, c1)
+
+    intemediate_q, a2_scale = ops.scaled_fp8_quant(
+        intermediate, a2_scale, use_per_token_if_dynamic=per_act_token)
+
+    ops.cutlass_moe_mm(c2, intemediate_q, w2_q, a2_scale, w2_scale,
+                       expert_offsets[:-1], problem_sizes2, ab_strides2,
+                       ab_strides2, c_strides2)
+
+    return (c2[c_map].view(m, topk, k) *
+            topk_weights.view(m, topk, 1).to(out_dtype)).sum(dim=1)
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
new file mode 100644
index 0000000000000..353c8cc9d59fb
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -0,0 +1,294 @@
+# SPDX-License-Identifier: Apache-2.0
+import importlib.util
+from typing import Optional, Tuple
+
+import torch
+
+import vllm.envs as envs
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
+    moe_align_block_size)
+from vllm.model_executor.layers.fused_moe.utils import (_fp8_perm,
+                                                        _fp8_quantize,
+                                                        _resize_cache)
+from vllm.utils import round_up
+
+logger = init_logger(__name__)
+
+has_deep_gemm = importlib.util.find_spec("deep_gemm") is not None
+
+
+def _valid_deep_gemm(hidden_states: torch.Tensor,
+                     w1: torch.Tensor,
+                     w2: torch.Tensor,
+                     expert_map: Optional[torch.Tensor] = None) -> bool:
+    """
+    Check if the given problem size is supported by the DeepGemm grouped
+    gemm kernel.  All of M, N, K and the quantization block_shape must be
+    aligned by `dg.get_m_alignment_for_contiguous_layout()`.
+    """
+    if not has_deep_gemm:
+        return False
+
+    # Lazy import to avoid CUDA initialization problems.
+    import deep_gemm as dg
+
+    # Expert maps not supported yet.
+    if expert_map is not None:
+        return False
+
+    align = dg.get_m_alignment_for_contiguous_layout()
+    M = hidden_states.shape[0]
+    _, K, N = w2.shape
+
+    # For now, disable DeepGemm for small N until better permute/unpermute
+    # ops are available.
+    if N <= 512:
+        return False
+
+    if align > M or N % align != 0 or K % align != 0:
+        return False
+
+    return (hidden_states.is_contiguous() and w1.is_contiguous()
+            and w2.is_contiguous())
+
+
+def _moe_permute(
+    curr_hidden_states: torch.Tensor,
+    a1q_scale: Optional[torch.Tensor],
+    curr_topk_ids: torch.Tensor,
+    global_num_experts: int,
+    expert_map: Optional[torch.Tensor],
+    block_m: int,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
+           Optional[torch.Tensor]]:
+    """
+    Determine the sorted_token_ids, expert_ids for the given problem size.
+    Permute the hidden states and scales according to `sorted_token_ids`.
+    """
+    top_k_num = curr_topk_ids.shape[1]
+
+    tokens_in_chunk, _ = curr_hidden_states.shape
+
+    sorted_token_ids, expert_ids, num_tokens_post_padded = (
+        moe_align_block_size(curr_topk_ids,
+                             block_m,
+                             global_num_experts,
+                             expert_map,
+                             pad_sorted_ids=True))
+
+    inv_perm: Optional[torch.Tensor] = None
+
+    num_tokens = top_k_num * tokens_in_chunk
+    sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1)
+    expert_ids = torch.repeat_interleave(expert_ids, block_m, dim=0)
+    inv_perm = torch.argsort(sorted_token_ids)[:num_tokens]
+
+    # Permute according to sorted token ids.
+    curr_hidden_states = _fp8_perm(curr_hidden_states,
+                                   sorted_token_ids // top_k_num)
+
+    if a1q_scale is not None:
+        a1q_scale = a1q_scale[sorted_token_ids // top_k_num]
+
+    return (curr_hidden_states, a1q_scale, sorted_token_ids, expert_ids,
+            inv_perm)
+
+
+def _moe_unpermute_and_reduce(
+    out: torch.Tensor,
+    curr_hidden: torch.Tensor,
+    inv_perm: Optional[torch.Tensor],
+    topk_weight: torch.Tensor,
+) -> None:
+    """
+    Unpermute the final result and apply topk_weights, then perform the final
+    reduction on the hidden states.
+    """
+    M, topk = topk_weight.shape
+    K = curr_hidden.shape[1]
+    curr_hidden = curr_hidden[inv_perm, ...]
+    curr_hidden = curr_hidden.view(-1, topk, K)
+    curr_hidden.mul_(topk_weight.view(M, -1, 1))
+    ops.moe_sum(curr_hidden, out)
+
+
+def deep_gemm_moe_fp8(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    activation: str = "silu",
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """
+    This function computes a a8w8-quantized Mixture of Experts (MoE) layer
+    using two sets of quantized weights, w1_q and w2_q, and top-k gating
+    mechanism. The matrix multiplications are implemented with DeepGemm
+    grouped gemm.
+
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+        Shape: [M, K]
+    - w1 (torch.Tensor): The first set of fp8 quantized expert weights.
+        Shape: [num_experts, K, 2N] (the weights are passed transposed)
+    - w2 (torch.Tensor): The second set of fp8 quantized expert weights.
+        Shape: [num_experts, N, K] (the weights are passed transposed)
+    - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
+        Shape: [num_experts] or [num_experts, 2N]
+    - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
+        Shape: [num_experts] or [num_experts, K]
+    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
+    - topk_ids (torch.Tensor): The token->expert mapping for topk_weights.
+    - inplace (bool): If True, perform the operation in-place.
+        Defaults to False.
+    - activation (str): The activation function to apply after the first
+        MoE layer.
+    - global_num_experts (int): The total number of experts in the global
+        expert space.
+    - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices
+        from the global expert space to the local expert space of the expert
+        parallel shard.
+    - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
+        Shape: scalar or [M]
+    - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
+        quantize the intermediate result between the gemms.
+        Shape: scalar or [M]
+
+    Returns:
+    - torch.Tensor: The bfloat16 output tensor after applying the MoE layer.
+    """
+    # Lazy import to avoid CUDA initialization problems.
+    import deep_gemm as dg
+
+    assert expert_map is None, "Expert maps not supported yet"
+
+    assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch"
+
+    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
+    assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
+    assert hidden_states.dtype in [
+        torch.float32, torch.float16, torch.bfloat16
+    ]
+    assert w1.dtype == torch.float8_e4m3fn
+    assert w2.dtype == torch.float8_e4m3fn
+    assert w1.shape[0] == w2.shape[0], "Expert number mismatch"
+    assert w1.shape[0] == w1_scale.shape[0], "w1 scales expert number mismatch"
+    assert w1.shape[0] == w2_scale.shape[0], "w2 scales expert number mismatch"
+    assert a1_scale is None or a1_scale.dim(
+    ) == 0 or a1_scale.shape[0] == 1 or a1_scale.shape[
+        0] == hidden_states.shape[0], "Input scale shape mismatch"
+    assert a2_scale is None or a1_scale is None or a2_scale.shape == a1_scale.shape, "Intermediate scale shape mismatch"  # noqa: E501
+
+    num_tokens, _ = hidden_states.shape
+    E, N, _ = w1.shape
+    K = w2.shape[1]
+    if global_num_experts == -1:
+        global_num_experts = E
+
+    # We execute the fused_moe kernel in chunks to circumvent this issue:
+    # https://github.com/vllm-project/vllm/issues/5938
+    CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
+
+    assert _valid_deep_gemm(hidden_states, w1, w2, expert_map)
+
+    if inplace:
+        out_hidden_states = hidden_states
+    else:
+        out_hidden_states = torch.empty_like(hidden_states)
+
+    block_m = dg.get_m_alignment_for_contiguous_layout()
+    block_shape = [block_m, block_m]
+
+    assert w1_scale is not None
+    assert w2_scale is not None
+
+    # We attempt to transpose and align offline in Fp8MoEMethod, in which
+    # case these calls will be nops.  Otherwise, they'll be performed every
+    # time the layer is executed.
+    w1_scale = dg.get_col_major_tma_aligned_tensor(w1_scale).contiguous()
+    w2_scale = dg.get_col_major_tma_aligned_tensor(w2_scale).contiguous()
+
+    M_sum = topk_ids.numel() + global_num_experts * (block_m - 1)
+    M_sum = round_up(M_sum, block_m)
+
+    num_chunks = (num_tokens // CHUNK_SIZE) + 1
+
+    # We can reuse the memory between cache1 and cache3 because by the time
+    # we need cache3, we're done with cache1
+    workspace13 = torch.empty(M_sum * max(N, K),
+                              device=hidden_states.device,
+                              dtype=hidden_states.dtype)
+
+    workspace1 = workspace13[:M_sum * N].view(M_sum, N)
+    workspace2 = torch.empty((M_sum, N // 2),
+                             device=hidden_states.device,
+                             dtype=hidden_states.dtype)
+    workspace3 = workspace13[:M_sum * K].view(M_sum, K)
+
+    for chunk in range(num_chunks):
+        begin_chunk_idx, end_chunk_idx = (chunk * CHUNK_SIZE,
+                                          min((chunk + 1) * CHUNK_SIZE,
+                                              num_tokens))
+        curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx]
+        tokens_in_chunk, _ = curr_hidden_states.shape
+
+        if tokens_in_chunk == 0:
+            break
+
+        curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]
+        curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx]
+
+        a1q_scale: Optional[torch.Tensor] = None
+
+        qcurr_hidden_states, a1q_scale = _fp8_quantize(curr_hidden_states,
+                                                       a1_scale, block_shape)
+
+        (qcurr_hidden_states, a1q_scale, sorted_token_ids, expert_ids,
+         inv_perm) = _moe_permute(qcurr_hidden_states, a1q_scale,
+                                  curr_topk_ids, global_num_experts,
+                                  expert_map, block_m)
+
+        # Adjust the intermediate cache size and config for the last chunk.
+        # Note that in most cases we only have one chunk so the cache size
+        # and config are already set correctly and do not need to be adjusted.
+        if tokens_in_chunk < CHUNK_SIZE and chunk > 0:
+            curr_M = sorted_token_ids.numel()
+            workspace1 = _resize_cache(workspace1, (curr_M, N))
+            workspace2 = _resize_cache(workspace2, (curr_M, N // 2))
+            workspace3 = _resize_cache(workspace3, (curr_M, K))
+
+        dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
+            (qcurr_hidden_states, a1q_scale), (w1, w1_scale), workspace1,
+            expert_ids)
+
+        if activation == "silu":
+            torch.ops._C.silu_and_mul(workspace2, workspace1.view(-1, N))
+        elif activation == "gelu":
+            torch.ops._C.gelu_and_mul(workspace2, workspace1.view(-1, N))
+        else:
+            raise ValueError(f"Unsupported FusedMoe activation: {activation}")
+
+        a2q_scale: Optional[torch.Tensor] = None
+
+        qworkspace2, a2q_scale = _fp8_quantize(workspace2, a2_scale,
+                                               block_shape)
+
+        dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
+            (qworkspace2, a2q_scale), (w2, w2_scale), workspace3, expert_ids)
+
+        _moe_unpermute_and_reduce(
+            out_hidden_states[begin_chunk_idx:end_chunk_idx],
+            workspace3.view(*workspace3.shape), inv_perm, curr_topk_weights)
+
+    return out_hidden_states
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 977447e03995e..aa0bd553fc325 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1,10 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 """Fused MoE kernel."""
 import functools
-import importlib.util
 import json
 import os
-from math import prod
 from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import torch
@@ -14,10 +12,13 @@ import triton.language as tl
 import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    per_token_group_quant_fp8)
+from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+    _valid_deep_gemm, deep_gemm_moe_fp8)
+from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
+    moe_align_block_size)
+from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
 from vllm.platforms import current_platform
-from vllm.utils import direct_register_custom_op, round_up
+from vllm.utils import direct_register_custom_op
 
 from .rocm_aiter_fused_moe import (is_rocm_aiter_moe_enabled,
                                    rocm_aiter_fused_experts,
@@ -25,8 +26,6 @@ from .rocm_aiter_fused_moe import (is_rocm_aiter_moe_enabled,
 
 logger = init_logger(__name__)
 
-has_deep_gemm = importlib.util.find_spec("deep_gemm") is not None
-
 
 @triton.jit
 def write_zeros_to_output(c_ptr, stride_cm, stride_cn, pid_n, N, offs_token,
@@ -443,300 +442,13 @@ def fused_moe_kernel(
     tl.store(c_ptrs, accumulator, mask=c_mask)
 
 
-def ceil_div(a, b):
-    return (a + b - 1) // b
-
-
-@triton.jit
-def moe_align_block_size_stage1(
-    topk_ids_ptr,
-    tokens_cnts_ptr,
-    num_experts: tl.constexpr,
-    numel: tl.constexpr,
-    tokens_per_thread: tl.constexpr,
-):
-    pid = tl.program_id(0)
-
-    start_idx = pid * tokens_per_thread
-
-    off_c = (pid + 1) * num_experts
-
-    for i in range(tokens_per_thread):
-        if start_idx + i < numel:
-            idx = tl.load(topk_ids_ptr + start_idx + i)
-            token_cnt = tl.load(tokens_cnts_ptr + off_c + idx)
-            tl.store(tokens_cnts_ptr + off_c + idx, token_cnt + 1)
-
-
-@triton.jit
-def moe_align_block_size_stage2(
-    tokens_cnts_ptr,
-    num_experts: tl.constexpr,
-):
-    pid = tl.program_id(0)
-
-    last_cnt = 0
-    for i in range(1, num_experts + 1):
-        token_cnt = tl.load(tokens_cnts_ptr + i * num_experts + pid)
-        last_cnt = last_cnt + token_cnt
-        tl.store(tokens_cnts_ptr + i * num_experts + pid, last_cnt)
-
-
-@triton.jit
-def moe_align_block_size_stage3(
-    total_tokens_post_pad_ptr,
-    tokens_cnts_ptr,
-    cumsum_ptr,
-    num_experts: tl.constexpr,
-    block_size: tl.constexpr,
-):
-    last_cumsum = 0
-    off_cnt = num_experts * num_experts
-    for i in range(1, num_experts + 1):
-        token_cnt = tl.load(tokens_cnts_ptr + off_cnt + i - 1)
-        last_cumsum = last_cumsum + tl.cdiv(token_cnt, block_size) * block_size
-        tl.store(cumsum_ptr + i, last_cumsum)
-    tl.store(total_tokens_post_pad_ptr, last_cumsum)
-
-
-@triton.jit
-def moe_align_block_size_stage4(
-    topk_ids_ptr,
-    sorted_token_ids_ptr,
-    expert_ids_ptr,
-    tokens_cnts_ptr,
-    cumsum_ptr,
-    num_experts: tl.constexpr,
-    block_size: tl.constexpr,
-    numel: tl.constexpr,
-    tokens_per_thread: tl.constexpr,
-):
-    pid = tl.program_id(0)
-    start_idx = tl.load(cumsum_ptr + pid)
-    end_idx = tl.load(cumsum_ptr + pid + 1)
-
-    for i in range(start_idx, end_idx, block_size):
-        tl.store(expert_ids_ptr + i // block_size, pid)
-
-    start_idx = pid * tokens_per_thread
-    off_t = pid * num_experts
-
-    for i in range(start_idx, tl.minimum(start_idx + tokens_per_thread,
-                                         numel)):
-        expert_id = tl.load(topk_ids_ptr + i)
-        token_cnt = tl.load(tokens_cnts_ptr + off_t + expert_id)
-        rank_post_pad = token_cnt + tl.load(cumsum_ptr + expert_id)
-        tl.store(sorted_token_ids_ptr + rank_post_pad, i)
-        tl.store(tokens_cnts_ptr + off_t + expert_id, token_cnt + 1)
-
-
-# Triton implementation based on:
-# https://github.com/sgl-project/sglang/commit/ba5112ff691d791a9e38c6c71f59324a5fcb49d0
-def moe_align_block_size_triton(
-    topk_ids: torch.Tensor,
-    num_experts: int,
-    block_size: int,
-    sorted_token_ids: torch.Tensor,
-    expert_ids: torch.Tensor,
-    num_tokens_post_pad: torch.Tensor,
-) -> None:
-    numel = topk_ids.numel()
-    grid = (num_experts, )
-    tokens_cnts = torch.zeros((num_experts + 1, num_experts),
-                              dtype=torch.int32,
-                              device=topk_ids.device)
-    cumsum = torch.zeros((num_experts + 1, ),
-                         dtype=torch.int32,
-                         device=topk_ids.device)
-    tokens_per_thread = ceil_div(numel, num_experts)
-
-    moe_align_block_size_stage1[grid](
-        topk_ids,
-        tokens_cnts,
-        num_experts,
-        numel,
-        tokens_per_thread,
-    )
-    moe_align_block_size_stage2[grid](
-        tokens_cnts,
-        num_experts,
-    )
-    moe_align_block_size_stage3[(1, )](
-        num_tokens_post_pad,
-        tokens_cnts,
-        cumsum,
-        num_experts,
-        block_size,
-    )
-    moe_align_block_size_stage4[grid](
-        topk_ids,
-        sorted_token_ids,
-        expert_ids,
-        tokens_cnts,
-        cumsum,
-        num_experts,
-        block_size,
-        numel,
-        tokens_per_thread,
-    )
-
-
-def moe_align_block_size(
-    topk_ids: torch.Tensor,
-    block_size: int,
-    num_experts: int,
-    expert_map: Optional[torch.Tensor] = None,
-    pad_sorted_ids: bool = False
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    """
-    Aligns the token distribution across experts to be compatible with block
-    size for matrix multiplication.
-
-    Parameters:
-    - topk_ids: A tensor of shape [total_tokens, top_k] representing the
-        top-k expert indices for each token.
-    - block_size: The block size used in block matrix multiplication.
-    - num_experts: The total number of experts.
-    - expert_map: A tensor of shape [num_experts] that maps the expert index
-        from the global space to the local index space of the current
-        expert parallel shard. If the expert is not in the current expert
-        parallel shard, the mapping is set to -1.
-    - pad_sorted_ids: A flag indicating whether the sorted_token_ids length
-      should be padded to a multiple of block_size,
-
-    Returns:
-    - sorted_token_ids: A tensor containing the sorted token indices according
-        to their allocated expert.
-    - expert_ids: A tensor indicating the assigned expert index for each block.
-    - num_tokens_post_padded: The total number of tokens after padding,
-        ensuring divisibility by block_size.
-
-    This function pads the number of tokens that each expert needs to process
-    so that it is divisible by block_size.
-    Padding ensures that during block matrix multiplication, the dimensions
-    align correctly.
-
-    Example:
-    Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]],
-    block_size = 4, and num_experts = 4:
-    - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts,
-        with each expert needing to process 3 tokens.
-    - As block_size is 4, we pad 1 token for each expert.
-    - First, flatten topk_ids to [2, 3, 4, 1, 2, 4, 1, 3, 4, 1, 2, 3].
-    - Then append padding tokens [12, 12, 12, 12] for each block.
-    - After sorting by expert index, we obtain token_ids
-        [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12].
-        Tokens 12 are non-existent (padding) and are ignored in
-        the subsequent matrix multiplication.
-    - The padding ensures that the total number of tokens is now divisible
-        by block_size for proper block matrix operations.
-    """
-    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
-    if pad_sorted_ids:
-        max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
-    sorted_ids = torch.empty((max_num_tokens_padded, ),
-                             dtype=torch.int32,
-                             device=topk_ids.device)
-    sorted_ids.fill_(topk_ids.numel())
-    max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
-    # Expert ids must be zeroed out to prevent index out of bounds error while
-    # mapping global expert ids to local expert ids in expert parallelism.
-    expert_ids = torch.zeros((max_num_m_blocks, ),
-                             dtype=torch.int32,
-                             device=topk_ids.device)
-    num_tokens_post_pad = torch.empty((1),
-                                      dtype=torch.int32,
-                                      device=topk_ids.device)
-    if num_experts >= 224:
-        if envs.VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON or num_experts != 256:
-            moe_align_block_size_triton(
-                topk_ids,
-                num_experts,
-                block_size,
-                sorted_ids,
-                expert_ids,
-                num_tokens_post_pad,
-            )
-        else:
-            # Currently requires num_experts=256
-            ops.sgl_moe_align_block_size(
-                topk_ids,
-                num_experts,
-                block_size,
-                sorted_ids,
-                expert_ids,
-                num_tokens_post_pad,
-            )
-    else:
-        ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids,
-                                 expert_ids, num_tokens_post_pad)
-    if expert_map is not None:
-        expert_ids = expert_map[expert_ids]
-
-    return sorted_ids, expert_ids, num_tokens_post_pad
-
-
-def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor,
-                     w2: torch.Tensor,
-                     expert_map: Optional[torch.Tensor]) -> bool:
-    """
-    Check if the given problem size is supported by the DeepGemm grouped
-    gemm kernel.  All of M, N, K and the quantization block_shape must be
-    aligned by `dg.get_m_alignment_for_contiguous_layout()`.
-    """
-    if not has_deep_gemm:
-        return False
-
-    # Lazy import to avoid CUDA initialization problems.
-    import deep_gemm as dg
-
-    # Expert maps not supported yet.
-    if expert_map is not None:
-        return False
-
-    align = dg.get_m_alignment_for_contiguous_layout()
-    M = hidden_states.shape[0]
-    _, K, N = w2.shape
-
-    # For now, disable DeepGemm for small N until better permute/unpermute
-    # ops are available.
-    if N <= 512:
-        return False
-
-    if align > M or N % align != 0 or K % align != 0:
-        return False
-
-    return (hidden_states.is_contiguous() and w1.is_contiguous()
-            and w2.is_contiguous())
-
-
-def _fp8_quantize(
-    A: torch.Tensor,
-    A_scale: Optional[torch.Tensor],
-    block_shape: Optional[List[int]],
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """
-    Perform fp8 quantization on the inputs.  If a block_shape
-    is provided, the output will be blocked.
-    """
-    if block_shape is None:
-        A, A_scale = ops.scaled_fp8_quant(A, A_scale)
-    else:
-        assert len(block_shape) == 2
-        _, block_k = block_shape[0], block_shape[1]
-        A, A_scale = per_token_group_quant_fp8(A, block_k)
-        assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
-    return A, A_scale
-
-
 def invoke_fused_moe_kernel(A: torch.Tensor,
                             B: torch.Tensor,
                             C: torch.Tensor,
                             A_scale: Optional[torch.Tensor],
                             B_scale: Optional[torch.Tensor],
                             B_zp: Optional[torch.Tensor],
-                            topk_weights: torch.Tensor,
-                            topk_ids: torch.Tensor,
+                            topk_weights: Optional[torch.Tensor],
                             sorted_token_ids: torch.Tensor,
                             expert_ids: torch.Tensor,
                             num_tokens_post_padded: torch.Tensor,
@@ -748,7 +460,8 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
                             use_int8_w8a16: bool,
                             use_int4_w4a16: bool,
                             block_shape: Optional[List[int]] = None) -> None:
-    assert topk_weights.stride(1) == 1
+    assert topk_weights is not None or not mul_routed_weight
+    assert topk_weights is None or topk_weights.stride(1) == 1
     assert sorted_token_ids.stride(0) == 1
 
     if use_fp8_w8a8:
@@ -765,6 +478,9 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
         assert A_scale is None
         assert B_scale is None
 
+    M = A.shape[0]
+    num_tokens = M * top_k
+
     EM = sorted_token_ids.shape[0]
     if A.shape[0] < config["BLOCK_SIZE_M"]:
         # optimize for small batch_size.
@@ -782,7 +498,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
         assert B_zp is None or B_zp.ndim == 3
 
         use_moe_wna16_cuda = should_moe_wna16_use_cuda(
-            num_valid_tokens=topk_ids.numel(),
+            num_valid_tokens=num_tokens,
             group_size=block_shape[1],
             num_experts=B.shape[0],
             bit=4 if use_int4_w4a16 else 8)
@@ -790,12 +506,12 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
         config.update(
             get_moe_wna16_block_config(config=config,
                                        use_moe_wna16_cuda=use_moe_wna16_cuda,
-                                       num_valid_tokens=topk_ids.numel(),
+                                       num_valid_tokens=num_tokens,
                                        size_k=A.shape[1],
                                        size_n=B.shape[1],
                                        num_experts=B.shape[1],
                                        group_size=block_shape[1],
-                                       real_top_k=topk_ids.shape[1],
+                                       real_top_k=top_k,
                                        block_size_m=config["BLOCK_SIZE_M"]))
 
         if use_moe_wna16_cuda:
@@ -821,7 +537,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
             B.shape[1],
             A.shape[1],
             EM,
-            topk_ids.numel(),
+            num_tokens,
             A.stride(0),
             A.stride(1),
             B.stride(0),
@@ -864,7 +580,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
             B.shape[1],
             B.shape[2],
             EM,
-            topk_ids.numel(),
+            num_tokens,
             A.stride(0),
             A.stride(1),
             B.stride(0),
@@ -1389,6 +1105,7 @@ def fused_experts(hidden_states: torch.Tensor,
             w2=w2,
             topk_weights=topk_weights,
             topk_ids=topk_ids,
+            inplace=inplace,
             activation=activation,
             global_num_experts=global_num_experts,
             expert_map=expert_map,
@@ -1419,85 +1136,6 @@ def fused_experts(hidden_states: torch.Tensor,
             block_shape=block_shape)
 
 
-def _fp8_perm(m: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
-    """
-    A permutation routine that works on fp8 types.
-    """
-    if torch.is_floating_point(m) and torch.finfo(m.dtype).bits == 8:
-        return m.view(dtype=torch.uint8)[idx, ...].view(dtype=m.dtype)
-    else:
-        return m[idx, ...]
-
-
-def _moe_permute(
-    curr_hidden_states: torch.Tensor,
-    a1q_scale: Optional[torch.Tensor],
-    curr_topk_ids: torch.Tensor,
-    global_num_experts: int,
-    expert_map: Optional[torch.Tensor],
-    top_k_num: int,
-    block_m: int,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
-           torch.Tensor]:
-    """
-    Determine the sorted_token_ids, expert_ids for the given problem size.
-    Permute the hidden states and scales according to `sorted_token_ids`.
-    """
-    tokens_in_chunk, _ = curr_hidden_states.shape
-
-    sorted_token_ids, expert_ids, num_tokens_post_padded = (
-        moe_align_block_size(curr_topk_ids,
-                             block_m,
-                             global_num_experts,
-                             expert_map,
-                             pad_sorted_ids=True))
-
-    inv_perm: Optional[torch.Tensor] = None
-
-    num_tokens = top_k_num * tokens_in_chunk
-    sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1)
-    expert_ids = torch.repeat_interleave(expert_ids, block_m, dim=0)
-    inv_perm = torch.argsort(sorted_token_ids)[:num_tokens]
-
-    # Permute according to sorted token ids.
-    curr_hidden_states = _fp8_perm(curr_hidden_states,
-                                   sorted_token_ids // top_k_num)
-
-    if a1q_scale is not None:
-        a1q_scale = a1q_scale[sorted_token_ids // top_k_num]
-
-    return (curr_hidden_states, a1q_scale, sorted_token_ids, expert_ids,
-            inv_perm)
-
-
-def _moe_unpermute_and_reduce(
-    out: torch.Tensor,
-    curr_hidden: torch.Tensor,
-    inv_perm: Optional[torch.Tensor],
-    topk: int,
-    K: int,
-    topk_weight: torch.Tensor,
-) -> None:
-    """
-    Unpermute the final result and apply topk_weights, then perform the final
-    reduction on the hidden states.
-    """
-    M = topk_weight.shape[0]
-    curr_hidden = curr_hidden[inv_perm, ...]
-    curr_hidden = curr_hidden.view(-1, topk, K)
-    curr_hidden.mul_(topk_weight.view(M, -1, 1))
-    ops.moe_sum(curr_hidden, out)
-
-
-def _resize_cache(x: torch.Tensor, v: Tuple[int, ...]) -> torch.Tensor:
-    """
-    Shrink the given tensor and apply the given view to it.  This is
-    used to resize the intermediate fused_moe caches.
-    """
-    assert prod(v) <= x.numel()
-    return x.flatten()[:prod(v)].view(*v)
-
-
 def fused_experts_impl(hidden_states: torch.Tensor,
                        w1: torch.Tensor,
                        w2: torch.Tensor,
@@ -1629,7 +1267,6 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                 w1_scale,
                                 w1_zp,
                                 curr_topk_weights,
-                                curr_topk_ids,
                                 sorted_token_ids,
                                 expert_ids,
                                 num_tokens_post_padded,
@@ -1660,28 +1297,34 @@ def fused_experts_impl(hidden_states: torch.Tensor,
             qintermediate_cache2 = intermediate_cache2
             a2q_scale = a2_scale
 
-        invoke_fused_moe_kernel(qintermediate_cache2,
-                                w2,
-                                intermediate_cache3,
-                                a2q_scale,
-                                w2_scale,
-                                w2_zp,
-                                curr_topk_weights,
-                                curr_topk_ids,
-                                sorted_token_ids,
-                                expert_ids,
-                                num_tokens_post_padded,
-                                True,
-                                1,
-                                config,
-                                compute_type=compute_type,
-                                use_fp8_w8a8=use_fp8_w8a8,
-                                use_int8_w8a16=use_int8_w8a16,
-                                use_int4_w4a16=use_int4_w4a16,
-                                block_shape=block_shape)
+        invoke_fused_moe_kernel(
+            qintermediate_cache2,
+            w2,
+            intermediate_cache3,
+            a2q_scale,
+            w2_scale,
+            w2_zp,
+            curr_topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            False,  #True,
+            1,
+            config,
+            compute_type=compute_type,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            use_int4_w4a16=use_int4_w4a16,
+            block_shape=block_shape)
+
+        if True:
+            intermediate_cache3 = intermediate_cache3.view(-1, top_k_num, K)
+            intermediate_cache3.mul_(
+                curr_topk_weights.view(tokens_in_chunk, -1, 1))
 
         ops.moe_sum(intermediate_cache3.view(*intermediate_cache3.shape),
                     out_hidden_states[begin_chunk_idx:end_chunk_idx])
+
     return out_hidden_states
 
 
@@ -1790,327 +1433,3 @@ def fused_moe(
                          a1_scale=a1_scale,
                          a2_scale=a2_scale,
                          block_shape=block_shape)
-
-
-def deep_gemm_moe_fp8(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    w1_scale: torch.Tensor,
-    w2_scale: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    inplace: bool = False,
-    activation: str = "silu",
-    global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
-    """
-    This function computes a a8w8-quantized Mixture of Experts (MoE) layer
-    using two sets of quantized weights, w1_q and w2_q, and top-k gating
-    mechanism. The matrix multiplications are implemented with DeepGemm
-    grouped gemm.
-
-    Parameters:
-    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
-        Shape: [M, K]
-    - w1 (torch.Tensor): The first set of fp8 quantized expert weights.
-        Shape: [num_experts, K, 2N] (the weights are passed transposed)
-    - w2 (torch.Tensor): The second set of fp8 quantized expert weights.
-        Shape: [num_experts, N, K] (the weights are passed transposed)
-    - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
-        Shape: [num_experts] or [num_experts, 2N]
-    - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
-        Shape: [num_experts] or [num_experts, K]
-    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
-    - topk_ids (torch.Tensor): The token->expert mapping for topk_weights.
-    - inplace (bool): If True, perform the operation in-place.
-        Defaults to False.
-    - activation (str): The activation function to apply after the first
-        MoE layer.
-    - global_num_experts (int): The total number of experts in the global
-        expert space.
-    - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices
-        from the global expert space to the local expert space of the expert
-        parallel shard.
-    - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
-        Shape: scalar or [M]
-    - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
-        quantize the intermediate result between the gemms.
-        Shape: scalar or [M]
-
-    Returns:
-    - torch.Tensor: The bfloat16 output tensor after applying the MoE layer.
-    """
-    # Lazy import to avoid CUDA initialization problems.
-    import deep_gemm as dg
-
-    assert expert_map is None, "Expert maps not supported yet"
-
-    assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch"
-
-    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
-    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
-    assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
-    assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
-    assert hidden_states.dtype in [
-        torch.float32, torch.float16, torch.bfloat16
-    ]
-    assert w1.dtype == torch.float8_e4m3fn
-    assert w2.dtype == torch.float8_e4m3fn
-    assert w1.shape[0] == w2.shape[0], "Expert number mismatch"
-    assert w1.shape[0] == w1_scale.shape[0], "w1 scales expert number mismatch"
-    assert w1.shape[0] == w2_scale.shape[0], "w2 scales expert number mismatch"
-    assert a1_scale is None or a1_scale.dim(
-    ) == 0 or a1_scale.shape[0] == 1 or a1_scale.shape[
-        0] == hidden_states.shape[0], "Input scale shape mismatch"
-    assert a2_scale is None or a1_scale is None or a2_scale.shape == a1_scale.shape, "Intermediate scale shape mismatch"  # noqa: E501
-
-    num_tokens, _ = hidden_states.shape
-    E, N, _ = w1.shape
-    K = w2.shape[1]
-    if global_num_experts == -1:
-        global_num_experts = E
-    top_k_num = topk_ids.shape[1]
-    # We execute the fused_moe kernel in chunks to circumvent this issue:
-    # https://github.com/vllm-project/vllm/issues/5938
-    CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
-
-    assert _valid_deep_gemm(hidden_states, w1, w2, expert_map)
-
-    if inplace:
-        out_hidden_states = hidden_states
-    else:
-        out_hidden_states = torch.empty_like(hidden_states)
-
-    block_m = dg.get_m_alignment_for_contiguous_layout()
-    block_shape = [block_m, block_m]
-
-    assert w1_scale is not None
-    assert w2_scale is not None
-
-    # We attempt to transpose and align offline in Fp8MoEMethod, in which
-    # case these calls will be nops.  Otherwise, they'll be performed every
-    # time the layer is executed.
-    w1_scale = dg.get_col_major_tma_aligned_tensor(w1_scale).contiguous()
-    w2_scale = dg.get_col_major_tma_aligned_tensor(w2_scale).contiguous()
-
-    M_sum = topk_ids.numel() + global_num_experts * (block_m - 1)
-    M_sum = round_up(M_sum, block_m)
-
-    num_chunks = (num_tokens // CHUNK_SIZE) + 1
-
-    # We can reuse the memory between cache1 and cache3 because by the time
-    # we need cache3, we're done with cache1
-    cache13 = torch.empty(M_sum * max(N, K),
-                          device=hidden_states.device,
-                          dtype=hidden_states.dtype)
-
-    intermediate_cache1 = cache13[:M_sum * N].view(M_sum, N)
-    intermediate_cache2 = torch.empty((M_sum, N // 2),
-                                      device=hidden_states.device,
-                                      dtype=hidden_states.dtype)
-    intermediate_cache3 = cache13[:M_sum * K].view(M_sum, K)
-
-    for chunk in range(num_chunks):
-        begin_chunk_idx, end_chunk_idx = (chunk * CHUNK_SIZE,
-                                          min((chunk + 1) * CHUNK_SIZE,
-                                              num_tokens))
-        curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx]
-        tokens_in_chunk, _ = curr_hidden_states.shape
-
-        if tokens_in_chunk == 0:
-            break
-
-        curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]
-        curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx]
-
-        a1q_scale: Optional[torch.Tensor] = None
-
-        qcurr_hidden_states, a1q_scale = _fp8_quantize(curr_hidden_states,
-                                                       a1_scale, block_shape)
-
-        (qcurr_hidden_states, a1q_scale, sorted_token_ids, expert_ids,
-         inv_perm) = _moe_permute(qcurr_hidden_states, a1q_scale,
-                                  curr_topk_ids, global_num_experts,
-                                  expert_map, top_k_num, block_m)
-
-        # Adjust the intermediate cache size and config for the last chunk.
-        # Note that in most cases we only have one chunk so the cache size
-        # and config are already set correctly and do not need to be adjusted.
-        if tokens_in_chunk < CHUNK_SIZE and chunk > 0:
-            curr_M = sorted_token_ids.numel()
-            intermediate_cache1 = _resize_cache(intermediate_cache1,
-                                                (curr_M, N))
-            intermediate_cache2 = _resize_cache(intermediate_cache2,
-                                                (curr_M, N // 2))
-            intermediate_cache3 = _resize_cache(intermediate_cache3,
-                                                (curr_M, K))
-
-        dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
-            (qcurr_hidden_states, a1q_scale), (w1, w1_scale),
-            intermediate_cache1, expert_ids)
-
-        if activation == "silu":
-            torch.ops._C.silu_and_mul(intermediate_cache2,
-                                      intermediate_cache1.view(-1, N))
-        elif activation == "gelu":
-            torch.ops._C.gelu_and_mul(intermediate_cache2,
-                                      intermediate_cache1.view(-1, N))
-        else:
-            raise ValueError(f"Unsupported FusedMoe activation: {activation}")
-
-        a2q_scale: Optional[torch.Tensor] = None
-
-        qintermediate_cache2, a2q_scale = _fp8_quantize(
-            intermediate_cache2, a2_scale, block_shape)
-
-        dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
-            (qintermediate_cache2, a2q_scale), (w2, w2_scale),
-            intermediate_cache3, expert_ids)
-
-        _moe_unpermute_and_reduce(
-            out_hidden_states[begin_chunk_idx:end_chunk_idx],
-            intermediate_cache3.view(*intermediate_cache3.shape), inv_perm,
-            top_k_num, K, curr_topk_weights)
-
-    return out_hidden_states
-
-
-#TODO make the grouped gemm kernel consistent with scaled gemm kernel
-def cutlass_moe_fp8(
-    a: torch.Tensor,
-    w1_q: torch.Tensor,
-    w2_q: torch.Tensor,
-    w1_scale: torch.Tensor,
-    w2_scale: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    ab_strides1: torch.Tensor,
-    c_strides1: torch.Tensor,
-    ab_strides2: torch.Tensor,
-    c_strides2: torch.Tensor,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    out_dtype: torch.dtype = torch.half,
-) -> torch.Tensor:
-    """
-    This function computes a a8w8-quantized Mixture of Experts (MoE) layer
-    using two sets of quantized weights, w1_q and w2_q, and top-k gating
-    mechanism. The matrix multiplications are implemented with CUTLASS
-    grouped gemm.
-
-    Parameters:
-    - a (torch.Tensor): The input tensor to the MoE layer.
-        Shape: [M, K]
-    - w1_q (torch.Tensor): The first set of fp8-quantized expert weights.
-        Shape: [num_experts, K, 2N] (the weights are passed transposed)
-    - w2_q (torch.Tensor): The second set of fp8-quantized expert weights.
-        Shape: [num_experts, N, K] (the weights are passed transposed)
-    - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
-        Shape: [num_experts] or [num_experts, 2N]
-    - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
-        Shape: [num_experts] or [num_experts, K]
-    - gating_output (torch.Tensor): The output of the gating operation
-        (before softmax).
-    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
-    - ab_strides1 (torch.Tensor): The input and weights strides of the first
-        grouped gemm.
-    - c_strides1 (torch.Tensor): The output strides of the first grouped gemm.
-    - ab_strides2 (torch.Tensor): The input and weights strides of the second
-        grouped gemm.
-    - c_strides2 (torch.Tensor): The output strides of the second grouped gemm.
-    - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
-        Shape: scalar or [M]
-    - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
-        quantize the intermediate result between the gemms.
-        Shape: scalar or [M]
-    - out_dtype (torch.Tensor): The output tensor type.
-
-    Returns:
-    - torch.Tensor: The fp16 output tensor after applying the MoE layer.
-    """
-
-    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
-    assert w1_q.dtype == torch.float8_e4m3fn
-    assert w2_q.dtype == torch.float8_e4m3fn
-    assert a.shape[1] == w1_q.shape[1], "Hidden size mismatch w1"
-    assert w1_q.shape[2] == w2_q.shape[1] * 2, "Hidden size mismatch w2"
-    assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch"
-    assert a1_scale is None or a1_scale.dim(
-    ) == 0 or a1_scale.shape[0] == 1 or a1_scale.shape[0] == a.shape[
-        0], "Input scale shape mismatch"
-    assert w1_scale.dim() == 1 or w1_scale.shape[1] == 1 or w1_scale.shape[
-        1] == w1_q.shape[2], "W1 scale shape mismatch"
-    assert w2_scale.dim() == 1 or w2_scale.shape[1] == 1 or w2_scale.shape[
-        1] == w2_q.shape[2], "W2 scale shape mismatch"
-    assert w1_q.shape[0] == w2_q.shape[0], "Weights expert number mismatch"
-    assert w1_q.shape[0] == w1_scale.shape[
-        0], "w1 scales expert number mismatch"
-    assert w1_q.shape[0] == w2_scale.shape[
-        0], "w2 scales expert number mismatch"
-    assert a2_scale is None or a1_scale is None or a2_scale.shape == a1_scale.shape, "Intermediate scale shape mismatch"  # noqa: E501
-    assert ab_strides1.shape[0] == w1_q.shape[
-        0], "AB Strides 1 expert number mismatch"
-    assert c_strides1.shape[0] == w1_q.shape[
-        0], "C Strides 1 expert number mismatch"
-    assert ab_strides2.shape[0] == w2_q.shape[
-        0], "AB Strides 2 expert number  mismatch"
-    assert c_strides2.shape[0] == w2_q.shape[
-        0], "C Strides 2 expert number mismatch"
-    assert out_dtype in [torch.half, torch.bfloat16], "Invalid output dtype"
-
-    num_experts = w1_q.size(0)
-    m = a.size(0)
-    k = w1_q.size(1)
-    n = w2_q.size(1)
-
-    topk = topk_ids.size(1)
-    per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
-        a2_scale.numel() != 1 if a2_scale is not None else False)
-
-    a_q, a1_scale = ops.scaled_fp8_quant(
-        a, a1_scale, use_per_token_if_dynamic=per_act_token)
-    device = a_q.device
-
-    expert_offsets = torch.empty((num_experts + 1),
-                                 dtype=torch.int32,
-                                 device=device)
-    problem_sizes1 = torch.empty((num_experts, 3),
-                                 dtype=torch.int32,
-                                 device=device)
-    problem_sizes2 = torch.empty((num_experts, 3),
-                                 dtype=torch.int32,
-                                 device=device)
-
-    a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
-    c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
-
-    ops.get_cutlass_moe_mm_data(topk_ids, expert_offsets, problem_sizes1,
-                                problem_sizes2, a_map, c_map, num_experts, n,
-                                k)
-
-    rep_a_q = a_q.view(dtype=torch.uint8)[a_map].view(dtype=a_q.dtype)
-    rep_a1_scales = a1_scale[a_map] if per_act_token else a1_scale
-
-    c1 = torch.empty((m * topk, n * 2), device=device, dtype=out_dtype)
-    c2 = torch.empty((m * topk, k), device=device, dtype=out_dtype)
-
-    ops.cutlass_moe_mm(c1, rep_a_q, w1_q, rep_a1_scales, w1_scale,
-                       expert_offsets[:-1], problem_sizes1, ab_strides1,
-                       ab_strides1, c_strides1)
-
-    intermediate = torch.empty((m * topk, n), device=device, dtype=out_dtype)
-    torch.ops._C.silu_and_mul(intermediate, c1)
-
-    intemediate_q, a2_scale = ops.scaled_fp8_quant(
-        intermediate, a2_scale, use_per_token_if_dynamic=per_act_token)
-
-    ops.cutlass_moe_mm(c2, intemediate_q, w2_q, a2_scale, w2_scale,
-                       expert_offsets[:-1], problem_sizes2, ab_strides2,
-                       ab_strides2, c_strides2)
-
-    return (c2[c_map].view(m, topk, k) *
-            topk_weights.view(m, topk, 1).to(out_dtype)).sum(dim=1)
diff --git a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
new file mode 100644
index 0000000000000..07d51acf98674
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
@@ -0,0 +1,243 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+import vllm.envs as envs
+from vllm import _custom_ops as ops
+from vllm.utils import round_up
+
+
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+
+@triton.jit
+def moe_align_block_size_stage1(
+    topk_ids_ptr,
+    tokens_cnts_ptr,
+    num_experts: tl.constexpr,
+    numel: tl.constexpr,
+    tokens_per_thread: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    start_idx = pid * tokens_per_thread
+
+    off_c = (pid + 1) * num_experts
+
+    for i in range(tokens_per_thread):
+        if start_idx + i < numel:
+            idx = tl.load(topk_ids_ptr + start_idx + i)
+            token_cnt = tl.load(tokens_cnts_ptr + off_c + idx)
+            tl.store(tokens_cnts_ptr + off_c + idx, token_cnt + 1)
+
+
+@triton.jit
+def moe_align_block_size_stage2(
+    tokens_cnts_ptr,
+    num_experts: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    last_cnt = 0
+    for i in range(1, num_experts + 1):
+        token_cnt = tl.load(tokens_cnts_ptr + i * num_experts + pid)
+        last_cnt = last_cnt + token_cnt
+        tl.store(tokens_cnts_ptr + i * num_experts + pid, last_cnt)
+
+
+@triton.jit
+def moe_align_block_size_stage3(
+    total_tokens_post_pad_ptr,
+    tokens_cnts_ptr,
+    cumsum_ptr,
+    num_experts: tl.constexpr,
+    block_size: tl.constexpr,
+):
+    last_cumsum = 0
+    off_cnt = num_experts * num_experts
+    for i in range(1, num_experts + 1):
+        token_cnt = tl.load(tokens_cnts_ptr + off_cnt + i - 1)
+        last_cumsum = last_cumsum + tl.cdiv(token_cnt, block_size) * block_size
+        tl.store(cumsum_ptr + i, last_cumsum)
+    tl.store(total_tokens_post_pad_ptr, last_cumsum)
+
+
+@triton.jit
+def moe_align_block_size_stage4(
+    topk_ids_ptr,
+    sorted_token_ids_ptr,
+    expert_ids_ptr,
+    tokens_cnts_ptr,
+    cumsum_ptr,
+    num_experts: tl.constexpr,
+    block_size: tl.constexpr,
+    numel: tl.constexpr,
+    tokens_per_thread: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    start_idx = tl.load(cumsum_ptr + pid)
+    end_idx = tl.load(cumsum_ptr + pid + 1)
+
+    for i in range(start_idx, end_idx, block_size):
+        tl.store(expert_ids_ptr + i // block_size, pid)
+
+    start_idx = pid * tokens_per_thread
+    off_t = pid * num_experts
+
+    for i in range(start_idx, tl.minimum(start_idx + tokens_per_thread,
+                                         numel)):
+        expert_id = tl.load(topk_ids_ptr + i)
+        token_cnt = tl.load(tokens_cnts_ptr + off_t + expert_id)
+        rank_post_pad = token_cnt + tl.load(cumsum_ptr + expert_id)
+        tl.store(sorted_token_ids_ptr + rank_post_pad, i)
+        tl.store(tokens_cnts_ptr + off_t + expert_id, token_cnt + 1)
+
+
+# Triton implementation based on:
+# https://github.com/sgl-project/sglang/commit/ba5112ff691d791a9e38c6c71f59324a5fcb49d0
+def moe_align_block_size_triton(
+    topk_ids: torch.Tensor,
+    num_experts: int,
+    block_size: int,
+    sorted_token_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    num_tokens_post_pad: torch.Tensor,
+) -> None:
+    numel = topk_ids.numel()
+    grid = (num_experts, )
+    tokens_cnts = torch.zeros((num_experts + 1, num_experts),
+                              dtype=torch.int32,
+                              device=topk_ids.device)
+    cumsum = torch.zeros((num_experts + 1, ),
+                         dtype=torch.int32,
+                         device=topk_ids.device)
+    tokens_per_thread = ceil_div(numel, num_experts)
+
+    moe_align_block_size_stage1[grid](
+        topk_ids,
+        tokens_cnts,
+        num_experts,
+        numel,
+        tokens_per_thread,
+    )
+    moe_align_block_size_stage2[grid](
+        tokens_cnts,
+        num_experts,
+    )
+    moe_align_block_size_stage3[(1, )](
+        num_tokens_post_pad,
+        tokens_cnts,
+        cumsum,
+        num_experts,
+        block_size,
+    )
+    moe_align_block_size_stage4[grid](
+        topk_ids,
+        sorted_token_ids,
+        expert_ids,
+        tokens_cnts,
+        cumsum,
+        num_experts,
+        block_size,
+        numel,
+        tokens_per_thread,
+    )
+
+
+def moe_align_block_size(
+    topk_ids: torch.Tensor,
+    block_size: int,
+    num_experts: int,
+    expert_map: Optional[torch.Tensor] = None,
+    pad_sorted_ids: bool = False
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Aligns the token distribution across experts to be compatible with block
+    size for matrix multiplication.
+
+    Parameters:
+    - topk_ids: A tensor of shape [total_tokens, top_k] representing the
+        top-k expert indices for each token.
+    - block_size: The block size used in block matrix multiplication.
+    - num_experts: The total number of experts.
+    - expert_map: A tensor of shape [num_experts] that maps the expert index
+        from the global space to the local index space of the current
+        expert parallel shard. If the expert is not in the current expert
+        parallel shard, the mapping is set to -1.
+    - pad_sorted_ids: A flag indicating whether the sorted_token_ids length
+      should be padded to a multiple of block_size,
+
+    Returns:
+    - sorted_token_ids: A tensor containing the sorted token indices according
+        to their allocated expert.
+    - expert_ids: A tensor indicating the assigned expert index for each block.
+    - num_tokens_post_padded: The total number of tokens after padding,
+        ensuring divisibility by block_size.
+
+    This function pads the number of tokens that each expert needs to process
+    so that it is divisible by block_size.
+    Padding ensures that during block matrix multiplication, the dimensions
+    align correctly.
+
+    Example:
+    Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]],
+    block_size = 4, and num_experts = 4:
+    - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts,
+        with each expert needing to process 3 tokens.
+    - As block_size is 4, we pad 1 token for each expert.
+    - First, flatten topk_ids to [2, 3, 4, 1, 2, 4, 1, 3, 4, 1, 2, 3].
+    - Then append padding tokens [12, 12, 12, 12] for each block.
+    - After sorting by expert index, we obtain token_ids
+        [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12].
+        Tokens 12 are non-existent (padding) and are ignored in
+        the subsequent matrix multiplication.
+    - The padding ensures that the total number of tokens is now divisible
+        by block_size for proper block matrix operations.
+    """
+    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    if pad_sorted_ids:
+        max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
+    sorted_ids = torch.empty((max_num_tokens_padded, ),
+                             dtype=torch.int32,
+                             device=topk_ids.device)
+    sorted_ids.fill_(topk_ids.numel())
+    max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
+    # Expert ids must be zeroed out to prevent index out of bounds error while
+    # mapping global expert ids to local expert ids in expert parallelism.
+    expert_ids = torch.zeros((max_num_m_blocks, ),
+                             dtype=torch.int32,
+                             device=topk_ids.device)
+    num_tokens_post_pad = torch.empty((1),
+                                      dtype=torch.int32,
+                                      device=topk_ids.device)
+    if num_experts >= 224:
+        if envs.VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON or num_experts != 256:
+            moe_align_block_size_triton(
+                topk_ids,
+                num_experts,
+                block_size,
+                sorted_ids,
+                expert_ids,
+                num_tokens_post_pad,
+            )
+        else:
+            # Currently requires num_experts=256
+            ops.sgl_moe_align_block_size(
+                topk_ids,
+                num_experts,
+                block_size,
+                sorted_ids,
+                expert_ids,
+                num_tokens_post_pad,
+            )
+    else:
+        ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids,
+                                 expert_ids, num_tokens_post_pad)
+    if expert_map is not None:
+        expert_ids = expert_map[expert_ids]
+
+    return sorted_ids, expert_ids, num_tokens_post_pad
diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py
new file mode 100644
index 0000000000000..db31422f7275b
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/utils.py
@@ -0,0 +1,48 @@
+# SPDX-License-Identifier: Apache-2.0
+from math import prod
+from typing import List, Optional, Tuple
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8)
+from vllm.utils import cdiv
+
+
+def _resize_cache(x: torch.Tensor, v: Tuple[int, ...]) -> torch.Tensor:
+    """
+    Shrink the given tensor and apply the given view to it.  This is
+    used to resize the intermediate fused_moe caches.
+    """
+    assert prod(v) <= x.numel()
+    return x.flatten()[:prod(v)].view(*v)
+
+
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = ops.scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
+
+
+def _fp8_perm(m: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
+    """
+    A permutation routine that works on fp8 types.
+    """
+    if torch.is_floating_point(m) and torch.finfo(m.dtype).bits == 8:
+        return m.view(dtype=torch.uint8)[idx, ...].view(dtype=m.dtype)
+    else:
+        return m[idx, ...]

From 45b1ff7a25346ecb142e2ef4c87ca1ad08955ac4 Mon Sep 17 00:00:00 2001
From: yarongmu-google <150371854+yarongmu-google@users.noreply.github.com>
Date: Thu, 3 Apr 2025 10:32:54 -0700
Subject: [PATCH 214/593] =?UTF-8?q?[Misc][Performance]=20Advance=20tpu.txt?=
 =?UTF-8?q?=20to=20the=20most=20recent=20nightly=20torch=20=E2=80=A6=20(#1?=
 =?UTF-8?q?6024)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 requirements/tpu.txt | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index 1930eacb61ad6..085b79958f8b7 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -17,9 +17,10 @@ ray[data]
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250328-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250328-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250328-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250328-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250328-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250328-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250403-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250403-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250403-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250403-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250403-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250403-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+

From 03a70eacafccd0ed4688ed77dc86e2551ffa1c04 Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Thu, 3 Apr 2025 13:05:17 -0500
Subject: [PATCH 215/593] Re-enable the AMD Testing for the passing tests.
 (#15586)

Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
---
 .buildkite/run-amd-test.sh    | 20 +++++++++++++++++---
 .buildkite/test-pipeline.yaml | 15 +++++++++------
 2 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index e5a1b760db1f0..469422ddec20a 100755
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -105,19 +105,33 @@ fi
 if [[ $commands == *" entrypoints/openai "* ]]; then
   commands=${commands//" entrypoints/openai "/" entrypoints/openai \
   --ignore=entrypoints/openai/test_audio.py \
-  --ignore=entrypoints/openai/test_chat.py \
   --ignore=entrypoints/openai/test_shutdown.py \
   --ignore=entrypoints/openai/test_completion.py \
   --ignore=entrypoints/openai/test_sleep.py \
   --ignore=entrypoints/openai/test_models.py \
+  --ignore=entrypoints/openai/test_lora_adapters.py \
+  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
+  --ignore=entrypoints/openai/test_root_path.py \
+  --ignore=entrypoints/openai/test_tokenization.py \
   --ignore=entrypoints/openai/test_prompt_validation.py "}
 fi
 
 #ignore certain Entrypoints/llm tests
-if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
-  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
+if [[ $commands == *" entrypoints/llm "* ]]; then
+  commands=${commands//" entrypoints/llm "/" entrypoints/llm \
+  --ignore=entrypoints/llm/test_chat.py \
+  --ignore=entrypoints/llm/test_accuracy.py \
+  --ignore=entrypoints/llm/test_init.py \
+  --ignore=entrypoints/llm/test_generate_multiple_loras.py \
+  --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi
 
+#Obsolete currently
+##ignore certain Entrypoints/llm tests
+#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
+#  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
+#fi
+
 # --ignore=entrypoints/openai/test_encoder_decoder.py \
 # --ignore=entrypoints/openai/test_embedding.py \
 # --ignore=entrypoints/openai/test_oot_registration.py
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 59e64dc527f18..4a462e1909a1f 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -104,7 +104,7 @@ steps:
 - label: Entrypoints Test # 40min
   working_dir: "/vllm-workspace/tests"
   fast_check: true
-  mirror_hardwares: [amd]
+  #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
   - tests/entrypoints/llm
@@ -155,6 +155,7 @@ steps:
   - popd
 
 - label: Metrics, Tracing Test # 10min
+  mirror_hardwares: [amd]
   num_gpus: 2
   source_file_dependencies:
   - vllm/
@@ -173,7 +174,7 @@ steps:
 #####  1 GPU test  #####
 
 - label: Regression Test # 5min
-  mirror_hardwares: [amd]
+  #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
   - tests/test_regression
@@ -284,7 +285,7 @@ steps:
     - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
 
 - label: LoRA Test %N # 15min each
-  mirror_hardwares: [amd]
+  #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/lora
   - tests/lora
@@ -310,7 +311,7 @@ steps:
   - pytest -v -s compile/test_full_graph.py
 
 - label: Kernels Test %N # 1h each
-  mirror_hardwares: [amd]
+  # mirror_hardwares: [amd]
   source_file_dependencies:
   - csrc/
   - vllm/attention
@@ -320,7 +321,7 @@ steps:
   parallelism: 4
 
 - label: Tensorizer Test # 11min
-  mirror_hardwares: [amd]
+  # mirror_hardwares: [amd]
   soft_fail: true
   source_file_dependencies:
   - vllm/model_executor/model_loader
@@ -371,7 +372,7 @@ steps:
 
 - label: OpenAI-Compatible Tool Use # 20 min
   fast_check: false
-  mirror_hardwares: [ amd ]
+  #mirror_hardwares: [ amd ]
   source_file_dependencies:
     - vllm/
     - tests/tool_use
@@ -463,6 +464,7 @@ steps:
 
 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models Test
+  mirror_hardwares: [amd]
   optional: true
   commands:
     - echo 'Testing custom models...'
@@ -474,6 +476,7 @@ steps:
 #####  multi gpus test  #####
 
 - label: Distributed Comm Ops Test # 7min
+  mirror_hardwares: [amd]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
   source_file_dependencies:

From b6be6f8d1e49d4aa884603e8675dc216be1cbd79 Mon Sep 17 00:00:00 2001
From: iefgnoix <isaacwxf23@gmail.com>
Date: Thu, 3 Apr 2025 14:23:28 -0700
Subject: [PATCH 216/593] [TPU] Support sliding window and logit soft capping
 in the paged attention kernel for TPU. (#15732)

Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
---
 .buildkite/run-tpu-v1-test.sh          |  6 +-
 tests/entrypoints/llm/test_accuracy.py | 30 +++++---
 tests/v1/tpu/test_pallas.py            | 98 ++++++++++++++++++++++++++
 vllm/v1/attention/backends/pallas.py   | 12 ++--
 4 files changed, 128 insertions(+), 18 deletions(-)
 create mode 100644 tests/v1/tpu/test_pallas.py

diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh
index 5b7ce9a7677e3..87f74277cf900 100755
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-set -e
+set -xue
 
 # Build the docker image.
 docker build -f docker/Dockerfile.tpu -t vllm-tpu .
@@ -38,7 +38,9 @@ docker run --privileged --net host --shm-size=16G -it \
     && echo TEST_7 \
     && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py \
     && echo TEST_8 \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py" \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py \
+    && echo TEST_9 \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" \
 
 
 # TODO: This test fails because it uses RANDOM_SEED sampling
diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py
index 2bc32ace0a59d..95657455bd7bb 100644
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@@ -13,18 +13,24 @@ import pytest
 
 from vllm.platforms import current_platform
 
-MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
+MODEL_NAMES = [
+    "Qwen/Qwen2-1.5B-Instruct",
+    "google/gemma-3-1b-it",
+]
 NUM_CONCURRENT = 500
 TASK = "gsm8k"
 FILTER = "exact_match,strict-match"
 RTOL = 0.03
-EXPECTED_VALUE = 0.58
+EXPECTED_VALUES = {
+    "Qwen/Qwen2-1.5B-Instruct": 0.58,
+    "google/gemma-3-1b-it": 0.25,
+}
 
 
-def run_test(more_args=None):
+def run_test(model_name, more_args=None):
     """Run the end to end accuracy test."""
 
-    model_args = f"pretrained={MODEL_NAME},max_model_len=4096"
+    model_args = f"pretrained={model_name},max_model_len=4096"
 
     if more_args is not None:
         model_args = "{},{}".format(model_args, more_args)
@@ -37,9 +43,12 @@ def run_test(more_args=None):
     )
 
     measured_value = results["results"][TASK][FILTER]
-    assert (measured_value - RTOL < EXPECTED_VALUE
-            and measured_value + RTOL > EXPECTED_VALUE
-            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+    assert model_name in EXPECTED_VALUES, (
+        f"Cannot find the expected value for the model {model_name=}")
+    expected_value = EXPECTED_VALUES[model_name]
+    assert (measured_value - RTOL < expected_value
+            and measured_value + RTOL > expected_value
+            ), f"Expected: {expected_value} |  Measured: {measured_value}"
 
 
 # TODO: [AlexM] Fix it with new CI/CD tests
@@ -49,7 +58,8 @@ TPU_TP_TEST_STR = ""  #"tensor_parallel_size=4"
 @pytest.mark.skipif(not current_platform.is_cuda()
                     and not current_platform.is_tpu(),
                     reason="V1 is currently only supported on CUDA and TPU")
-def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
+@pytest.mark.parametrize("model", MODEL_NAMES)
+def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch):
     """Run with the V1 Engine."""
 
     with monkeypatch.context() as m:
@@ -64,7 +74,7 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
             if TPU_TP_TEST_STR:
                 more_args += ",{}".format(TPU_TP_TEST_STR)
 
-        run_test(more_args)
+        run_test(model, more_args)
 
 
 def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):
@@ -72,4 +82,4 @@ def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "0")
-        run_test()
+        run_test("Qwen/Qwen2-1.5B-Instruct")
diff --git a/tests/v1/tpu/test_pallas.py b/tests/v1/tpu/test_pallas.py
new file mode 100644
index 0000000000000..54eab145efb47
--- /dev/null
+++ b/tests/v1/tpu/test_pallas.py
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: Apache-2.0
+from unittest.mock import ANY, patch
+
+import torch
+
+from vllm.attention.backends.abstract import AttentionType
+from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK,
+                                               NUM_QUERIES_PER_BLOCK,
+                                               PallasAttentionBackendImpl,
+                                               PallasMetadata)
+
+
+def test_ragged_paged_attention():
+    # We verify that the kernel inputs such as sliding_window, etc. are passed
+    # in from the model correctly.
+    # The correctness of the paged attention kernel is tested in the kernel
+    # library.
+    num_heads = 4
+    head_size = 128
+    scale = 1.0
+    num_kv_heads = 4
+    sliding_window = 128
+    logits_soft_cap = 50.0
+    attn_impl = PallasAttentionBackendImpl(
+        num_heads=num_heads,
+        head_size=head_size,
+        scale=scale,
+        num_kv_heads=num_kv_heads,
+        alibi_slopes=None,
+        sliding_window=sliding_window,
+        kv_cache_dtype="auto",
+        logits_soft_cap=logits_soft_cap,
+        attn_type=AttentionType.DECODER,
+    )
+    mock_vmem_limit_bytes = 1024
+    attn_impl.vmem_limit_bytes = mock_vmem_limit_bytes
+
+    class FakeAttentionLayer:
+        _k_scale_float: float
+        _v_scale_float: float
+
+    layer = FakeAttentionLayer()
+    layer._k_scale_float = 1.0
+    layer._v_scale_float = 1.0
+
+    num_tokens = 16
+    num_blocks = 1024
+    block_size = 16
+    query = torch.zeros(num_tokens, num_heads * head_size)
+    key = torch.zeros(num_tokens, num_kv_heads * head_size)
+    value = torch.zeros(num_tokens, num_kv_heads * head_size)
+    kv_cache = torch.zeros(num_blocks, block_size, num_kv_heads * 2, head_size)
+    slot_mapping = torch.zeros(num_tokens, dtype=torch.int64)
+    max_num_reqs = 8
+    max_num_blocks_per_req = 8
+    block_tables = torch.zeros((max_num_reqs, max_num_blocks_per_req),
+                               dtype=torch.int32)
+    context_lens = torch.ones((max_num_reqs, ), dtype=torch.int32)
+    query_lens = [1] * max_num_reqs
+    query_start_loc = torch.cumsum(torch.tensor([0] + query_lens,
+                                                dtype=torch.int32),
+                                   dim=0,
+                                   dtype=torch.int32)
+    num_seqs = torch.tensor([max_num_reqs], dtype=torch.int32)
+    attn_metadata = PallasMetadata(
+        slot_mapping=slot_mapping,
+        block_tables=block_tables,
+        context_lens=context_lens,
+        query_start_loc=query_start_loc,
+        num_seqs=num_seqs,
+    )
+
+    with patch("torch.ops.xla.ragged_paged_attention"
+               ) as mock_ragged_paged_attention:
+        attn_impl.forward(
+            layer=layer,
+            query=query,
+            key=key,
+            value=value,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+
+        mock_ragged_paged_attention.assert_called_once_with(
+            ANY,  # query
+            ANY,  # kv_cache
+            ANY,  # context_lens
+            ANY,  # block_tables
+            ANY,  # query_start_loc
+            ANY,  # num_seqs
+            num_kv_pages_per_block=NUM_KV_PAGES_PER_BLOCK,
+            num_queries_per_block=NUM_QUERIES_PER_BLOCK,
+            vmem_limit_bytes=mock_vmem_limit_bytes,
+            use_kernel=True,
+            sm_scale=scale,
+            sliding_window=sliding_window,
+            soft_cap=logits_soft_cap,
+        )
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index 2f86920e2773a..2789863298027 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -92,6 +92,8 @@ class PallasAttentionBackendImpl(AttentionImpl):
         self.head_size = head_size
         self.scale = float(scale)
         self.num_kv_heads = num_kv_heads
+        self.sliding_window = sliding_window
+        self.logits_soft_cap = logits_soft_cap
 
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
@@ -99,15 +101,10 @@ class PallasAttentionBackendImpl(AttentionImpl):
             raise NotImplementedError("Head size must be a multiple of 128.")
         if alibi_slopes is not None:
             raise NotImplementedError("Alibi slopes is not supported.")
-        if sliding_window is not None:
-            raise NotImplementedError("Sliding window is not supported.")
         if kv_cache_dtype != "auto":
             raise NotImplementedError("FP8 KV cache dtype is not supported.")
         if blocksparse_params is not None:
             raise NotImplementedError("Blocksparse is not supported.")
-        if logits_soft_cap is not None:
-            raise NotImplementedError(
-                "Attention logits soft-capping is not supported.")
 
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
@@ -172,7 +169,10 @@ class PallasAttentionBackendImpl(AttentionImpl):
             num_queries_per_block=NUM_QUERIES_PER_BLOCK,
             vmem_limit_bytes=self.vmem_limit_bytes,
             use_kernel=True,
-            sm_scale=self.scale)
+            sm_scale=self.scale,
+            sliding_window=self.sliding_window,
+            soft_cap=self.logits_soft_cap,
+        )
 
         return output.reshape(num_tokens, hidden_size)
 

From f15e70d906aca5bb0e5ca7e4f1f857853ce5dd66 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Thu, 3 Apr 2025 14:28:45 -0700
Subject: [PATCH 217/593] [TPU] Switch Test to Non-Sliding Window (#15981)

Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
---
 tests/tpu/test_compilation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py
index 27328d4542d9a..2a71f460f78ef 100644
--- a/tests/tpu/test_compilation.py
+++ b/tests/tpu/test_compilation.py
@@ -30,7 +30,7 @@ def test_tpu_compilation():
                                          n=N,
                                          max_tokens=16)
 
-        llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
+        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
                   max_num_batched_tokens=256,
                   max_model_len=256,
                   max_num_seqs=32,

From dcc56d62da910407fd9754843180d664280002ae Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Thu, 3 Apr 2025 19:01:34 -0400
Subject: [PATCH 218/593] [Bugfix] Fix function names in test_block_fp8.py
 (#16033)

Signed-off-by: Bill Nell <bnell@redhat.com>
---
 tests/kernels/test_block_fp8.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/test_block_fp8.py
index c4488c0c6ff36..347319b303f4a 100644
--- a/tests/kernels/test_block_fp8.py
+++ b/tests/kernels/test_block_fp8.py
@@ -360,7 +360,7 @@ def fp8_perm(m, idx):
         return m[idx, ...]
 
 
-def test_moe_permute(a, a_s, topk_ids, num_groups, topk, block_m):
+def _moe_permute(a, a_s, topk_ids, num_groups, topk, block_m):
     M, K = a.shape
 
     sorted_token_ids, m_indices, num_pad = moe_align_block_size(
@@ -379,7 +379,7 @@ def test_moe_permute(a, a_s, topk_ids, num_groups, topk, block_m):
     return a, a_s, m_indices, inv_perm
 
 
-def test_moe_unpermute(out, inv_perm, topk, K, topk_weight):
+def _moe_unpermute(out, inv_perm, topk, K, topk_weight):
     M = topk_weight.shape[0]
     out = out[inv_perm, ...]
     tmp_out = out.view(-1, topk, K)
@@ -401,8 +401,8 @@ def deep_gemm_w8a8_block_fp8_moe(M, K, a, w1, w2, w1_s, w2_s, score, topk,
 
     a_q, a_s = per_token_group_quant_fp8(a, block_m)
 
-    a_q, a_s, m_indices, inv_perm = test_moe_permute(a_q, a_s, topk_ids,
-                                                     num_groups, topk, block_m)
+    a_q, a_s, m_indices, inv_perm = _moe_permute(a_q, a_s, topk_ids,
+                                                 num_groups, topk, block_m)
 
     inter_out = torch.zeros((a_q.shape[0], N * 2),
                             dtype=torch.bfloat16,
@@ -419,7 +419,7 @@ def deep_gemm_w8a8_block_fp8_moe(M, K, a, w1, w2, w1_s, w2_s, score, topk,
     deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
         (act_out_q, act_out_s), (w2, w2_s), out, m_indices)
 
-    final_out = test_moe_unpermute(out, inv_perm, topk, K, topk_weight)
+    final_out = _moe_unpermute(out, inv_perm, topk, K, topk_weight)
 
     return final_out
 

From 092475f7386ea36c1aed17be65b872139931be98 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Thu, 3 Apr 2025 17:12:48 -0700
Subject: [PATCH 219/593] [ROCm] Tweak the benchmark script to run on ROCm
 (#14252)

---
 .../scripts/run-performance-benchmarks.sh     | 27 ++++++++++++++-----
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index 4cd449b141ece..80ebb370ad461 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -10,15 +10,24 @@ set -x
 set -o pipefail
 
 check_gpus() {
-  # check the number of GPUs and GPU type.
-  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if command -v nvidia-smi; then
+    # check the number of GPUs and GPU type.
+    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  elif command -v amd-smi; then
+    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
+  fi
+
   if [[ $gpu_count -gt 0 ]]; then
     echo "GPU found."
   else
     echo "Need at least 1 GPU to run benchmarking."
     exit 1
   fi
-  declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
+  if command -v nvidia-smi; then
+    declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
+  elif command -v amd-smi; then
+    declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
+  fi
   echo "GPU type is $gpu_type"
 }
 
@@ -90,9 +99,15 @@ kill_gpu_processes() {
 
 
   # wait until GPU memory usage smaller than 1GB
-  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
-    sleep 1
-  done
+  if command -v nvidia-smi; then
+    while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
+      sleep 1
+    done
+  elif command -v amd-smi; then
+    while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
+      sleep 1
+    done
+  fi
 
   # remove vllm config file
   rm -rf ~/.config/vllm

From 86cbd2eee97a98df59c531c34d2aeff5a2b5765d Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Fri, 4 Apr 2025 09:33:36 +0800
Subject: [PATCH 220/593] [Misc] improve gguf check (#15974)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 vllm/transformers_utils/utils.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index bae487b75588e..564c0f83389e2 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -23,9 +23,14 @@ def check_gguf_file(model: Union[str, PathLike]) -> bool:
     elif model.suffix == ".gguf":
         return True
 
-    with open(model, "rb") as f:
-        header = f.read(4)
-    return header == b"GGUF"
+    try:
+        with model.open("rb") as f:
+            header = f.read(4)
+
+        return header == b"GGUF"
+    except Exception as e:
+        logger.debug("Error reading file %s: %s", model, e)
+        return False
 
 
 def modelscope_list_repo_files(

From fadc59c0e6b3c348581e18c90d642de40c853ade Mon Sep 17 00:00:00 2001
From: Chengji Yao <chengjiyao@google.com>
Date: Fri, 4 Apr 2025 04:48:50 -0700
Subject: [PATCH 221/593] [TPU][V1] Remove ragged attention kernel parameter
 hard coding (#16041)

Signed-off-by: Chengji Yao <chengjiyao@google.com>
---
 vllm/v1/attention/backends/pallas.py | 20 ++++++--------------
 vllm/v1/worker/tpu_model_runner.py   |  8 ++------
 2 files changed, 8 insertions(+), 20 deletions(-)

diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index 2789863298027..af729ee9910f4 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -11,10 +11,6 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer, AttentionType)
 from vllm.attention.backends.utils import CommonAttentionState
 
-# These are the 2 tunable parameters of the paged attention Pallas kernel.
-NUM_QUERIES_PER_BLOCK = 32
-NUM_KV_PAGES_PER_BLOCK = 128
-
 
 class PallasAttentionBackend(AttentionBackend):
 
@@ -115,13 +111,6 @@ class PallasAttentionBackendImpl(AttentionImpl):
         tpu_version = torch_xla.tpu.version()
         if tpu_version < 4:
             raise NotImplementedError("TPU version must be 4 or higher.")
-        # NOTE(chengjiyao): the TPU v4's vmem capacity is 16MB
-        # TODO(chengjiyao): autotune NUM_QUERIES_PER_BLOCK,
-        # NUM_KV_PAGES_PER_BLOCK and vmem_limit_bytes
-        if tpu_version == 4:
-            self.vmem_limit_bytes = 16 * 1024 * 1024
-        else:
-            self.vmem_limit_bytes = 64 * 1024 * 1024
 
     def forward(
         self,
@@ -165,9 +154,12 @@ class PallasAttentionBackendImpl(AttentionImpl):
             attn_metadata.block_tables,
             attn_metadata.query_start_loc,
             attn_metadata.num_seqs,
-            num_kv_pages_per_block=NUM_KV_PAGES_PER_BLOCK,
-            num_queries_per_block=NUM_QUERIES_PER_BLOCK,
-            vmem_limit_bytes=self.vmem_limit_bytes,
+            # By default, the system utilizes optimized block size and
+            # vmem_limit_bytes parameters from the kernel repository. However,
+            # these can be manually adjusted for debugging if necessary.
+            num_kv_pages_per_block=None,
+            num_queries_per_block=None,
+            vmem_limit_bytes=None,
             use_kernel=True,
             sm_scale=self.scale,
             sliding_window=self.sliding_window,
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index b1d5c0f338541..0668e7168b5f7 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -24,8 +24,7 @@ from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
 from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available
-from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK,
-                                               PallasAttentionBackend,
+from vllm.v1.attention.backends.pallas import (PallasAttentionBackend,
                                                PallasMetadata)
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
@@ -155,11 +154,8 @@ class TPUModelRunner:
                                             dtype=torch.int64,
                                             device="cpu")
         self.slot_mapping_np = self.slot_mapping_cpu.numpy()
-
-        padded_max_num_blocks_per_req = _get_padded_number(
-            self.max_num_blocks_per_req, NUM_KV_PAGES_PER_BLOCK)
         self.block_table_cpu = torch.zeros(
-            (self.max_num_tokens, padded_max_num_blocks_per_req),
+            (self.max_num_tokens, self.max_num_blocks_per_req),
             dtype=self.input_batch.block_table.get_cpu_tensor().dtype,
             device="cpu")
 

From 4ef0bb1fcfc6e948b8c9ac196ca5497bd2c9e157 Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Fri, 4 Apr 2025 22:58:16 +0800
Subject: [PATCH 222/593] doc: add info for macos clang errors (#16049)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
---
 .../getting_started/installation/cpu/apple.inc.md      | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/docs/source/getting_started/installation/cpu/apple.inc.md b/docs/source/getting_started/installation/cpu/apple.inc.md
index 7bc9e85ecd964..61812ead12225 100644
--- a/docs/source/getting_started/installation/cpu/apple.inc.md
+++ b/docs/source/getting_started/installation/cpu/apple.inc.md
@@ -12,7 +12,7 @@ There are no pre-built wheels or images for this device, so you must build vLLM
 
 - OS: `macOS Sonoma` or later
 - SDK: `XCode 15.4` or later with Command Line Tools
-- Compiler: `Apple Clang >= 15.0.0`
+- Compiler: `Apple Clang >= 15.0.0` and `Apple Clang < 17.0.0`
 
 ## Set up using Python
 
@@ -51,6 +51,14 @@ If the build has error like the following snippet where standard C++ headers can
       1 error generated.
 ```
 
+If run with error like the following snippet you need to check clang version and install a compatible version.
+
+```text
+AttributeError: '_OpNamespace' '_C' object has no attribute 'silu_and_mul'
+```
+
+More information can be found in <gh-issue:15941>.
+
 ## Set up using Docker
 
 ### Pre-built images

From a35a8a83920ed202a815ca2fead0969758229996 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 4 Apr 2025 16:52:41 +0100
Subject: [PATCH 223/593] [V1][Spec Decode] Avoid logging useless nan metrics
 (#16023)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 tests/v1/core/test_scheduler.py | 17 +++++++++--------
 vllm/v1/core/sched/scheduler.py | 25 +++++++++++++++++++------
 2 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index a087c41ab3a9f..21a1cbf540ae0 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -671,10 +671,7 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
         assert running_req.num_tokens_with_spec == 2 + len(spec_tokens[i])
 
     # No draft or accepted tokens counted yet
-    assert engine_core_outputs.scheduler_stats.spec_decoding_stats is not None
-    stats = engine_core_outputs.scheduler_stats.spec_decoding_stats
-    assert stats.num_draft_tokens == 0
-    assert stats.num_accepted_tokens == 0
+    assert engine_core_outputs.scheduler_stats.spec_decoding_stats is None
 
     # Schedule the speculated tokens for validation
     output = scheduler.schedule()
@@ -702,7 +699,11 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
     engine_core_outputs = scheduler.update_from_output(output,
                                                        model_runner_output)
 
-    assert engine_core_outputs.scheduler_stats.spec_decoding_stats is not None
-    stats = engine_core_outputs.scheduler_stats.spec_decoding_stats
-    assert stats.num_draft_tokens == expected[0]
-    assert stats.num_accepted_tokens == expected[1]
+    scheduler_stats = engine_core_outputs.scheduler_stats
+    if expected[0] == 0:
+        assert scheduler_stats.spec_decoding_stats is None
+    else:
+        assert scheduler_stats.spec_decoding_stats is not None
+        stats = scheduler_stats.spec_decoding_stats
+        assert stats.num_draft_tokens == expected[0]
+        assert stats.num_accepted_tokens == expected[1]
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index a0865c8fd8457..81f8ad25051c3 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -553,11 +553,11 @@ class Scheduler(SchedulerInterface):
         spec_token_ids = model_runner_output.spec_token_ids
         logprobs = model_runner_output.logprobs
         prompt_logprobs_dict = model_runner_output.prompt_logprobs_dict
-        spec_decoding_stats = SpecDecodingStats() if self.log_stats else None
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
 
         new_running: list[Request] = []
         outputs: list[EngineCoreOutput] = []
+        spec_decoding_stats: Optional[SpecDecodingStats] = None
 
         # NOTE(woosuk): As len(self.running) can be up to 1K or more, the below
         # loop can be a performance bottleneck. We should do our best to avoid
@@ -585,11 +585,10 @@ class Scheduler(SchedulerInterface):
                 num_tokens_rejected = (len(scheduled_spec_token_ids) + 1 -
                                        len(generated_token_ids))
                 request.num_computed_tokens -= num_tokens_rejected
-
-                if spec_decoding_stats is not None:
-                    spec_decoding_stats.observe(
-                        num_draft_tokens=len(scheduled_spec_token_ids),
-                        num_accepted_tokens=len(generated_token_ids) - 1)
+                spec_decoding_stats = self.make_spec_decoding_stats(
+                    spec_decoding_stats,
+                    num_draft_tokens=len(scheduled_spec_token_ids),
+                    num_accepted_tokens=len(generated_token_ids) - 1)
 
             cached_encoder_input_ids = (
                 self.encoder_cache_manager.get_cached_input_ids(request))
@@ -744,3 +743,17 @@ class Scheduler(SchedulerInterface):
             prefix_cache_stats=self.kv_cache_manager.make_prefix_cache_stats(),
             spec_decoding_stats=spec_decoding_stats,
         )
+
+    def make_spec_decoding_stats(
+        self,
+        spec_decoding_stats: Optional[SpecDecodingStats],
+        num_draft_tokens: int,
+        num_accepted_tokens: int,
+    ) -> Optional[SpecDecodingStats]:
+        if not self.log_stats:
+            return None
+        if spec_decoding_stats is None:
+            spec_decoding_stats = SpecDecodingStats()
+        spec_decoding_stats.observe(num_draft_tokens=num_draft_tokens,
+                                    num_accepted_tokens=num_accepted_tokens)
+        return spec_decoding_stats

From bf7e3c51aeccf836039454ab485e1337af9171ea Mon Sep 17 00:00:00 2001
From: Jonghyun Choe <andy.choe729@gmail.com>
Date: Sat, 5 Apr 2025 01:38:52 +0900
Subject: [PATCH 224/593] [Model] use AutoWeightsLoader for baichuan, gpt-neox,
 mpt (#15939)

Signed-off-by: Jonghyun Choe <andy.choe729@gmail.com>
---
 vllm/model_executor/models/baichuan.py | 103 ++++++++++++++-----------
 vllm/model_executor/models/gpt_neox.py |  79 ++++++++++---------
 vllm/model_executor/models/mpt.py      |  35 +++++----
 3 files changed, 118 insertions(+), 99 deletions(-)

diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 7e2b7c862e5f9..6a3112b5f7695 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -47,7 +47,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers)
 
 
@@ -321,6 +321,45 @@ class BaiChuanModel(nn.Module):
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP,
                               SupportsQuant):
@@ -353,6 +392,7 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP,
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
                                       quant_config=quant_config)
+        self.lm_head.weight.weight_loader = self.lm_head_weight_loader
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
@@ -393,53 +433,22 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP,
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if name == "lm_head.weight":
-                # Unlike Baichuan, Baichuan2 normalizes the head weights.
-                # Refer to:
-                # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/84603cde5ebffb6084e476cfaeceaf0b8b91fe54/modeling_baichuan.py#L508
-                # Distinguish between Baichuan and Baichuan2 by checking the
-                # vocab size. This is suggested by
-                # https://github.com/vllm-project/vllm/pull/1022#discussion_r1325652704
-                is_baichuan2 = self.config.vocab_size == 125696
-                if is_baichuan2:
-                    loaded_weight = torch.nn.functional.normalize(
-                        loaded_weight)
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
 
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+    def lm_head_weight_loader(self, param: nn.Parameter,
+                              loaded_weight: torch.Tensor):
+        # Unlike Baichuan, Baichuan2 normalizes the head weights.
+        # Refer to:
+        # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/84603cde5ebffb6084e476cfaeceaf0b8b91fe54/modeling_baichuan.py#L508
+        # Distinguish between Baichuan and Baichuan2 by checking the
+        # vocab size. This is suggested by
+        # https://github.com/vllm-project/vllm/pull/1022#discussion_r1325652704
+        is_baichuan2 = self.config.vocab_size == 125696
+        if is_baichuan2:
+            loaded_weight = torch.nn.functional.normalize(loaded_weight)
+
+        default_weight_loader(param, loaded_weight)
 
 
 class BaichuanForCausalLM(BaiChuanBaseForCausalLM):
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 4b30c7bb30359..582b2ff7e755c 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -42,7 +42,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -241,6 +241,45 @@ class GPTNeoXModel(nn.Module):
         hidden_states = self.final_layer_norm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if ("attention.bias" in name or "attention.masked_bias" in name
+                    or "rotary_emb.inv_freq" in name):
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using OpenRLHF may include
+                # these tensors in the checkpoint. Skip them.
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+            param = params_dict[name]
+
+            if "query_key_value" in name:
+                # NOTE: GPT-NeoX's fused QKV's output_dim has the shape of
+                # (num_heads * 3 * head_size), while the
+                # required shape is (3 * num_heads * head_size).
+                # Thus, we need weight conversion.
+                output_dim = getattr(param, "output_dim", None)
+                num_heads = self.config.num_attention_heads
+                if output_dim is not None:
+                    loaded_weight_shape = loaded_weight.shape
+                    loaded_weight = loaded_weight.view(
+                        loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
+                        loaded_weight_shape[output_dim + 1:])
+                    loaded_weight = loaded_weight.transpose(
+                        output_dim, output_dim + 1)
+                    loaded_weight = loaded_weight.reshape(loaded_weight_shape)
+
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class GPTNeoXForCausalLM(nn.Module, SupportsPP):
 
@@ -297,39 +336,5 @@ class GPTNeoXForCausalLM(nn.Module, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if ("attention.bias" in name or "attention.masked_bias" in name
-                    or "rotary_emb.inv_freq" in name):
-                continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
-                # Models trained using OpenRLHF may include
-                # these tensors in the checkpoint. Skip them.
-                continue
-            if is_pp_missing_parameter(name, self):
-                continue
-            param = params_dict[name]
-
-            if "query_key_value" in name:
-                # NOTE: GPT-NeoX's fused QKV's output_dim has the shape of
-                # (num_heads * 3 * head_size), while the
-                # required shape is (3 * num_heads * head_size).
-                # Thus, we need weight conversion.
-                output_dim = getattr(param, "output_dim", None)
-                num_heads = self.config.num_attention_heads
-                if output_dim is not None:
-                    loaded_weight_shape = loaded_weight.shape
-                    loaded_weight = loaded_weight.view(
-                        loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
-                        loaded_weight_shape[output_dim + 1:])
-                    loaded_weight = loaded_weight.transpose(
-                        output_dim, output_dim + 1)
-                    loaded_weight = loaded_weight.reshape(loaded_weight_shape)
-
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index d716818f31c03..b30f3ee37997f 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -27,7 +27,7 @@ from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.mpt import MPTConfig
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -266,6 +266,23 @@ class MPTModel(nn.Module):
         hidden_states = self.norm_f(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            # Skip loading extra bias for GPTQ models.
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class MPTForCausalLM(nn.Module, SupportsPP):
 
@@ -318,17 +335,5 @@ class MPTForCausalLM(nn.Module, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            # Skip loading extra bias for GPTQ models.
-            if name.endswith(".bias") and name not in params_dict:
-                continue
-            if is_pp_missing_parameter(name, self):
-                continue
-            param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)

From 0812d8dd412087565d542af55c8fedd6957c04f4 Mon Sep 17 00:00:00 2001
From: liuzhenwei <zhenweiliu@habana.ai>
Date: Sat, 5 Apr 2025 00:38:55 +0800
Subject: [PATCH 225/593] [Hardware][Gaudi][BugFix] fix arguments of hpu fused
 moe (#15945)

Signed-off-by: zhenwei <zhenweiliu@habana.ai>
---
 vllm/model_executor/layers/fused_moe/layer.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 5cbbe49bbba49..661fb52bbee2a 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -254,9 +254,12 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         renormalize: bool,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
-        e_score_correction_bias: Optional[torch.Tensor] = None
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        activation: str = "silu",
     ) -> torch.Tensor:
         assert not use_grouped_topk
         assert num_expert_group is None
@@ -472,7 +475,7 @@ class FusedMoE(torch.nn.Module):
                              "non-grouped topk.")
         if current_platform.is_hpu():
             from vllm_hpu_extension.ops import DynamicFusedMOE
-            self.hpu_fused_moe = DynamicFusedMOE(self.num_experts)
+            self.hpu_fused_moe = DynamicFusedMOE(self.global_num_experts)
 
         # Note: get_quant_method will look at the layer's local_num_experts
         # for heuristic purposes, so it must be initialized first.

From 230b131b54e8ad4ee9086a15c69b29b387ddb3b0 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 5 Apr 2025 00:38:58 +0800
Subject: [PATCH 226/593] [Bugfix][kernels] Fix half2float conversion in gguf
 kernels (#15995)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 csrc/quantization/gguf/ggml-common.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/csrc/quantization/gguf/ggml-common.h b/csrc/quantization/gguf/ggml-common.h
index 99a7ea0fb277e..6bef5db3ccf15 100644
--- a/csrc/quantization/gguf/ggml-common.h
+++ b/csrc/quantization/gguf/ggml-common.h
@@ -1090,6 +1090,11 @@ __device__ __forceinline__ c10::BFloat16 convert_from_half<c10::BFloat16>(half v
 #endif  // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
 }
 
+template<>
+__device__ __forceinline__ float convert_from_half<float>(half val) {
+    return __half2float(val);
+}
+
 #if defined(USE_ROCM)
 
 #ifndef __has_builtin

From 95862f7b4de2fbe99529a4832e2d8424bc9ee247 Mon Sep 17 00:00:00 2001
From: "Ziji Shi (Steven)" <shi.ziji.sm@gmail.com>
Date: Fri, 4 Apr 2025 09:39:02 -0700
Subject: [PATCH 227/593] [Benchmark][Doc] Update throughput benchmark and
 README (#15998)

Signed-off-by: StevenShi-23 <shi.ziji.sm@gmail.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
---
 benchmarks/README.md               | 29 +++++++++++++++++++++++++++++
 benchmarks/benchmark_throughput.py | 26 +++++++++++++++-----------
 2 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 4777d8329f2db..b0417631c5141 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -51,6 +51,12 @@ become available.
       <td style="text-align: center;">✅</td>
       <td style="text-align: center;">✅</td>
       <td><code>likaixin/InstructCoder</code></td>
+    </tr>
+      <tr>
+      <td><strong>HuggingFace-AIMO</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
     </tr>
     <tr>
       <td><strong>HuggingFace-Other</strong></td>
@@ -187,6 +193,17 @@ python3 vllm/benchmarks/benchmark_serving.py \
   --num-prompts 10
 ```
 
+**`AI-MO/aimo-validation-aime`**
+
+``` bash
+python3 vllm/benchmarks/benchmark_serving.py \
+    --model Qwen/QwQ-32B \
+    --dataset-name hf \
+    --dataset-path AI-MO/aimo-validation-aime \
+    --num-prompts 10 \
+    --seed 42
+```
+
 ---
 ## Example - Offline Throughput Benchmark
 
@@ -278,6 +295,18 @@ python3 vllm/benchmarks/benchmark_throughput.py \
   --num-prompts 10
 ```
 
+**`AI-MO/aimo-validation-aime`**
+
+```bash
+python3 benchmarks/benchmark_throughput.py \
+  --model Qwen/QwQ-32B \
+  --backend vllm \
+  --dataset-name hf \
+  --dataset-path AI-MO/aimo-validation-aime \
+  --hf-split train \
+  --num-prompts 10
+```
+
 ### Benchmark with LoRA Adapters
 
 ``` bash
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 1ff63f0a44795..d0d7dfa1d7957 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -11,10 +11,10 @@ from typing import Any, Optional, Union
 
 import torch
 import uvloop
-from benchmark_dataset import (BurstGPTDataset, ConversationDataset,
-                               InstructCoderDataset, RandomDataset,
-                               SampleRequest, ShareGPTDataset, SonnetDataset,
-                               VisionArenaDataset)
+from benchmark_dataset import (AIMODataset, BurstGPTDataset,
+                               ConversationDataset, InstructCoderDataset,
+                               RandomDataset, SampleRequest, ShareGPTDataset,
+                               SonnetDataset, VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
@@ -332,7 +332,10 @@ def get_requests(args, tokenizer):
             common_kwargs['dataset_subset'] = args.hf_subset
             common_kwargs['dataset_split'] = args.hf_split
             sample_kwargs["enable_multimodal_chat"] = True
-
+        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = AIMODataset
+            common_kwargs['dataset_subset'] = None
+            common_kwargs['dataset_split'] = "train"
     else:
         raise ValueError(f"Unknown dataset name: {args.dataset_name}")
     # Remove None values
@@ -467,12 +470,13 @@ def validate_args(args):
                 since --dataset-name is not 'hf'.",
                       stacklevel=2)
     elif args.dataset_name == "hf":
-        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
-            assert args.backend == "vllm-chat", "VisionArenaDataset needs to use vllm-chat as the backend."  #noqa: E501
-        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
-            assert args.backend == "vllm", "InstructCoder dataset needs to use vllm as the backend."  #noqa: E501
-        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
-            assert args.backend == "vllm-chat", "ConversationDataset needs to use vllm-chat as the backend."  #noqa: E501
+        if args.dataset_path in (
+                VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
+                | ConversationDataset.SUPPORTED_DATASET_PATHS):
+            assert args.backend == "vllm-chat", f"{args.dataset_path} needs to use vllm-chat as the backend."  #noqa: E501
+        elif args.dataset_path in (InstructCoderDataset.SUPPORTED_DATASET_PATHS
+                                   | AIMODataset.SUPPORTED_DATASET_PATHS):
+            assert args.backend == "vllm", f"{args.dataset_path} needs to use vllm as the backend."  #noqa: E501
         else:
             raise ValueError(
                 f"{args.dataset_path} is not supported by hf dataset.")

From 2386803f2a2e3df1f29ea05212eaf68590b85805 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Sat, 5 Apr 2025 00:39:05 +0800
Subject: [PATCH 228/593] [CPU] Change default block_size for CPU backend
 (#16002)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 vllm/platforms/cpu.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 619219023f4da..67466bdb98075 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -2,6 +2,7 @@
 
 import os
 import sys
+from importlib.util import find_spec
 from typing import TYPE_CHECKING, Optional
 
 import psutil
@@ -68,8 +69,15 @@ class CpuPlatform(Platform):
 
         cache_config = vllm_config.cache_config
 
+        ipex_avaliable = find_spec("intel_extension_for_pytorch") is not None
+
         if cache_config and cache_config.block_size is None:
-            cache_config.block_size = 16
+            cache_config.block_size = 128 if ipex_avaliable else 16
+
+        if not ipex_avaliable and cache_config.block_size != 16:
+            raise RuntimeError(
+                f"--block-size={cache_config.block_size} requires"
+                " intel_extension_for_pytorch")
 
         scheduler_config = vllm_config.scheduler_config
         if ((scheduler_config.chunked_prefill_enabled

From ef608c37a7cf106fcd48aee24ec44248d2a6665b Mon Sep 17 00:00:00 2001
From: Ilya Markov <markovilya197@gmail.com>
Date: Fri, 4 Apr 2025 18:39:08 +0200
Subject: [PATCH 229/593] [Distributed] [ROCM] Fix custom allreduce enable
 checks (#16010)

Signed-off-by: ilmarkov <imarkov@redhat.com>
Co-authored-by: ilmarkov <imarkov@redhat.com>
---
 vllm/config.py              | 7 +++----
 vllm/platforms/cuda.py      | 4 ++++
 vllm/platforms/interface.py | 7 +++++++
 vllm/platforms/rocm.py      | 7 +++++++
 4 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 92e887e08639f..1aadf2c25b430 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1619,13 +1619,12 @@ class ParallelConfig:
         if self.use_ray:
             from vllm.executor import ray_utils
             ray_utils.assert_ray_available()
-        device_capability = current_platform.get_device_capability()
-        if (current_platform.is_rocm() and device_capability is not None
-                and device_capability < (9, 4)):
+
+        if not current_platform.use_custom_allreduce():
             self.disable_custom_all_reduce = True
             logger.info(
                 "Disabled the custom all-reduce kernel because it is not "
-                "supported on AMD GPUs older than MI300X.")
+                "supported on current platform.")
         if self.ray_workers_use_nsight and not self.use_ray:
             raise ValueError("Unable to use nsight profiling unless workers "
                              "run with Ray.")
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 28505fca10dfa..0576022be448b 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -308,6 +308,10 @@ class CudaPlatformBase(Platform):
     def supports_v1(cls, model_config: ModelConfig) -> bool:
         return True
 
+    @classmethod
+    def use_custom_allreduce(cls) -> bool:
+        return True
+
 
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 36db70681a199..b6f6029de9c82 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -379,6 +379,13 @@ class Platform:
         """
         return False
 
+    @classmethod
+    def use_custom_allreduce(cls) -> bool:
+        """
+        Returns if custom allreduce is supported on the current platform
+        """
+        return False
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 0bedd80e5ecf1..d18b7c26f7ec5 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -302,3 +302,10 @@ class RocmPlatform(Platform):
     def supports_v1(cls, model_config: ModelConfig) -> bool:
         # V1 support on AMD gpus is experimental
         return True
+
+    @classmethod
+    def use_custom_allreduce(cls) -> bool:
+        # We only enable custom allreduce for MI300 series
+        gcn_arch = torch.cuda.get_device_properties(0).gcnArchName
+        supported_archs = ['gfx94']
+        return any(gfx in gcn_arch for gfx in supported_archs)

From 40a36ccfeb49df1351cb1d9250ebf29f6f6805ae Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Fri, 4 Apr 2025 12:40:20 -0400
Subject: [PATCH 230/593] [ROCm][Bugfix] Use platform specific FP8 dtype
 (#15717)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 vllm/attention/ops/prefix_prefill.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index 49ba476d78b62..e0478c2aebdaa 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -753,7 +753,7 @@ if triton.__version__ >= "2.1.0":
             assert (v_cache.dtype == torch.uint8)
 
             if kv_cache_dtype in ("fp8", "fp8_e4m3"):
-                target_dtype = torch.float8_e4m3fn
+                target_dtype = current_platform.fp8_dtype()
             elif kv_cache_dtype == "fp8_e5m2":
                 target_dtype = torch.float8_e5m2
             else:

From a6d042df0a1f62ffab206693cc798dd695b89d7c Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Fri, 4 Apr 2025 12:40:37 -0400
Subject: [PATCH 231/593] [ROCm][Bugfix] Bring back fallback to eager mode
 removed in #14917, but for ROCm only (#15413)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 vllm/config.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index 1aadf2c25b430..2669d1a13b37e 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -29,7 +29,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
                                                      get_quantization_config)
 from vllm.model_executor.models import ModelRegistry
-from vllm.platforms import CpuArchEnum
+from vllm.platforms import CpuArchEnum, current_platform
 from vllm.sampling_params import GuidedDecodingParams
 from vllm.tracing import is_otel_available, otel_import_error_traceback
 from vllm.transformers_utils.config import (
@@ -684,6 +684,13 @@ class ModelConfig:
             self.max_seq_len_to_capture = self.max_model_len
         self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
                                           self.max_model_len)
+        ROCM_UNSUPPORTED_MODELS = ['mllama']
+        if (self.hf_config.model_type in ROCM_UNSUPPORTED_MODELS
+                and not self.enforce_eager and current_platform.is_rocm()):
+            logger.warning(
+                "CUDA graph is not supported for %s on ROCm yet, fallback "
+                "to the eager mode.", self.hf_config.model_type)
+            self.enforce_eager = True
 
     def _verify_bnb_config(self) -> None:
         """

From 4708f13a9c8758d08cd45df200d63aa97762e873 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 4 Apr 2025 11:58:08 -0600
Subject: [PATCH 232/593] [Bugfix] Fix default behavior/fallback for pp in v1
 (#16057)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/engine/arg_utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 88723d9f5b741..89c9b67470e6b 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1521,8 +1521,9 @@ class EngineArgs:
         # PP is supported on V1 with Ray distributed executor,
         # but off for MP distributed executor for now.
         if (self.pipeline_parallel_size > 1
-                and self.distributed_executor_backend == "mp"
-                and _warn_or_fallback("PP (MP distributed executor)")):
+                and self.distributed_executor_backend != "ray"):
+            name = "Pipeline Parallelism without Ray distributed executor"
+            _raise_or_fallback(feature_name=name, recommend_to_remove=False)
             return False
 
         # ngram is supported on V1, but off by default for now.

From 4dc52e1c53b4ab06ec4d6c92791a4de0718abb70 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Fri, 4 Apr 2025 12:16:20 -0700
Subject: [PATCH 233/593] [CI] Reorganize .buildkite directory (#16001)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .buildkite/release-pipeline.yaml                            | 6 +++---
 .buildkite/{ => scripts/hardware_ci}/run-amd-test.sh        | 0
 .../{ => scripts/hardware_ci}/run-cpu-test-ppc64le.sh       | 0
 .buildkite/{ => scripts/hardware_ci}/run-cpu-test.sh        | 0
 .buildkite/{ => scripts/hardware_ci}/run-gh200-test.sh      | 0
 .buildkite/{ => scripts/hardware_ci}/run-hpu-test.sh        | 0
 .buildkite/{ => scripts/hardware_ci}/run-neuron-test.sh     | 0
 .buildkite/{ => scripts/hardware_ci}/run-tpu-v1-test.sh     | 0
 .buildkite/{ => scripts/hardware_ci}/run-xpu-test.sh        | 0
 .buildkite/{ => scripts}/run-benchmarks.sh                  | 0
 .buildkite/{ => scripts}/run-multi-node-test.sh             | 2 +-
 .buildkite/{ => scripts}/upload-wheels.sh                   | 0
 .buildkite/test-pipeline.yaml                               | 2 +-
 tools/shellcheck.sh                                         | 4 ++--
 14 files changed, 7 insertions(+), 7 deletions(-)
 rename .buildkite/{ => scripts/hardware_ci}/run-amd-test.sh (100%)
 rename .buildkite/{ => scripts/hardware_ci}/run-cpu-test-ppc64le.sh (100%)
 rename .buildkite/{ => scripts/hardware_ci}/run-cpu-test.sh (100%)
 rename .buildkite/{ => scripts/hardware_ci}/run-gh200-test.sh (100%)
 rename .buildkite/{ => scripts/hardware_ci}/run-hpu-test.sh (100%)
 rename .buildkite/{ => scripts/hardware_ci}/run-neuron-test.sh (100%)
 rename .buildkite/{ => scripts/hardware_ci}/run-tpu-v1-test.sh (100%)
 rename .buildkite/{ => scripts/hardware_ci}/run-xpu-test.sh (100%)
 rename .buildkite/{ => scripts}/run-benchmarks.sh (100%)
 rename .buildkite/{ => scripts}/run-multi-node-test.sh (96%)
 rename .buildkite/{ => scripts}/upload-wheels.sh (100%)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index a420759aad916..3354ea37002b9 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -6,7 +6,7 @@ steps:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/upload-wheels.sh"
+      - "bash .buildkite/scripts/upload-wheels.sh"
     env:
       DOCKER_BUILDKIT: "1"
 
@@ -17,7 +17,7 @@ steps:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/upload-wheels.sh"
+      - "bash .buildkite/scripts/upload-wheels.sh"
     env:
       DOCKER_BUILDKIT: "1"
 
@@ -34,7 +34,7 @@ steps:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/upload-wheels.sh"
+      - "bash .buildkite/scripts/upload-wheels.sh"
     env:
       DOCKER_BUILDKIT: "1"
 
diff --git a/.buildkite/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
similarity index 100%
rename from .buildkite/run-amd-test.sh
rename to .buildkite/scripts/hardware_ci/run-amd-test.sh
diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
similarity index 100%
rename from .buildkite/run-cpu-test-ppc64le.sh
rename to .buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
similarity index 100%
rename from .buildkite/run-cpu-test.sh
rename to .buildkite/scripts/hardware_ci/run-cpu-test.sh
diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
similarity index 100%
rename from .buildkite/run-gh200-test.sh
rename to .buildkite/scripts/hardware_ci/run-gh200-test.sh
diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
similarity index 100%
rename from .buildkite/run-hpu-test.sh
rename to .buildkite/scripts/hardware_ci/run-hpu-test.sh
diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
similarity index 100%
rename from .buildkite/run-neuron-test.sh
rename to .buildkite/scripts/hardware_ci/run-neuron-test.sh
diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
similarity index 100%
rename from .buildkite/run-tpu-v1-test.sh
rename to .buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
similarity index 100%
rename from .buildkite/run-xpu-test.sh
rename to .buildkite/scripts/hardware_ci/run-xpu-test.sh
diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/scripts/run-benchmarks.sh
similarity index 100%
rename from .buildkite/run-benchmarks.sh
rename to .buildkite/scripts/run-benchmarks.sh
diff --git a/.buildkite/run-multi-node-test.sh b/.buildkite/scripts/run-multi-node-test.sh
similarity index 96%
rename from .buildkite/run-multi-node-test.sh
rename to .buildkite/scripts/run-multi-node-test.sh
index 530bf90a855fe..49aebce786b92 100755
--- a/.buildkite/run-multi-node-test.sh
+++ b/.buildkite/scripts/run-multi-node-test.sh
@@ -3,7 +3,7 @@
 set -euox pipefail
 
 if [[ $# -lt 4 ]]; then
-    echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
+    echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
     exit 1
 fi
 
diff --git a/.buildkite/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
similarity index 100%
rename from .buildkite/upload-wheels.sh
rename to .buildkite/scripts/upload-wheels.sh
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 4a462e1909a1f..0b775851c0572 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -337,7 +337,7 @@ steps:
   source_file_dependencies:
   - benchmarks/
   commands:
-  - bash run-benchmarks.sh
+  - bash scripts/run-benchmarks.sh
 
 - label: Quantization Test # 33min
   source_file_dependencies:
diff --git a/tools/shellcheck.sh b/tools/shellcheck.sh
index 7efb3cabc64fe..59ce400385ebb 100755
--- a/tools/shellcheck.sh
+++ b/tools/shellcheck.sh
@@ -18,5 +18,5 @@ if ! [ -x "$(command -v shellcheck)" ]; then
     export PATH="$PATH:$(pwd)/shellcheck-${scversion}"
 fi
 
-# TODO - fix warnings in .buildkite/run-amd-test.sh
-find . -name "*.sh" ".git" -prune -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck -s bash "{}"'
+# TODO - fix warnings in .buildkite/scripts/hardware_ci/run-amd-test.sh
+find . -name "*.sh" ".git" -prune -not -path "./.buildkite/scripts/hardware_ci/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck -s bash "{}"'

From 651cf0fec19ed1da44ad266066f86b74e5246006 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 4 Apr 2025 12:56:43 -0700
Subject: [PATCH 234/593] [V1] DP scale-out (1/N): Use zmq ROUTER/DEALER
 sockets for input queue (#15906)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/utils.py                 |  35 +++++++----
 vllm/v1/engine/core.py        |  28 +++++----
 vllm/v1/engine/core_client.py | 108 ++++++++++++++++++++++------------
 vllm/v1/utils.py              |  11 +---
 4 files changed, 113 insertions(+), 69 deletions(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 5f32f8cb66a5c..46f01638d0eb5 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -2189,6 +2189,8 @@ def make_zmq_socket(
     ctx: Union[zmq.asyncio.Context, zmq.Context],  # type: ignore[name-defined]
     path: str,
     socket_type: Any,
+    bind: Optional[bool] = None,
+    identity: Optional[bytes] = None,
 ) -> Union[zmq.Socket, zmq.asyncio.Socket]:  # type: ignore[name-defined]
     """Make a ZMQ socket with the proper bind/connect semantics."""
 
@@ -2207,16 +2209,24 @@ def make_zmq_socket(
     else:
         buf_size = -1  # Use system default buffer size
 
-    if socket_type == zmq.constants.PULL:
-        socket.setsockopt(zmq.constants.RCVHWM, 0)
-        socket.setsockopt(zmq.constants.RCVBUF, buf_size)
+    if bind is None:
+        bind = socket_type != zmq.PUSH
+
+    if socket_type in (zmq.PULL, zmq.DEALER, zmq.ROUTER):
+        socket.setsockopt(zmq.RCVHWM, 0)
+        socket.setsockopt(zmq.RCVBUF, buf_size)
+
+    if socket_type in (zmq.PUSH, zmq.DEALER, zmq.ROUTER):
+        socket.setsockopt(zmq.SNDHWM, 0)
+        socket.setsockopt(zmq.SNDBUF, buf_size)
+
+    if identity is not None:
+        socket.setsockopt(zmq.IDENTITY, identity)
+
+    if bind:
         socket.bind(path)
-    elif socket_type == zmq.constants.PUSH:
-        socket.setsockopt(zmq.constants.SNDHWM, 0)
-        socket.setsockopt(zmq.constants.SNDBUF, buf_size)
-        socket.connect(path)
     else:
-        raise ValueError(f"Unknown Socket Type: {socket_type}")
+        socket.connect(path)
 
     return socket
 
@@ -2225,14 +2235,19 @@ def make_zmq_socket(
 def zmq_socket_ctx(
     path: str,
     socket_type: Any,
+    bind: Optional[bool] = None,
     linger: int = 0,
+    identity: Optional[bytes] = None,
 ) -> Iterator[zmq.Socket]:
     """Context manager for a ZMQ socket"""
 
     ctx = zmq.Context()  # type: ignore[attr-defined]
     try:
-        yield make_zmq_socket(ctx, path, socket_type)
-
+        yield make_zmq_socket(ctx,
+                              path,
+                              socket_type,
+                              bind=bind,
+                              identity=identity)
     except KeyboardInterrupt:
         logger.debug("Got Keyboard Interrupt.")
 
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 39caca0c2a452..f58c77e4f1658 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -318,6 +318,11 @@ class EngineCoreProc(EngineCore):
     ):
         super().__init__(vllm_config, executor_class, log_stats)
 
+        self.step_fn = (self.step if self.batch_queue is None else
+                        self.step_with_batch_queue)
+
+        self.global_unfinished_reqs = False
+
         # Background Threads and Queues for IO. These enable us to
         # overlap ZMQ socket IO with GPU since they release the GIL,
         # and to overlap some serialization/deserialization with the
@@ -327,22 +332,16 @@ class EngineCoreProc(EngineCore):
                                             Any]] = queue.Queue()
         self.output_queue: queue.Queue[EngineCoreOutputs] = queue.Queue()
         threading.Thread(target=self.process_input_socket,
-                         args=(input_path, ),
+                         args=(input_path, engine_index),
                          daemon=True).start()
         threading.Thread(target=self.process_output_socket,
                          args=(output_path, engine_index),
                          daemon=True).start()
 
-        self.global_unfinished_reqs = False
-
-        self.step_fn = (self.step if self.batch_queue is None else
-                        self.step_with_batch_queue)
-
     @staticmethod
     def run_engine_core(*args,
                         dp_rank: int = 0,
                         local_dp_rank: int = 0,
-                        ready_pipe,
                         **kwargs):
         """Launch EngineCore busy loop in background process."""
 
@@ -377,9 +376,6 @@ class EngineCoreProc(EngineCore):
             else:
                 engine_core = EngineCoreProc(*args, **kwargs)
 
-            # Send Readiness signal to EngineClient.
-            ready_pipe.send({"status": "READY"})
-
             engine_core.run_busy_loop()
 
         except SystemExit:
@@ -476,14 +472,22 @@ class EngineCoreProc(EngineCore):
             and not isinstance(v, p.annotation) else v
             for v, p in zip(args, arg_types))
 
-    def process_input_socket(self, input_path: str):
+    def process_input_socket(self, input_path: str, engine_index: int):
         """Input socket IO thread."""
 
         # Msgpack serialization decoding.
         add_request_decoder = MsgpackDecoder(EngineCoreRequest)
         generic_decoder = MsgpackDecoder()
+        identity = engine_index.to_bytes(length=2, byteorder="little")
+
+        with zmq_socket_ctx(input_path,
+                            zmq.DEALER,
+                            identity=identity,
+                            bind=False) as socket:
+
+            # Send ready message to front-end once input socket is connected.
+            socket.send(b'READY')
 
-        with zmq_socket_ctx(input_path, zmq.constants.PULL) as socket:
             while True:
                 # (RequestType, RequestData)
                 type_frame, data_frame = socket.recv_multipart(copy=False)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index e948e59b8c425..b94b0aa75386a 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -8,7 +8,7 @@ import threading
 import uuid
 import weakref
 from abc import ABC, abstractmethod
-from collections.abc import Awaitable, Sequence
+from collections.abc import Awaitable
 from concurrent.futures import Future
 from dataclasses import dataclass, field
 from threading import Thread
@@ -35,6 +35,8 @@ AnyFuture = Union[asyncio.Future[Any], Future[Any]]
 
 _R = TypeVar('_R')  # Return type for collective_rpc
 
+STARTUP_POLL_PERIOD_MS = 10000
+
 
 class EngineCoreClient(ABC):
     """
@@ -261,15 +263,13 @@ class CoreEngine:
         vllm_config: VllmConfig,
         executor_class: type[Executor],
         log_stats: bool,
-        ctx: Union[zmq.Context, zmq.asyncio.Context],
+        input_path: str,
         output_path: str,
         index: int = 0,
         local_dp_rank: int = 0,
     ):
-        # Paths and sockets for IPC.
-        input_path = get_open_zmq_ipc_path()
-        self.input_socket = make_zmq_socket(ctx, input_path,
-                                            zmq.constants.PUSH)
+        self.index = index
+        self.identity = index.to_bytes(length=2, byteorder="little")
         try:
             # Start EngineCore in background process.
             self.proc_handle = BackgroundProcHandle(
@@ -291,14 +291,9 @@ class CoreEngine:
                 # Ensure socket is closed if process fails to start.
                 self.close()
 
-    def send_multipart(self, msg_parts: Sequence):
-        return self.input_socket.send_multipart(msg_parts, copy=False)
-
     def close(self):
         if proc_handle := getattr(self, "proc_handle", None):
             proc_handle.shutdown()
-        if socket := getattr(self, "input_socket", None):
-            socket.close(linger=0)
 
 
 @dataclass
@@ -309,6 +304,7 @@ class BackgroundResources:
     ctx: Union[zmq.Context]
     core_engines: list[CoreEngine] = field(default_factory=list)
     output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
+    input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
     shutdown_path: Optional[str] = None
 
     def __call__(self):
@@ -321,6 +317,8 @@ class BackgroundResources:
         # aren't explicitly closed first.
         if self.output_socket is not None:
             self.output_socket.close(linger=0)
+        if self.input_socket is not None:
+            self.input_socket.close(linger=0)
         if self.shutdown_path is not None:
             # We must ensure that the sync output socket is
             # closed cleanly in its own thread.
@@ -387,21 +385,51 @@ class MPClient(EngineCoreClient):
 
         # Paths and sockets for IPC.
         self.output_path = get_open_zmq_ipc_path()
+        input_path = get_open_zmq_ipc_path()
+        self.input_socket = make_zmq_socket(self.ctx,
+                                            input_path,
+                                            zmq.ROUTER,
+                                            bind=True)
+        self.resources.input_socket = self.input_socket
 
         new_core_engine = lambda index, local_dp_rank=None: CoreEngine(
-            vllm_config, executor_class, log_stats, self.ctx, self.output_path,
-            index, local_dp_rank)
+            vllm_config, executor_class, log_stats, input_path, self.
+            output_path, index, local_dp_rank)
 
         # Start engine core process(es).
         self._init_core_engines(vllm_config, new_core_engine,
                                 self.resources.core_engines)
 
         # Wait for engine core process(es) to start.
-        for engine in self.resources.core_engines:
-            engine.proc_handle.wait_for_startup()
+        self._wait_for_engine_startup()
 
         self.utility_results: dict[int, AnyFuture] = {}
 
+    def _wait_for_engine_startup(self):
+        # Get a sync handle to the socket which can be sync or async.
+        sync_input_socket = zmq.Socket.shadow(self.input_socket)
+
+        # Wait for engine core process(es) to send ready messages.
+        identities = set(eng.index for eng in self.resources.core_engines)
+        while identities:
+            while not sync_input_socket.poll(timeout=STARTUP_POLL_PERIOD_MS):
+                logger.info("Waiting for %d core engine proc(s) to start: %s",
+                            len(identities), identities)
+            eng_id_bytes, msg = sync_input_socket.recv_multipart()
+            eng_id = int.from_bytes(eng_id_bytes, byteorder="little")
+            if eng_id not in identities:
+                raise RuntimeError(f"Unexpected or duplicate engine: {eng_id}")
+            if msg != b'READY':
+                raise RuntimeError(f"Engine {eng_id} failed: {msg.decode()}")
+            logger.info("Core engine process %d ready.", eng_id)
+            identities.discard(eng_id)
+
+        # Double check that the process are running.
+        for engine in self.resources.core_engines:
+            proc = engine.proc_handle.proc
+            if proc.exitcode is not None:
+                raise RuntimeError(f"Engine proc {proc.name} not running")
+
     def _init_core_engines(
         self,
         vllm_config: VllmConfig,
@@ -494,9 +522,10 @@ class SyncMPClient(MPClient):
         return self.outputs_queue.get()
 
     def _send_input(self, request_type: EngineCoreRequestType, request: Any):
-        # (RequestType, SerializedRequest)
-        msg = (request_type.value, self.encoder.encode(request))
-        self.core_engine.send_multipart(msg)
+        # (Identity, RequestType, SerializedRequest)
+        msg = (self.core_engine.identity, request_type.value,
+               self.encoder.encode(request))
+        self.input_socket.send_multipart(msg, copy=False)
 
     def call_utility(self, method: str, *args) -> Any:
         call_id = uuid.uuid1().int >> 64
@@ -625,30 +654,34 @@ class AsyncMPClient(MPClient):
         assert self.outputs_queue is not None
         return await self.outputs_queue.get()
 
-    async def _send_input(self, request_type: EngineCoreRequestType,
-                          request: Any) -> None:
-        await self.core_engine.send_multipart(
-            (request_type.value, self.encoder.encode(request)))
+    def _send_input(self,
+                    request_type: EngineCoreRequestType,
+                    request: Any,
+                    engine: Optional[CoreEngine] = None) -> Awaitable[None]:
+        if engine is None:
+            engine = self.core_engine
 
-        self._ensure_output_queue_task()
+        message = (request_type.value, self.encoder.encode(request))
+        return self._send_input_message(message, engine)
+
+    def _send_input_message(self, message: tuple[bytes, bytes],
+                            engine: CoreEngine) -> Awaitable[None]:
+        message = (engine.identity, ) + message  # type: ignore[assignment]
+        return self.input_socket.send_multipart(message, copy=False)
 
     async def call_utility_async(self, method: str, *args) -> Any:
         return await self._call_utility_async(method,
                                               *args,
                                               engine=self.core_engine)
 
-    async def _call_utility_async(
-        self,
-        method: str,
-        *args,
-        engine: CoreEngine,
-    ) -> Any:
+    async def _call_utility_async(self, method: str, *args,
+                                  engine: CoreEngine) -> Any:
         call_id = uuid.uuid1().int >> 64
         future = asyncio.get_running_loop().create_future()
         self.utility_results[call_id] = future
         message = (EngineCoreRequestType.UTILITY.value,
                    self.encoder.encode((call_id, method, args)))
-        await engine.send_multipart(message)
+        await self._send_input_message(message, engine)
         self._ensure_output_queue_task()
         return await future
 
@@ -657,6 +690,7 @@ class AsyncMPClient(MPClient):
         # tokenized.
         request.prompt = None
         await self._send_input(EngineCoreRequestType.ADD, request)
+        self._ensure_output_queue_task()
 
     async def abort_requests_async(self, request_ids: list[str]) -> None:
         if len(request_ids) > 0:
@@ -761,15 +795,15 @@ class DPAsyncMPClient(AsyncMPClient):
         self.reqs_in_flight[request.request_id] = chosen_engine
         chosen_engine.num_reqs_in_flight += 1
         if self.num_engines_running >= len(self.core_engines):
-            await chosen_engine.send_multipart(msg)
+            await self._send_input_message(msg, chosen_engine)
         else:
             # Send request to chosen engine and dp start loop
             # control message to all other engines.
             self.num_engines_running += len(self.core_engines)
             await asyncio.gather(*[
-                engine.send_multipart(msg if engine is
-                                      chosen_engine else self.start_dp_msg)
-                for engine in self.core_engines
+                self._send_input_message(
+                    msg if engine is chosen_engine else self.start_dp_msg,
+                    engine) for engine in self.core_engines
             ])
 
         self._ensure_output_queue_task()
@@ -794,7 +828,7 @@ class DPAsyncMPClient(AsyncMPClient):
                 # sure to start the other engines:
                 self.num_engines_running = len(self.core_engines)
                 coros = [
-                    engine.send_multipart(self.start_dp_msg)
+                    self._send_input_message(self.start_dp_msg, engine)
                     for engine in self.core_engines
                     if not engine.num_reqs_in_flight
                 ]
@@ -820,5 +854,5 @@ class DPAsyncMPClient(AsyncMPClient):
 
     async def _abort_requests(self, request_ids: list[str],
                               engine: CoreEngine) -> None:
-        await engine.send_multipart((EngineCoreRequestType.ABORT.value,
-                                     self.encoder.encode(request_ids)))
+        await self._send_input(EngineCoreRequestType.ABORT, request_ids,
+                               engine)
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index f42b3501adb3b..fed5761b04b6c 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -105,12 +105,9 @@ class BackgroundProcHandle:
         process_kwargs: dict[Any, Any],
     ):
         context = get_mp_context()
-        self.reader, writer = context.Pipe(duplex=False)
 
-        assert ("ready_pipe" not in process_kwargs
-                and "input_path" not in process_kwargs
+        assert ("input_path" not in process_kwargs
                 and "output_path" not in process_kwargs)
-        process_kwargs["ready_pipe"] = writer
         process_kwargs["input_path"] = input_path
         process_kwargs["output_path"] = output_path
 
@@ -122,12 +119,6 @@ class BackgroundProcHandle:
                                            input_path, output_path)
         self.proc.start()
 
-    def wait_for_startup(self):
-        # Wait for startup.
-        if self.reader.recv()["status"] != "READY":
-            raise RuntimeError(f"{self.proc.name} initialization failed. "
-                               "See root cause above.")
-
     def shutdown(self):
         self._finalizer()
 

From f5722a505222c41d21571d4506d7f8cd78020f7e Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 5 Apr 2025 05:26:44 +0800
Subject: [PATCH 235/593] [V1] Scatter and gather placeholders in the model
 runner (#15712)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
---
 docs/source/contributing/model/multimodal.md  |  16 +-
 docs/source/models/supported_models.md        |   3 -
 examples/offline_inference/audio_language.py  |   2 +-
 .../audio_language/test_ultravox.py           |   5 +-
 .../vision_language/test_models.py            |   2 +-
 .../vision_language/test_pixtral.py           |  26 ++--
 .../multimodal/processing/test_llava_next.py  |   4 +-
 .../processing/test_llava_onevision.py        |   4 +-
 tests/models/registry.py                      |   4 +-
 tests/multimodal/test_processing.py           |   9 ++
 tests/v1/core/test_kv_cache_utils.py          |  46 +++---
 vllm/model_executor/models/aya_vision.py      |  71 +++------
 vllm/model_executor/models/chameleon.py       |   6 +-
 vllm/model_executor/models/fuyu.py            |  85 ++++-------
 vllm/model_executor/models/gemma3_mm.py       |  83 +++-------
 vllm/model_executor/models/h2ovl.py           |   2 +-
 vllm/model_executor/models/idefics3.py        |  82 ++--------
 vllm/model_executor/models/internvl.py        |  43 +-----
 vllm/model_executor/models/llava.py           |  55 +------
 vllm/model_executor/models/minicpmo.py        |  74 ++-------
 vllm/model_executor/models/minicpmv.py        | 142 +++++-------------
 vllm/model_executor/models/mistral3.py        |  50 +-----
 vllm/model_executor/models/molmo.py           |  74 ++-------
 vllm/model_executor/models/nvlm_d.py          |  32 +---
 vllm/model_executor/models/paligemma.py       |   6 +-
 vllm/model_executor/models/phi3v.py           |  11 +-
 vllm/model_executor/models/pixtral.py         |  48 +-----
 vllm/model_executor/models/qwen2_audio.py     |   6 +-
 vllm/model_executor/models/qwen_vl.py         |   6 +-
 vllm/model_executor/models/skyworkr1v.py      |  42 +-----
 vllm/model_executor/models/vision.py          |  77 +---------
 vllm/multimodal/base.py                       |   4 +-
 vllm/multimodal/inputs.py                     |  32 +++-
 vllm/multimodal/processing.py                 |  75 ++++++---
 vllm/multimodal/profiling.py                  |   2 +-
 vllm/multimodal/utils.py                      |   2 +-
 vllm/v1/core/kv_cache_utils.py                |   7 +-
 vllm/v1/core/sched/scheduler.py               |   8 +-
 vllm/v1/request.py                            |   2 +-
 vllm/v1/worker/gpu_model_runner.py            |  60 +++++---
 vllm/v1/worker/tpu_model_runner.py            |  85 ++++++++---
 vllm/v1/worker/utils.py                       |  45 ++++++
 42 files changed, 496 insertions(+), 942 deletions(-)

diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index 9cbfc32991f09..c4894d39edc97 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -860,8 +860,8 @@ prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
 )
 ```
 
-To accommodate this, instead of a string you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`
-with different `full` and `feature` attributes:
+To assign the vision embeddings to only the image tokens, instead of a string
+you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`:
 
 ```python
 hf_config = self.info.get_hf_config()
@@ -879,9 +879,9 @@ def get_replacement_fuyu(item_idx: int):
     image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                     [_NEWLINE_TOKEN_ID]) * nrows
 
-    return PromptUpdateDetails(
-        full=image_tokens + [bos_token_id],
-        features=image_tokens,
+    return PromptUpdateDetails.select_token_id(
+        image_tokens + [bos_token_id],
+        embed_token_id=_IMAGE_TOKEN_ID,
     )
 ```
 
@@ -914,9 +914,9 @@ def _get_prompt_updates(
         image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                         [_NEWLINE_TOKEN_ID]) * nrows
 
-        return PromptUpdateDetails(
-            full=image_tokens + [bos_token_id],
-            features=image_tokens,
+        return PromptUpdateDetails.select_token_id(
+            image_tokens + [bos_token_id],
+            embed_token_id=_IMAGE_TOKEN_ID,
         )
 
     return [
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 74b4eab920438..316fc3b2c4f2a 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -989,9 +989,6 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
 :::{important}
-To use Gemma3 series models, you have to install Hugging Face Transformers library from source via
-`pip install git+https://github.com/huggingface/transformers`.
-
 Pan-and-scan image pre-processing is currently supported on V0 (but not V1).
 You can enable it by passing `--mm-processor-kwargs '{"do_pan_and_scan": True}'`.
 :::
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 840892ea07010..f33efbab955ef 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -47,7 +47,7 @@ def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
         model=model_name,
         trust_remote_code=True,
         max_model_len=4096,
-        max_num_seqs=5,
+        max_num_seqs=2,
         limit_mm_per_prompt={"audio": audio_count},
     )
 
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index 83ece5d22bfb3..242f3398b9213 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -55,7 +55,10 @@ def server(request, audio_assets):
         for key, value in request.param.items()
     ]
 
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    with RemoteOpenAIServer(MODEL_NAME,
+                            args,
+                            env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
+                                      "30"}) as remote_server:
         yield remote_server
 
 
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 3b34f012f6264..b984cd6f54886 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -167,7 +167,7 @@ VLM_TEST_SETTINGS = {
             "cherry_blossom": "<image>What is the season?",  # noqa: E501
         }),
         multi_image_prompt="<image><image>Describe the two images in detail.",  # noqa: E501
-        max_model_len=8192,
+        max_model_len=4096,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
         vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}}
diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py
index ee619d8d80c42..6ebe75f0e8129 100644
--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -176,6 +176,8 @@ def test_chat(
             model,
             dtype=dtype,
             tokenizer_mode="mistral",
+            load_format="mistral",
+            config_format="mistral",
             max_model_len=max_model_len,
             limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
     ) as vllm_model:
@@ -198,22 +200,14 @@ def test_chat(
 
 
 @large_gpu_test(min_gb=48)
-@pytest.mark.parametrize(
-    "prompt,expected_ranges",
-    [(_create_engine_inputs_hf(IMG_URLS[:1]), [{
-        "offset": 11,
-        "length": 494
-    }]),
-     (_create_engine_inputs_hf(IMG_URLS[1:4]), [{
-         "offset": 11,
-         "length": 266
-     }, {
-         "offset": 277,
-         "length": 1056
-     }, {
-         "offset": 1333,
-         "length": 418
-     }])])
+@pytest.mark.parametrize("prompt,expected_ranges",
+                         [(_create_engine_inputs_hf(IMG_URLS[:1]),
+                           [PlaceholderRange(offset=11, length=494)]),
+                          (_create_engine_inputs_hf(IMG_URLS[1:4]), [
+                              PlaceholderRange(offset=11, length=266),
+                              PlaceholderRange(offset=277, length=1056),
+                              PlaceholderRange(offset=1333, length=418)
+                          ])])
 def test_multi_modal_placeholders(vllm_runner, prompt,
                                   expected_ranges: list[PlaceholderRange],
                                   monkeypatch) -> None:
diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
index fe56a200a330f..b82bfe483dbbc 100644
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -92,8 +92,8 @@ def _validate_image_prompt_replacements_one(
         first_placeholder = image_placeholders[0]
 
         # NOTE: There is a BOS token
-        assert first_placeholder["offset"] == 1
-        assert first_placeholder["length"] == (
+        assert first_placeholder.offset == 1
+        assert first_placeholder.length == (
             len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
 
     except Exception as exc:
diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
index 7cefdd37ee49a..dcc8dc8dab5a0 100644
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -92,8 +92,8 @@ def _validate_image_prompt_replacements_one(
 
         first_placeholder = image_placeholders[0]
 
-        assert first_placeholder["offset"] == 0
-        assert first_placeholder["length"] == len(
+        assert first_placeholder.offset == 0
+        assert first_placeholder.length == len(
             processed_inputs["prompt_token_ids"]) // num_imgs
     except Exception as exc:
         failed_size_excs.append((image_size, exc))
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 39e104a11ab11..9996bd2edced4 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -277,7 +277,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                         trust_remote_code=True,
                                         hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
     "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m",
-                                      extras={"2b": "h2oai/h2ovl-mississippi-2b"}),  # noqa: E501
+                                      extras={"2b": "h2oai/h2ovl-mississippi-2b"},  # noqa: E501
+                                      max_transformers_version="4.48",  # noqa: E501
+                                      transformers_version_reason="HF model is not compatible."),  # noqa: E501
     "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
                                          extras={"2B": "OpenGVLab/InternVL2-2B"},  # noqa: E501
                                          trust_remote_code=True),
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index da112bd7a921c..fa9588a050965 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -785,6 +785,7 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=6,
                         tokens=[32000, 32000],
+                        is_embed=None,
                     ),
                 ],
                 "pattern_4": [
@@ -793,6 +794,7 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=3,
                         tokens=[32000],
+                        is_embed=None,
                     ),
                 ],
             }
@@ -807,12 +809,14 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=1,
                         tokens=[32000, 32000],
+                        is_embed=None,
                     ),
                     PlaceholderFeaturesInfo(
                         modality="pattern_1",
                         item_idx=1,
                         start_idx=5,
                         tokens=[32000, 32000],
+                        is_embed=None,
                     ),
                 ],
                 "pattern_3": [
@@ -821,6 +825,7 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=7,
                         tokens=[1550, 918, 1550],
+                        is_embed=None,
                     ),
                 ],
                 # No match for pattern_4 as it has lower priority than pattern_1
@@ -835,12 +840,14 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=1,
                         tokens=[32000, 32000],
+                        is_embed=None,
                     ),
                     PlaceholderFeaturesInfo(
                         modality="pattern_1",
                         item_idx=1,
                         start_idx=3,
                         tokens=[32000, 32000],
+                        is_embed=None,
                     ),
                 ],
                 "pattern_4": [
@@ -849,6 +856,7 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=5,
                         tokens=[32000],
+                        is_embed=None,
                     ),
                 ],
                 "pattern_3": [
@@ -857,6 +865,7 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=6,
                         tokens=[1550, 918, 1550],
+                        is_embed=None,
                     ),
                 ],
             }
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 8362af24a67ed..51836644b3251 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -3,7 +3,7 @@
 import pytest
 import torch
 
-from vllm.multimodal.inputs import MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.utils import sha256
 # disable yapf here as it formats differently than isort such that both fail
@@ -158,13 +158,10 @@ def test_generate_block_hash_extra_keys():
     request = make_request(
         request_id=0,
         prompt_token_ids=[_ for _ in range(20)],
-        mm_positions=[{
-            "offset": 0,
-            "length": 5
-        }, {
-            "offset": 10,
-            "length": 5
-        }],
+        mm_positions=[
+            PlaceholderRange(offset=0, length=5),
+            PlaceholderRange(offset=10, length=5),
+        ],
         mm_hashes=["hash1", "hash2"],
     )
 
@@ -222,13 +219,10 @@ def test_hash_request_tokens(hash_fn):
     request = make_request(
         request_id=0,
         prompt_token_ids=[_ for _ in range(6)],
-        mm_positions=[{
-            "offset": 0,
-            "length": 3
-        }, {
-            "offset": 3,
-            "length": 3
-        }],
+        mm_positions=[
+            PlaceholderRange(offset=0, length=3),
+            PlaceholderRange(offset=3, length=3),
+        ],
         mm_hashes=["hash1", "hash2"],
     )
 
@@ -253,25 +247,19 @@ def test_hash_tokens_different_mm_input(hash_fn):
     request1 = make_request(
         request_id=0,
         prompt_token_ids=[_ for _ in range(6)],
-        mm_positions=[{
-            "offset": 0,
-            "length": 3
-        }, {
-            "offset": 3,
-            "length": 3
-        }],
+        mm_positions=[
+            PlaceholderRange(offset=0, length=3),
+            PlaceholderRange(offset=3, length=3),
+        ],
         mm_hashes=["hash1", "hash2"],
     )
     request2 = make_request(
         request_id=1,
         prompt_token_ids=[_ for _ in range(6)],
-        mm_positions=[{
-            "offset": 0,
-            "length": 3
-        }, {
-            "offset": 3,
-            "length": 3
-        }],
+        mm_positions=[
+            PlaceholderRange(offset=0, length=3),
+            PlaceholderRange(offset=3, length=3),
+        ],
         mm_hashes=["hash3", "hash2"],
     )
     block_size = 3
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index b4bf1d82c083e..6b68885d375a2 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -27,7 +27,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo,
                                         MultiModalFieldConfig,
                                         PromptReplacement, PromptUpdate,
-                                        encode_tokens)
+                                        PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -35,7 +35,6 @@ from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features
 
 
 class AyaVisionImagePixelInputs(TypedDict):
@@ -51,13 +50,6 @@ class AyaVisionImagePixelInputs(TypedDict):
     num_patches: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 class AyaVisionMultiModalProjector(nn.Module):
 
@@ -135,21 +127,20 @@ class AyaVisionProcessingInfo(BaseProcessingInfo):
     def get_max_image_tokens(self) -> int:
         hf_processor = self.get_hf_processor()
         image_processor = hf_processor.image_processor
+
         image_size = self.get_image_size_with_most_features()
-        tokenizer = hf_processor.tokenizer
         num_patches = self.get_num_patches(
             image_width=image_size.width,
             image_height=image_size.height,
             size=image_processor.size,
             min_patches=image_processor.min_patches,
-            max_patches=image_processor.max_patches)
-        image_string = hf_processor._prompt_split_image(num_patches)
-        x = encode_tokens(
-            tokenizer,
-            image_string,
-            add_special_tokens=False,
+            max_patches=image_processor.max_patches,
         )
-        return len(x)
+
+        img_patches_per_tile = (hf_processor.img_size //
+                                hf_processor.patch_size)**2
+
+        return num_patches * img_patches_per_tile
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
@@ -221,7 +212,6 @@ class AyaVisionMultiModalProcessor(
         hf_processor = self.info.get_hf_processor(**mm_kwargs)
         image_processor = hf_processor.image_processor
 
-        hf_config = self.info.get_hf_config()
         # HF processor pops the `num_patches` kwarg, which is needed by vLLM
         if (images :=
                 mm_data.get("images")) is not None and '<image>' in prompt:
@@ -234,6 +224,7 @@ class AyaVisionMultiModalProcessor(
                 parsed_images.get_image_size(i)
                 for i in range(len(parsed_images))
             ]
+
             num_patches = [
                 self.info.get_num_patches(
                     image_width=image_size.width,
@@ -243,20 +234,6 @@ class AyaVisionMultiModalProcessor(
                     max_patches=image_processor.max_patches)
                 for image_size in image_sizes
             ]
-            image_tokens_list = [
-                hf_processor._prompt_split_image(num_patch)
-                for num_patch in num_patches
-            ]
-            tokenizer = self.info.get_tokenizer()
-            image_token_ids = [
-                tokenizer.encode(image_tokens, add_special_tokens=False)
-                for image_tokens in image_tokens_list
-            ]
-            embed_is_patch = [
-                torch.tensor(image_repl_tokens) == hf_config.image_token_index
-                for image_repl_tokens in image_token_ids
-            ]
-            processed_outputs["embed_is_patch"] = embed_is_patch
             processed_outputs["num_patches"] = torch.tensor(num_patches)
 
         return processed_outputs
@@ -271,7 +248,6 @@ class AyaVisionMultiModalProcessor(
             pixel_values=MultiModalFieldConfig.flat_from_sizes(
                 "image", num_patches),
             num_patches=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
@@ -283,6 +259,7 @@ class AyaVisionMultiModalProcessor(
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_token = hf_processor.image_token
+        img_patch_token = hf_processor.img_patch_token
         image_processor = hf_processor.image_processor
 
         def get_replacement(item_idx: int):
@@ -294,8 +271,11 @@ class AyaVisionMultiModalProcessor(
                 image_height=image_size.height,
                 size=image_processor.size,
                 min_patches=image_processor.min_patches,
-                max_patches=image_processor.max_patches)
-            return hf_processor._prompt_split_image(num_patches=num_patches)
+                max_patches=image_processor.max_patches,
+            )
+            repl = hf_processor._prompt_split_image(num_patches=num_patches)
+
+            return PromptUpdateDetails.select_text(repl, img_patch_token)
 
         return [
             PromptReplacement(
@@ -424,7 +404,6 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
             self, **kwargs: object) -> Optional[AyaVisionImagePixelInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
         num_patches = kwargs.pop("num_patches", None)
-        embed_is_patch = kwargs.pop("embed_is_patch", None)
         image_embeds = kwargs.pop("image_embeds", None)
         assert image_embeds is None, "Aya Vision does not support image_embeds."
 
@@ -436,18 +415,13 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
             raise ValueError("Incorrect type of num_patches. "
                              f"Got type: {type(num_patches)}")
 
-        if not isinstance(embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of embed_is_patch. "
-                             f"Got type: {type(embed_is_patch)}")
-
         pixel_values = flatten_bn(pixel_values, concat=True)
         num_patches = flatten_bn(num_patches, concat=True)
-        embed_is_patch = flatten_bn(embed_is_patch)
+
         return AyaVisionImagePixelInputs(
             type="pixel_values",
             pixel_values=self._validate_pixel_values(pixel_values),
             num_patches=num_patches,
-            embed_is_patch=embed_is_patch,
         )
 
     def get_multimodal_embeddings(
@@ -455,11 +429,8 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
-        image_features = self._process_image_input(image_input, **kwargs)
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+
+        return self._process_image_input(image_input, **kwargs)
 
     def get_input_embeddings(
         self,
@@ -471,9 +442,9 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids=input_ids,
                 inputs_embeds=inputs_embeds,
-                multimodal_embeddings=select_patch_features(
-                    multimodal_embeddings),
-                placeholder_token_id=self.config.image_token_index)
+                multimodal_embeddings=multimodal_embeddings,
+                placeholder_token_id=self.config.image_token_index,
+            )
 
         return inputs_embeds
 
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index f758c98ea5e59..3d527cb6f529d 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -162,9 +162,9 @@ class ChameleonMultiModalProcessor(
             PromptReplacement(
                 modality="image",
                 target=[image_token_id],
-                replacement=PromptUpdateDetails(
-                    full=([image_start_id] + image_tokens + [image_end_id]),
-                    features=image_tokens,
+                replacement=PromptUpdateDetails.select_token_id(
+                    [image_start_id] + image_tokens + [image_end_id],
+                    embed_token_id=image_token_id,
                 ),
             )
         ]
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index a807b047a1aae..189b91db4a862 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -18,7 +18,7 @@
 """ PyTorch Fuyu model."""
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, Set, Tuple, TypedDict, Union
+from typing import Literal, Optional, Set, Tuple, TypedDict
 
 import torch
 import torch.nn as nn
@@ -43,7 +43,6 @@ from vllm.sequence import IntermediateTensors
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features
 
 # Cannot find the following 2 numbers from hf config.
 _IMAGE_TOKEN_ID = 71011
@@ -66,14 +65,6 @@ class FuyuImagePatchInputs(TypedDict):
     flattened just like `flat_data`.
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 class FuyuProcessingInfo(BaseProcessingInfo):
 
@@ -94,15 +85,7 @@ class FuyuProcessingInfo(BaseProcessingInfo):
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> Mapping[str, int]:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        max_ncols, max_nrows = self.get_image_feature_grid_size(
-            image_width=target_width,
-            image_height=target_height,
-        )
-        max_image_tokens = (max_ncols + 1) * max_nrows
-
-        return {"image": max_image_tokens}
+        return {"image": self.get_max_image_tokens()}
 
     def get_image_feature_grid_size(
         self,
@@ -128,11 +111,32 @@ class FuyuProcessingInfo(BaseProcessingInfo):
         nrows = math.ceil(image_height / patch_height)
         return ncols, nrows
 
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        ncols, nrows = self.get_image_feature_grid_size(
+            image_width=image_width,
+            image_height=image_height,
+        )
+
+        return ncols * nrows
+
     def get_image_size_with_most_features(self) -> ImageSize:
         image_processor = self.get_image_processor()
         return ImageSize(width=image_processor.size["width"],
                          height=image_processor.size["height"])
 
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+        )
+
 
 class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]):
 
@@ -192,19 +196,6 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
 
             processed_outputs["image_patches"] = image_patches[0]
 
-            # get patch grid size for each image
-            embed_is_patch = []
-            for image in images:
-                ncols, nrows = self.info.get_image_feature_grid_size(
-                    image_width=image.width,
-                    image_height=image.height,
-                )
-
-                mask = torch.tensor(([True] * ncols + [False]) * nrows)
-                embed_is_patch.append(mask)
-
-            processed_outputs["embed_is_patch"] = embed_is_patch
-
         return processed_outputs
 
     def _apply_hf_processor_tokens_only(
@@ -224,8 +215,7 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(image_patches=MultiModalFieldConfig.batched("image"),
-                    embed_is_patch=MultiModalFieldConfig.batched("image"))
+        return dict(image_patches=MultiModalFieldConfig.batched("image"))
 
     def _get_prompt_updates(
         self,
@@ -252,9 +242,9 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
             image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                             [_NEWLINE_TOKEN_ID]) * nrows
 
-            return PromptUpdateDetails(
-                full=image_tokens + [bos_token_id],
-                features=image_tokens,
+            return PromptUpdateDetails.select_token_id(
+                image_tokens + [bos_token_id],
+                embed_token_id=_IMAGE_TOKEN_ID,
             )
 
         return [
@@ -329,20 +319,13 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
                 raise ValueError("Incorrect type of image patches. "
                                  f"Got type: {type(image_patches)}")
 
-            embed_is_patch = kwargs.pop("embed_is_patch")
-            if not isinstance(embed_is_patch, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of embed_is_patch. "
-                                 f"Got type: {type(embed_is_patch)}")
-
             image_patches_flat = flatten_bn(image_patches)
-            embed_is_patch = flatten_bn(embed_is_patch)
 
             return FuyuImagePatchInputs(
                 type="image_patches",
                 flat_data=self._validate_pixel_values(
                     flatten_bn(image_patches_flat, concat=True)),
                 patches_per_image=[x.size(0) for x in image_patches_flat],
-                embed_is_patch=embed_is_patch,
             )
 
         return None
@@ -364,12 +347,7 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         if image_input is None:
             return None
 
-        image_features = self._process_image_input(image_input)
-
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -379,8 +357,11 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
             inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds,
-                select_patch_features(multimodal_embeddings), _IMAGE_TOKEN_ID)
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                _IMAGE_TOKEN_ID,
+            )
         return inputs_embeds
 
     def forward(
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index bbdea70a7bcfd..9552ee1f0b3a7 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -25,7 +25,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PlaceholderFeaturesInfo,
                                         PromptReplacement, PromptTargetMatch,
                                         PromptUpdate, PromptUpdateDetails,
-                                        encode_tokens, find_mm_placeholders,
+                                        find_mm_placeholders,
                                         replace_token_matches)
 # yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
@@ -36,7 +36,6 @@ from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features
 
 logger = init_logger(__name__)
 
@@ -54,14 +53,6 @@ class Gemma3ImagePixelInputs(TypedDict):
     num_patches: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 Gemma3ImageInputs = Gemma3ImagePixelInputs
 
@@ -183,7 +174,7 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
         if processor is None:
             processor = self.get_hf_processor()
 
-        image_token = processor.boi_token
+        boi_token = processor.boi_token
 
         num_crops = self.get_num_crops(
             image_width=image_width,
@@ -192,19 +183,21 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
         )
 
         if num_crops == 0:
-            image_text = image_token
+            image_text = boi_token
         else:
-            crops_image_tokens = " ".join(image_token
-                                          for _ in range(num_crops))
+            crops_image_tokens = " ".join(boi_token for _ in range(num_crops))
             image_text = (
-                f"Here is the original image {image_token} and here are some "
+                f"Here is the original image {boi_token} and here are some "
                 f"crops to help you see better {crops_image_tokens}")
 
-        repl_full = image_text.replace(image_token,
+        repl_full = image_text.replace(boi_token,
                                        processor.full_image_sequence)
-        repl_features = repl_full.strip("\n")
 
-        return PromptUpdateDetails(full=repl_full, features=repl_features)
+        tokenizer = processor.tokenizer
+        vocab = tokenizer.get_vocab()
+        image_token_id = vocab[tokenizer.image_token]
+
+        return PromptUpdateDetails.select_token_id(repl_full, image_token_id)
 
     def get_num_image_tokens(
         self,
@@ -213,19 +206,17 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
         image_height: int,
         processor: Optional[Gemma3Processor],
     ) -> int:
-        tokenizer = self.get_tokenizer()
-        image_repl = self.get_image_repl(
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        num_crops = self.get_num_crops(
             image_width=image_width,
             image_height=image_height,
             processor=processor,
         )
+        image_seq_len = processor.image_seq_length
 
-        image_repl_tokens = encode_tokens(
-            tokenizer,
-            image_repl.features,
-            add_special_tokens=False,
-        )
-        return len(image_repl_tokens)
+        return (num_crops + 1) * image_seq_len
 
     def get_image_size_with_most_features(self) -> ImageSize:
         processor = self.get_hf_processor()
@@ -301,28 +292,6 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
             ]
             hf_processor = self.info.get_hf_processor(**mm_kwargs)
 
-            image_repl_features = [
-                self.info.get_image_repl(image_width=size.width,
-                                         image_height=size.height,
-                                         processor=hf_processor).features
-                for size in image_sizes
-            ]
-
-            tokenizer = self.info.get_tokenizer()
-            image_repls_feature_tokens = [
-                tokenizer.encode(image_repl, add_special_tokens=False)
-                for image_repl in image_repl_features
-            ]
-
-            vocab = tokenizer.get_vocab()
-            image_token_id = vocab[tokenizer.image_token]
-
-            embed_is_patch = [
-                torch.tensor(image_repl_tokens) == image_token_id
-                for image_repl_tokens in image_repls_feature_tokens
-            ]
-            processed_outputs["embed_is_patch"] = embed_is_patch
-
             num_crops = [
                 self.info.get_num_crops(image_width=size.width,
                                         image_height=size.height,
@@ -344,7 +313,6 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
             pixel_values=MultiModalFieldConfig.flat_from_sizes(
                 "image", num_crops + 1),
             num_crops=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
         )
 
     def _get_prompt_updates(
@@ -454,6 +422,7 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
                     item_idx=p.item_idx,
                     start_idx=repl_orig_idxs[p.start_idx],
                     tokens=p.tokens,
+                    is_embed=p.is_embed,
                 ) for p in placeholders
             ]
             for modality, placeholders in repls.items()
@@ -572,7 +541,6 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
             self, **kwargs: object) -> Optional[Gemma3ImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
         num_crops = kwargs.pop("num_crops", None)
-        embed_is_patch = kwargs.pop("embed_is_patch", None)
         image_embeds = kwargs.pop("image_embeds", None)
         assert image_embeds is None, "Gemma3 does not support image_embeds."
         if pixel_values is None:
@@ -586,19 +554,13 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
             raise ValueError("Incorrect type of num_crops. "
                              f"Got type: {type(num_crops)}")
 
-        if not isinstance(embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of embed_is_patch. "
-                             f"Got type: {type(embed_is_patch)}")
-
         pixel_values = flatten_bn(pixel_values, concat=True)
         num_crops = flatten_bn(num_crops, concat=True)
-        embed_is_patch = flatten_bn(embed_is_patch)
 
         return Gemma3ImagePixelInputs(
             type="pixel_values",
             pixel_values=self._validate_pixel_values(pixel_values),
             num_patches=num_crops + 1,
-            embed_is_patch=embed_is_patch,
         )
 
     def _image_pixels_to_features(
@@ -635,12 +597,7 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
         if image_input is None:
             return None
 
-        image_features = self._process_image_input(image_input)
-
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -652,7 +609,7 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.config.image_token_index,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 3b2ad695f83ef..f975a19a364ed 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -257,7 +257,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
         repl_features = IMG_CONTEXT * feature_size
         repl_full = IMG_START + repl_features + IMG_END
 
-        return PromptUpdateDetails(full=repl_full, features=repl_features)
+        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
 
     def resolve_min_max_num(
         self,
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index da4a44346c32e..347106bc4dcf8 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -41,7 +41,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         MultiModalDataItems,
                                         MultiModalFieldConfig,
                                         PromptReplacement, PromptUpdate,
-                                        encode_tokens)
+                                        PromptUpdateDetails)
 # yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
@@ -54,7 +54,6 @@ from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal
 from .llama import LlamaModel
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features
 
 
 class Idefics3ImagePixelInputs(TypedDict):
@@ -69,14 +68,6 @@ class Idefics3ImagePixelInputs(TypedDict):
     num_patches: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 class Idefics3ImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
@@ -86,14 +77,6 @@ class Idefics3ImageEmbeddingInputs(TypedDict):
     `hidden_size` must match the hidden size of language model backbone.
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs]
 
@@ -275,19 +258,16 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
         image_height: int,
         processor: Optional[Idefics3Processor],
     ) -> int:
-        tokenizer = self.get_tokenizer()
-        image_repl = self.get_image_repl(
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        num_patches = self.get_num_patches(
             image_width=image_width,
             image_height=image_height,
             processor=processor,
         )
 
-        image_repl_tokens = encode_tokens(
-            tokenizer,
-            image_repl,
-            add_special_tokens=False,
-        )
-        return len(image_repl_tokens)
+        return num_patches * processor.image_seq_len
 
     def get_image_size_with_most_features(self) -> ImageSize:
         processor = self.get_hf_processor()
@@ -364,28 +344,6 @@ class Idefics3MultiModalProcessor(
         ]
         hf_processor = self.info.get_hf_processor(**mm_kwargs)
 
-        image_repl_features = [
-            self.info.get_image_repl(image_width=size.width,
-                                     image_height=size.height,
-                                     processor=hf_processor)
-            for size in image_sizes
-        ]
-
-        tokenizer = self.info.get_tokenizer()
-        image_repls_feature_tokens = [
-            tokenizer.encode(image_repl, add_special_tokens=False)
-            for image_repl in image_repl_features
-        ]
-
-        vocab = tokenizer.get_vocab()
-        image_token_id = vocab[hf_processor.image_token.content]
-
-        embed_is_patch = [
-            torch.tensor(image_repl_tokens) == image_token_id
-            for image_repl_tokens in image_repls_feature_tokens
-        ]
-        processed_outputs["embed_is_patch"] = embed_is_patch
-
         num_patches = [
             self.info.get_num_patches(
                 image_width=size.width,
@@ -415,7 +373,6 @@ class Idefics3MultiModalProcessor(
                 "image", num_patches),
             image_embeds=MultiModalFieldConfig.batched("image"),
             num_patches=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
         )
 
     def _get_prompt_updates(
@@ -427,17 +384,22 @@ class Idefics3MultiModalProcessor(
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_token = hf_processor.image_token.content
 
-        def get_replacement_idefics3(item_idx: int) -> str:
+        def get_replacement_idefics3(item_idx: int) -> PromptUpdateDetails:
             images = mm_items.get_items("image", ImageProcessorItems)
 
             image_size = images.get_image_size(item_idx)
 
-            return self.info.get_image_repl(
+            image_repl = self.info.get_image_repl(
                 image_width=image_size.width,
                 image_height=image_size.height,
                 processor=hf_processor,
             )
 
+            return PromptUpdateDetails.select_text(
+                image_repl,
+                embed_text=image_token,
+            )
+
         return [
             PromptReplacement(
                 modality="image",
@@ -675,13 +637,6 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
         if pixel_values is None and image_embeds is None:
             return None
 
-        embed_is_patch = kwargs.pop("embed_is_patch")
-        if not isinstance(embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of embed_is_patch. "
-                             f"Got type: {type(embed_is_patch)}")
-
-        embed_is_patch = flatten_bn(embed_is_patch)
-
         if image_embeds is not None:
             if not isinstance(image_embeds, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of image embeddings. "
@@ -690,7 +645,6 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
             return Idefics3ImageEmbeddingInputs(
                 type="image_embeds",
                 data=flatten_bn(image_embeds, concat=True),
-                embed_is_patch=embed_is_patch,
             )
 
         if pixel_values is not None:
@@ -718,7 +672,6 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
                 pixel_values=self._validate_pixel_values(pixel_values),
                 pixel_attention_mask=pixel_attention_mask,
                 num_patches=num_patches,
-                embed_is_patch=embed_is_patch,
             )
 
         raise AssertionError("This line should be unreachable.")
@@ -754,12 +707,7 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
         if image_input is None:
             return None
 
-        image_features = self._process_image_input(image_input)
-
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -771,7 +719,7 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.config.image_token_id,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 0729f4c7d203c..cf5608e3de7ba 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -39,7 +39,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features
 
 IMG_START = '<img>'
 IMG_END = '</img>'
@@ -60,14 +59,6 @@ class InternVLImagePixelInputs(TypedDict):
     num_patches: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 class InternVLImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
@@ -419,24 +410,12 @@ class BaseInternVLProcessor(ABC):
                 torch.tensor([len(item) for item in pixel_values_lst]),
             }
 
-            tokenizer = self.tokenizer
-            image_token_id = self.image_token_id
-
-            embed_is_patch = list[torch.Tensor]()
-
             for pixel_values in pixel_values_lst:
                 num_patches = pixel_values.shape[0]
                 feature_size = num_patches * self.num_image_token
 
                 image_repl = self.get_image_repl(feature_size, num_patches)
-                feature_tokens = tokenizer.encode(image_repl.features,
-                                                  add_special_tokens=False)
-
                 text = [t.replace('<image>', image_repl.full, 1) for t in text]
-                embed_is_patch.append(
-                    torch.tensor(feature_tokens) == image_token_id)
-
-            image_inputs["embed_is_patch"] = embed_is_patch
 
         text_inputs = self.tokenizer(text)
 
@@ -460,7 +439,7 @@ class InternVLProcessor(BaseInternVLProcessor):
         repl_features = IMG_CONTEXT * feature_size
         repl_full = IMG_START + repl_features + IMG_END
 
-        return PromptUpdateDetails(full=repl_full, features=repl_features)
+        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
 
 
 class BaseInternVLProcessingInfo(BaseProcessingInfo):
@@ -599,7 +578,6 @@ class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
             pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
                 "image", image_num_patches),
             image_num_patches=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
             image_token_id=MultiModalFieldConfig.shared("image", num_images),
         )
@@ -831,7 +809,6 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
             self, **kwargs: object) -> Optional[InternVLImageInputs]:
         pixel_values_flat = kwargs.pop("pixel_values_flat", None)
         image_num_patches = kwargs.pop("image_num_patches", None)
-        embed_is_patch = kwargs.pop("embed_is_patch", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
         if pixel_values_flat is None and image_embeds is None:
@@ -860,20 +837,14 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
                 raise ValueError("Incorrect type of image_num_patches. "
                                  f"Got type: {type(image_num_patches)}")
 
-            if not isinstance(embed_is_patch, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of embed_is_patch. "
-                                 f"Got type: {type(embed_is_patch)}")
-
             pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
             image_num_patches = flatten_bn(image_num_patches, concat=True)
-            embed_is_patch = flatten_bn(embed_is_patch)
 
             return InternVLImagePixelInputs(
                 type="pixel_values",
                 pixel_values_flat=self._validate_pixel_values(
                     pixel_values_flat),
                 num_patches=image_num_patches,
-                embed_is_patch=embed_is_patch,
             )
 
         raise AssertionError("This line should be unreachable.")
@@ -919,15 +890,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
         if image_input is None:
             return None
 
-        image_features = self._process_image_input(image_input)
-
-        if image_input["type"] != "pixel_values":
-            return image_features
-
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -941,7 +904,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.img_context_token_id,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 45a0bf73b837d..b34ac38f68071 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -32,7 +32,8 @@ from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, ProcessingCache,
-                                        PromptReplacement, PromptUpdate)
+                                        PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -42,8 +43,7 @@ from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import (get_vision_encoder_info, scatter_patch_features,
-                     select_patch_features)
+from .vision import get_vision_encoder_info
 
 
 class LlavaImagePixelInputs(TypedDict):
@@ -67,14 +67,6 @@ class PixtralHFImagePixelInputs(TypedDict):
     in which case the data is passed as a list instead of a batched tensor.
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-    
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 class LlavaImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
@@ -343,23 +335,6 @@ class PixtralHFMultiModalProcessor(
                     for p, (h, w) in zip(pixel_values, image_sizes)
                 ]
 
-            hf_config = self.info.get_hf_config()
-            vision_config = hf_config.vision_config
-            assert isinstance(vision_config, PixtralVisionConfig)
-            encoder_info = PixtralHFEncoderInfo(vision_config)
-
-            tile_sizes = [
-                encoder_info.get_patch_grid_size(
-                    image_width=pixel_value.shape[-1],
-                    image_height=pixel_value.shape[-2],
-                ) for pixel_value in processed_outputs["pixel_values"]
-            ]
-            embed_is_patch = [
-                torch.tensor(([True] * ncols + [False]) * nrows)
-                for ncols, nrows in tile_sizes
-            ]
-            processed_outputs["embed_is_patch"] = embed_is_patch
-
         return processed_outputs
 
     def _get_mm_fields_config(
@@ -369,7 +344,6 @@ class PixtralHFMultiModalProcessor(
     ) -> Mapping[str, MultiModalFieldConfig]:
         return dict(
             pixel_values=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
@@ -404,7 +378,7 @@ class PixtralHFMultiModalProcessor(
             tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
             tokens[-1] = image_end_id
 
-            return tokens
+            return PromptUpdateDetails.select_token_id(tokens, image_token_id)
 
         return [
             PromptReplacement(
@@ -612,17 +586,9 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
                                  f"Got type: {type(pixel_values)}")
 
             if self.config.vision_config.model_type == "pixtral":
-                embed_is_patch = kwargs.pop("embed_is_patch")
-                if not isinstance(embed_is_patch, (torch.Tensor, list)):
-                    raise ValueError("Incorrect type of embed_is_patch. "
-                                     f"Got type: {type(embed_is_patch)}")
-
-                embed_is_patch = flatten_bn(embed_is_patch)
-
                 return PixtralHFImagePixelInputs(
                     type="pixel_values_pixtral",
                     pixel_values=flatten_bn(pixel_values),
-                    embed_is_patch=embed_is_patch,
                 )
 
             return LlavaImagePixelInputs(
@@ -714,16 +680,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
         if image_input is None:
             return None
 
-        image_features = self._process_image_input(image_input)
-
-        if image_input["type"] != "pixel_values_pixtral":
-            # The path is used for pixtral (V0 only) and llava (V0/V1)
-            return image_features
-
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -735,7 +692,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.config.image_token_index,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index c74e086d3748e..a4fb0cb1741e9 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -40,7 +40,8 @@ from vllm.multimodal.parse import (AudioItem, AudioProcessorItems,
                                    DictEmbeddingItems, ModalityData,
                                    ModalityDataItems, MultiModalDataItems,
                                    MultiModalDataParser)
-from vllm.multimodal.processing import PromptReplacement, PromptUpdate
+from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
 from vllm.multimodal.profiling import ProcessorInputs
 
 from .minicpmv import (_MAX_FRAMES_PER_VIDEO, MiniCPMV2_6,
@@ -50,7 +51,6 @@ from .minicpmv import (_MAX_FRAMES_PER_VIDEO, MiniCPMV2_6,
                        _minicpmv_field_config)
 from .utils import (AutoWeightsLoader, cast_overflow_tensors, flatten_bn,
                     maybe_prefix)
-from .vision import scatter_patch_features
 
 CPU_DEVICE = torch.device("cpu")
 
@@ -73,14 +73,6 @@ class MiniCPMOAudioFeatureInputs(TypedDict):
     which equals to `audio_features.shape[-1]`
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which audio embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_audios, num_embeds)`
-    """
-
 
 class MiniCPMOAudioEmbeddingInputs(TypedDict):
     type: Literal["audio_embeds"]
@@ -93,14 +85,6 @@ class MiniCPMOAudioEmbeddingInputs(TypedDict):
     Length of each slice may vary, so pass it as a list.
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which audio embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_audios, num_embeds)`
-    """
-
 
 MiniCPMOAudioInputs = Union[MiniCPMOAudioFeatureInputs,
                             MiniCPMOAudioEmbeddingInputs]
@@ -115,7 +99,6 @@ def _minicpmo_field_config(hf_inputs: Mapping[str, torch.Tensor]):
         audio_features=MultiModalFieldConfig.batched("audio"),
         audio_feature_lens=MultiModalFieldConfig.batched("audio"),
         audio_embeds=MultiModalFieldConfig.batched("audio"),
-        audio_embed_is_patch=MultiModalFieldConfig.batched("audio"),
         audio_token_id=MultiModalFieldConfig.shared("audio", num_audios),
     )
 
@@ -197,8 +180,7 @@ class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
         pool_step = self.get_default_audio_pool_step()
         fbank_feat_in_chunk = 100
         cnn_feat_in_chunk = (fbank_feat_in_chunk - 1) // 2 + 1
-        num_audio_tokens = (cnn_feat_in_chunk - pool_step) // pool_step + 1
-        return num_audio_tokens + 2  # <audio>(<unk>*N)</audio>
+        return (cnn_feat_in_chunk - pool_step) // pool_step + 1
 
     def get_max_audio_chunks_with_most_features(self) -> int:
         return 30
@@ -209,8 +191,7 @@ class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
 
     def get_audio_len_by_num_chunks(self, num_chunks: int) -> int:
         sampling_rate = self.get_default_audio_sampling_rate()
-        # exclude <audio> </audio>
-        num_tokens_per_chunk = self.get_max_audio_tokens_per_chunk() - 2
+        num_tokens_per_chunk = self.get_max_audio_tokens_per_chunk()
         return int(num_chunks * sampling_rate / num_tokens_per_chunk) + 1
 
     def get_num_frames_with_most_features(
@@ -295,13 +276,6 @@ class MiniCPMOMultiModalProcessor(
 
         if isinstance(parsed_audios, MiniCPMOAudioEmbeddingItems):
             audio_inputs = {}
-
-            audio_lens = [
-                self.info.get_audio_len_by_num_chunks(
-                    sum(map(len,
-                            parsed_audios.get(i)["audio_embeds"])))
-                for i in range(len(parsed_audios))
-            ]
         else:
             audio_inputs = self._base_call_hf_processor(
                 prompts=[self.info.audio_pattern] * len(parsed_audios),
@@ -323,27 +297,7 @@ class MiniCPMOMultiModalProcessor(
             ]
             audio_inputs["audio_features"] = unpadded_audio_features
 
-            audio_lens = [
-                parsed_audios.get_audio_length(i)
-                for i in range(len(parsed_audios))
-            ]
-
-        audio_repl_features = [
-            self.get_audio_prompt_texts(audio_len) for audio_len in audio_lens
-        ]
-
         tokenizer = self.info.get_tokenizer()
-        audio_repls_feature_tokens = [
-            tokenizer.encode(audio_repl, add_special_tokens=False)
-            for audio_repl in audio_repl_features
-        ]
-
-        embed_is_patch = [
-            self.get_embed_is_patch(audio_repl_tokens)
-            for audio_repl_tokens in audio_repls_feature_tokens
-        ]
-        audio_inputs["audio_embed_is_patch"] = embed_is_patch
-
         unk_token_id = tokenizer.get_vocab()["<unk>"]
         audio_inputs["audio_token_id"] = torch.tensor(unk_token_id)
 
@@ -384,7 +338,10 @@ class MiniCPMOMultiModalProcessor(
             else:
                 audio_len = audios.get_audio_length(item_idx)
 
-            return self.get_audio_prompt_texts(audio_len)
+            return PromptUpdateDetails.select_text(
+                self.get_audio_prompt_texts(audio_len),
+                "<unk>",
+            )
 
         return [
             *base_updates,
@@ -713,13 +670,6 @@ class MiniCPMO(MiniCPMV2_6):
             assert isinstance(audio_token_id, torch.Tensor)
             self.mm_token_ids.add(audio_token_id.flatten().unique().item())
 
-        audio_embed_is_patch = kwargs.pop("audio_embed_is_patch")
-        if not isinstance(audio_embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of audio_embed_is_patch. "
-                             f"Got type: {type(audio_embed_is_patch)}")
-
-        audio_embed_is_patch = flatten_bn(audio_embed_is_patch)
-
         if audio_embeds is not None:
             if not isinstance(audio_embeds, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of audio_embeds. "
@@ -730,7 +680,6 @@ class MiniCPMO(MiniCPMV2_6):
             return MiniCPMOAudioEmbeddingInputs(
                 type="audio_embeds",
                 audio_embeds=audio_embeds_flat,
-                embed_is_patch=audio_embed_is_patch,
             )
 
         if not isinstance(audio_features, (torch.Tensor, list)):
@@ -749,7 +698,6 @@ class MiniCPMO(MiniCPMV2_6):
             type="audio_features",
             audio_features=audio_features_flat,
             audio_feature_lens=audio_feature_lens_flat,
-            embed_is_patch=audio_embed_is_patch,
         )
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
@@ -781,10 +729,6 @@ class MiniCPMO(MiniCPMV2_6):
             if modality == "audios":
                 audio_input = modalities["audios"]
                 audio_features = self._process_audio_input(audio_input)
-                multimodal_embeddings += tuple(
-                    scatter_patch_features(
-                        audio_features,
-                        audio_input["embed_is_patch"],
-                    ))
+                multimodal_embeddings += tuple(audio_features)
 
         return multimodal_embeddings
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 5fab9df3f8f99..eb20a963ae2ab 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -56,7 +56,7 @@ from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem,
                                    VideoItem, VideoProcessorItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate)
+                                        PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
@@ -67,7 +67,6 @@ from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP)
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features
 
 # For profile run
 _MAX_FRAMES_PER_VIDEO = 16
@@ -90,14 +89,6 @@ class MiniCPMVImagePixelInputs(TypedDict):
     This should be in `(height, width)` format.
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
     num_slices: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
@@ -112,14 +103,6 @@ class MiniCPMVImageEmbeddingInputs(TypedDict):
     instead of a batched tensor.
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 MiniCPMVImageInputs = Union[MiniCPMVImagePixelInputs,
                             MiniCPMVImageEmbeddingInputs]
@@ -245,12 +228,10 @@ def _minicpmv_field_config(hf_inputs: Mapping[str, torch.Tensor]):
         image_sizes=MultiModalFieldConfig.batched("image"),
         tgt_sizes=MultiModalFieldConfig.batched("image"),
         image_embeds=MultiModalFieldConfig.batched("image"),
-        embed_is_patch=MultiModalFieldConfig.batched("image"),
         video_pixel_values=MultiModalFieldConfig.batched("video"),
         video_image_sizes=MultiModalFieldConfig.batched("video"),
         video_tgt_sizes=MultiModalFieldConfig.batched("video"),
         video_embeds=MultiModalFieldConfig.batched("video"),
-        video_embed_is_patch=MultiModalFieldConfig.batched("video"),
         image_token_id=MultiModalFieldConfig.shared("image", num_images),
         video_token_id=MultiModalFieldConfig.shared("video", num_videos),
     )
@@ -398,22 +379,43 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
             use_image_id=use_image_id,
         )
 
+    def get_sliced_grid(
+        self,
+        image_size: ImageSize,
+        # For MiniCPM V/O 2.6
+        max_slice_nums: Optional[int] = None,
+    ) -> Optional[tuple[int, int]]:
+        image_processor = self.get_image_processor()
+        version = self.get_model_version()
+
+        if version == (2, 0) or version == (2, 5):
+            return image_processor.get_sliced_grid(image_size)
+
+        if max_slice_nums is None:
+            max_slice_nums = image_processor.max_slice_nums
+
+        return image_processor.get_sliced_grid(
+            image_size,
+            max_slice_nums=max_slice_nums,
+        )
+
     def get_num_image_tokens(
         self,
         image_size: ImageSize,
         max_slice_nums: Optional[int] = None,
-        use_image_id: bool = True,
     ) -> int:
-        tokenizer = self.get_tokenizer()
-        image_placeholders = self.get_slice_image_placeholder(
+        image_processor = self.get_image_processor()
+
+        grid = self.get_sliced_grid(
             image_size,
             max_slice_nums=max_slice_nums,
-            use_image_id=use_image_id,
         )
-        image_token_ids = tokenizer.encode(image_placeholders,
-                                           add_special_tokens=False)
+        if grid is None:
+            ncols = nrows = 0
+        else:
+            ncols, nrows = grid
 
-        return len(image_token_ids)
+        return (ncols * nrows + 1) * image_processor.image_feature_size
 
     def get_max_image_tokens(self) -> int:
         image_size = self.get_image_size_with_most_features()
@@ -433,7 +435,6 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
         return self.get_num_image_tokens(
             frame_size,
             max_slice_nums=self.get_video_max_slice_num(),
-            use_image_id=False,
         )
 
     def get_max_video_tokens(
@@ -539,14 +540,6 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
             use_image_id=False,
         ) * num_frames
 
-    def get_embed_is_patch(
-        self,
-        input_ids: list[int],
-    ) -> torch.Tensor:
-        tokenizer = self.info.get_tokenizer()
-        unk_token_id = tokenizer.get_vocab()["<unk>"]
-        return torch.tensor(input_ids) == unk_token_id
-
     def process_images(
         self,
         mm_data: Mapping[str, object],
@@ -570,26 +563,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
                 out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
             )
 
-        image_sizes = [
-            parsed_images.get_image_size(i) for i in range(len(parsed_images))
-        ]
-        image_repl_features = [
-            self.get_image_prompt_texts(size, idx)
-            for idx, size in enumerate(image_sizes)
-        ]
-
         tokenizer = self.info.get_tokenizer()
-        image_repls_feature_tokens = [
-            tokenizer.encode(image_repl, add_special_tokens=False)
-            for image_repl in image_repl_features
-        ]
-
-        embed_is_patch = [
-            self.get_embed_is_patch(image_repl_tokens)
-            for image_repl_tokens in image_repls_feature_tokens
-        ]
-        image_inputs["embed_is_patch"] = embed_is_patch
-
         unk_token_id = tokenizer.get_vocab()["<unk>"]
         image_inputs["image_token_id"] = torch.tensor(unk_token_id)
 
@@ -625,31 +599,9 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
                 out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
             )
 
-        frame_sizes = [
-            parsed_videos.get_frame_size(i) for i in range(len(parsed_videos))
-        ]
-        num_frames = [
-            parsed_videos.get_num_frames(i) for i in range(len(parsed_videos))
-        ]
-        video_repl_features = [
-            self.get_video_prompt_texts(size, nframes)
-            for size, nframes in zip(frame_sizes, num_frames)
-        ]
-
-        tokenizer = self.info.get_tokenizer()
-        video_repls_feature_tokens = [
-            tokenizer.encode(video_repl, add_special_tokens=False)
-            for video_repl in video_repl_features
-        ]
-
-        embed_is_patch = [
-            self.get_embed_is_patch(video_repl_tokens)
-            for video_repl_tokens in video_repls_feature_tokens
-        ]
-        video_inputs["embed_is_patch"] = embed_is_patch
-
         video_inputs = {f"video_{k}": v for k, v in video_inputs.items()}
 
+        tokenizer = self.info.get_tokenizer()
         unk_token_id = tokenizer.get_vocab()["<unk>"]
         video_inputs["video_token_id"] = torch.tensor(unk_token_id)
 
@@ -740,7 +692,10 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
             image_size = images.get_image_size(item_idx)
 
-            return self.get_image_prompt_texts(image_size, item_idx)
+            return PromptUpdateDetails.select_text(
+                self.get_image_prompt_texts(image_size, item_idx),
+                "<unk>",
+            )
 
         def get_video_replacement(item_idx: int):
             videos = mm_items.get_items(
@@ -749,7 +704,10 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
             frame_size = videos.get_frame_size(item_idx)
             num_frames = videos.get_num_frames(item_idx)
 
-            return self.get_video_prompt_texts(frame_size, num_frames)
+            return PromptUpdateDetails.select_text(
+                self.get_video_prompt_texts(frame_size, num_frames),
+                "<unk>",
+            )
 
         get_replacement = {
             "image": get_image_replacement,
@@ -832,14 +790,6 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
             assert isinstance(image_token_id, torch.Tensor)
             self.mm_token_ids.add(image_token_id.flatten().unique().item())
 
-        embed_is_patch = kwargs.pop("embed_is_patch")
-        if not isinstance(embed_is_patch, (torch.Tensor, list)):
-            raise ValueError(
-                f"Incorrect type of embed_is_patch for {modality=}. "
-                f"Got type: {type(embed_is_patch)}")
-
-        embed_is_patch = flatten_bn(embed_is_patch)
-
         if image_embeds is not None:
             if not isinstance(image_embeds, (torch.Tensor, list)):
                 raise ValueError(
@@ -851,7 +801,6 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
             return MiniCPMVImageEmbeddingInputs(
                 type="image_embeds",
                 image_embeds=image_embeds_flat,
-                embed_is_patch=embed_is_patch,
             )
 
         if not isinstance(pixel_values, (torch.Tensor, list)):
@@ -879,7 +828,6 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
             type="pixel_values",
             pixel_values=pixel_values_flat,
             tgt_sizes=tgt_sizes_flat,
-            embed_is_patch=embed_is_patch,
             num_slices=num_slices_flat,
         )
 
@@ -936,19 +884,11 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
             if modality == "images":
                 image_input = modalities["images"]
                 image_features = self._process_vision_input(image_input)
-                multimodal_embeddings += tuple(
-                    scatter_patch_features(
-                        image_features,
-                        image_input["embed_is_patch"],
-                    ))
+                multimodal_embeddings += tuple(image_features)
             if modality == "videos":
                 video_input = modalities["videos"]
                 video_features = self._process_vision_input(video_input)
-                multimodal_embeddings += tuple(
-                    scatter_patch_features(
-                        video_features,
-                        video_input["embed_is_patch"],
-                    ))
+                multimodal_embeddings += tuple(video_features)
 
         return multimodal_embeddings
 
@@ -971,7 +911,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 list(self.mm_token_ids),
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index 872769dd649e0..b6fbc6b1fa3d0 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -27,7 +27,8 @@ from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, ProcessingCache,
-                                        PromptReplacement, PromptUpdate)
+                                        PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -35,8 +36,7 @@ from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import (get_vision_encoder_info, scatter_patch_features,
-                     select_patch_features)
+from .vision import get_vision_encoder_info
 
 
 class Mistral3ImagePixelInputs(TypedDict):
@@ -49,14 +49,6 @@ class Mistral3ImagePixelInputs(TypedDict):
     in which case the data is passed as a list instead of a batched tensor.
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-    
-    Shape: `(batch_size, num_images, num_embeds)`
-    """
-
 
 class Mistral3PatchMerger(nn.Module):
     """
@@ -266,23 +258,6 @@ class Mistral3MultiModalProcessor(
                 p[:, :h, :w] for p, (h, w) in zip(pixel_values, image_sizes)
             ]
 
-            hf_config = self.info.get_hf_config()
-            vision_config = hf_config.vision_config
-            assert isinstance(vision_config, PixtralVisionConfig)
-            encoder_info = PixtralHFEncoderInfo(vision_config)
-
-            tile_sizes = [
-                encoder_info.get_patch_grid_size(
-                    image_width=pixel_value.shape[-1],
-                    image_height=pixel_value.shape[-2],
-                ) for pixel_value in processed_outputs["pixel_values"]
-            ]
-            embed_is_patch = [
-                torch.tensor(([True] * ncols + [False]) * nrows)
-                for ncols, nrows in tile_sizes
-            ]
-            processed_outputs["embed_is_patch"] = embed_is_patch
-
         return processed_outputs
 
     def _get_mm_fields_config(
@@ -292,7 +267,6 @@ class Mistral3MultiModalProcessor(
     ) -> Mapping[str, MultiModalFieldConfig]:
         return dict(
             pixel_values=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
@@ -327,7 +301,7 @@ class Mistral3MultiModalProcessor(
             tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
             tokens[-1] = image_end_id
 
-            return tokens
+            return PromptUpdateDetails.select_token_id(tokens, image_token_id)
 
         return [
             PromptReplacement(
@@ -418,8 +392,6 @@ def init_vision_tower_for_llava(
     )
 
 
-# TODO(mgoin): Support V1, there are issues with image batching/chunking
-# that need to be resolved first.
 @MULTIMODAL_REGISTRY.register_processor(
     _build_mistral3_processor,
     info=_build_mistral3_info,
@@ -509,16 +481,9 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsMultiModal,
             raise ValueError("Incorrect type of pixel values. "
                              f"Got type: {type(pixel_values)}")
 
-        assert self.config.vision_config.model_type == "pixtral"
-        embed_is_patch = kwargs.pop("embed_is_patch")
-        if not isinstance(embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of embed_is_patch. "
-                             f"Got type: {type(embed_is_patch)}")
-
         return Mistral3ImagePixelInputs(
             type="pixel_values_pixtral",
             pixel_values=flatten_bn(pixel_values),
-            embed_is_patch=flatten_bn(embed_is_patch),
         )
 
     def _process_image_input(
@@ -557,10 +522,7 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         vision_embeddings = self._process_image_input(image_input)
 
-        return scatter_patch_features(
-            vision_embeddings,
-            image_input["embed_is_patch"],
-        )
+        return vision_embeddings
 
     def get_input_embeddings(
         self,
@@ -572,7 +534,7 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsMultiModal,
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.config.image_token_index,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index b2f795155f17b..6857bfa810e3e 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -46,7 +46,8 @@ from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptIndexTargets,
-                                        PromptInsertion, PromptUpdate)
+                                        PromptInsertion, PromptUpdate,
+                                        PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -56,7 +57,6 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features
 
 # TODO: hard-coded for now. Consider making it configurable.
 VIT_LAYERS = [-2, -9]
@@ -84,14 +84,6 @@ class MolmoImageInputs(TypedDict):
     Shape: `(batch_size * num_images, num_crops, num_patch)`
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-    
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
     num_crops: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
@@ -1146,30 +1138,6 @@ class MolmoProcessorWrapper:
         if image_input_idx is not None:
             feat_is_patch = image_input_idx >= 0
 
-            input_is_embed = torch.isin(
-                input_ids,
-                torch.tensor([
-                    self.image_patch_id,
-                    self.im_col_id,
-                    self.im_start_id,
-                    self.im_end_id,
-                ]),
-            )
-            embed_ids = input_ids[input_is_embed]
-            embed_is_patch = embed_ids == self.image_patch_id
-            assert embed_is_patch.sum() == feat_is_patch.sum()
-
-            # image_tokens = extra_joint + joint
-            # Both `extra_joint` and `joint` have `im_start_id` and `im_end_id`
-            embed_start = torch.nonzero(embed_ids == self.im_start_id)[::2, 0]
-            embed_end = torch.nonzero(embed_ids == self.im_end_id)[1::2, 0]
-            assert len(embed_start) == len(embed_end) == len(images)
-
-            embed_is_patch = [
-                embed_is_patch[start:end + 1]
-                for start, end in zip(embed_start, embed_end)
-            ]
-
             tilings = [
                 self.select_tiling(
                     image_width=image.size[0],
@@ -1181,7 +1149,6 @@ class MolmoProcessorWrapper:
             assert num_crops.sum() == len(feat_is_patch)
 
             outputs["feat_is_patch"] = feat_is_patch
-            outputs["embed_is_patch"] = embed_is_patch
             outputs["num_crops"] = num_crops
             outputs["img_patch_id"] = self.image_patch_id
 
@@ -1220,17 +1187,13 @@ class MolmoProcessingInfo(BaseProcessingInfo):
         )
         pooling_size = processor.pooling_size
 
-        base_image_input_size = processor.base_image_input_size
-        base_image_input_d = processor.image_patch_size
+        image_token_length_w = processor.image_token_length_w
+        image_token_length_h = processor.image_token_length_h
 
-        crop_patches = base_image_input_size[0] // base_image_input_d
+        extra = image_token_length_w * image_token_length_h
+        joint = ((ncols + 1) // pooling_size) * ((nrows + 1) // pooling_size)
 
-        per_row = ncols // pooling_size + 1
-        joint = per_row * (nrows // pooling_size) + 2
-        image_token_length = (crop_patches + pooling_size - 1) // pooling_size
-        resize = (image_token_length + 1) * image_token_length + 2
-
-        return resize + joint
+        return extra + joint
 
     def get_max_image_tokens(self) -> int:
         target_width, target_height = self.get_image_size_with_most_features()
@@ -1328,7 +1291,6 @@ class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
                 "image", num_crops),
             feat_is_patch=MultiModalFieldConfig.flat_from_sizes(
                 "image", num_crops),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
             num_crops=MultiModalFieldConfig.batched("image"),
             img_patch_id=MultiModalFieldConfig.shared("image", num_images),
         )
@@ -1368,8 +1330,10 @@ class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
             joint = ([img_start_id] + joint_row *
                      ((nrows + 1) // pooling_size) + [img_end_id])
 
-            image_tokens = extra_joint + joint
-            return image_tokens
+            return PromptUpdateDetails.select_token_id(
+                extra_joint + joint,
+                embed_token_id=img_patch_id,
+            )
 
         return [
             PromptInsertion(
@@ -1475,11 +1439,6 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
             raise ValueError("Incorrect type of feat_is_patch. "
                              f"Got type: {type(feat_is_patch)}")
 
-        embed_is_patch = kwargs.pop("embed_is_patch", None)
-        if not isinstance(embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of embed_is_patch. "
-                             f"Got type: {type(embed_is_patch)}")
-
         num_crops = kwargs.pop("num_crops", None)
         if not isinstance(num_crops, (torch.Tensor, list)):
             raise ValueError("Incorrect type of num_crops. "
@@ -1491,14 +1450,12 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
                              f"Got type: {type(img_patch_id)}")
         self.img_patch_id = img_patch_id.flatten().unique().item()
 
-        embed_is_patch = flatten_bn(embed_is_patch)
         num_crops = flatten_bn(num_crops, concat=True)
 
         return MolmoImageInputs(
             images=images,
             image_masks=image_masks,
             feat_is_patch=feat_is_patch,
-            embed_is_patch=embed_is_patch,
             num_crops=num_crops,
         )
 
@@ -1537,12 +1494,7 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
         if image_input is None:
             return None
 
-        image_features = self._process_image_input(image_input)
-
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -1556,7 +1508,7 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.img_patch_id,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py
index 9d04f30c8f3fe..314f75c203012 100644
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -57,7 +57,7 @@ class NVLMProcessor(BaseInternVLProcessor):
         # when trying to find "<tile" as a subsequence of "<Image><tile"
         repl = "<Image>" + features + "</Image>"
 
-        return PromptUpdateDetails(full=repl, features=repl)
+        return PromptUpdateDetails.select_text(repl, IMG_PAD)
 
 
 class NVLMProcessingInfo(BaseInternVLProcessingInfo):
@@ -84,31 +84,6 @@ class NVLMProcessingInfo(BaseInternVLProcessingInfo):
             **kwargs,
         )
 
-    def get_max_image_tokens(self) -> int:
-        hf_processor = self.get_hf_processor()
-        tokenizer = hf_processor.tokenizer
-
-        max_num_patches = hf_processor.max_dynamic_patch
-        # we need +1 here because max_dynamic_patch in config doesn't
-        # include the thumbnail patch
-        tile_pos_identifiers = [
-            f"<tile_{i+1}>" for i in range(max_num_patches)
-        ]
-        if hf_processor.use_thumbnail and max_num_patches != 1:
-            tile_pos_identifiers += ["<tile_global_thumbnail>"]
-
-        # "<Image><tile" is tokenized as ["<Image", "><", "tile"]
-        # so we include <tile_1> in the start_str
-        start_str = "<Image>" + tile_pos_identifiers.pop(0)
-        end_str = "</Image>"
-        start_token_len = len(tokenizer.encode(start_str))
-        end_token_len = len(tokenizer.encode(end_str))
-        tile_token_len = sum(
-            len(tokenizer.encode(identifier))
-            for identifier in tile_pos_identifiers)
-        non_image_tokens_num = start_token_len + end_token_len + tile_token_len
-        return super().get_max_image_tokens() + non_image_tokens_num
-
 
 class NVLMDummyInputsBuilder(InternVLDummyInputsBuilder[NVLMProcessingInfo]):
 
@@ -177,10 +152,7 @@ class NVLMMultiModalProcessor(InternVLMultiModalProcessor[NVLMProcessingInfo]):
 
             repl = hf_processor.get_image_repl(feature_size, num_patches)
 
-            return PromptUpdateDetails(
-                full=repl.full + "\n",
-                features=repl.features + "\n",
-            )
+            return PromptUpdateDetails.select_text(repl.full + "\n", IMG_PAD)
 
         # See note in dummy data regarding why we have the extra newline
         return [
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 6fedb8c819849..845f77ac39ce7 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -162,9 +162,9 @@ class PaliGemmaMultiModalProcessor(
                 modality="image",
                 target=PromptIndexTargets.prefix(
                     [bos_token_id] if tokenizer.add_bos_token else []),
-                insertion=PromptUpdateDetails(
-                    full=image_tokens + [bos_token_id],
-                    features=image_tokens,
+                insertion=PromptUpdateDetails.select_token_id(
+                    image_tokens + [bos_token_id],
+                    embed_token_id=image_token_id,
                 ),
             )
         ]
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index d5c64989e64d3..d3b0688f21c38 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -40,8 +40,7 @@ from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, BoundPromptUpdate,
                                         PlaceholderFeaturesInfo,
-                                        PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+                                        PromptReplacement, PromptUpdate)
 # yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
@@ -443,12 +442,7 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
                     processor=hf_processor,
                 )
 
-            image_tokens = [_IMAGE_TOKEN_ID] * num_image_tokens
-
-            return PromptUpdateDetails(
-                full=image_tokens,
-                features=image_tokens,
-            )
+            return [_IMAGE_TOKEN_ID] * num_image_tokens
 
         num_images = mm_items.get_count("image", strict=False)
 
@@ -517,6 +511,7 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
                         item_idx=p.item_idx,
                         start_idx=p.start_idx - 1,
                         tokens=p.tokens,
+                        is_embed=p.is_embed,
                     ) for p in ps
                 ]
                 for modality, ps in placeholders.items()
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index f8c7cc9382aac..e07c6516aef2e 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -37,7 +37,7 @@ from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate)
+                                        PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import (MistralTokenizer,
@@ -46,8 +46,7 @@ from vllm.transformers_utils.tokenizer import (MistralTokenizer,
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (flatten_bn, init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
-from .vision import (VisionEncoderInfo, resolve_visual_encoder_outputs,
-                     scatter_patch_features, select_patch_features)
+from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
 
 try:
     from xformers import ops as xops
@@ -68,14 +67,6 @@ class PixtralImagePixelInputs(TypedDict):
     The result of stacking :attr:`ImageEncoding.tokens` from each prompt.
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 class PixtralProcessorAdapter:
     """
@@ -144,11 +135,8 @@ class PixtralProcessorAdapter:
                 "For more info, see: "
                 "https://github.com/vllm-project/vllm/issues/8411.")
 
-        image_token_id = self.image_token_id
-
         images_processed = list[torch.Tensor]()
         images_tokens = list[torch.Tensor]()
-        images_embed_is_patch = list[torch.Tensor]()
 
         for image in images:
             image_inputs = self.image_processor(ImageChunk(image=image))
@@ -157,12 +145,10 @@ class PixtralProcessorAdapter:
 
             images_processed.append(image_processed)
             images_tokens.append(image_tokens)
-            images_embed_is_patch.append(image_tokens == image_token_id)
 
         return {
             "input_ids": torch.cat(images_tokens)[None].expand(len(text), -1),
             "images": images_processed,
-            "embed_is_patch": images_embed_is_patch,
         }
 
 
@@ -213,7 +199,7 @@ class PixtralProcessingInfo(BaseProcessingInfo):
         ncols, nrows = processor.image_processor._image_to_num_tokens(
             Image.new("RGB", (image_width, image_height)))
 
-        return (ncols + 1) * nrows
+        return ncols * nrows
 
     def get_image_size_with_most_features(self) -> ImageSize:
         image_processor = self.get_hf_processor().image_processor
@@ -263,10 +249,7 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
         hf_inputs: Mapping[str, NestedTensors],
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(
-            images=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
-        )
+        return dict(images=MultiModalFieldConfig.batched("image"))
 
     def _get_prompt_updates(
         self,
@@ -290,7 +273,7 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
             tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
             tokens[-1] = image_end_id
 
-            return tokens
+            return PromptUpdateDetails.select_token_id(tokens, image_token_id)
 
         return [
             PromptReplacement(
@@ -381,17 +364,9 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
             raise ValueError("Incorrect type of images. "
                              f"Got type: {type(images)}")
 
-        embed_is_patch = kwargs.pop("embed_is_patch")
-        if not isinstance(embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of embed_is_patch. "
-                             f"Got type: {type(embed_is_patch)}")
-
-        embed_is_patch = flatten_bn(embed_is_patch)
-
         return PixtralImagePixelInputs(
             type="pixel_values",
             images=flatten_bn(images),
-            embed_is_patch=embed_is_patch,
         )
 
     def _process_image_input(
@@ -427,12 +402,7 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
         if image_input is None:
             return None
 
-        image_features = self._process_image_input(image_input)
-
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -444,7 +414,7 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.vision_args.image_token_id,
             )
         return inputs_embeds
@@ -963,9 +933,7 @@ class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]):
             image_width=image_width,
             image_height=image_height,
         )
-
-        # Consider the image_break_token
-        return (ncols + 1) * nrows
+        return ncols * nrows
 
     def get_max_image_tokens(self) -> int:
         image_size = self.get_image_size()
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index ccb5a3f600b2d..54220037d253f 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -229,9 +229,9 @@ class Qwen2AudioMultiModalProcessor(
 
             audio_tokens = [audio_token_id] * num_features
 
-            return PromptUpdateDetails(
-                full=[audio_bos_id] + audio_tokens + [audio_eos_id],
-                features=audio_tokens,
+            return PromptUpdateDetails.select_token_id(
+                [audio_bos_id] + audio_tokens + [audio_eos_id],
+                embed_token_id=audio_token_id,
             )
 
         return [
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index 4e9d02ae0abdb..a2ec9a9a4d177 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -647,9 +647,9 @@ class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
             PromptReplacement(
                 modality="image",
                 target=[img_start_id, img_end_id],
-                replacement=PromptUpdateDetails(
-                    full=[img_start_id] + image_tokens + [img_end_id],
-                    features=image_tokens,
+                replacement=PromptUpdateDetails.select_token_id(
+                    [img_start_id] + image_tokens + [img_end_id],
+                    embed_token_id=img_pad_id,
                 ),
             )
         ]
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index ac5de0e36b894..e3deae828a33c 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -40,7 +40,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features
 
 IMG_START = '<img>'
 IMG_END = '</img>'
@@ -61,14 +60,6 @@ class SkyworkR1VImagePixelInputs(TypedDict):
     num_patches: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 class SkyworkR1VImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
@@ -419,24 +410,13 @@ class BaseSkyworkR1VProcessor(ABC):
                 torch.tensor([len(item) for item in pixel_values_lst]),
             }
 
-            tokenizer = self.tokenizer
-            image_token_id = self.image_token_id
-
-            embed_is_patch = list[torch.Tensor]()
-
             for pixel_values in pixel_values_lst:
                 num_patches = pixel_values.shape[0]
                 feature_size = num_patches * self.num_image_token
 
                 image_repl = self.get_image_repl(feature_size, num_patches)
-                feature_tokens = tokenizer.encode(image_repl.features,
-                                                  add_special_tokens=False)
 
                 text = [t.replace('<image>', image_repl.full, 1) for t in text]
-                embed_is_patch.append(
-                    torch.tensor(feature_tokens) == image_token_id)
-
-            image_inputs["embed_is_patch"] = embed_is_patch
 
         text_inputs = self.tokenizer(text)
 
@@ -460,7 +440,7 @@ class SkyworkR1VProcessor(BaseSkyworkR1VProcessor):
         repl_features = IMG_CONTEXT * feature_size
         repl_full = IMG_START + repl_features + IMG_END
 
-        return PromptUpdateDetails(full=repl_full, features=repl_features)
+        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
 
 
 class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo):
@@ -599,7 +579,6 @@ class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[_I]):
             pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
                 "image", image_num_patches),
             image_num_patches=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
             image_token_id=MultiModalFieldConfig.shared("image", num_images),
         )
@@ -835,7 +814,6 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
             self, **kwargs: object) -> Optional[SkyworkR1VImageInputs]:
         pixel_values_flat = kwargs.pop("pixel_values_flat", None)
         image_num_patches = kwargs.pop("image_num_patches", None)
-        embed_is_patch = kwargs.pop("embed_is_patch", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
         if pixel_values_flat is None and image_embeds is None:
@@ -864,20 +842,14 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
                 raise ValueError("Incorrect type of image_num_patches. "
                                  f"Got type: {type(image_num_patches)}")
 
-            if not isinstance(embed_is_patch, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of embed_is_patch. "
-                                 f"Got type: {type(embed_is_patch)}")
-
             pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
             image_num_patches = flatten_bn(image_num_patches, concat=True)
-            embed_is_patch = flatten_bn(embed_is_patch)
 
             return SkyworkR1VImagePixelInputs(
                 type="pixel_values",
                 pixel_values_flat=self._validate_pixel_values(
                     pixel_values_flat),
                 num_patches=image_num_patches,
-                embed_is_patch=embed_is_patch,
             )
 
         raise AssertionError("This line should be unreachable.")
@@ -923,15 +895,7 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
         if image_input is None:
             return None
 
-        image_features = self._process_image_input(image_input)
-
-        if image_input["type"] != "pixel_values":
-            return image_features
-
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -945,7 +909,7 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.img_context_token_id,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 9e00da682e808..347f51499b7be 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -1,8 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from abc import ABC, abstractmethod
-from collections.abc import Sequence
-from typing import Final, Generic, Optional, Protocol, TypeVar, Union, cast
+from typing import Final, Generic, Optional, Protocol, TypeVar, Union
 
 import torch
 from transformers import PretrainedConfig
@@ -10,12 +9,9 @@ from transformers import PretrainedConfig
 import vllm.envs as envs
 from vllm.attention.selector import (backend_name_to_enum,
                                      get_global_forced_attn_backend)
-from vllm.jsontree import JSONTree, json_map_leaves
 from vllm.logger import init_logger
 from vllm.platforms import _Backend, current_platform
 
-from .interfaces import MultiModalEmbeddings
-
 logger = init_logger(__name__)
 
 _C = TypeVar("_C", bound=PretrainedConfig)
@@ -155,74 +151,3 @@ def resolve_visual_encoder_outputs(
     if post_layer_norm is not None and uses_last_layer:
         hs_pool[-1] = post_layer_norm(encoder_outputs)
     return torch.cat(hs_pool, dim=-1)
-
-
-def scatter_patch_features(
-    patches: Union[torch.Tensor, Sequence[torch.Tensor]],
-    embed_is_patch: Union[torch.Tensor, Sequence[torch.Tensor]],
-) -> tuple[torch.Tensor, ...]:
-    """
-    Scatter the patch features into a contiguous tensor that corresponds
-    to the embedding tokens defined by the multimodal processor.
-    
-    The rest of the values in the tensor are set to NaN so that they
-    can be filtered out by :func`select_patch_features`.
-
-    Args:
-        patches: The patch features for each image.
-          Shape: `(num_images, <patch_dims>, feature_depth)`
-        embed_is_patch: A boolean mask indicating which image embeddings
-          correspond to patch tokens for each image.
-          Shape: `(num_images, num_embeds)`
-
-    Note:
-        The original code only considers patch tokens as feature
-        tokens, but our processor considers all image-related tokens
-        as feature tokens because the feature tokens need to be
-        consecutive in `input_ids`.
-
-    Example:
-        A simplified example for one image:
-
-        .. code-block::
-
-            Embedding tokens (from HF processor):
-            [<start> <patch> <patch>  <col>  <patch> <patch>  <col>  <end> ]
-
-            embed_is_patch (from HF processor):
-            [ False   True    True    False    True    True   False  False ]
-
-            Encoder outputs (from model):
-            [  p1      p2      p3      p4   ]
-
-            The resulting embedding tensor is:
-            [  nan     p1      p2      nan      p3      p4     nan    nan  ]
-    """
-    if len(patches) != len(embed_is_patch):
-        raise ValueError(f"Inconsistent num_images: {len(patches)=} vs. "
-                         f"{len(embed_is_patch)=}")
-
-    def get_embed_one(patches_one: torch.Tensor, e_is_patch: torch.Tensor):
-        embed_one = patches_one.new_full(
-            (e_is_patch.shape[0], patches_one.shape[-1]),
-            fill_value=torch.nan,
-        )
-        embed_one[e_is_patch] = patches_one
-        return embed_one
-
-    return tuple(
-        get_embed_one(patches_one, e_is_patch)
-        for patches_one, e_is_patch in zip(patches, embed_is_patch))
-
-
-def select_patch_features(
-        multimodal_embeddings: MultiModalEmbeddings) -> MultiModalEmbeddings:
-    """
-    Given the outputs of :func:`scatter_patch_features`, return only
-    the values that correspond to patch features.
-    """
-    selected_features = json_map_leaves(
-        lambda x: x[~x.isnan()].view(-1, *x.shape[1:]),
-        cast(JSONTree[torch.Tensor], multimodal_embeddings),
-    )
-    return cast(MultiModalEmbeddings, selected_features)
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 5159b0bca8c1c..ad95b982499c9 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -385,8 +385,8 @@ class MultiModalPlaceholderMap:
         for placeholder_dict, mm_item in zip(multi_modal_placeholders,
                                              multi_modal_items):
             placeholder = range(
-                placeholder_dict["offset"],
-                placeholder_dict["offset"] + placeholder_dict["length"],
+                placeholder_dict.offset,
+                placeholder_dict.offset + placeholder_dict.length,
             )
             intersection = range(
                 max(positions.start, placeholder.start),
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 81d72ff190222..53729799b629c 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -109,7 +109,8 @@ The built-in modalities are defined by :class:`MultiModalDataBuiltins`.
 """
 
 
-class PlaceholderRange(TypedDict):
+@dataclass(frozen=True)
+class PlaceholderRange:
     """
     Placeholder location information for multi-modal data.
 
@@ -121,8 +122,8 @@ class PlaceholderRange(TypedDict):
 
         .. code-block::
 
-            A: { "offset": 0, "length": 4 }
-            B: { "offset": 5, "length": 4 }
+            A: PlaceholderRange(offset=0, length=4)
+            B: PlaceholderRange(offset=5, length=4)
     """
 
     offset: int
@@ -131,6 +132,31 @@ class PlaceholderRange(TypedDict):
     length: int
     """The length of the placeholder."""
 
+    is_embed: Optional[torch.Tensor] = None
+    """
+    A boolean mask of shape `(length,)` indicating which positions
+    between `offset` and `offset + length` to assign embeddings to.
+    """
+
+    def get_num_embeds(self) -> int:
+        if self.is_embed is None:
+            return self.length
+
+        return int(self.is_embed.sum().item())
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, self.__class__):
+            return False
+        if not (self.offset, self.length) == (other.offset, other.length):
+            return False
+
+        if self.is_embed is None:
+            return other.is_embed is None
+        if other.is_embed is None:
+            return self.is_embed is None
+
+        return nested_tensors_equal(self.is_embed, other.is_embed)
+
 
 NestedTensors = Union[list["NestedTensors"], list[torch.Tensor], torch.Tensor,
                       tuple[torch.Tensor, ...]]
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index c8864c33fe372..a37d2975e9d2d 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -108,16 +108,46 @@ class PromptUpdateDetails(Generic[_S]):
     full: _S
     """The full content."""
 
-    features: _S
+    is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]] = None
     """
-    The part of the content that corresponds to feature placeholders;
-    this will be replaced by the output of the vision encoder during model
-    inference.
+    Given :attr:`full`, return a boolean mask of shape `(len(full),)`
+    indicating which positions of `full` to assign embeddings to.
+
+    `None` (default) means to assign embeddings to all positions of `full`.
+
+    The embeddings are obtained by calling
+    :class:`SupportsMultiModal.get_multimodal_embeddings`.
     """
 
     @staticmethod
     def from_seq(seq: _S) -> "PromptUpdateDetails[_S]":
-        return PromptUpdateDetails(full=seq, features=seq)
+        return PromptUpdateDetails(full=seq)
+
+    @staticmethod
+    def select_text(
+        seq: _S,
+        embed_text: str,
+    ) -> "PromptUpdateDetails[_S]":
+
+        def is_embed(full: "_BoundPromptSequence") -> torch.Tensor:
+            embed_token_ids = encode_tokens(full.tokenizer, embed_text)
+
+            return torch.isin(
+                torch.tensor(full.token_ids),
+                torch.tensor(embed_token_ids),
+            )
+
+        return PromptUpdateDetails(full=seq, is_embed=is_embed)
+
+    @staticmethod
+    def select_token_id(
+        seq: _S,
+        embed_token_id: int,
+    ) -> "PromptUpdateDetails[_S]":
+        return PromptUpdateDetails(
+            full=seq,
+            is_embed=lambda f: torch.tensor(f.token_ids) == embed_token_id,
+        )
 
 
 PromptUpdateInfo = Union[PromptSeq, PromptUpdateDetails]
@@ -406,7 +436,7 @@ class _BoundPromptSequence:
 @dataclass
 class _BoundPromptContent:
     full: _BoundPromptSequence
-    features: _BoundPromptSequence
+    is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]]
 
 
 @dataclass
@@ -466,10 +496,8 @@ class BoundPromptUpdate:
 
         bound_full = _BoundPromptSequence.from_seq(self.tokenizer,
                                                    content.full)
-        bound_features = _BoundPromptSequence.from_seq(self.tokenizer,
-                                                       content.features)
         bound_content = _BoundPromptContent(full=bound_full,
-                                            features=bound_features)
+                                            is_embed=content.is_embed)
 
         if cache_key is not None:
             self._content_cache[cache_key] = bound_content
@@ -605,15 +633,19 @@ class PlaceholderFeaturesInfo:
     item_idx: int
     start_idx: int
     tokens: list[int]
+    is_embed: Optional[torch.Tensor]
 
     @property
     def length(self) -> int:
         return len(self.tokens)
 
     def to_range(self) -> PlaceholderRange:
+        # TODO: Is it worth it to optimize this by stripping the
+        # leading and ending positions where `is_embed=False`?
         return PlaceholderRange(
             offset=self.start_idx,
             length=self.length,
+            is_embed=self.is_embed,
         )
 
 
@@ -806,22 +838,17 @@ def _iter_placeholders(
                     continue
 
                 if prompt[start_idx:end_idx_full] == content_tokens_full:
-                    content_tokens_feat = content.features.token_ids
+                    content_is_embed = content.is_embed
+                    if content_is_embed is not None:
+                        content_is_embed = content_is_embed(content.full)
 
-                    try:
-                        match = next(
-                            iter_token_matches(content_tokens_full,
-                                               content_tokens_feat))
-                        yield PlaceholderFeaturesInfo(
-                            modality=modality,
-                            item_idx=item_idx,
-                            start_idx=start_idx + match.start_idx,
-                            tokens=content_tokens_feat,
-                        )
-                    except StopIteration:
-                        raise AssertionError(
-                            f"{content_tokens_feat=} should be a "
-                            f"subsequence of {content_tokens_full=}") from None
+                    yield PlaceholderFeaturesInfo(
+                        modality=modality,
+                        item_idx=item_idx,
+                        start_idx=start_idx,
+                        tokens=content_tokens_full,
+                        is_embed=content_is_embed,
+                    )
 
                     # Exclude overlapping matches
                     start_idx = end_idx_full
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 1df9a1f5eba1c..4616e4e957854 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -180,7 +180,7 @@ class MultiModalProfiler(Generic[_I]):
         placeholders_by_modality = mm_inputs["mm_placeholders"]
 
         total_placeholders_by_modality = {
-            modality: sum(item["length"] for item in placeholders)
+            modality: sum(item.get_num_embeds() for item in placeholders)
             for modality, placeholders in placeholders_by_modality.items()
         }
         expected_placeholders_by_modality = {
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index fc0fb8929b1e7..77c83f0c2b212 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -340,7 +340,7 @@ def merge_and_sort_multimodal_metadata(
             all_items.append((modality, placeholder, hash_value))
 
     # Sort all items by offset
-    all_items.sort(key=lambda x: x[1]['offset'])
+    all_items.sort(key=lambda x: x[1].offset)
 
     # Split into separate lists
     sorted_modalities = [item[0] for item in all_items]
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 34bc9369b125d..afcf7e344a0f0 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -310,8 +310,7 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
     # Note that we assume mm_positions is sorted by offset.
     # We do not need to check all mm inputs if the start token index is out of
     # range. This usually happens in the late prefill phase and decoding phase.
-    if mm_positions[-1]["offset"] + mm_positions[-1][
-            "length"] < start_token_idx:
+    if mm_positions[-1].offset + mm_positions[-1].length < start_token_idx:
         return extra_keys, start_mm_idx
 
     # Support start_mm_idx == -1 to indicate the last mm input.
@@ -322,8 +321,8 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
     curr_mm_idx = start_mm_idx
     while mm_positions and curr_mm_idx < len(mm_positions):
         assert mm_hashes[curr_mm_idx] is not None
-        offset = mm_positions[curr_mm_idx]["offset"]
-        length = mm_positions[curr_mm_idx]["length"]
+        offset = mm_positions[curr_mm_idx].offset
+        length = mm_positions[curr_mm_idx].length
         if end_token_idx > offset:
             if start_token_idx > offset + length:
                 # This block has passed the current mm input.
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 81f8ad25051c3..b3905987efc77 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -505,8 +505,8 @@ class Scheduler(SchedulerInterface):
         assert mm_positions is not None
         assert len(mm_positions) > 0
         for i, pos_info in enumerate(mm_positions):
-            start_pos = pos_info["offset"]
-            num_encoder_tokens = pos_info["length"]
+            start_pos = pos_info.offset
+            num_encoder_tokens = pos_info.length
 
             # The encoder output is needed if the two ranges overlap:
             # [num_computed_tokens, num_computed_tokens + num_new_tokens) and
@@ -596,8 +596,8 @@ class Scheduler(SchedulerInterface):
             if cached_encoder_input_ids:
                 for input_id in list(cached_encoder_input_ids):
                     mm_positions = request.mm_positions[input_id]
-                    start_pos = mm_positions["offset"]
-                    num_tokens = mm_positions["length"]
+                    start_pos = mm_positions.offset
+                    num_tokens = mm_positions.length
                     if start_pos + num_tokens <= request.num_computed_tokens:
                         # The encoder output is already processed and stored
                         # in the decoder's KV cache.
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 490fe4e83d3ad..daf59fd76e9a9 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -121,7 +121,7 @@ class Request:
 
     def get_num_encoder_tokens(self, input_id: int) -> int:
         assert input_id < len(self.mm_positions)
-        num_tokens = self.mm_positions[input_id]["length"]
+        num_tokens = self.mm_positions[input_id].length
         return num_tokens
 
     @property
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 513806332efe3..aba71845cb20e 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -19,7 +19,8 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.model_loader import get_model
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
@@ -43,7 +44,8 @@ from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
-from .utils import sanity_check_mm_encoder_outputs
+from .utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs,
+                    scatter_mm_placeholders)
 
 if TYPE_CHECKING:
     import xgrammar as xgr
@@ -829,19 +831,22 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         )
         return metadata
 
-    def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
+    def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
         scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
         if not scheduled_encoder_inputs:
             return
 
         # Batch the multi-modal inputs.
-        mm_inputs: list[MultiModalKwargs] = []
-        req_input_ids: list[tuple[str, int]] = []
+        mm_inputs = list[MultiModalKwargs]()
+        req_ids_pos = list[tuple[str, int, PlaceholderRange]]()
         for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
             req_state = self.requests[req_id]
-            for input_id in encoder_input_ids:
+            for input_id, pos_info in zip(
+                    encoder_input_ids,
+                    req_state.mm_positions,
+            ):
                 mm_inputs.append(req_state.mm_inputs[input_id])
-                req_input_ids.append((req_id, input_id))
+                req_ids_pos.append((req_id, input_id, pos_info))
 
         # Batch mm inputs as much as we can: if a request in the batch has
         # multiple modalities or a different modality than the previous one,
@@ -877,16 +882,23 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 encoder_outputs.append(output)
 
         # Cache the encoder outputs.
-        for (req_id, input_id), output in zip(req_input_ids, encoder_outputs):
+        for (req_id, input_id, pos_info), output in zip(
+                req_ids_pos,
+                encoder_outputs,
+        ):
             if req_id not in self.encoder_cache:
                 self.encoder_cache[req_id] = {}
-            self.encoder_cache[req_id][input_id] = output
 
-    def _gather_encoder_outputs(
+            self.encoder_cache[req_id][input_id] = scatter_mm_placeholders(
+                output,
+                is_embed=pos_info.is_embed,
+            )
+
+    def _gather_mm_embeddings(
         self,
         scheduler_output: "SchedulerOutput",
     ) -> list[torch.Tensor]:
-        encoder_outputs: list[torch.Tensor] = []
+        mm_embeds: list[torch.Tensor] = []
         for req_id in self.input_batch.req_ids:
             num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
                 req_id]
@@ -894,8 +906,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             num_computed_tokens = req_state.num_computed_tokens
             mm_positions = req_state.mm_positions
             for i, pos_info in enumerate(mm_positions):
-                start_pos = pos_info["offset"]
-                num_encoder_tokens = pos_info["length"]
+                start_pos = pos_info.offset
+                num_encoder_tokens = pos_info.length
 
                 # The encoder output is needed if the two ranges overlap:
                 # [num_computed_tokens,
@@ -917,8 +929,16 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 assert req_id in self.encoder_cache
                 assert i in self.encoder_cache[req_id]
                 encoder_output = self.encoder_cache[req_id][i]
-                encoder_outputs.append(encoder_output[start_idx:end_idx])
-        return encoder_outputs
+
+                if (is_embed := pos_info.is_embed) is not None:
+                    is_embed = is_embed[start_idx:end_idx]
+
+                mm_embeds_item = gather_mm_placeholders(
+                    encoder_output[start_idx:end_idx],
+                    is_embed=is_embed,
+                )
+                mm_embeds.append(mm_embeds_item)
+        return mm_embeds
 
     def get_model(self) -> nn.Module:
         return self.model
@@ -983,10 +1003,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
 
         if self.is_multimodal_model:
             # Run the multimodal encoder if any.
-            self._execute_encoder(scheduler_output)
-            encoder_outputs = self._gather_encoder_outputs(scheduler_output)
+            self._execute_mm_encoder(scheduler_output)
+            mm_embeds = self._gather_mm_embeddings(scheduler_output)
         else:
-            encoder_outputs = []
+            mm_embeds = []
 
         # Prepare the decoder inputs.
         attn_metadata, logits_indices, spec_decode_metadata = (
@@ -1008,9 +1028,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             # embeddings), we always use embeddings (rather than token ids)
             # as input to the multimodal model, even when the input is text.
             input_ids = self.input_ids[:num_scheduled_tokens]
-            if encoder_outputs:
+            if mm_embeds:
                 inputs_embeds = self.model.get_input_embeddings(
-                    input_ids, encoder_outputs)
+                    input_ids, mm_embeds)
             else:
                 inputs_embeds = self.model.get_input_embeddings(input_ids)
             # TODO(woosuk): Avoid the copy. Optimize.
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 0668e7168b5f7..488912fbd4b05 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -19,7 +19,8 @@ from vllm.config import VllmConfig
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
@@ -36,7 +37,8 @@ from vllm.v1.sample.tpu.sampler import Sampler as TPUSampler
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
-from .utils import sanity_check_mm_encoder_outputs
+from .utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs,
+                    scatter_mm_placeholders)
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
@@ -507,19 +509,47 @@ class TPUModelRunner:
         logits_indices = logits_indices.to(self.device)
         return attn_metadata, logits_indices
 
-    def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
+    def _scatter_placeholders(
+        self,
+        embeds: torch.Tensor,
+        is_embed: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        if is_embed is None:
+            return embeds
+
+        placeholders = embeds.new_full(
+            (is_embed.shape[0], embeds.shape[-1]),
+            fill_value=torch.nan,
+        )
+        placeholders[is_embed] = embeds
+        return placeholders
+
+    def _gather_placeholders(
+        self,
+        placeholders: torch.Tensor,
+        is_embed: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        if is_embed is None:
+            return placeholders
+
+        return placeholders[is_embed]
+
+    def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
         scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
         if not scheduled_encoder_inputs:
             return
 
         # Batch the multi-modal inputs.
-        mm_inputs: list[MultiModalKwargs] = []
-        req_input_ids: list[tuple[str, int]] = []
+        mm_inputs = list[MultiModalKwargs]()
+        req_ids_pos = list[tuple[str, int, PlaceholderRange]]()
         for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
             req_state = self.requests[req_id]
-            for input_id in encoder_input_ids:
+            for input_id, pos_info in zip(
+                    encoder_input_ids,
+                    req_state.mm_positions,
+            ):
                 mm_inputs.append(req_state.mm_inputs[input_id])
-                req_input_ids.append((req_id, input_id))
+                req_ids_pos.append((req_id, input_id, pos_info))
 
         # Batch mm inputs as much as we can: if a request in the batch has
         # multiple modalities or a different modality than the previous one,
@@ -555,16 +585,23 @@ class TPUModelRunner:
                 encoder_outputs.append(output)
 
         # Cache the encoder outputs.
-        for (req_id, input_id), output in zip(req_input_ids, encoder_outputs):
+        for (req_id, input_id, pos_info), output in zip(
+                req_ids_pos,
+                encoder_outputs,
+        ):
             if req_id not in self.encoder_cache:
                 self.encoder_cache[req_id] = {}
-            self.encoder_cache[req_id][input_id] = output
 
-    def _gather_encoder_outputs(
+            self.encoder_cache[req_id][input_id] = scatter_mm_placeholders(
+                output,
+                is_embed=pos_info.is_embed,
+            )
+
+    def _gather_mm_embeddings(
         self,
         scheduler_output: "SchedulerOutput",
     ) -> list[torch.Tensor]:
-        encoder_outputs: list[torch.Tensor] = []
+        mm_embeds: list[torch.Tensor] = []
         for req_id in self.input_batch.req_ids:
             num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
                 req_id]
@@ -572,8 +609,8 @@ class TPUModelRunner:
             num_computed_tokens = req_state.num_computed_tokens
             mm_positions = req_state.mm_positions
             for i, pos_info in enumerate(mm_positions):
-                start_pos = pos_info["offset"]
-                num_encoder_tokens = pos_info["length"]
+                start_pos = pos_info.offset
+                num_encoder_tokens = pos_info.length
 
                 # The encoder output is needed if the two ranges overlap:
                 # [num_computed_tokens,
@@ -595,8 +632,16 @@ class TPUModelRunner:
                 assert req_id in self.encoder_cache
                 assert i in self.encoder_cache[req_id]
                 encoder_output = self.encoder_cache[req_id][i]
-                encoder_outputs.append(encoder_output[start_idx:end_idx])
-        return encoder_outputs
+
+                if (is_embed := pos_info.is_embed) is not None:
+                    is_embed = is_embed[start_idx:end_idx]
+
+                mm_embeds_item = gather_mm_placeholders(
+                    encoder_output[start_idx:end_idx],
+                    is_embed=is_embed,
+                )
+                mm_embeds.append(mm_embeds_item)
+        return mm_embeds
 
     @torch.no_grad()
     def execute_model(
@@ -612,10 +657,10 @@ class TPUModelRunner:
 
         if self.is_multimodal_model:
             # Run the multimodal encoder if any.
-            self._execute_encoder(scheduler_output)
-            encoder_outputs = self._gather_encoder_outputs(scheduler_output)
+            self._execute_mm_encoder(scheduler_output)
+            mm_embeds = self._gather_mm_embeddings(scheduler_output)
         else:
-            encoder_outputs = []
+            mm_embeds = []
 
         # Prepare inputs
         attn_metadata, logits_indices = self._prepare_inputs(scheduler_output)
@@ -623,9 +668,9 @@ class TPUModelRunner:
             # NOTE(woosuk): To unify token ids and soft tokens (vision
             # embeddings), we always use embeddings (rather than token ids)
             # as input to the multimodal model, even when the input is text.
-            if encoder_outputs:
+            if mm_embeds:
                 inputs_embeds = self.model.get_input_embeddings(
-                    self.input_ids, encoder_outputs)
+                    self.input_ids, mm_embeds)
             else:
                 inputs_embeds = self.model.get_input_embeddings(self.input_ids)
             input_ids = None
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index b1d3aa7cd8afb..e46ca0c90fe38 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -1,4 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+
 import torch
 
 
@@ -27,3 +29,46 @@ def sanity_check_mm_encoder_outputs(
         f"but got tensors with shapes {[e.shape for e in mm_embeddings]} "
         "instead. This is most likely due to incorrect implementation "
         "of the model's `get_multimodal_embeddings` method.")
+
+
+def scatter_mm_placeholders(
+    embeds: torch.Tensor,
+    is_embed: Optional[torch.Tensor],
+) -> torch.Tensor:
+    """
+    Scatter the multimodal embeddings into a contiguous tensor that represents
+    the placeholder tokens.
+
+    :class:`vllm.multimodal.processing.PromptUpdateDetails.is_embed`.
+
+    Args:
+        embeds: The multimodal embeddings.
+          Shape: `(num_embeds, embed_dim)`
+        is_embed: A boolean mask indicating which positions in the placeholder
+          tokens need to be filled with multimodal embeddings.
+          Shape: `(num_placeholders, num_embeds)`
+    """
+    if is_embed is None:
+        return embeds
+
+    placeholders = embeds.new_full(
+        (is_embed.shape[0], embeds.shape[-1]),
+        fill_value=torch.nan,
+    )
+    placeholders[is_embed] = embeds
+    return placeholders
+
+
+def gather_mm_placeholders(
+    placeholders: torch.Tensor,
+    is_embed: Optional[torch.Tensor],
+) -> torch.Tensor:
+    """
+    Reconstructs the embeddings from the placeholder tokens.
+
+    This is the operation of :func:`scatter_mm_placeholders`.
+    """
+    if is_embed is None:
+        return placeholders
+
+    return placeholders[is_embed]

From af51d80fa14ca8e01c6be36232170683f3e47f09 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Fri, 4 Apr 2025 14:50:57 -0700
Subject: [PATCH 236/593] Revert "[V1] Scatter and gather placeholders in the
 model runner" (#16075)

---
 docs/source/contributing/model/multimodal.md  |  16 +-
 docs/source/models/supported_models.md        |   3 +
 examples/offline_inference/audio_language.py  |   2 +-
 .../audio_language/test_ultravox.py           |   5 +-
 .../vision_language/test_models.py            |   2 +-
 .../vision_language/test_pixtral.py           |  26 ++--
 .../multimodal/processing/test_llava_next.py  |   4 +-
 .../processing/test_llava_onevision.py        |   4 +-
 tests/models/registry.py                      |   4 +-
 tests/multimodal/test_processing.py           |   9 --
 tests/v1/core/test_kv_cache_utils.py          |  46 +++---
 vllm/model_executor/models/aya_vision.py      |  71 ++++++---
 vllm/model_executor/models/chameleon.py       |   6 +-
 vllm/model_executor/models/fuyu.py            |  85 +++++++----
 vllm/model_executor/models/gemma3_mm.py       |  83 +++++++---
 vllm/model_executor/models/h2ovl.py           |   2 +-
 vllm/model_executor/models/idefics3.py        |  82 ++++++++--
 vllm/model_executor/models/internvl.py        |  43 +++++-
 vllm/model_executor/models/llava.py           |  55 ++++++-
 vllm/model_executor/models/minicpmo.py        |  74 +++++++--
 vllm/model_executor/models/minicpmv.py        | 142 +++++++++++++-----
 vllm/model_executor/models/mistral3.py        |  50 +++++-
 vllm/model_executor/models/molmo.py           |  74 +++++++--
 vllm/model_executor/models/nvlm_d.py          |  32 +++-
 vllm/model_executor/models/paligemma.py       |   6 +-
 vllm/model_executor/models/phi3v.py           |  11 +-
 vllm/model_executor/models/pixtral.py         |  48 +++++-
 vllm/model_executor/models/qwen2_audio.py     |   6 +-
 vllm/model_executor/models/qwen_vl.py         |   6 +-
 vllm/model_executor/models/skyworkr1v.py      |  42 +++++-
 vllm/model_executor/models/vision.py          |  77 +++++++++-
 vllm/multimodal/base.py                       |   4 +-
 vllm/multimodal/inputs.py                     |  32 +---
 vllm/multimodal/processing.py                 |  75 +++------
 vllm/multimodal/profiling.py                  |   2 +-
 vllm/multimodal/utils.py                      |   2 +-
 vllm/v1/core/kv_cache_utils.py                |   7 +-
 vllm/v1/core/sched/scheduler.py               |   8 +-
 vllm/v1/request.py                            |   2 +-
 vllm/v1/worker/gpu_model_runner.py            |  60 +++-----
 vllm/v1/worker/tpu_model_runner.py            |  85 +++--------
 vllm/v1/worker/utils.py                       |  45 ------
 42 files changed, 942 insertions(+), 496 deletions(-)

diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index c4894d39edc97..9cbfc32991f09 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -860,8 +860,8 @@ prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
 )
 ```
 
-To assign the vision embeddings to only the image tokens, instead of a string
-you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`:
+To accommodate this, instead of a string you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`
+with different `full` and `feature` attributes:
 
 ```python
 hf_config = self.info.get_hf_config()
@@ -879,9 +879,9 @@ def get_replacement_fuyu(item_idx: int):
     image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                     [_NEWLINE_TOKEN_ID]) * nrows
 
-    return PromptUpdateDetails.select_token_id(
-        image_tokens + [bos_token_id],
-        embed_token_id=_IMAGE_TOKEN_ID,
+    return PromptUpdateDetails(
+        full=image_tokens + [bos_token_id],
+        features=image_tokens,
     )
 ```
 
@@ -914,9 +914,9 @@ def _get_prompt_updates(
         image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                         [_NEWLINE_TOKEN_ID]) * nrows
 
-        return PromptUpdateDetails.select_token_id(
-            image_tokens + [bos_token_id],
-            embed_token_id=_IMAGE_TOKEN_ID,
+        return PromptUpdateDetails(
+            full=image_tokens + [bos_token_id],
+            features=image_tokens,
         )
 
     return [
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 316fc3b2c4f2a..74b4eab920438 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -989,6 +989,9 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
 :::{important}
+To use Gemma3 series models, you have to install Hugging Face Transformers library from source via
+`pip install git+https://github.com/huggingface/transformers`.
+
 Pan-and-scan image pre-processing is currently supported on V0 (but not V1).
 You can enable it by passing `--mm-processor-kwargs '{"do_pan_and_scan": True}'`.
 :::
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index f33efbab955ef..840892ea07010 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -47,7 +47,7 @@ def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
         model=model_name,
         trust_remote_code=True,
         max_model_len=4096,
-        max_num_seqs=2,
+        max_num_seqs=5,
         limit_mm_per_prompt={"audio": audio_count},
     )
 
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index 242f3398b9213..83ece5d22bfb3 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -55,10 +55,7 @@ def server(request, audio_assets):
         for key, value in request.param.items()
     ]
 
-    with RemoteOpenAIServer(MODEL_NAME,
-                            args,
-                            env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
-                                      "30"}) as remote_server:
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
 
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index b984cd6f54886..3b34f012f6264 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -167,7 +167,7 @@ VLM_TEST_SETTINGS = {
             "cherry_blossom": "<image>What is the season?",  # noqa: E501
         }),
         multi_image_prompt="<image><image>Describe the two images in detail.",  # noqa: E501
-        max_model_len=4096,
+        max_model_len=8192,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
         vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}}
diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py
index 6ebe75f0e8129..ee619d8d80c42 100644
--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -176,8 +176,6 @@ def test_chat(
             model,
             dtype=dtype,
             tokenizer_mode="mistral",
-            load_format="mistral",
-            config_format="mistral",
             max_model_len=max_model_len,
             limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
     ) as vllm_model:
@@ -200,14 +198,22 @@ def test_chat(
 
 
 @large_gpu_test(min_gb=48)
-@pytest.mark.parametrize("prompt,expected_ranges",
-                         [(_create_engine_inputs_hf(IMG_URLS[:1]),
-                           [PlaceholderRange(offset=11, length=494)]),
-                          (_create_engine_inputs_hf(IMG_URLS[1:4]), [
-                              PlaceholderRange(offset=11, length=266),
-                              PlaceholderRange(offset=277, length=1056),
-                              PlaceholderRange(offset=1333, length=418)
-                          ])])
+@pytest.mark.parametrize(
+    "prompt,expected_ranges",
+    [(_create_engine_inputs_hf(IMG_URLS[:1]), [{
+        "offset": 11,
+        "length": 494
+    }]),
+     (_create_engine_inputs_hf(IMG_URLS[1:4]), [{
+         "offset": 11,
+         "length": 266
+     }, {
+         "offset": 277,
+         "length": 1056
+     }, {
+         "offset": 1333,
+         "length": 418
+     }])])
 def test_multi_modal_placeholders(vllm_runner, prompt,
                                   expected_ranges: list[PlaceholderRange],
                                   monkeypatch) -> None:
diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
index b82bfe483dbbc..fe56a200a330f 100644
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -92,8 +92,8 @@ def _validate_image_prompt_replacements_one(
         first_placeholder = image_placeholders[0]
 
         # NOTE: There is a BOS token
-        assert first_placeholder.offset == 1
-        assert first_placeholder.length == (
+        assert first_placeholder["offset"] == 1
+        assert first_placeholder["length"] == (
             len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
 
     except Exception as exc:
diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
index dcc8dc8dab5a0..7cefdd37ee49a 100644
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -92,8 +92,8 @@ def _validate_image_prompt_replacements_one(
 
         first_placeholder = image_placeholders[0]
 
-        assert first_placeholder.offset == 0
-        assert first_placeholder.length == len(
+        assert first_placeholder["offset"] == 0
+        assert first_placeholder["length"] == len(
             processed_inputs["prompt_token_ids"]) // num_imgs
     except Exception as exc:
         failed_size_excs.append((image_size, exc))
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 9996bd2edced4..39e104a11ab11 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -277,9 +277,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                         trust_remote_code=True,
                                         hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
     "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m",
-                                      extras={"2b": "h2oai/h2ovl-mississippi-2b"},  # noqa: E501
-                                      max_transformers_version="4.48",  # noqa: E501
-                                      transformers_version_reason="HF model is not compatible."),  # noqa: E501
+                                      extras={"2b": "h2oai/h2ovl-mississippi-2b"}),  # noqa: E501
     "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
                                          extras={"2B": "OpenGVLab/InternVL2-2B"},  # noqa: E501
                                          trust_remote_code=True),
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index fa9588a050965..da112bd7a921c 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -785,7 +785,6 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=6,
                         tokens=[32000, 32000],
-                        is_embed=None,
                     ),
                 ],
                 "pattern_4": [
@@ -794,7 +793,6 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=3,
                         tokens=[32000],
-                        is_embed=None,
                     ),
                 ],
             }
@@ -809,14 +807,12 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=1,
                         tokens=[32000, 32000],
-                        is_embed=None,
                     ),
                     PlaceholderFeaturesInfo(
                         modality="pattern_1",
                         item_idx=1,
                         start_idx=5,
                         tokens=[32000, 32000],
-                        is_embed=None,
                     ),
                 ],
                 "pattern_3": [
@@ -825,7 +821,6 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=7,
                         tokens=[1550, 918, 1550],
-                        is_embed=None,
                     ),
                 ],
                 # No match for pattern_4 as it has lower priority than pattern_1
@@ -840,14 +835,12 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=1,
                         tokens=[32000, 32000],
-                        is_embed=None,
                     ),
                     PlaceholderFeaturesInfo(
                         modality="pattern_1",
                         item_idx=1,
                         start_idx=3,
                         tokens=[32000, 32000],
-                        is_embed=None,
                     ),
                 ],
                 "pattern_4": [
@@ -856,7 +849,6 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=5,
                         tokens=[32000],
-                        is_embed=None,
                     ),
                 ],
                 "pattern_3": [
@@ -865,7 +857,6 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=6,
                         tokens=[1550, 918, 1550],
-                        is_embed=None,
                     ),
                 ],
             }
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 51836644b3251..8362af24a67ed 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -3,7 +3,7 @@
 import pytest
 import torch
 
-from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.multimodal.inputs import MultiModalKwargs
 from vllm.sampling_params import SamplingParams
 from vllm.utils import sha256
 # disable yapf here as it formats differently than isort such that both fail
@@ -158,10 +158,13 @@ def test_generate_block_hash_extra_keys():
     request = make_request(
         request_id=0,
         prompt_token_ids=[_ for _ in range(20)],
-        mm_positions=[
-            PlaceholderRange(offset=0, length=5),
-            PlaceholderRange(offset=10, length=5),
-        ],
+        mm_positions=[{
+            "offset": 0,
+            "length": 5
+        }, {
+            "offset": 10,
+            "length": 5
+        }],
         mm_hashes=["hash1", "hash2"],
     )
 
@@ -219,10 +222,13 @@ def test_hash_request_tokens(hash_fn):
     request = make_request(
         request_id=0,
         prompt_token_ids=[_ for _ in range(6)],
-        mm_positions=[
-            PlaceholderRange(offset=0, length=3),
-            PlaceholderRange(offset=3, length=3),
-        ],
+        mm_positions=[{
+            "offset": 0,
+            "length": 3
+        }, {
+            "offset": 3,
+            "length": 3
+        }],
         mm_hashes=["hash1", "hash2"],
     )
 
@@ -247,19 +253,25 @@ def test_hash_tokens_different_mm_input(hash_fn):
     request1 = make_request(
         request_id=0,
         prompt_token_ids=[_ for _ in range(6)],
-        mm_positions=[
-            PlaceholderRange(offset=0, length=3),
-            PlaceholderRange(offset=3, length=3),
-        ],
+        mm_positions=[{
+            "offset": 0,
+            "length": 3
+        }, {
+            "offset": 3,
+            "length": 3
+        }],
         mm_hashes=["hash1", "hash2"],
     )
     request2 = make_request(
         request_id=1,
         prompt_token_ids=[_ for _ in range(6)],
-        mm_positions=[
-            PlaceholderRange(offset=0, length=3),
-            PlaceholderRange(offset=3, length=3),
-        ],
+        mm_positions=[{
+            "offset": 0,
+            "length": 3
+        }, {
+            "offset": 3,
+            "length": 3
+        }],
         mm_hashes=["hash3", "hash2"],
     )
     block_size = 3
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index 6b68885d375a2..b4bf1d82c083e 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -27,7 +27,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo,
                                         MultiModalFieldConfig,
                                         PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+                                        encode_tokens)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -35,6 +35,7 @@ from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features
 
 
 class AyaVisionImagePixelInputs(TypedDict):
@@ -50,6 +51,13 @@ class AyaVisionImagePixelInputs(TypedDict):
     num_patches: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
 
 class AyaVisionMultiModalProjector(nn.Module):
 
@@ -127,20 +135,21 @@ class AyaVisionProcessingInfo(BaseProcessingInfo):
     def get_max_image_tokens(self) -> int:
         hf_processor = self.get_hf_processor()
         image_processor = hf_processor.image_processor
-
         image_size = self.get_image_size_with_most_features()
+        tokenizer = hf_processor.tokenizer
         num_patches = self.get_num_patches(
             image_width=image_size.width,
             image_height=image_size.height,
             size=image_processor.size,
             min_patches=image_processor.min_patches,
-            max_patches=image_processor.max_patches,
+            max_patches=image_processor.max_patches)
+        image_string = hf_processor._prompt_split_image(num_patches)
+        x = encode_tokens(
+            tokenizer,
+            image_string,
+            add_special_tokens=False,
         )
-
-        img_patches_per_tile = (hf_processor.img_size //
-                                hf_processor.patch_size)**2
-
-        return num_patches * img_patches_per_tile
+        return len(x)
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
@@ -212,6 +221,7 @@ class AyaVisionMultiModalProcessor(
         hf_processor = self.info.get_hf_processor(**mm_kwargs)
         image_processor = hf_processor.image_processor
 
+        hf_config = self.info.get_hf_config()
         # HF processor pops the `num_patches` kwarg, which is needed by vLLM
         if (images :=
                 mm_data.get("images")) is not None and '<image>' in prompt:
@@ -224,7 +234,6 @@ class AyaVisionMultiModalProcessor(
                 parsed_images.get_image_size(i)
                 for i in range(len(parsed_images))
             ]
-
             num_patches = [
                 self.info.get_num_patches(
                     image_width=image_size.width,
@@ -234,6 +243,20 @@ class AyaVisionMultiModalProcessor(
                     max_patches=image_processor.max_patches)
                 for image_size in image_sizes
             ]
+            image_tokens_list = [
+                hf_processor._prompt_split_image(num_patch)
+                for num_patch in num_patches
+            ]
+            tokenizer = self.info.get_tokenizer()
+            image_token_ids = [
+                tokenizer.encode(image_tokens, add_special_tokens=False)
+                for image_tokens in image_tokens_list
+            ]
+            embed_is_patch = [
+                torch.tensor(image_repl_tokens) == hf_config.image_token_index
+                for image_repl_tokens in image_token_ids
+            ]
+            processed_outputs["embed_is_patch"] = embed_is_patch
             processed_outputs["num_patches"] = torch.tensor(num_patches)
 
         return processed_outputs
@@ -248,6 +271,7 @@ class AyaVisionMultiModalProcessor(
             pixel_values=MultiModalFieldConfig.flat_from_sizes(
                 "image", num_patches),
             num_patches=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
@@ -259,7 +283,6 @@ class AyaVisionMultiModalProcessor(
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_token = hf_processor.image_token
-        img_patch_token = hf_processor.img_patch_token
         image_processor = hf_processor.image_processor
 
         def get_replacement(item_idx: int):
@@ -271,11 +294,8 @@ class AyaVisionMultiModalProcessor(
                 image_height=image_size.height,
                 size=image_processor.size,
                 min_patches=image_processor.min_patches,
-                max_patches=image_processor.max_patches,
-            )
-            repl = hf_processor._prompt_split_image(num_patches=num_patches)
-
-            return PromptUpdateDetails.select_text(repl, img_patch_token)
+                max_patches=image_processor.max_patches)
+            return hf_processor._prompt_split_image(num_patches=num_patches)
 
         return [
             PromptReplacement(
@@ -404,6 +424,7 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
             self, **kwargs: object) -> Optional[AyaVisionImagePixelInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
         num_patches = kwargs.pop("num_patches", None)
+        embed_is_patch = kwargs.pop("embed_is_patch", None)
         image_embeds = kwargs.pop("image_embeds", None)
         assert image_embeds is None, "Aya Vision does not support image_embeds."
 
@@ -415,13 +436,18 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
             raise ValueError("Incorrect type of num_patches. "
                              f"Got type: {type(num_patches)}")
 
+        if not isinstance(embed_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of embed_is_patch. "
+                             f"Got type: {type(embed_is_patch)}")
+
         pixel_values = flatten_bn(pixel_values, concat=True)
         num_patches = flatten_bn(num_patches, concat=True)
-
+        embed_is_patch = flatten_bn(embed_is_patch)
         return AyaVisionImagePixelInputs(
             type="pixel_values",
             pixel_values=self._validate_pixel_values(pixel_values),
             num_patches=num_patches,
+            embed_is_patch=embed_is_patch,
         )
 
     def get_multimodal_embeddings(
@@ -429,8 +455,11 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
-
-        return self._process_image_input(image_input, **kwargs)
+        image_features = self._process_image_input(image_input, **kwargs)
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -442,9 +471,9 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids=input_ids,
                 inputs_embeds=inputs_embeds,
-                multimodal_embeddings=multimodal_embeddings,
-                placeholder_token_id=self.config.image_token_index,
-            )
+                multimodal_embeddings=select_patch_features(
+                    multimodal_embeddings),
+                placeholder_token_id=self.config.image_token_index)
 
         return inputs_embeds
 
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 3d527cb6f529d..f758c98ea5e59 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -162,9 +162,9 @@ class ChameleonMultiModalProcessor(
             PromptReplacement(
                 modality="image",
                 target=[image_token_id],
-                replacement=PromptUpdateDetails.select_token_id(
-                    [image_start_id] + image_tokens + [image_end_id],
-                    embed_token_id=image_token_id,
+                replacement=PromptUpdateDetails(
+                    full=([image_start_id] + image_tokens + [image_end_id]),
+                    features=image_tokens,
                 ),
             )
         ]
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 189b91db4a862..a807b047a1aae 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -18,7 +18,7 @@
 """ PyTorch Fuyu model."""
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, Set, Tuple, TypedDict
+from typing import Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
 import torch.nn as nn
@@ -43,6 +43,7 @@ from vllm.sequence import IntermediateTensors
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features
 
 # Cannot find the following 2 numbers from hf config.
 _IMAGE_TOKEN_ID = 71011
@@ -65,6 +66,14 @@ class FuyuImagePatchInputs(TypedDict):
     flattened just like `flat_data`.
     """
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
 
 class FuyuProcessingInfo(BaseProcessingInfo):
 
@@ -85,7 +94,15 @@ class FuyuProcessingInfo(BaseProcessingInfo):
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        max_ncols, max_nrows = self.get_image_feature_grid_size(
+            image_width=target_width,
+            image_height=target_height,
+        )
+        max_image_tokens = (max_ncols + 1) * max_nrows
+
+        return {"image": max_image_tokens}
 
     def get_image_feature_grid_size(
         self,
@@ -111,32 +128,11 @@ class FuyuProcessingInfo(BaseProcessingInfo):
         nrows = math.ceil(image_height / patch_height)
         return ncols, nrows
 
-    def get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-    ) -> int:
-        ncols, nrows = self.get_image_feature_grid_size(
-            image_width=image_width,
-            image_height=image_height,
-        )
-
-        return ncols * nrows
-
     def get_image_size_with_most_features(self) -> ImageSize:
         image_processor = self.get_image_processor()
         return ImageSize(width=image_processor.size["width"],
                          height=image_processor.size["height"])
 
-    def get_max_image_tokens(self) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-        )
-
 
 class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]):
 
@@ -196,6 +192,19 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
 
             processed_outputs["image_patches"] = image_patches[0]
 
+            # get patch grid size for each image
+            embed_is_patch = []
+            for image in images:
+                ncols, nrows = self.info.get_image_feature_grid_size(
+                    image_width=image.width,
+                    image_height=image.height,
+                )
+
+                mask = torch.tensor(([True] * ncols + [False]) * nrows)
+                embed_is_patch.append(mask)
+
+            processed_outputs["embed_is_patch"] = embed_is_patch
+
         return processed_outputs
 
     def _apply_hf_processor_tokens_only(
@@ -215,7 +224,8 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(image_patches=MultiModalFieldConfig.batched("image"))
+        return dict(image_patches=MultiModalFieldConfig.batched("image"),
+                    embed_is_patch=MultiModalFieldConfig.batched("image"))
 
     def _get_prompt_updates(
         self,
@@ -242,9 +252,9 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
             image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                             [_NEWLINE_TOKEN_ID]) * nrows
 
-            return PromptUpdateDetails.select_token_id(
-                image_tokens + [bos_token_id],
-                embed_token_id=_IMAGE_TOKEN_ID,
+            return PromptUpdateDetails(
+                full=image_tokens + [bos_token_id],
+                features=image_tokens,
             )
 
         return [
@@ -319,13 +329,20 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
                 raise ValueError("Incorrect type of image patches. "
                                  f"Got type: {type(image_patches)}")
 
+            embed_is_patch = kwargs.pop("embed_is_patch")
+            if not isinstance(embed_is_patch, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of embed_is_patch. "
+                                 f"Got type: {type(embed_is_patch)}")
+
             image_patches_flat = flatten_bn(image_patches)
+            embed_is_patch = flatten_bn(embed_is_patch)
 
             return FuyuImagePatchInputs(
                 type="image_patches",
                 flat_data=self._validate_pixel_values(
                     flatten_bn(image_patches_flat, concat=True)),
                 patches_per_image=[x.size(0) for x in image_patches_flat],
+                embed_is_patch=embed_is_patch,
             )
 
         return None
@@ -347,7 +364,12 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         if image_input is None:
             return None
 
-        return self._process_image_input(image_input)
+        image_features = self._process_image_input(image_input)
+
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -357,11 +379,8 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
             inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                multimodal_embeddings,
-                _IMAGE_TOKEN_ID,
-            )
+                input_ids, inputs_embeds,
+                select_patch_features(multimodal_embeddings), _IMAGE_TOKEN_ID)
         return inputs_embeds
 
     def forward(
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 9552ee1f0b3a7..bbdea70a7bcfd 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -25,7 +25,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PlaceholderFeaturesInfo,
                                         PromptReplacement, PromptTargetMatch,
                                         PromptUpdate, PromptUpdateDetails,
-                                        find_mm_placeholders,
+                                        encode_tokens, find_mm_placeholders,
                                         replace_token_matches)
 # yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
@@ -36,6 +36,7 @@ from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features
 
 logger = init_logger(__name__)
 
@@ -53,6 +54,14 @@ class Gemma3ImagePixelInputs(TypedDict):
     num_patches: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
 
 Gemma3ImageInputs = Gemma3ImagePixelInputs
 
@@ -174,7 +183,7 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
         if processor is None:
             processor = self.get_hf_processor()
 
-        boi_token = processor.boi_token
+        image_token = processor.boi_token
 
         num_crops = self.get_num_crops(
             image_width=image_width,
@@ -183,21 +192,19 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
         )
 
         if num_crops == 0:
-            image_text = boi_token
+            image_text = image_token
         else:
-            crops_image_tokens = " ".join(boi_token for _ in range(num_crops))
+            crops_image_tokens = " ".join(image_token
+                                          for _ in range(num_crops))
             image_text = (
-                f"Here is the original image {boi_token} and here are some "
+                f"Here is the original image {image_token} and here are some "
                 f"crops to help you see better {crops_image_tokens}")
 
-        repl_full = image_text.replace(boi_token,
+        repl_full = image_text.replace(image_token,
                                        processor.full_image_sequence)
+        repl_features = repl_full.strip("\n")
 
-        tokenizer = processor.tokenizer
-        vocab = tokenizer.get_vocab()
-        image_token_id = vocab[tokenizer.image_token]
-
-        return PromptUpdateDetails.select_token_id(repl_full, image_token_id)
+        return PromptUpdateDetails(full=repl_full, features=repl_features)
 
     def get_num_image_tokens(
         self,
@@ -206,17 +213,19 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
         image_height: int,
         processor: Optional[Gemma3Processor],
     ) -> int:
-        if processor is None:
-            processor = self.get_hf_processor()
-
-        num_crops = self.get_num_crops(
+        tokenizer = self.get_tokenizer()
+        image_repl = self.get_image_repl(
             image_width=image_width,
             image_height=image_height,
             processor=processor,
         )
-        image_seq_len = processor.image_seq_length
 
-        return (num_crops + 1) * image_seq_len
+        image_repl_tokens = encode_tokens(
+            tokenizer,
+            image_repl.features,
+            add_special_tokens=False,
+        )
+        return len(image_repl_tokens)
 
     def get_image_size_with_most_features(self) -> ImageSize:
         processor = self.get_hf_processor()
@@ -292,6 +301,28 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
             ]
             hf_processor = self.info.get_hf_processor(**mm_kwargs)
 
+            image_repl_features = [
+                self.info.get_image_repl(image_width=size.width,
+                                         image_height=size.height,
+                                         processor=hf_processor).features
+                for size in image_sizes
+            ]
+
+            tokenizer = self.info.get_tokenizer()
+            image_repls_feature_tokens = [
+                tokenizer.encode(image_repl, add_special_tokens=False)
+                for image_repl in image_repl_features
+            ]
+
+            vocab = tokenizer.get_vocab()
+            image_token_id = vocab[tokenizer.image_token]
+
+            embed_is_patch = [
+                torch.tensor(image_repl_tokens) == image_token_id
+                for image_repl_tokens in image_repls_feature_tokens
+            ]
+            processed_outputs["embed_is_patch"] = embed_is_patch
+
             num_crops = [
                 self.info.get_num_crops(image_width=size.width,
                                         image_height=size.height,
@@ -313,6 +344,7 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
             pixel_values=MultiModalFieldConfig.flat_from_sizes(
                 "image", num_crops + 1),
             num_crops=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
         )
 
     def _get_prompt_updates(
@@ -422,7 +454,6 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
                     item_idx=p.item_idx,
                     start_idx=repl_orig_idxs[p.start_idx],
                     tokens=p.tokens,
-                    is_embed=p.is_embed,
                 ) for p in placeholders
             ]
             for modality, placeholders in repls.items()
@@ -541,6 +572,7 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
             self, **kwargs: object) -> Optional[Gemma3ImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
         num_crops = kwargs.pop("num_crops", None)
+        embed_is_patch = kwargs.pop("embed_is_patch", None)
         image_embeds = kwargs.pop("image_embeds", None)
         assert image_embeds is None, "Gemma3 does not support image_embeds."
         if pixel_values is None:
@@ -554,13 +586,19 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
             raise ValueError("Incorrect type of num_crops. "
                              f"Got type: {type(num_crops)}")
 
+        if not isinstance(embed_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of embed_is_patch. "
+                             f"Got type: {type(embed_is_patch)}")
+
         pixel_values = flatten_bn(pixel_values, concat=True)
         num_crops = flatten_bn(num_crops, concat=True)
+        embed_is_patch = flatten_bn(embed_is_patch)
 
         return Gemma3ImagePixelInputs(
             type="pixel_values",
             pixel_values=self._validate_pixel_values(pixel_values),
             num_patches=num_crops + 1,
+            embed_is_patch=embed_is_patch,
         )
 
     def _image_pixels_to_features(
@@ -597,7 +635,12 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
         if image_input is None:
             return None
 
-        return self._process_image_input(image_input)
+        image_features = self._process_image_input(image_input)
+
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -609,7 +652,7 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                multimodal_embeddings,
+                select_patch_features(multimodal_embeddings),
                 self.config.image_token_index,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index f975a19a364ed..3b2ad695f83ef 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -257,7 +257,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
         repl_features = IMG_CONTEXT * feature_size
         repl_full = IMG_START + repl_features + IMG_END
 
-        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
+        return PromptUpdateDetails(full=repl_full, features=repl_features)
 
     def resolve_min_max_num(
         self,
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 347106bc4dcf8..da4a44346c32e 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -41,7 +41,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         MultiModalDataItems,
                                         MultiModalFieldConfig,
                                         PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+                                        encode_tokens)
 # yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
@@ -54,6 +54,7 @@ from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal
 from .llama import LlamaModel
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features
 
 
 class Idefics3ImagePixelInputs(TypedDict):
@@ -68,6 +69,14 @@ class Idefics3ImagePixelInputs(TypedDict):
     num_patches: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
 
 class Idefics3ImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
@@ -77,6 +86,14 @@ class Idefics3ImageEmbeddingInputs(TypedDict):
     `hidden_size` must match the hidden size of language model backbone.
     """
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
 
 ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs]
 
@@ -258,16 +275,19 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
         image_height: int,
         processor: Optional[Idefics3Processor],
     ) -> int:
-        if processor is None:
-            processor = self.get_hf_processor()
-
-        num_patches = self.get_num_patches(
+        tokenizer = self.get_tokenizer()
+        image_repl = self.get_image_repl(
             image_width=image_width,
             image_height=image_height,
             processor=processor,
         )
 
-        return num_patches * processor.image_seq_len
+        image_repl_tokens = encode_tokens(
+            tokenizer,
+            image_repl,
+            add_special_tokens=False,
+        )
+        return len(image_repl_tokens)
 
     def get_image_size_with_most_features(self) -> ImageSize:
         processor = self.get_hf_processor()
@@ -344,6 +364,28 @@ class Idefics3MultiModalProcessor(
         ]
         hf_processor = self.info.get_hf_processor(**mm_kwargs)
 
+        image_repl_features = [
+            self.info.get_image_repl(image_width=size.width,
+                                     image_height=size.height,
+                                     processor=hf_processor)
+            for size in image_sizes
+        ]
+
+        tokenizer = self.info.get_tokenizer()
+        image_repls_feature_tokens = [
+            tokenizer.encode(image_repl, add_special_tokens=False)
+            for image_repl in image_repl_features
+        ]
+
+        vocab = tokenizer.get_vocab()
+        image_token_id = vocab[hf_processor.image_token.content]
+
+        embed_is_patch = [
+            torch.tensor(image_repl_tokens) == image_token_id
+            for image_repl_tokens in image_repls_feature_tokens
+        ]
+        processed_outputs["embed_is_patch"] = embed_is_patch
+
         num_patches = [
             self.info.get_num_patches(
                 image_width=size.width,
@@ -373,6 +415,7 @@ class Idefics3MultiModalProcessor(
                 "image", num_patches),
             image_embeds=MultiModalFieldConfig.batched("image"),
             num_patches=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
         )
 
     def _get_prompt_updates(
@@ -384,22 +427,17 @@ class Idefics3MultiModalProcessor(
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_token = hf_processor.image_token.content
 
-        def get_replacement_idefics3(item_idx: int) -> PromptUpdateDetails:
+        def get_replacement_idefics3(item_idx: int) -> str:
             images = mm_items.get_items("image", ImageProcessorItems)
 
             image_size = images.get_image_size(item_idx)
 
-            image_repl = self.info.get_image_repl(
+            return self.info.get_image_repl(
                 image_width=image_size.width,
                 image_height=image_size.height,
                 processor=hf_processor,
             )
 
-            return PromptUpdateDetails.select_text(
-                image_repl,
-                embed_text=image_token,
-            )
-
         return [
             PromptReplacement(
                 modality="image",
@@ -637,6 +675,13 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
         if pixel_values is None and image_embeds is None:
             return None
 
+        embed_is_patch = kwargs.pop("embed_is_patch")
+        if not isinstance(embed_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of embed_is_patch. "
+                             f"Got type: {type(embed_is_patch)}")
+
+        embed_is_patch = flatten_bn(embed_is_patch)
+
         if image_embeds is not None:
             if not isinstance(image_embeds, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of image embeddings. "
@@ -645,6 +690,7 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
             return Idefics3ImageEmbeddingInputs(
                 type="image_embeds",
                 data=flatten_bn(image_embeds, concat=True),
+                embed_is_patch=embed_is_patch,
             )
 
         if pixel_values is not None:
@@ -672,6 +718,7 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
                 pixel_values=self._validate_pixel_values(pixel_values),
                 pixel_attention_mask=pixel_attention_mask,
                 num_patches=num_patches,
+                embed_is_patch=embed_is_patch,
             )
 
         raise AssertionError("This line should be unreachable.")
@@ -707,7 +754,12 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
         if image_input is None:
             return None
 
-        return self._process_image_input(image_input)
+        image_features = self._process_image_input(image_input)
+
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -719,7 +771,7 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                multimodal_embeddings,
+                select_patch_features(multimodal_embeddings),
                 self.config.image_token_id,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index cf5608e3de7ba..0729f4c7d203c 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -39,6 +39,7 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features
 
 IMG_START = '<img>'
 IMG_END = '</img>'
@@ -59,6 +60,14 @@ class InternVLImagePixelInputs(TypedDict):
     num_patches: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
 
 class InternVLImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
@@ -410,12 +419,24 @@ class BaseInternVLProcessor(ABC):
                 torch.tensor([len(item) for item in pixel_values_lst]),
             }
 
+            tokenizer = self.tokenizer
+            image_token_id = self.image_token_id
+
+            embed_is_patch = list[torch.Tensor]()
+
             for pixel_values in pixel_values_lst:
                 num_patches = pixel_values.shape[0]
                 feature_size = num_patches * self.num_image_token
 
                 image_repl = self.get_image_repl(feature_size, num_patches)
+                feature_tokens = tokenizer.encode(image_repl.features,
+                                                  add_special_tokens=False)
+
                 text = [t.replace('<image>', image_repl.full, 1) for t in text]
+                embed_is_patch.append(
+                    torch.tensor(feature_tokens) == image_token_id)
+
+            image_inputs["embed_is_patch"] = embed_is_patch
 
         text_inputs = self.tokenizer(text)
 
@@ -439,7 +460,7 @@ class InternVLProcessor(BaseInternVLProcessor):
         repl_features = IMG_CONTEXT * feature_size
         repl_full = IMG_START + repl_features + IMG_END
 
-        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
+        return PromptUpdateDetails(full=repl_full, features=repl_features)
 
 
 class BaseInternVLProcessingInfo(BaseProcessingInfo):
@@ -578,6 +599,7 @@ class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
             pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
                 "image", image_num_patches),
             image_num_patches=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
             image_token_id=MultiModalFieldConfig.shared("image", num_images),
         )
@@ -809,6 +831,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
             self, **kwargs: object) -> Optional[InternVLImageInputs]:
         pixel_values_flat = kwargs.pop("pixel_values_flat", None)
         image_num_patches = kwargs.pop("image_num_patches", None)
+        embed_is_patch = kwargs.pop("embed_is_patch", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
         if pixel_values_flat is None and image_embeds is None:
@@ -837,14 +860,20 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
                 raise ValueError("Incorrect type of image_num_patches. "
                                  f"Got type: {type(image_num_patches)}")
 
+            if not isinstance(embed_is_patch, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of embed_is_patch. "
+                                 f"Got type: {type(embed_is_patch)}")
+
             pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
             image_num_patches = flatten_bn(image_num_patches, concat=True)
+            embed_is_patch = flatten_bn(embed_is_patch)
 
             return InternVLImagePixelInputs(
                 type="pixel_values",
                 pixel_values_flat=self._validate_pixel_values(
                     pixel_values_flat),
                 num_patches=image_num_patches,
+                embed_is_patch=embed_is_patch,
             )
 
         raise AssertionError("This line should be unreachable.")
@@ -890,7 +919,15 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
         if image_input is None:
             return None
 
-        return self._process_image_input(image_input)
+        image_features = self._process_image_input(image_input)
+
+        if image_input["type"] != "pixel_values":
+            return image_features
+
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -904,7 +941,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                multimodal_embeddings,
+                select_patch_features(multimodal_embeddings),
                 self.img_context_token_id,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index b34ac38f68071..45a0bf73b837d 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -32,8 +32,7 @@ from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, ProcessingCache,
-                                        PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+                                        PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -43,7 +42,8 @@ from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import get_vision_encoder_info
+from .vision import (get_vision_encoder_info, scatter_patch_features,
+                     select_patch_features)
 
 
 class LlavaImagePixelInputs(TypedDict):
@@ -67,6 +67,14 @@ class PixtralHFImagePixelInputs(TypedDict):
     in which case the data is passed as a list instead of a batched tensor.
     """
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+    
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
 
 class LlavaImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
@@ -335,6 +343,23 @@ class PixtralHFMultiModalProcessor(
                     for p, (h, w) in zip(pixel_values, image_sizes)
                 ]
 
+            hf_config = self.info.get_hf_config()
+            vision_config = hf_config.vision_config
+            assert isinstance(vision_config, PixtralVisionConfig)
+            encoder_info = PixtralHFEncoderInfo(vision_config)
+
+            tile_sizes = [
+                encoder_info.get_patch_grid_size(
+                    image_width=pixel_value.shape[-1],
+                    image_height=pixel_value.shape[-2],
+                ) for pixel_value in processed_outputs["pixel_values"]
+            ]
+            embed_is_patch = [
+                torch.tensor(([True] * ncols + [False]) * nrows)
+                for ncols, nrows in tile_sizes
+            ]
+            processed_outputs["embed_is_patch"] = embed_is_patch
+
         return processed_outputs
 
     def _get_mm_fields_config(
@@ -344,6 +369,7 @@ class PixtralHFMultiModalProcessor(
     ) -> Mapping[str, MultiModalFieldConfig]:
         return dict(
             pixel_values=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
@@ -378,7 +404,7 @@ class PixtralHFMultiModalProcessor(
             tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
             tokens[-1] = image_end_id
 
-            return PromptUpdateDetails.select_token_id(tokens, image_token_id)
+            return tokens
 
         return [
             PromptReplacement(
@@ -586,9 +612,17 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
                                  f"Got type: {type(pixel_values)}")
 
             if self.config.vision_config.model_type == "pixtral":
+                embed_is_patch = kwargs.pop("embed_is_patch")
+                if not isinstance(embed_is_patch, (torch.Tensor, list)):
+                    raise ValueError("Incorrect type of embed_is_patch. "
+                                     f"Got type: {type(embed_is_patch)}")
+
+                embed_is_patch = flatten_bn(embed_is_patch)
+
                 return PixtralHFImagePixelInputs(
                     type="pixel_values_pixtral",
                     pixel_values=flatten_bn(pixel_values),
+                    embed_is_patch=embed_is_patch,
                 )
 
             return LlavaImagePixelInputs(
@@ -680,7 +714,16 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
         if image_input is None:
             return None
 
-        return self._process_image_input(image_input)
+        image_features = self._process_image_input(image_input)
+
+        if image_input["type"] != "pixel_values_pixtral":
+            # The path is used for pixtral (V0 only) and llava (V0/V1)
+            return image_features
+
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -692,7 +735,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                multimodal_embeddings,
+                select_patch_features(multimodal_embeddings),
                 self.config.image_token_index,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index a4fb0cb1741e9..c74e086d3748e 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -40,8 +40,7 @@ from vllm.multimodal.parse import (AudioItem, AudioProcessorItems,
                                    DictEmbeddingItems, ModalityData,
                                    ModalityDataItems, MultiModalDataItems,
                                    MultiModalDataParser)
-from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+from vllm.multimodal.processing import PromptReplacement, PromptUpdate
 from vllm.multimodal.profiling import ProcessorInputs
 
 from .minicpmv import (_MAX_FRAMES_PER_VIDEO, MiniCPMV2_6,
@@ -51,6 +50,7 @@ from .minicpmv import (_MAX_FRAMES_PER_VIDEO, MiniCPMV2_6,
                        _minicpmv_field_config)
 from .utils import (AutoWeightsLoader, cast_overflow_tensors, flatten_bn,
                     maybe_prefix)
+from .vision import scatter_patch_features
 
 CPU_DEVICE = torch.device("cpu")
 
@@ -73,6 +73,14 @@ class MiniCPMOAudioFeatureInputs(TypedDict):
     which equals to `audio_features.shape[-1]`
     """
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which audio embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_audios, num_embeds)`
+    """
+
 
 class MiniCPMOAudioEmbeddingInputs(TypedDict):
     type: Literal["audio_embeds"]
@@ -85,6 +93,14 @@ class MiniCPMOAudioEmbeddingInputs(TypedDict):
     Length of each slice may vary, so pass it as a list.
     """
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which audio embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_audios, num_embeds)`
+    """
+
 
 MiniCPMOAudioInputs = Union[MiniCPMOAudioFeatureInputs,
                             MiniCPMOAudioEmbeddingInputs]
@@ -99,6 +115,7 @@ def _minicpmo_field_config(hf_inputs: Mapping[str, torch.Tensor]):
         audio_features=MultiModalFieldConfig.batched("audio"),
         audio_feature_lens=MultiModalFieldConfig.batched("audio"),
         audio_embeds=MultiModalFieldConfig.batched("audio"),
+        audio_embed_is_patch=MultiModalFieldConfig.batched("audio"),
         audio_token_id=MultiModalFieldConfig.shared("audio", num_audios),
     )
 
@@ -180,7 +197,8 @@ class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
         pool_step = self.get_default_audio_pool_step()
         fbank_feat_in_chunk = 100
         cnn_feat_in_chunk = (fbank_feat_in_chunk - 1) // 2 + 1
-        return (cnn_feat_in_chunk - pool_step) // pool_step + 1
+        num_audio_tokens = (cnn_feat_in_chunk - pool_step) // pool_step + 1
+        return num_audio_tokens + 2  # <audio>(<unk>*N)</audio>
 
     def get_max_audio_chunks_with_most_features(self) -> int:
         return 30
@@ -191,7 +209,8 @@ class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
 
     def get_audio_len_by_num_chunks(self, num_chunks: int) -> int:
         sampling_rate = self.get_default_audio_sampling_rate()
-        num_tokens_per_chunk = self.get_max_audio_tokens_per_chunk()
+        # exclude <audio> </audio>
+        num_tokens_per_chunk = self.get_max_audio_tokens_per_chunk() - 2
         return int(num_chunks * sampling_rate / num_tokens_per_chunk) + 1
 
     def get_num_frames_with_most_features(
@@ -276,6 +295,13 @@ class MiniCPMOMultiModalProcessor(
 
         if isinstance(parsed_audios, MiniCPMOAudioEmbeddingItems):
             audio_inputs = {}
+
+            audio_lens = [
+                self.info.get_audio_len_by_num_chunks(
+                    sum(map(len,
+                            parsed_audios.get(i)["audio_embeds"])))
+                for i in range(len(parsed_audios))
+            ]
         else:
             audio_inputs = self._base_call_hf_processor(
                 prompts=[self.info.audio_pattern] * len(parsed_audios),
@@ -297,7 +323,27 @@ class MiniCPMOMultiModalProcessor(
             ]
             audio_inputs["audio_features"] = unpadded_audio_features
 
+            audio_lens = [
+                parsed_audios.get_audio_length(i)
+                for i in range(len(parsed_audios))
+            ]
+
+        audio_repl_features = [
+            self.get_audio_prompt_texts(audio_len) for audio_len in audio_lens
+        ]
+
         tokenizer = self.info.get_tokenizer()
+        audio_repls_feature_tokens = [
+            tokenizer.encode(audio_repl, add_special_tokens=False)
+            for audio_repl in audio_repl_features
+        ]
+
+        embed_is_patch = [
+            self.get_embed_is_patch(audio_repl_tokens)
+            for audio_repl_tokens in audio_repls_feature_tokens
+        ]
+        audio_inputs["audio_embed_is_patch"] = embed_is_patch
+
         unk_token_id = tokenizer.get_vocab()["<unk>"]
         audio_inputs["audio_token_id"] = torch.tensor(unk_token_id)
 
@@ -338,10 +384,7 @@ class MiniCPMOMultiModalProcessor(
             else:
                 audio_len = audios.get_audio_length(item_idx)
 
-            return PromptUpdateDetails.select_text(
-                self.get_audio_prompt_texts(audio_len),
-                "<unk>",
-            )
+            return self.get_audio_prompt_texts(audio_len)
 
         return [
             *base_updates,
@@ -670,6 +713,13 @@ class MiniCPMO(MiniCPMV2_6):
             assert isinstance(audio_token_id, torch.Tensor)
             self.mm_token_ids.add(audio_token_id.flatten().unique().item())
 
+        audio_embed_is_patch = kwargs.pop("audio_embed_is_patch")
+        if not isinstance(audio_embed_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of audio_embed_is_patch. "
+                             f"Got type: {type(audio_embed_is_patch)}")
+
+        audio_embed_is_patch = flatten_bn(audio_embed_is_patch)
+
         if audio_embeds is not None:
             if not isinstance(audio_embeds, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of audio_embeds. "
@@ -680,6 +730,7 @@ class MiniCPMO(MiniCPMV2_6):
             return MiniCPMOAudioEmbeddingInputs(
                 type="audio_embeds",
                 audio_embeds=audio_embeds_flat,
+                embed_is_patch=audio_embed_is_patch,
             )
 
         if not isinstance(audio_features, (torch.Tensor, list)):
@@ -698,6 +749,7 @@ class MiniCPMO(MiniCPMV2_6):
             type="audio_features",
             audio_features=audio_features_flat,
             audio_feature_lens=audio_feature_lens_flat,
+            embed_is_patch=audio_embed_is_patch,
         )
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
@@ -729,6 +781,10 @@ class MiniCPMO(MiniCPMV2_6):
             if modality == "audios":
                 audio_input = modalities["audios"]
                 audio_features = self._process_audio_input(audio_input)
-                multimodal_embeddings += tuple(audio_features)
+                multimodal_embeddings += tuple(
+                    scatter_patch_features(
+                        audio_features,
+                        audio_input["embed_is_patch"],
+                    ))
 
         return multimodal_embeddings
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index eb20a963ae2ab..5fab9df3f8f99 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -56,7 +56,7 @@ from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem,
                                    VideoItem, VideoProcessorItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails)
+                                        PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
@@ -67,6 +67,7 @@ from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP)
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features
 
 # For profile run
 _MAX_FRAMES_PER_VIDEO = 16
@@ -89,6 +90,14 @@ class MiniCPMVImagePixelInputs(TypedDict):
     This should be in `(height, width)` format.
     """
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
     num_slices: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
@@ -103,6 +112,14 @@ class MiniCPMVImageEmbeddingInputs(TypedDict):
     instead of a batched tensor.
     """
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
 
 MiniCPMVImageInputs = Union[MiniCPMVImagePixelInputs,
                             MiniCPMVImageEmbeddingInputs]
@@ -228,10 +245,12 @@ def _minicpmv_field_config(hf_inputs: Mapping[str, torch.Tensor]):
         image_sizes=MultiModalFieldConfig.batched("image"),
         tgt_sizes=MultiModalFieldConfig.batched("image"),
         image_embeds=MultiModalFieldConfig.batched("image"),
+        embed_is_patch=MultiModalFieldConfig.batched("image"),
         video_pixel_values=MultiModalFieldConfig.batched("video"),
         video_image_sizes=MultiModalFieldConfig.batched("video"),
         video_tgt_sizes=MultiModalFieldConfig.batched("video"),
         video_embeds=MultiModalFieldConfig.batched("video"),
+        video_embed_is_patch=MultiModalFieldConfig.batched("video"),
         image_token_id=MultiModalFieldConfig.shared("image", num_images),
         video_token_id=MultiModalFieldConfig.shared("video", num_videos),
     )
@@ -379,43 +398,22 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
             use_image_id=use_image_id,
         )
 
-    def get_sliced_grid(
-        self,
-        image_size: ImageSize,
-        # For MiniCPM V/O 2.6
-        max_slice_nums: Optional[int] = None,
-    ) -> Optional[tuple[int, int]]:
-        image_processor = self.get_image_processor()
-        version = self.get_model_version()
-
-        if version == (2, 0) or version == (2, 5):
-            return image_processor.get_sliced_grid(image_size)
-
-        if max_slice_nums is None:
-            max_slice_nums = image_processor.max_slice_nums
-
-        return image_processor.get_sliced_grid(
-            image_size,
-            max_slice_nums=max_slice_nums,
-        )
-
     def get_num_image_tokens(
         self,
         image_size: ImageSize,
         max_slice_nums: Optional[int] = None,
+        use_image_id: bool = True,
     ) -> int:
-        image_processor = self.get_image_processor()
-
-        grid = self.get_sliced_grid(
+        tokenizer = self.get_tokenizer()
+        image_placeholders = self.get_slice_image_placeholder(
             image_size,
             max_slice_nums=max_slice_nums,
+            use_image_id=use_image_id,
         )
-        if grid is None:
-            ncols = nrows = 0
-        else:
-            ncols, nrows = grid
+        image_token_ids = tokenizer.encode(image_placeholders,
+                                           add_special_tokens=False)
 
-        return (ncols * nrows + 1) * image_processor.image_feature_size
+        return len(image_token_ids)
 
     def get_max_image_tokens(self) -> int:
         image_size = self.get_image_size_with_most_features()
@@ -435,6 +433,7 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
         return self.get_num_image_tokens(
             frame_size,
             max_slice_nums=self.get_video_max_slice_num(),
+            use_image_id=False,
         )
 
     def get_max_video_tokens(
@@ -540,6 +539,14 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
             use_image_id=False,
         ) * num_frames
 
+    def get_embed_is_patch(
+        self,
+        input_ids: list[int],
+    ) -> torch.Tensor:
+        tokenizer = self.info.get_tokenizer()
+        unk_token_id = tokenizer.get_vocab()["<unk>"]
+        return torch.tensor(input_ids) == unk_token_id
+
     def process_images(
         self,
         mm_data: Mapping[str, object],
@@ -563,7 +570,26 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
                 out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
             )
 
+        image_sizes = [
+            parsed_images.get_image_size(i) for i in range(len(parsed_images))
+        ]
+        image_repl_features = [
+            self.get_image_prompt_texts(size, idx)
+            for idx, size in enumerate(image_sizes)
+        ]
+
         tokenizer = self.info.get_tokenizer()
+        image_repls_feature_tokens = [
+            tokenizer.encode(image_repl, add_special_tokens=False)
+            for image_repl in image_repl_features
+        ]
+
+        embed_is_patch = [
+            self.get_embed_is_patch(image_repl_tokens)
+            for image_repl_tokens in image_repls_feature_tokens
+        ]
+        image_inputs["embed_is_patch"] = embed_is_patch
+
         unk_token_id = tokenizer.get_vocab()["<unk>"]
         image_inputs["image_token_id"] = torch.tensor(unk_token_id)
 
@@ -599,9 +625,31 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
                 out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
             )
 
-        video_inputs = {f"video_{k}": v for k, v in video_inputs.items()}
+        frame_sizes = [
+            parsed_videos.get_frame_size(i) for i in range(len(parsed_videos))
+        ]
+        num_frames = [
+            parsed_videos.get_num_frames(i) for i in range(len(parsed_videos))
+        ]
+        video_repl_features = [
+            self.get_video_prompt_texts(size, nframes)
+            for size, nframes in zip(frame_sizes, num_frames)
+        ]
 
         tokenizer = self.info.get_tokenizer()
+        video_repls_feature_tokens = [
+            tokenizer.encode(video_repl, add_special_tokens=False)
+            for video_repl in video_repl_features
+        ]
+
+        embed_is_patch = [
+            self.get_embed_is_patch(video_repl_tokens)
+            for video_repl_tokens in video_repls_feature_tokens
+        ]
+        video_inputs["embed_is_patch"] = embed_is_patch
+
+        video_inputs = {f"video_{k}": v for k, v in video_inputs.items()}
+
         unk_token_id = tokenizer.get_vocab()["<unk>"]
         video_inputs["video_token_id"] = torch.tensor(unk_token_id)
 
@@ -692,10 +740,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
             image_size = images.get_image_size(item_idx)
 
-            return PromptUpdateDetails.select_text(
-                self.get_image_prompt_texts(image_size, item_idx),
-                "<unk>",
-            )
+            return self.get_image_prompt_texts(image_size, item_idx)
 
         def get_video_replacement(item_idx: int):
             videos = mm_items.get_items(
@@ -704,10 +749,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
             frame_size = videos.get_frame_size(item_idx)
             num_frames = videos.get_num_frames(item_idx)
 
-            return PromptUpdateDetails.select_text(
-                self.get_video_prompt_texts(frame_size, num_frames),
-                "<unk>",
-            )
+            return self.get_video_prompt_texts(frame_size, num_frames)
 
         get_replacement = {
             "image": get_image_replacement,
@@ -790,6 +832,14 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
             assert isinstance(image_token_id, torch.Tensor)
             self.mm_token_ids.add(image_token_id.flatten().unique().item())
 
+        embed_is_patch = kwargs.pop("embed_is_patch")
+        if not isinstance(embed_is_patch, (torch.Tensor, list)):
+            raise ValueError(
+                f"Incorrect type of embed_is_patch for {modality=}. "
+                f"Got type: {type(embed_is_patch)}")
+
+        embed_is_patch = flatten_bn(embed_is_patch)
+
         if image_embeds is not None:
             if not isinstance(image_embeds, (torch.Tensor, list)):
                 raise ValueError(
@@ -801,6 +851,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
             return MiniCPMVImageEmbeddingInputs(
                 type="image_embeds",
                 image_embeds=image_embeds_flat,
+                embed_is_patch=embed_is_patch,
             )
 
         if not isinstance(pixel_values, (torch.Tensor, list)):
@@ -828,6 +879,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
             type="pixel_values",
             pixel_values=pixel_values_flat,
             tgt_sizes=tgt_sizes_flat,
+            embed_is_patch=embed_is_patch,
             num_slices=num_slices_flat,
         )
 
@@ -884,11 +936,19 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
             if modality == "images":
                 image_input = modalities["images"]
                 image_features = self._process_vision_input(image_input)
-                multimodal_embeddings += tuple(image_features)
+                multimodal_embeddings += tuple(
+                    scatter_patch_features(
+                        image_features,
+                        image_input["embed_is_patch"],
+                    ))
             if modality == "videos":
                 video_input = modalities["videos"]
                 video_features = self._process_vision_input(video_input)
-                multimodal_embeddings += tuple(video_features)
+                multimodal_embeddings += tuple(
+                    scatter_patch_features(
+                        video_features,
+                        video_input["embed_is_patch"],
+                    ))
 
         return multimodal_embeddings
 
@@ -911,7 +971,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                multimodal_embeddings,
+                select_patch_features(multimodal_embeddings),
                 list(self.mm_token_ids),
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index b6fbc6b1fa3d0..872769dd649e0 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -27,8 +27,7 @@ from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, ProcessingCache,
-                                        PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+                                        PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -36,7 +35,8 @@ from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import get_vision_encoder_info
+from .vision import (get_vision_encoder_info, scatter_patch_features,
+                     select_patch_features)
 
 
 class Mistral3ImagePixelInputs(TypedDict):
@@ -49,6 +49,14 @@ class Mistral3ImagePixelInputs(TypedDict):
     in which case the data is passed as a list instead of a batched tensor.
     """
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+    
+    Shape: `(batch_size, num_images, num_embeds)`
+    """
+
 
 class Mistral3PatchMerger(nn.Module):
     """
@@ -258,6 +266,23 @@ class Mistral3MultiModalProcessor(
                 p[:, :h, :w] for p, (h, w) in zip(pixel_values, image_sizes)
             ]
 
+            hf_config = self.info.get_hf_config()
+            vision_config = hf_config.vision_config
+            assert isinstance(vision_config, PixtralVisionConfig)
+            encoder_info = PixtralHFEncoderInfo(vision_config)
+
+            tile_sizes = [
+                encoder_info.get_patch_grid_size(
+                    image_width=pixel_value.shape[-1],
+                    image_height=pixel_value.shape[-2],
+                ) for pixel_value in processed_outputs["pixel_values"]
+            ]
+            embed_is_patch = [
+                torch.tensor(([True] * ncols + [False]) * nrows)
+                for ncols, nrows in tile_sizes
+            ]
+            processed_outputs["embed_is_patch"] = embed_is_patch
+
         return processed_outputs
 
     def _get_mm_fields_config(
@@ -267,6 +292,7 @@ class Mistral3MultiModalProcessor(
     ) -> Mapping[str, MultiModalFieldConfig]:
         return dict(
             pixel_values=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
@@ -301,7 +327,7 @@ class Mistral3MultiModalProcessor(
             tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
             tokens[-1] = image_end_id
 
-            return PromptUpdateDetails.select_token_id(tokens, image_token_id)
+            return tokens
 
         return [
             PromptReplacement(
@@ -392,6 +418,8 @@ def init_vision_tower_for_llava(
     )
 
 
+# TODO(mgoin): Support V1, there are issues with image batching/chunking
+# that need to be resolved first.
 @MULTIMODAL_REGISTRY.register_processor(
     _build_mistral3_processor,
     info=_build_mistral3_info,
@@ -481,9 +509,16 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsMultiModal,
             raise ValueError("Incorrect type of pixel values. "
                              f"Got type: {type(pixel_values)}")
 
+        assert self.config.vision_config.model_type == "pixtral"
+        embed_is_patch = kwargs.pop("embed_is_patch")
+        if not isinstance(embed_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of embed_is_patch. "
+                             f"Got type: {type(embed_is_patch)}")
+
         return Mistral3ImagePixelInputs(
             type="pixel_values_pixtral",
             pixel_values=flatten_bn(pixel_values),
+            embed_is_patch=flatten_bn(embed_is_patch),
         )
 
     def _process_image_input(
@@ -522,7 +557,10 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         vision_embeddings = self._process_image_input(image_input)
 
-        return vision_embeddings
+        return scatter_patch_features(
+            vision_embeddings,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -534,7 +572,7 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsMultiModal,
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                multimodal_embeddings,
+                select_patch_features(multimodal_embeddings),
                 self.config.image_token_index,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 6857bfa810e3e..b2f795155f17b 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -46,8 +46,7 @@ from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptIndexTargets,
-                                        PromptInsertion, PromptUpdate,
-                                        PromptUpdateDetails)
+                                        PromptInsertion, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -57,6 +56,7 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix, merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features
 
 # TODO: hard-coded for now. Consider making it configurable.
 VIT_LAYERS = [-2, -9]
@@ -84,6 +84,14 @@ class MolmoImageInputs(TypedDict):
     Shape: `(batch_size * num_images, num_crops, num_patch)`
     """
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+    
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
     num_crops: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
@@ -1138,6 +1146,30 @@ class MolmoProcessorWrapper:
         if image_input_idx is not None:
             feat_is_patch = image_input_idx >= 0
 
+            input_is_embed = torch.isin(
+                input_ids,
+                torch.tensor([
+                    self.image_patch_id,
+                    self.im_col_id,
+                    self.im_start_id,
+                    self.im_end_id,
+                ]),
+            )
+            embed_ids = input_ids[input_is_embed]
+            embed_is_patch = embed_ids == self.image_patch_id
+            assert embed_is_patch.sum() == feat_is_patch.sum()
+
+            # image_tokens = extra_joint + joint
+            # Both `extra_joint` and `joint` have `im_start_id` and `im_end_id`
+            embed_start = torch.nonzero(embed_ids == self.im_start_id)[::2, 0]
+            embed_end = torch.nonzero(embed_ids == self.im_end_id)[1::2, 0]
+            assert len(embed_start) == len(embed_end) == len(images)
+
+            embed_is_patch = [
+                embed_is_patch[start:end + 1]
+                for start, end in zip(embed_start, embed_end)
+            ]
+
             tilings = [
                 self.select_tiling(
                     image_width=image.size[0],
@@ -1149,6 +1181,7 @@ class MolmoProcessorWrapper:
             assert num_crops.sum() == len(feat_is_patch)
 
             outputs["feat_is_patch"] = feat_is_patch
+            outputs["embed_is_patch"] = embed_is_patch
             outputs["num_crops"] = num_crops
             outputs["img_patch_id"] = self.image_patch_id
 
@@ -1187,13 +1220,17 @@ class MolmoProcessingInfo(BaseProcessingInfo):
         )
         pooling_size = processor.pooling_size
 
-        image_token_length_w = processor.image_token_length_w
-        image_token_length_h = processor.image_token_length_h
+        base_image_input_size = processor.base_image_input_size
+        base_image_input_d = processor.image_patch_size
 
-        extra = image_token_length_w * image_token_length_h
-        joint = ((ncols + 1) // pooling_size) * ((nrows + 1) // pooling_size)
+        crop_patches = base_image_input_size[0] // base_image_input_d
 
-        return extra + joint
+        per_row = ncols // pooling_size + 1
+        joint = per_row * (nrows // pooling_size) + 2
+        image_token_length = (crop_patches + pooling_size - 1) // pooling_size
+        resize = (image_token_length + 1) * image_token_length + 2
+
+        return resize + joint
 
     def get_max_image_tokens(self) -> int:
         target_width, target_height = self.get_image_size_with_most_features()
@@ -1291,6 +1328,7 @@ class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
                 "image", num_crops),
             feat_is_patch=MultiModalFieldConfig.flat_from_sizes(
                 "image", num_crops),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
             num_crops=MultiModalFieldConfig.batched("image"),
             img_patch_id=MultiModalFieldConfig.shared("image", num_images),
         )
@@ -1330,10 +1368,8 @@ class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
             joint = ([img_start_id] + joint_row *
                      ((nrows + 1) // pooling_size) + [img_end_id])
 
-            return PromptUpdateDetails.select_token_id(
-                extra_joint + joint,
-                embed_token_id=img_patch_id,
-            )
+            image_tokens = extra_joint + joint
+            return image_tokens
 
         return [
             PromptInsertion(
@@ -1439,6 +1475,11 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
             raise ValueError("Incorrect type of feat_is_patch. "
                              f"Got type: {type(feat_is_patch)}")
 
+        embed_is_patch = kwargs.pop("embed_is_patch", None)
+        if not isinstance(embed_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of embed_is_patch. "
+                             f"Got type: {type(embed_is_patch)}")
+
         num_crops = kwargs.pop("num_crops", None)
         if not isinstance(num_crops, (torch.Tensor, list)):
             raise ValueError("Incorrect type of num_crops. "
@@ -1450,12 +1491,14 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
                              f"Got type: {type(img_patch_id)}")
         self.img_patch_id = img_patch_id.flatten().unique().item()
 
+        embed_is_patch = flatten_bn(embed_is_patch)
         num_crops = flatten_bn(num_crops, concat=True)
 
         return MolmoImageInputs(
             images=images,
             image_masks=image_masks,
             feat_is_patch=feat_is_patch,
+            embed_is_patch=embed_is_patch,
             num_crops=num_crops,
         )
 
@@ -1494,7 +1537,12 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
         if image_input is None:
             return None
 
-        return self._process_image_input(image_input)
+        image_features = self._process_image_input(image_input)
+
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -1508,7 +1556,7 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                multimodal_embeddings,
+                select_patch_features(multimodal_embeddings),
                 self.img_patch_id,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py
index 314f75c203012..9d04f30c8f3fe 100644
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -57,7 +57,7 @@ class NVLMProcessor(BaseInternVLProcessor):
         # when trying to find "<tile" as a subsequence of "<Image><tile"
         repl = "<Image>" + features + "</Image>"
 
-        return PromptUpdateDetails.select_text(repl, IMG_PAD)
+        return PromptUpdateDetails(full=repl, features=repl)
 
 
 class NVLMProcessingInfo(BaseInternVLProcessingInfo):
@@ -84,6 +84,31 @@ class NVLMProcessingInfo(BaseInternVLProcessingInfo):
             **kwargs,
         )
 
+    def get_max_image_tokens(self) -> int:
+        hf_processor = self.get_hf_processor()
+        tokenizer = hf_processor.tokenizer
+
+        max_num_patches = hf_processor.max_dynamic_patch
+        # we need +1 here because max_dynamic_patch in config doesn't
+        # include the thumbnail patch
+        tile_pos_identifiers = [
+            f"<tile_{i+1}>" for i in range(max_num_patches)
+        ]
+        if hf_processor.use_thumbnail and max_num_patches != 1:
+            tile_pos_identifiers += ["<tile_global_thumbnail>"]
+
+        # "<Image><tile" is tokenized as ["<Image", "><", "tile"]
+        # so we include <tile_1> in the start_str
+        start_str = "<Image>" + tile_pos_identifiers.pop(0)
+        end_str = "</Image>"
+        start_token_len = len(tokenizer.encode(start_str))
+        end_token_len = len(tokenizer.encode(end_str))
+        tile_token_len = sum(
+            len(tokenizer.encode(identifier))
+            for identifier in tile_pos_identifiers)
+        non_image_tokens_num = start_token_len + end_token_len + tile_token_len
+        return super().get_max_image_tokens() + non_image_tokens_num
+
 
 class NVLMDummyInputsBuilder(InternVLDummyInputsBuilder[NVLMProcessingInfo]):
 
@@ -152,7 +177,10 @@ class NVLMMultiModalProcessor(InternVLMultiModalProcessor[NVLMProcessingInfo]):
 
             repl = hf_processor.get_image_repl(feature_size, num_patches)
 
-            return PromptUpdateDetails.select_text(repl.full + "\n", IMG_PAD)
+            return PromptUpdateDetails(
+                full=repl.full + "\n",
+                features=repl.features + "\n",
+            )
 
         # See note in dummy data regarding why we have the extra newline
         return [
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 845f77ac39ce7..6fedb8c819849 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -162,9 +162,9 @@ class PaliGemmaMultiModalProcessor(
                 modality="image",
                 target=PromptIndexTargets.prefix(
                     [bos_token_id] if tokenizer.add_bos_token else []),
-                insertion=PromptUpdateDetails.select_token_id(
-                    image_tokens + [bos_token_id],
-                    embed_token_id=image_token_id,
+                insertion=PromptUpdateDetails(
+                    full=image_tokens + [bos_token_id],
+                    features=image_tokens,
                 ),
             )
         ]
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index d3b0688f21c38..d5c64989e64d3 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -40,7 +40,8 @@ from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, BoundPromptUpdate,
                                         PlaceholderFeaturesInfo,
-                                        PromptReplacement, PromptUpdate)
+                                        PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
 # yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
@@ -442,7 +443,12 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
                     processor=hf_processor,
                 )
 
-            return [_IMAGE_TOKEN_ID] * num_image_tokens
+            image_tokens = [_IMAGE_TOKEN_ID] * num_image_tokens
+
+            return PromptUpdateDetails(
+                full=image_tokens,
+                features=image_tokens,
+            )
 
         num_images = mm_items.get_count("image", strict=False)
 
@@ -511,7 +517,6 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
                         item_idx=p.item_idx,
                         start_idx=p.start_idx - 1,
                         tokens=p.tokens,
-                        is_embed=p.is_embed,
                     ) for p in ps
                 ]
                 for modality, ps in placeholders.items()
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index e07c6516aef2e..f8c7cc9382aac 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -37,7 +37,7 @@ from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails)
+                                        PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import (MistralTokenizer,
@@ -46,7 +46,8 @@ from vllm.transformers_utils.tokenizer import (MistralTokenizer,
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (flatten_bn, init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
-from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
+from .vision import (VisionEncoderInfo, resolve_visual_encoder_outputs,
+                     scatter_patch_features, select_patch_features)
 
 try:
     from xformers import ops as xops
@@ -67,6 +68,14 @@ class PixtralImagePixelInputs(TypedDict):
     The result of stacking :attr:`ImageEncoding.tokens` from each prompt.
     """
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
 
 class PixtralProcessorAdapter:
     """
@@ -135,8 +144,11 @@ class PixtralProcessorAdapter:
                 "For more info, see: "
                 "https://github.com/vllm-project/vllm/issues/8411.")
 
+        image_token_id = self.image_token_id
+
         images_processed = list[torch.Tensor]()
         images_tokens = list[torch.Tensor]()
+        images_embed_is_patch = list[torch.Tensor]()
 
         for image in images:
             image_inputs = self.image_processor(ImageChunk(image=image))
@@ -145,10 +157,12 @@ class PixtralProcessorAdapter:
 
             images_processed.append(image_processed)
             images_tokens.append(image_tokens)
+            images_embed_is_patch.append(image_tokens == image_token_id)
 
         return {
             "input_ids": torch.cat(images_tokens)[None].expand(len(text), -1),
             "images": images_processed,
+            "embed_is_patch": images_embed_is_patch,
         }
 
 
@@ -199,7 +213,7 @@ class PixtralProcessingInfo(BaseProcessingInfo):
         ncols, nrows = processor.image_processor._image_to_num_tokens(
             Image.new("RGB", (image_width, image_height)))
 
-        return ncols * nrows
+        return (ncols + 1) * nrows
 
     def get_image_size_with_most_features(self) -> ImageSize:
         image_processor = self.get_hf_processor().image_processor
@@ -249,7 +263,10 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
         hf_inputs: Mapping[str, NestedTensors],
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(images=MultiModalFieldConfig.batched("image"))
+        return dict(
+            images=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
+        )
 
     def _get_prompt_updates(
         self,
@@ -273,7 +290,7 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
             tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
             tokens[-1] = image_end_id
 
-            return PromptUpdateDetails.select_token_id(tokens, image_token_id)
+            return tokens
 
         return [
             PromptReplacement(
@@ -364,9 +381,17 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
             raise ValueError("Incorrect type of images. "
                              f"Got type: {type(images)}")
 
+        embed_is_patch = kwargs.pop("embed_is_patch")
+        if not isinstance(embed_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of embed_is_patch. "
+                             f"Got type: {type(embed_is_patch)}")
+
+        embed_is_patch = flatten_bn(embed_is_patch)
+
         return PixtralImagePixelInputs(
             type="pixel_values",
             images=flatten_bn(images),
+            embed_is_patch=embed_is_patch,
         )
 
     def _process_image_input(
@@ -402,7 +427,12 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
         if image_input is None:
             return None
 
-        return self._process_image_input(image_input)
+        image_features = self._process_image_input(image_input)
+
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -414,7 +444,7 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                multimodal_embeddings,
+                select_patch_features(multimodal_embeddings),
                 self.vision_args.image_token_id,
             )
         return inputs_embeds
@@ -933,7 +963,9 @@ class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]):
             image_width=image_width,
             image_height=image_height,
         )
-        return ncols * nrows
+
+        # Consider the image_break_token
+        return (ncols + 1) * nrows
 
     def get_max_image_tokens(self) -> int:
         image_size = self.get_image_size()
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 54220037d253f..ccb5a3f600b2d 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -229,9 +229,9 @@ class Qwen2AudioMultiModalProcessor(
 
             audio_tokens = [audio_token_id] * num_features
 
-            return PromptUpdateDetails.select_token_id(
-                [audio_bos_id] + audio_tokens + [audio_eos_id],
-                embed_token_id=audio_token_id,
+            return PromptUpdateDetails(
+                full=[audio_bos_id] + audio_tokens + [audio_eos_id],
+                features=audio_tokens,
             )
 
         return [
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index a2ec9a9a4d177..4e9d02ae0abdb 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -647,9 +647,9 @@ class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
             PromptReplacement(
                 modality="image",
                 target=[img_start_id, img_end_id],
-                replacement=PromptUpdateDetails.select_token_id(
-                    [img_start_id] + image_tokens + [img_end_id],
-                    embed_token_id=img_pad_id,
+                replacement=PromptUpdateDetails(
+                    full=[img_start_id] + image_tokens + [img_end_id],
+                    features=image_tokens,
                 ),
             )
         ]
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index e3deae828a33c..ac5de0e36b894 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -40,6 +40,7 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features
 
 IMG_START = '<img>'
 IMG_END = '</img>'
@@ -60,6 +61,14 @@ class SkyworkR1VImagePixelInputs(TypedDict):
     num_patches: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
 
 class SkyworkR1VImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
@@ -410,13 +419,24 @@ class BaseSkyworkR1VProcessor(ABC):
                 torch.tensor([len(item) for item in pixel_values_lst]),
             }
 
+            tokenizer = self.tokenizer
+            image_token_id = self.image_token_id
+
+            embed_is_patch = list[torch.Tensor]()
+
             for pixel_values in pixel_values_lst:
                 num_patches = pixel_values.shape[0]
                 feature_size = num_patches * self.num_image_token
 
                 image_repl = self.get_image_repl(feature_size, num_patches)
+                feature_tokens = tokenizer.encode(image_repl.features,
+                                                  add_special_tokens=False)
 
                 text = [t.replace('<image>', image_repl.full, 1) for t in text]
+                embed_is_patch.append(
+                    torch.tensor(feature_tokens) == image_token_id)
+
+            image_inputs["embed_is_patch"] = embed_is_patch
 
         text_inputs = self.tokenizer(text)
 
@@ -440,7 +460,7 @@ class SkyworkR1VProcessor(BaseSkyworkR1VProcessor):
         repl_features = IMG_CONTEXT * feature_size
         repl_full = IMG_START + repl_features + IMG_END
 
-        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
+        return PromptUpdateDetails(full=repl_full, features=repl_features)
 
 
 class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo):
@@ -579,6 +599,7 @@ class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[_I]):
             pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
                 "image", image_num_patches),
             image_num_patches=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
             image_token_id=MultiModalFieldConfig.shared("image", num_images),
         )
@@ -814,6 +835,7 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
             self, **kwargs: object) -> Optional[SkyworkR1VImageInputs]:
         pixel_values_flat = kwargs.pop("pixel_values_flat", None)
         image_num_patches = kwargs.pop("image_num_patches", None)
+        embed_is_patch = kwargs.pop("embed_is_patch", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
         if pixel_values_flat is None and image_embeds is None:
@@ -842,14 +864,20 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
                 raise ValueError("Incorrect type of image_num_patches. "
                                  f"Got type: {type(image_num_patches)}")
 
+            if not isinstance(embed_is_patch, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of embed_is_patch. "
+                                 f"Got type: {type(embed_is_patch)}")
+
             pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
             image_num_patches = flatten_bn(image_num_patches, concat=True)
+            embed_is_patch = flatten_bn(embed_is_patch)
 
             return SkyworkR1VImagePixelInputs(
                 type="pixel_values",
                 pixel_values_flat=self._validate_pixel_values(
                     pixel_values_flat),
                 num_patches=image_num_patches,
+                embed_is_patch=embed_is_patch,
             )
 
         raise AssertionError("This line should be unreachable.")
@@ -895,7 +923,15 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
         if image_input is None:
             return None
 
-        return self._process_image_input(image_input)
+        image_features = self._process_image_input(image_input)
+
+        if image_input["type"] != "pixel_values":
+            return image_features
+
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -909,7 +945,7 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                multimodal_embeddings,
+                select_patch_features(multimodal_embeddings),
                 self.img_context_token_id,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 347f51499b7be..9e00da682e808 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from abc import ABC, abstractmethod
-from typing import Final, Generic, Optional, Protocol, TypeVar, Union
+from collections.abc import Sequence
+from typing import Final, Generic, Optional, Protocol, TypeVar, Union, cast
 
 import torch
 from transformers import PretrainedConfig
@@ -9,9 +10,12 @@ from transformers import PretrainedConfig
 import vllm.envs as envs
 from vllm.attention.selector import (backend_name_to_enum,
                                      get_global_forced_attn_backend)
+from vllm.jsontree import JSONTree, json_map_leaves
 from vllm.logger import init_logger
 from vllm.platforms import _Backend, current_platform
 
+from .interfaces import MultiModalEmbeddings
+
 logger = init_logger(__name__)
 
 _C = TypeVar("_C", bound=PretrainedConfig)
@@ -151,3 +155,74 @@ def resolve_visual_encoder_outputs(
     if post_layer_norm is not None and uses_last_layer:
         hs_pool[-1] = post_layer_norm(encoder_outputs)
     return torch.cat(hs_pool, dim=-1)
+
+
+def scatter_patch_features(
+    patches: Union[torch.Tensor, Sequence[torch.Tensor]],
+    embed_is_patch: Union[torch.Tensor, Sequence[torch.Tensor]],
+) -> tuple[torch.Tensor, ...]:
+    """
+    Scatter the patch features into a contiguous tensor that corresponds
+    to the embedding tokens defined by the multimodal processor.
+    
+    The rest of the values in the tensor are set to NaN so that they
+    can be filtered out by :func`select_patch_features`.
+
+    Args:
+        patches: The patch features for each image.
+          Shape: `(num_images, <patch_dims>, feature_depth)`
+        embed_is_patch: A boolean mask indicating which image embeddings
+          correspond to patch tokens for each image.
+          Shape: `(num_images, num_embeds)`
+
+    Note:
+        The original code only considers patch tokens as feature
+        tokens, but our processor considers all image-related tokens
+        as feature tokens because the feature tokens need to be
+        consecutive in `input_ids`.
+
+    Example:
+        A simplified example for one image:
+
+        .. code-block::
+
+            Embedding tokens (from HF processor):
+            [<start> <patch> <patch>  <col>  <patch> <patch>  <col>  <end> ]
+
+            embed_is_patch (from HF processor):
+            [ False   True    True    False    True    True   False  False ]
+
+            Encoder outputs (from model):
+            [  p1      p2      p3      p4   ]
+
+            The resulting embedding tensor is:
+            [  nan     p1      p2      nan      p3      p4     nan    nan  ]
+    """
+    if len(patches) != len(embed_is_patch):
+        raise ValueError(f"Inconsistent num_images: {len(patches)=} vs. "
+                         f"{len(embed_is_patch)=}")
+
+    def get_embed_one(patches_one: torch.Tensor, e_is_patch: torch.Tensor):
+        embed_one = patches_one.new_full(
+            (e_is_patch.shape[0], patches_one.shape[-1]),
+            fill_value=torch.nan,
+        )
+        embed_one[e_is_patch] = patches_one
+        return embed_one
+
+    return tuple(
+        get_embed_one(patches_one, e_is_patch)
+        for patches_one, e_is_patch in zip(patches, embed_is_patch))
+
+
+def select_patch_features(
+        multimodal_embeddings: MultiModalEmbeddings) -> MultiModalEmbeddings:
+    """
+    Given the outputs of :func:`scatter_patch_features`, return only
+    the values that correspond to patch features.
+    """
+    selected_features = json_map_leaves(
+        lambda x: x[~x.isnan()].view(-1, *x.shape[1:]),
+        cast(JSONTree[torch.Tensor], multimodal_embeddings),
+    )
+    return cast(MultiModalEmbeddings, selected_features)
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index ad95b982499c9..5159b0bca8c1c 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -385,8 +385,8 @@ class MultiModalPlaceholderMap:
         for placeholder_dict, mm_item in zip(multi_modal_placeholders,
                                              multi_modal_items):
             placeholder = range(
-                placeholder_dict.offset,
-                placeholder_dict.offset + placeholder_dict.length,
+                placeholder_dict["offset"],
+                placeholder_dict["offset"] + placeholder_dict["length"],
             )
             intersection = range(
                 max(positions.start, placeholder.start),
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 53729799b629c..81d72ff190222 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -109,8 +109,7 @@ The built-in modalities are defined by :class:`MultiModalDataBuiltins`.
 """
 
 
-@dataclass(frozen=True)
-class PlaceholderRange:
+class PlaceholderRange(TypedDict):
     """
     Placeholder location information for multi-modal data.
 
@@ -122,8 +121,8 @@ class PlaceholderRange:
 
         .. code-block::
 
-            A: PlaceholderRange(offset=0, length=4)
-            B: PlaceholderRange(offset=5, length=4)
+            A: { "offset": 0, "length": 4 }
+            B: { "offset": 5, "length": 4 }
     """
 
     offset: int
@@ -132,31 +131,6 @@ class PlaceholderRange:
     length: int
     """The length of the placeholder."""
 
-    is_embed: Optional[torch.Tensor] = None
-    """
-    A boolean mask of shape `(length,)` indicating which positions
-    between `offset` and `offset + length` to assign embeddings to.
-    """
-
-    def get_num_embeds(self) -> int:
-        if self.is_embed is None:
-            return self.length
-
-        return int(self.is_embed.sum().item())
-
-    def __eq__(self, other: object) -> bool:
-        if not isinstance(other, self.__class__):
-            return False
-        if not (self.offset, self.length) == (other.offset, other.length):
-            return False
-
-        if self.is_embed is None:
-            return other.is_embed is None
-        if other.is_embed is None:
-            return self.is_embed is None
-
-        return nested_tensors_equal(self.is_embed, other.is_embed)
-
 
 NestedTensors = Union[list["NestedTensors"], list[torch.Tensor], torch.Tensor,
                       tuple[torch.Tensor, ...]]
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index a37d2975e9d2d..c8864c33fe372 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -108,46 +108,16 @@ class PromptUpdateDetails(Generic[_S]):
     full: _S
     """The full content."""
 
-    is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]] = None
+    features: _S
     """
-    Given :attr:`full`, return a boolean mask of shape `(len(full),)`
-    indicating which positions of `full` to assign embeddings to.
-
-    `None` (default) means to assign embeddings to all positions of `full`.
-
-    The embeddings are obtained by calling
-    :class:`SupportsMultiModal.get_multimodal_embeddings`.
+    The part of the content that corresponds to feature placeholders;
+    this will be replaced by the output of the vision encoder during model
+    inference.
     """
 
     @staticmethod
     def from_seq(seq: _S) -> "PromptUpdateDetails[_S]":
-        return PromptUpdateDetails(full=seq)
-
-    @staticmethod
-    def select_text(
-        seq: _S,
-        embed_text: str,
-    ) -> "PromptUpdateDetails[_S]":
-
-        def is_embed(full: "_BoundPromptSequence") -> torch.Tensor:
-            embed_token_ids = encode_tokens(full.tokenizer, embed_text)
-
-            return torch.isin(
-                torch.tensor(full.token_ids),
-                torch.tensor(embed_token_ids),
-            )
-
-        return PromptUpdateDetails(full=seq, is_embed=is_embed)
-
-    @staticmethod
-    def select_token_id(
-        seq: _S,
-        embed_token_id: int,
-    ) -> "PromptUpdateDetails[_S]":
-        return PromptUpdateDetails(
-            full=seq,
-            is_embed=lambda f: torch.tensor(f.token_ids) == embed_token_id,
-        )
+        return PromptUpdateDetails(full=seq, features=seq)
 
 
 PromptUpdateInfo = Union[PromptSeq, PromptUpdateDetails]
@@ -436,7 +406,7 @@ class _BoundPromptSequence:
 @dataclass
 class _BoundPromptContent:
     full: _BoundPromptSequence
-    is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]]
+    features: _BoundPromptSequence
 
 
 @dataclass
@@ -496,8 +466,10 @@ class BoundPromptUpdate:
 
         bound_full = _BoundPromptSequence.from_seq(self.tokenizer,
                                                    content.full)
+        bound_features = _BoundPromptSequence.from_seq(self.tokenizer,
+                                                       content.features)
         bound_content = _BoundPromptContent(full=bound_full,
-                                            is_embed=content.is_embed)
+                                            features=bound_features)
 
         if cache_key is not None:
             self._content_cache[cache_key] = bound_content
@@ -633,19 +605,15 @@ class PlaceholderFeaturesInfo:
     item_idx: int
     start_idx: int
     tokens: list[int]
-    is_embed: Optional[torch.Tensor]
 
     @property
     def length(self) -> int:
         return len(self.tokens)
 
     def to_range(self) -> PlaceholderRange:
-        # TODO: Is it worth it to optimize this by stripping the
-        # leading and ending positions where `is_embed=False`?
         return PlaceholderRange(
             offset=self.start_idx,
             length=self.length,
-            is_embed=self.is_embed,
         )
 
 
@@ -838,17 +806,22 @@ def _iter_placeholders(
                     continue
 
                 if prompt[start_idx:end_idx_full] == content_tokens_full:
-                    content_is_embed = content.is_embed
-                    if content_is_embed is not None:
-                        content_is_embed = content_is_embed(content.full)
+                    content_tokens_feat = content.features.token_ids
 
-                    yield PlaceholderFeaturesInfo(
-                        modality=modality,
-                        item_idx=item_idx,
-                        start_idx=start_idx,
-                        tokens=content_tokens_full,
-                        is_embed=content_is_embed,
-                    )
+                    try:
+                        match = next(
+                            iter_token_matches(content_tokens_full,
+                                               content_tokens_feat))
+                        yield PlaceholderFeaturesInfo(
+                            modality=modality,
+                            item_idx=item_idx,
+                            start_idx=start_idx + match.start_idx,
+                            tokens=content_tokens_feat,
+                        )
+                    except StopIteration:
+                        raise AssertionError(
+                            f"{content_tokens_feat=} should be a "
+                            f"subsequence of {content_tokens_full=}") from None
 
                     # Exclude overlapping matches
                     start_idx = end_idx_full
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 4616e4e957854..1df9a1f5eba1c 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -180,7 +180,7 @@ class MultiModalProfiler(Generic[_I]):
         placeholders_by_modality = mm_inputs["mm_placeholders"]
 
         total_placeholders_by_modality = {
-            modality: sum(item.get_num_embeds() for item in placeholders)
+            modality: sum(item["length"] for item in placeholders)
             for modality, placeholders in placeholders_by_modality.items()
         }
         expected_placeholders_by_modality = {
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 77c83f0c2b212..fc0fb8929b1e7 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -340,7 +340,7 @@ def merge_and_sort_multimodal_metadata(
             all_items.append((modality, placeholder, hash_value))
 
     # Sort all items by offset
-    all_items.sort(key=lambda x: x[1].offset)
+    all_items.sort(key=lambda x: x[1]['offset'])
 
     # Split into separate lists
     sorted_modalities = [item[0] for item in all_items]
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index afcf7e344a0f0..34bc9369b125d 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -310,7 +310,8 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
     # Note that we assume mm_positions is sorted by offset.
     # We do not need to check all mm inputs if the start token index is out of
     # range. This usually happens in the late prefill phase and decoding phase.
-    if mm_positions[-1].offset + mm_positions[-1].length < start_token_idx:
+    if mm_positions[-1]["offset"] + mm_positions[-1][
+            "length"] < start_token_idx:
         return extra_keys, start_mm_idx
 
     # Support start_mm_idx == -1 to indicate the last mm input.
@@ -321,8 +322,8 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
     curr_mm_idx = start_mm_idx
     while mm_positions and curr_mm_idx < len(mm_positions):
         assert mm_hashes[curr_mm_idx] is not None
-        offset = mm_positions[curr_mm_idx].offset
-        length = mm_positions[curr_mm_idx].length
+        offset = mm_positions[curr_mm_idx]["offset"]
+        length = mm_positions[curr_mm_idx]["length"]
         if end_token_idx > offset:
             if start_token_idx > offset + length:
                 # This block has passed the current mm input.
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index b3905987efc77..81f8ad25051c3 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -505,8 +505,8 @@ class Scheduler(SchedulerInterface):
         assert mm_positions is not None
         assert len(mm_positions) > 0
         for i, pos_info in enumerate(mm_positions):
-            start_pos = pos_info.offset
-            num_encoder_tokens = pos_info.length
+            start_pos = pos_info["offset"]
+            num_encoder_tokens = pos_info["length"]
 
             # The encoder output is needed if the two ranges overlap:
             # [num_computed_tokens, num_computed_tokens + num_new_tokens) and
@@ -596,8 +596,8 @@ class Scheduler(SchedulerInterface):
             if cached_encoder_input_ids:
                 for input_id in list(cached_encoder_input_ids):
                     mm_positions = request.mm_positions[input_id]
-                    start_pos = mm_positions.offset
-                    num_tokens = mm_positions.length
+                    start_pos = mm_positions["offset"]
+                    num_tokens = mm_positions["length"]
                     if start_pos + num_tokens <= request.num_computed_tokens:
                         # The encoder output is already processed and stored
                         # in the decoder's KV cache.
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index daf59fd76e9a9..490fe4e83d3ad 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -121,7 +121,7 @@ class Request:
 
     def get_num_encoder_tokens(self, input_id: int) -> int:
         assert input_id < len(self.mm_positions)
-        num_tokens = self.mm_positions[input_id].length
+        num_tokens = self.mm_positions[input_id]["length"]
         return num_tokens
 
     @property
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index aba71845cb20e..513806332efe3 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -19,8 +19,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.model_loader import get_model
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
@@ -44,8 +43,7 @@ from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
-from .utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs,
-                    scatter_mm_placeholders)
+from .utils import sanity_check_mm_encoder_outputs
 
 if TYPE_CHECKING:
     import xgrammar as xgr
@@ -831,22 +829,19 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         )
         return metadata
 
-    def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
+    def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
         scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
         if not scheduled_encoder_inputs:
             return
 
         # Batch the multi-modal inputs.
-        mm_inputs = list[MultiModalKwargs]()
-        req_ids_pos = list[tuple[str, int, PlaceholderRange]]()
+        mm_inputs: list[MultiModalKwargs] = []
+        req_input_ids: list[tuple[str, int]] = []
         for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
             req_state = self.requests[req_id]
-            for input_id, pos_info in zip(
-                    encoder_input_ids,
-                    req_state.mm_positions,
-            ):
+            for input_id in encoder_input_ids:
                 mm_inputs.append(req_state.mm_inputs[input_id])
-                req_ids_pos.append((req_id, input_id, pos_info))
+                req_input_ids.append((req_id, input_id))
 
         # Batch mm inputs as much as we can: if a request in the batch has
         # multiple modalities or a different modality than the previous one,
@@ -882,23 +877,16 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 encoder_outputs.append(output)
 
         # Cache the encoder outputs.
-        for (req_id, input_id, pos_info), output in zip(
-                req_ids_pos,
-                encoder_outputs,
-        ):
+        for (req_id, input_id), output in zip(req_input_ids, encoder_outputs):
             if req_id not in self.encoder_cache:
                 self.encoder_cache[req_id] = {}
+            self.encoder_cache[req_id][input_id] = output
 
-            self.encoder_cache[req_id][input_id] = scatter_mm_placeholders(
-                output,
-                is_embed=pos_info.is_embed,
-            )
-
-    def _gather_mm_embeddings(
+    def _gather_encoder_outputs(
         self,
         scheduler_output: "SchedulerOutput",
     ) -> list[torch.Tensor]:
-        mm_embeds: list[torch.Tensor] = []
+        encoder_outputs: list[torch.Tensor] = []
         for req_id in self.input_batch.req_ids:
             num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
                 req_id]
@@ -906,8 +894,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             num_computed_tokens = req_state.num_computed_tokens
             mm_positions = req_state.mm_positions
             for i, pos_info in enumerate(mm_positions):
-                start_pos = pos_info.offset
-                num_encoder_tokens = pos_info.length
+                start_pos = pos_info["offset"]
+                num_encoder_tokens = pos_info["length"]
 
                 # The encoder output is needed if the two ranges overlap:
                 # [num_computed_tokens,
@@ -929,16 +917,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 assert req_id in self.encoder_cache
                 assert i in self.encoder_cache[req_id]
                 encoder_output = self.encoder_cache[req_id][i]
-
-                if (is_embed := pos_info.is_embed) is not None:
-                    is_embed = is_embed[start_idx:end_idx]
-
-                mm_embeds_item = gather_mm_placeholders(
-                    encoder_output[start_idx:end_idx],
-                    is_embed=is_embed,
-                )
-                mm_embeds.append(mm_embeds_item)
-        return mm_embeds
+                encoder_outputs.append(encoder_output[start_idx:end_idx])
+        return encoder_outputs
 
     def get_model(self) -> nn.Module:
         return self.model
@@ -1003,10 +983,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
 
         if self.is_multimodal_model:
             # Run the multimodal encoder if any.
-            self._execute_mm_encoder(scheduler_output)
-            mm_embeds = self._gather_mm_embeddings(scheduler_output)
+            self._execute_encoder(scheduler_output)
+            encoder_outputs = self._gather_encoder_outputs(scheduler_output)
         else:
-            mm_embeds = []
+            encoder_outputs = []
 
         # Prepare the decoder inputs.
         attn_metadata, logits_indices, spec_decode_metadata = (
@@ -1028,9 +1008,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             # embeddings), we always use embeddings (rather than token ids)
             # as input to the multimodal model, even when the input is text.
             input_ids = self.input_ids[:num_scheduled_tokens]
-            if mm_embeds:
+            if encoder_outputs:
                 inputs_embeds = self.model.get_input_embeddings(
-                    input_ids, mm_embeds)
+                    input_ids, encoder_outputs)
             else:
                 inputs_embeds = self.model.get_input_embeddings(input_ids)
             # TODO(woosuk): Avoid the copy. Optimize.
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 488912fbd4b05..0668e7168b5f7 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -19,8 +19,7 @@ from vllm.config import VllmConfig
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
@@ -37,8 +36,7 @@ from vllm.v1.sample.tpu.sampler import Sampler as TPUSampler
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
-from .utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs,
-                    scatter_mm_placeholders)
+from .utils import sanity_check_mm_encoder_outputs
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
@@ -509,47 +507,19 @@ class TPUModelRunner:
         logits_indices = logits_indices.to(self.device)
         return attn_metadata, logits_indices
 
-    def _scatter_placeholders(
-        self,
-        embeds: torch.Tensor,
-        is_embed: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        if is_embed is None:
-            return embeds
-
-        placeholders = embeds.new_full(
-            (is_embed.shape[0], embeds.shape[-1]),
-            fill_value=torch.nan,
-        )
-        placeholders[is_embed] = embeds
-        return placeholders
-
-    def _gather_placeholders(
-        self,
-        placeholders: torch.Tensor,
-        is_embed: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        if is_embed is None:
-            return placeholders
-
-        return placeholders[is_embed]
-
-    def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
+    def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
         scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
         if not scheduled_encoder_inputs:
             return
 
         # Batch the multi-modal inputs.
-        mm_inputs = list[MultiModalKwargs]()
-        req_ids_pos = list[tuple[str, int, PlaceholderRange]]()
+        mm_inputs: list[MultiModalKwargs] = []
+        req_input_ids: list[tuple[str, int]] = []
         for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
             req_state = self.requests[req_id]
-            for input_id, pos_info in zip(
-                    encoder_input_ids,
-                    req_state.mm_positions,
-            ):
+            for input_id in encoder_input_ids:
                 mm_inputs.append(req_state.mm_inputs[input_id])
-                req_ids_pos.append((req_id, input_id, pos_info))
+                req_input_ids.append((req_id, input_id))
 
         # Batch mm inputs as much as we can: if a request in the batch has
         # multiple modalities or a different modality than the previous one,
@@ -585,23 +555,16 @@ class TPUModelRunner:
                 encoder_outputs.append(output)
 
         # Cache the encoder outputs.
-        for (req_id, input_id, pos_info), output in zip(
-                req_ids_pos,
-                encoder_outputs,
-        ):
+        for (req_id, input_id), output in zip(req_input_ids, encoder_outputs):
             if req_id not in self.encoder_cache:
                 self.encoder_cache[req_id] = {}
+            self.encoder_cache[req_id][input_id] = output
 
-            self.encoder_cache[req_id][input_id] = scatter_mm_placeholders(
-                output,
-                is_embed=pos_info.is_embed,
-            )
-
-    def _gather_mm_embeddings(
+    def _gather_encoder_outputs(
         self,
         scheduler_output: "SchedulerOutput",
     ) -> list[torch.Tensor]:
-        mm_embeds: list[torch.Tensor] = []
+        encoder_outputs: list[torch.Tensor] = []
         for req_id in self.input_batch.req_ids:
             num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
                 req_id]
@@ -609,8 +572,8 @@ class TPUModelRunner:
             num_computed_tokens = req_state.num_computed_tokens
             mm_positions = req_state.mm_positions
             for i, pos_info in enumerate(mm_positions):
-                start_pos = pos_info.offset
-                num_encoder_tokens = pos_info.length
+                start_pos = pos_info["offset"]
+                num_encoder_tokens = pos_info["length"]
 
                 # The encoder output is needed if the two ranges overlap:
                 # [num_computed_tokens,
@@ -632,16 +595,8 @@ class TPUModelRunner:
                 assert req_id in self.encoder_cache
                 assert i in self.encoder_cache[req_id]
                 encoder_output = self.encoder_cache[req_id][i]
-
-                if (is_embed := pos_info.is_embed) is not None:
-                    is_embed = is_embed[start_idx:end_idx]
-
-                mm_embeds_item = gather_mm_placeholders(
-                    encoder_output[start_idx:end_idx],
-                    is_embed=is_embed,
-                )
-                mm_embeds.append(mm_embeds_item)
-        return mm_embeds
+                encoder_outputs.append(encoder_output[start_idx:end_idx])
+        return encoder_outputs
 
     @torch.no_grad()
     def execute_model(
@@ -657,10 +612,10 @@ class TPUModelRunner:
 
         if self.is_multimodal_model:
             # Run the multimodal encoder if any.
-            self._execute_mm_encoder(scheduler_output)
-            mm_embeds = self._gather_mm_embeddings(scheduler_output)
+            self._execute_encoder(scheduler_output)
+            encoder_outputs = self._gather_encoder_outputs(scheduler_output)
         else:
-            mm_embeds = []
+            encoder_outputs = []
 
         # Prepare inputs
         attn_metadata, logits_indices = self._prepare_inputs(scheduler_output)
@@ -668,9 +623,9 @@ class TPUModelRunner:
             # NOTE(woosuk): To unify token ids and soft tokens (vision
             # embeddings), we always use embeddings (rather than token ids)
             # as input to the multimodal model, even when the input is text.
-            if mm_embeds:
+            if encoder_outputs:
                 inputs_embeds = self.model.get_input_embeddings(
-                    self.input_ids, mm_embeds)
+                    self.input_ids, encoder_outputs)
             else:
                 inputs_embeds = self.model.get_input_embeddings(self.input_ids)
             input_ids = None
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index e46ca0c90fe38..b1d3aa7cd8afb 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -1,6 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-from typing import Optional
-
 import torch
 
 
@@ -29,46 +27,3 @@ def sanity_check_mm_encoder_outputs(
         f"but got tensors with shapes {[e.shape for e in mm_embeddings]} "
         "instead. This is most likely due to incorrect implementation "
         "of the model's `get_multimodal_embeddings` method.")
-
-
-def scatter_mm_placeholders(
-    embeds: torch.Tensor,
-    is_embed: Optional[torch.Tensor],
-) -> torch.Tensor:
-    """
-    Scatter the multimodal embeddings into a contiguous tensor that represents
-    the placeholder tokens.
-
-    :class:`vllm.multimodal.processing.PromptUpdateDetails.is_embed`.
-
-    Args:
-        embeds: The multimodal embeddings.
-          Shape: `(num_embeds, embed_dim)`
-        is_embed: A boolean mask indicating which positions in the placeholder
-          tokens need to be filled with multimodal embeddings.
-          Shape: `(num_placeholders, num_embeds)`
-    """
-    if is_embed is None:
-        return embeds
-
-    placeholders = embeds.new_full(
-        (is_embed.shape[0], embeds.shape[-1]),
-        fill_value=torch.nan,
-    )
-    placeholders[is_embed] = embeds
-    return placeholders
-
-
-def gather_mm_placeholders(
-    placeholders: torch.Tensor,
-    is_embed: Optional[torch.Tensor],
-) -> torch.Tensor:
-    """
-    Reconstructs the embeddings from the placeholder tokens.
-
-    This is the operation of :func:`scatter_mm_placeholders`.
-    """
-    if is_embed is None:
-        return placeholders
-
-    return placeholders[is_embed]

From d6fc629f4d716f5b0910fe15b25a83aa02c01b09 Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Fri, 4 Apr 2025 19:27:34 -0400
Subject: [PATCH 237/593] [Kernel][Minor] Re-fuse triton moe weight application
 (#16071)

Signed-off-by: Bill Nell <bnell@redhat.com>
---
 .../layers/fused_moe/fused_moe.py             | 42 ++++++++-----------
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index aa0bd553fc325..0817879c4d576 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1297,30 +1297,24 @@ def fused_experts_impl(hidden_states: torch.Tensor,
             qintermediate_cache2 = intermediate_cache2
             a2q_scale = a2_scale
 
-        invoke_fused_moe_kernel(
-            qintermediate_cache2,
-            w2,
-            intermediate_cache3,
-            a2q_scale,
-            w2_scale,
-            w2_zp,
-            curr_topk_weights,
-            sorted_token_ids,
-            expert_ids,
-            num_tokens_post_padded,
-            False,  #True,
-            1,
-            config,
-            compute_type=compute_type,
-            use_fp8_w8a8=use_fp8_w8a8,
-            use_int8_w8a16=use_int8_w8a16,
-            use_int4_w4a16=use_int4_w4a16,
-            block_shape=block_shape)
-
-        if True:
-            intermediate_cache3 = intermediate_cache3.view(-1, top_k_num, K)
-            intermediate_cache3.mul_(
-                curr_topk_weights.view(tokens_in_chunk, -1, 1))
+        invoke_fused_moe_kernel(qintermediate_cache2,
+                                w2,
+                                intermediate_cache3,
+                                a2q_scale,
+                                w2_scale,
+                                w2_zp,
+                                curr_topk_weights,
+                                sorted_token_ids,
+                                expert_ids,
+                                num_tokens_post_padded,
+                                True,
+                                1,
+                                config,
+                                compute_type=compute_type,
+                                use_fp8_w8a8=use_fp8_w8a8,
+                                use_int8_w8a16=use_int8_w8a16,
+                                use_int4_w4a16=use_int4_w4a16,
+                                block_shape=block_shape)
 
         ops.moe_sum(intermediate_cache3.view(*intermediate_cache3.shape),
                     out_hidden_states[begin_chunk_idx:end_chunk_idx])

From 70ad3f9e98687776772c446530a091c4e2019e7b Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 4 Apr 2025 17:31:19 -0600
Subject: [PATCH 238/593] [Bugfix][TPU] Fix V1 TPU worker for sliding window
 (#16059)

Signed-off-by: Michael Goin <mgoin64@gmail.com>
---
 vllm/v1/worker/tpu_worker.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index bd24072f4c1a1..67902b41b2844 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -18,7 +18,7 @@ from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.v1.core.sched.output import SchedulerOutput
-from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+from vllm.v1.kv_cache_interface import (AttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.utils import bind_kv_cache
@@ -137,7 +137,7 @@ class TPUWorker:
         kv_caches: dict[str, torch.Tensor] = {}
         kv_cache_spec = self.model_runner.get_kv_cache_spec()
         for layer_name, layer_spec in kv_cache_spec.items():
-            if isinstance(layer_spec, FullAttentionSpec):
+            if isinstance(layer_spec, AttentionSpec):
                 dtype = layer_spec.dtype
 
                 # Use an empty tensor instead of `None`` to force Dynamo to pass
@@ -147,7 +147,8 @@ class TPUWorker:
                                             device=self.device)
                 kv_caches[layer_name] = tpu_kv_cache
             else:
-                raise NotImplementedError
+                raise NotImplementedError(
+                    f"Unsupported KV cache spec '{type(layer_spec)}'")
 
         runner_kv_caches: list[torch.Tensor] = []
         bind_kv_cache(

From 63375f0cdb0073e466d9012aad1192c445a5fa64 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 4 Apr 2025 16:32:54 -0700
Subject: [PATCH 239/593] [V1][Spec Decode] Update N-gram Proposer Interface
 (#15750)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/spec_decode/ngram_proposer.py | 28 ++++++++++++++-------------
 vllm/v1/worker/gpu_model_runner.py    |  6 +-----
 2 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index 8f6d20d11ff3d..7e548bb48b57c 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -10,14 +10,21 @@ from vllm.config import VllmConfig
 class NgramProposer:
 
     def __init__(self, vllm_config: VllmConfig):
-        self.vllm_config = vllm_config
+        # Minimum length of the n-gram to match.
+        self.min_n = vllm_config.speculative_config.prompt_lookup_min
+        # Maximum length of the n-gram to match.
+        self.max_n = vllm_config.speculative_config.prompt_lookup_max
+        # Number of tokens follow the match. If there are less than k
+        # tokens follow the match, we will return the maximum amount of
+        # tokens until the end.
+        self.k = vllm_config.speculative_config.num_speculative_tokens
+        # Trigger Numba JIT compilation for N-gram proposer.
+        # This usually takes less than 1 second.
+        self.propose(np.zeros(1024, dtype=np.int32))
 
     def propose(
         self,
         context_token_ids: np.ndarray,
-        min_n: int,
-        max_n: int,
-        k: int,
     ) -> Optional[np.ndarray]:
         """Proposes the next sequence of tokens based on n-gram pattern 
         matching in the context. The function finds matches of the last n 
@@ -27,17 +34,12 @@ class NgramProposer:
         Args:
             context_token_ids: Numpy array of token IDs representing the 
                                context sequence.
-            min_n: Minimum length of the n-gram to match.
-            max_n: Maximum length of the n-gram to match.
-            k: Number of tokens follow the match. If there are less 
-               than k tokens follow the match, we will return 
-               the maximum amount of tokens until the end.
-        
+
         Returns:
             np.ndarray: The sequence of tokens that followed 
                         the matched n-gram in the context.
             None: If no matching n-gram pattern is found.
-        
+
         Example:
             If context_token_ids = [1,2,3,4,2,3], min_n = 2, max_n = 3, and
             k = 4:
@@ -49,8 +51,8 @@ class NgramProposer:
               we only have three tokens after the match.
         """
         # TODO(woosuk): Optimize this.
-        for n in range(max_n, min_n - 1, -1):
-            result = _find_subarray_kmp(context_token_ids, n, k)
+        for n in range(self.max_n, self.min_n - 1, -1):
+            result = _find_subarray_kmp(context_token_ids, n, self.k)
             if result is not None:
                 return result
         return None
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 513806332efe3..82b07c6cd3272 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1246,11 +1246,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             end_idx = start_idx + num_sampled_ids
             self.input_batch.token_ids_cpu[i, start_idx:end_idx] = sampled_ids
             drafter_output = self.drafter.propose(
-                self.input_batch.token_ids_cpu[i, :end_idx],
-                self.speculative_config.prompt_lookup_min,
-                self.speculative_config.prompt_lookup_max,
-                self.speculative_config.num_speculative_tokens,
-            )
+                self.input_batch.token_ids_cpu[i, :end_idx])
             if drafter_output is None or len(drafter_output) == 0:
                 draft_token_ids.append([])
             else:

From 4285e423a6be0bcfdf7ab45fac3815bf37cc2272 Mon Sep 17 00:00:00 2001
From: Tristan Leclercq <49700633+tristanleclercq@users.noreply.github.com>
Date: Sat, 5 Apr 2025 08:30:45 +0200
Subject: [PATCH 240/593] [Misc] Auto detect bitsandbytes pre-quantized models
 (#16027)

Signed-off-by: Tristan Leclercq <tristanleclercq@gmail.com>
---
 docs/source/features/quantization/bnb.md |  9 ++++++---
 tests/quantization/test_bitsandbytes.py  | 10 ++++++----
 vllm/engine/arg_utils.py                 |  4 ++++
 3 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/docs/source/features/quantization/bnb.md b/docs/source/features/quantization/bnb.md
index fc499e7692d98..e356b99d85cdf 100644
--- a/docs/source/features/quantization/bnb.md
+++ b/docs/source/features/quantization/bnb.md
@@ -19,17 +19,20 @@ And usually, these repositories have a config.json file that includes a quantiza
 
 ## Read quantized checkpoint
 
+For pre-quantized checkpoints, vLLM will try to infer the quantization method from the config file, so you don't need to explicitly specify the quantization argument.
+
 ```python
 from vllm import LLM
 import torch
 # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
 model_id = "unsloth/tinyllama-bnb-4bit"
-llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
-quantization="bitsandbytes")
+llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True)
 ```
 
 ## Inflight quantization: load as 4bit quantization
 
+For inflight 4bit quantization with BitsAndBytes, you need to explicitly specify the quantization argument.
+
 ```python
 from vllm import LLM
 import torch
@@ -40,7 +43,7 @@ quantization="bitsandbytes")
 
 ## OpenAI Compatible Server
 
-Append the following to your 4bit model arguments:
+Append the following to your model arguments for 4bit inflight quantization:
 
 ```console
 --quantization bitsandbytes
diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py
index 533b055ee6d53..8d9ae282153cf 100644
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -41,7 +41,7 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
 
     hf_model_kwargs = {"load_in_4bit": True}
     validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
-                             model_name, hf_model_kwargs)
+                             model_name, False, hf_model_kwargs)
 
 
 @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
@@ -53,7 +53,7 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                                        model_name, description) -> None:
 
     validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
-                             model_name)
+                             model_name, True)
 
 
 @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
@@ -65,7 +65,7 @@ def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                              model_name, description) -> None:
 
     validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
-                             model_name)
+                             model_name, True)
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
@@ -82,6 +82,7 @@ def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                              vllm_runner,
                              example_prompts[:1],
                              model_name,
+                             False,
                              hf_model_kwargs,
                              vllm_tp_size=2)
 
@@ -128,13 +129,14 @@ def validate_generated_texts(hf_runner,
                              vllm_runner,
                              prompts,
                              model_name,
+                             pre_quant=False,
                              hf_model_kwargs=None,
                              vllm_tp_size=1):
 
     # NOTE: run vLLM first, as it requires a clean process
     # when using distributed inference
     with vllm_runner(model_name,
-                     quantization='bitsandbytes',
+                     quantization=None if pre_quant else 'bitsandbytes',
                      tensor_parallel_size=vllm_tp_size,
                      enforce_eager=False) as llm:
         vllm_outputs = llm.generate_greedy(prompts, 8)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 89c9b67470e6b..93dba20141dd7 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1275,6 +1275,10 @@ class EngineArgs:
             self.model_loader_extra_config[
                 "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path
 
+        # bitsandbytes pre-quantized model need a specific model loader
+        if model_config.quantization == "bitsandbytes":
+            self.quantization = self.load_format = "bitsandbytes"
+
         load_config = self.create_load_config()
 
         prompt_adapter_config = PromptAdapterConfig(

From 0adba915479345ef7bb8eed486c7874372a2d00c Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Sat, 5 Apr 2025 03:36:01 -0700
Subject: [PATCH 241/593] [CI] Fix benchmark script level (#16089)

---
 .buildkite/scripts/run-benchmarks.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/scripts/run-benchmarks.sh b/.buildkite/scripts/run-benchmarks.sh
index 1641c1faa9d6a..195a8063fd743 100644
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@@ -5,8 +5,8 @@
 set -ex
 set -o pipefail
 
-# cd into parent directory of this file
-cd "$(dirname "${BASH_SOURCE[0]}")/.."
+# cd 2 levels into the working directory
+cd "$(dirname "${BASH_SOURCE[0]}")/../.."
 
 (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
 

From 6342adc4389480233d57a6253f27cf65afd36abc Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Sat, 5 Apr 2025 19:00:12 +0800
Subject: [PATCH 242/593] fix: support clang17 for macos and fix the real
 libomp (#16086)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
---
 cmake/cpu_extension.cmake | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index fdc03a7950563..00670bd398b5d 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -33,8 +33,6 @@ endif()
 
 if(MACOSX_FOUND)
     list(APPEND CXX_COMPILE_FLAGS
-        "-Xpreprocessor"
-        "-fopenmp"
         "-DVLLM_CPU_EXTENSION")
 else()
     list(APPEND CXX_COMPILE_FLAGS

From d2517a493913cf07bc19645ef84e32b8e14b2fc9 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sat, 5 Apr 2025 19:39:18 +0800
Subject: [PATCH 243/593] [doc] fix 404 (#16082)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 docs/source/models/supported_models.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 74b4eab920438..bd310219ca529 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -1128,5 +1128,5 @@ We have the following levels of testing for models:
 
 1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to [models tests](https://github.com/vllm-project/vllm/blob/main/tests/models) for the models that have passed this test.
 2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test.
-3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:main/examples) for the models that have passed this test.
+3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:examples) for the models that have passed this test.
 4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category.

From 6baeee70d1863fd512d1a96262763e1e9a0774f2 Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Sat, 5 Apr 2025 19:51:51 +0800
Subject: [PATCH 244/593] Revert "doc: add info for macos clang errors
 (#16049)" (#16091)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
---
 .../getting_started/installation/cpu/apple.inc.md      | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/docs/source/getting_started/installation/cpu/apple.inc.md b/docs/source/getting_started/installation/cpu/apple.inc.md
index 61812ead12225..7bc9e85ecd964 100644
--- a/docs/source/getting_started/installation/cpu/apple.inc.md
+++ b/docs/source/getting_started/installation/cpu/apple.inc.md
@@ -12,7 +12,7 @@ There are no pre-built wheels or images for this device, so you must build vLLM
 
 - OS: `macOS Sonoma` or later
 - SDK: `XCode 15.4` or later with Command Line Tools
-- Compiler: `Apple Clang >= 15.0.0` and `Apple Clang < 17.0.0`
+- Compiler: `Apple Clang >= 15.0.0`
 
 ## Set up using Python
 
@@ -51,14 +51,6 @@ If the build has error like the following snippet where standard C++ headers can
       1 error generated.
 ```
 
-If run with error like the following snippet you need to check clang version and install a compatible version.
-
-```text
-AttributeError: '_OpNamespace' '_C' object has no attribute 'silu_and_mul'
-```
-
-More information can be found in <gh-issue:15941>.
-
 ## Set up using Docker
 
 ### Pre-built images

From 97ae6d777f7ce2d2054a36b2ddd2f6b81d5f379a Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 5 Apr 2025 14:44:03 +0100
Subject: [PATCH 245/593] Fix some capitalisations in generated examples doc
 titles (#16094)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/source/generate_examples.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py
index 1206d5fe75390..f77dbefb0a018 100644
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@@ -17,6 +17,7 @@ def fix_case(text: str) -> str:
         "cli": "CLI",
         "cpu": "CPU",
         "llm": "LLM",
+        "mae": "MAE",
         "tpu": "TPU",
         "aqlm": "AQLM",
         "gguf": "GGUF",
@@ -24,6 +25,7 @@ def fix_case(text: str) -> str:
         "rlhf": "RLHF",
         "vllm": "vLLM",
         "openai": "OpenAI",
+        "lmcache": "LMCache",
         "multilora": "MultiLoRA",
         "mlpspeculator": "MLPSpeculator",
         r"fp\d+": lambda x: x.group(0).upper(),  # e.g. fp16, fp32

From d8f094a92aad5b102a798de7d6da1454972620c7 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sun, 6 Apr 2025 10:57:18 +0800
Subject: [PATCH 246/593] [Misc] format output for encoder_decoder.py (#16095)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 examples/offline_inference/encoder_decoder.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/examples/offline_inference/encoder_decoder.py b/examples/offline_inference/encoder_decoder.py
index 8765d1812cc53..c6ccfd42ec85b 100644
--- a/examples/offline_inference/encoder_decoder.py
+++ b/examples/offline_inference/encoder_decoder.py
@@ -75,8 +75,6 @@ prompts = [
     enc_dec_prompt1, enc_dec_prompt2, enc_dec_prompt3
 ] + zipped_prompt_list
 
-print(prompts)
-
 # Create a sampling params object.
 sampling_params = SamplingParams(
     temperature=0,
@@ -91,10 +89,13 @@ sampling_params = SamplingParams(
 outputs = llm.generate(prompts, sampling_params)
 
 # Print the outputs.
-for output in outputs:
+print("-" * 50)
+for i, output in enumerate(outputs):
     prompt = output.prompt
     encoder_prompt = output.encoder_prompt
     generated_text = output.outputs[0].text
-    print(f"Encoder prompt: {encoder_prompt!r}, "
-          f"Decoder prompt: {prompt!r}, "
+    print(f"Output {i+1}:")
+    print(f"Encoder prompt: {encoder_prompt!r}\n"
+          f"Decoder prompt: {prompt!r}\n"
           f"Generated text: {generated_text!r}")
+    print("-" * 50)

From 13affc432de047057dce9d47a9a9152bf4a7b50c Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Sun, 6 Apr 2025 11:03:50 +0800
Subject: [PATCH 247/593] [Misc] Remove redundant code (#16098)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 vllm/entrypoints/chat_utils.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index ff2d1aacbecec..9129e47de7e9d 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -498,7 +498,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
                                               hf_config.image_token_index)
             if model_type in ("aya_vision", "chameleon", "deepseek_vl_v2",
                               "internvl_chat", "skywork_chat", "NVLM_D",
-                              "h2ovl_chat"):
+                              "h2ovl_chat", "idefics3"):
                 return "<image>"
             if model_type == "mllama":
                 return "<|image|>"
@@ -506,8 +506,6 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
                 return "<|vision_start|><|image_pad|><|vision_end|>"
             if model_type == "molmo":
                 return ""
-            if model_type == "idefics3":
-                return "<image>"
             if model_type == "aria":
                 return "<|fim_prefix|><|img|><|fim_suffix|>"
             if model_type == "gemma3":

From 2fa66ef713a62f96c24135ceffd70f1bbafea4e4 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <linjinzhen@hotmail.com>
Date: Sun, 6 Apr 2025 11:04:22 +0800
Subject: [PATCH 248/593] [Bugfix] fix use_atomic_add support of marlin kernel
 when using v1 engine (#15946)

Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
---
 csrc/quantization/gptq_marlin/gptq_marlin.cu                | 6 +++++-
 .../layers/quantization/utils/marlin_utils.py               | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 14d397d03e135..83bbd1e6816a8 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -1785,7 +1785,7 @@ __global__ void Marlin(
             <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                 \
                 A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr,      \
                 num_groups, prob_m, prob_n, prob_k, lda, locks,                \
-                use_atomic_add, use_fp32_reduce);                              \
+                part_use_atomic_add, use_fp32_reduce);                         \
       }                                                                        \
     }
 
@@ -2215,6 +2215,10 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
       thread_m_blocks = exec_cfg.max_m_blocks;
     }
 
+    // atomic add reduce have better performance only when m * n is small
+    bool part_use_atomic_add =
+        use_atomic_add && div_ceil(prob_m, 64) * prob_n <= 2048;
+
     if (false) {
     }
     GPTQ_CALL_IF(vllm::kU4B8, 16, 4, 256)
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index d1fb52ae09def..5b2e3ca2c799d 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -305,7 +305,7 @@ def should_use_atomic_add_reduce(m: int, n: int, k: int, device: torch.device,
 
     # the performance of atomicAdd is better than global reduce
     # only when m*n is small and k is large
-    return max(m, 64) * n < 64 * 2048 and k >= 2048
+    return n < 2048 and k >= 2048
 
 
 def apply_gptq_marlin_linear(

From 29283eaa7eff50c88b1efe5443921a0768d5d15e Mon Sep 17 00:00:00 2001
From: Jonghyun Choe <andy.choe729@gmail.com>
Date: Sun, 6 Apr 2025 12:34:38 +0900
Subject: [PATCH 249/593] [Model] use AutoWeightsLoader for phi, gemma,
 deepseek (#16088)

Signed-off-by: Jonghyun Choe <andy.choe729@gmail.com>
---
 vllm/model_executor/models/deepseek.py | 102 +++++++++++++------------
 vllm/model_executor/models/gemma.py    |  89 +++++++++++----------
 vllm/model_executor/models/phi.py      |  87 +++++++++++----------
 3 files changed, 147 insertions(+), 131 deletions(-)

diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index f0212f37657aa..5e036d049a8a5 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -51,7 +51,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (extract_layer_index, is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, extract_layer_index,
+                    is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -385,6 +386,56 @@ class DeepseekModel(nn.Module):
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if (("mlp.experts." in name or "mlp.shared_experts." in name)
+                        and name not in params_dict):
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if (("mlp.experts." in name or "mlp.shared_experts." in name)
+                        and name not in params_dict):
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class DeepseekForCausalLM(nn.Module, SupportsPP):
 
@@ -439,50 +490,5 @@ class DeepseekForCausalLM(nn.Module, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Skip experts that are not assigned to this worker.
-                if (("mlp.experts." in name or "mlp.shared_experts." in name)
-                        and name not in params_dict):
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Skip experts that are not assigned to this worker.
-                if (("mlp.experts." in name or "mlp.shared_experts." in name)
-                        and name not in params_dict):
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
\ No newline at end of file
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index d741880c00d2d..92d99883c7743 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -43,7 +43,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -319,6 +319,46 @@ class GemmaModel(nn.Module):
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, shard_name, shard_id) in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        return loaded_params
+
 
 class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
@@ -385,44 +425,9 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            for (param_name, shard_name, shard_id) in stacked_params_mapping:
-                if shard_name not in name:
-                    continue
-                name = name.replace(shard_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # lm_head is not used in vllm as it is tied with embed_token.
-                # To prevent errors, skip loading lm_head.weight.
-                if "lm_head.weight" in name:
-                    continue
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-
-        return loaded_params
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 6ee80210c2b4d..fdf7734595a54 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -61,7 +61,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -249,6 +249,49 @@ class PhiModel(nn.Module):
 
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v")
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # pylint: disable=E1136
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
@@ -317,43 +360,5 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v")
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # pylint: disable=E1136
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)

From 620fc2d09ed86a4be6e9655ba94e771620a9e697 Mon Sep 17 00:00:00 2001
From: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Date: Sat, 5 Apr 2025 21:23:40 -0700
Subject: [PATCH 250/593] [Model] fix model testing for TeleChat2ForCausalLM
 and V0 llama4 (#16112)

Signed-off-by: Lu Fang <fanglu@fb.com>
---
 vllm/attention/backends/flash_attn.py   | 5 +++++
 vllm/model_executor/models/telechat2.py | 8 ++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 27bd292b51f22..c0a572b4aaea3 100755
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -617,10 +617,15 @@ class FlashAttentionImpl(AttentionImpl):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
                 "FlashAttention does not support block-sparse attention.")
+        if use_irope:
+            logger.warning(
+                "Using irope in V0 is not supported yet, it will fall back "
+                "to global attention for long context.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py
index a38035e37ec73..062b1c2cf5f54 100644
--- a/vllm/model_executor/models/telechat2.py
+++ b/vllm/model_executor/models/telechat2.py
@@ -19,7 +19,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Iterable, Set, Tuple
+from typing import Iterable, Set, Tuple, Type
 
 import torch
 
@@ -27,6 +27,7 @@ from vllm.config import VllmConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama import LlamaForCausalLM, LlamaModel
 
+from .llama import LlamaDecoderLayer
 from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
                     is_pp_missing_parameter)
 
@@ -120,7 +121,10 @@ class TeleChat2ForCausalLM(LlamaForCausalLM):
         },
     )
 
-    def _init_model(self, vllm_config: VllmConfig, prefix: str = ""):
+    def _init_model(self,
+                    vllm_config: VllmConfig,
+                    prefix: str = "",
+                    layer_type: Type[LlamaDecoderLayer] = LlamaDecoderLayer):
         return TeleChat2Model(vllm_config=vllm_config, prefix=prefix)
 
     def load_weights(self, weights: Iterable[Tuple[str,

From ba10801961f94afbceed39a41af2b29a85c348e3 Mon Sep 17 00:00:00 2001
From: Hyesoo Yang <45211235+hyeygit@users.noreply.github.com>
Date: Sat, 5 Apr 2025 21:30:35 -0700
Subject: [PATCH 251/593] [Benchmark] Add sampling parameters to
 benchmark_serving. (#16022)

Signed-off-by: Hyesoo Yang <hyeygit@gmail.com>
---
 benchmarks/README.md               | 18 +++++++++
 benchmarks/backend_request_func.py |  6 +++
 benchmarks/benchmark_serving.py    | 59 ++++++++++++++++++++++++++++--
 3 files changed, 80 insertions(+), 3 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index b0417631c5141..4a8ab895e18e9 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -204,6 +204,24 @@ python3 vllm/benchmarks/benchmark_serving.py \
     --seed 42
 ```
 
+### Running With Sampling Parameters
+
+When using OpenAI-compatible backends such as `vllm`, optional sampling
+parameters can be specified. Example client command:
+
+```bash
+python3 vllm/benchmarks/benchmark_serving.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --endpoint /v1/completions \
+  --dataset-name sharegpt \
+  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --top-k 10 \
+  --top-p 0.9 \
+  --temperature 0.5 \
+  --num-prompts 10
+```
+
 ---
 ## Example - Offline Throughput Benchmark
 
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index ea70a1f48a0bb..287d500a81de2 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -497,3 +497,9 @@ ASYNC_REQUEST_FUNCS = {
     "scalellm": async_request_openai_completions,
     "sglang": async_request_openai_completions,
 }
+
+OPENAI_COMPATIBLE_BACKENDS = [
+    k for k, v in ASYNC_REQUEST_FUNCS.items()
+    if v in (async_request_openai_completions,
+             async_request_openai_chat_completions)
+]
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 59648222e0a61..c50125b708b85 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -34,7 +34,8 @@ from datetime import datetime
 from typing import Any, Optional
 
 import numpy as np
-from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
+from backend_request_func import (ASYNC_REQUEST_FUNCS,
+                                  OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
                                   RequestFuncOutput)
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
@@ -260,6 +261,7 @@ async def benchmark(
     goodput_config_dict: dict[str, float],
     max_concurrency: Optional[int],
     lora_modules: Optional[Iterable[str]],
+    extra_body: Optional[dict],
 ):
     if backend in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -287,6 +289,7 @@ async def benchmark(
         logprobs=logprobs,
         multi_modal_content=test_mm_content,
         ignore_eos=ignore_eos,
+        extra_body=extra_body,
     )
 
     test_output = await request_func(request_func_input=test_input)
@@ -313,7 +316,8 @@ async def benchmark(
                                          output_len=test_output_len,
                                          logprobs=logprobs,
                                          multi_modal_content=test_mm_content,
-                                         ignore_eos=ignore_eos)
+                                         ignore_eos=ignore_eos,
+                                         extra_body=extra_body)
         profile_output = await request_func(request_func_input=profile_input)
         if profile_output.success:
             print("Profiler started")
@@ -363,7 +367,8 @@ async def benchmark(
                                               output_len=output_len,
                                               logprobs=logprobs,
                                               multi_modal_content=mm_content,
-                                              ignore_eos=ignore_eos)
+                                              ignore_eos=ignore_eos,
+                                              extra_body=extra_body)
         tasks.append(
             asyncio.create_task(
                 limited_request_func(request_func_input=request_func_input,
@@ -652,6 +657,26 @@ def main(args: argparse.Namespace):
             raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
     goodput_config_dict = check_goodput_args(args)
 
+    # Collect the sampling parameters.
+    sampling_params = {
+        k: v
+        for k, v in {
+            "top_p": args.top_p,
+            "top_k": args.top_k,
+            "min_p": args.min_p,
+            "temperature": args.temperature
+        }.items() if v is not None
+    }
+
+    # Sampling parameters are only supported by openai-compatible backend.
+    if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
+        raise ValueError(
+            "Sampling parameters are only supported by openai-compatible "
+            "backends.")
+
+    if "temperature" not in sampling_params:
+        sampling_params["temperature"] = 0.0  # Default to greedy decoding.
+
     # Avoid GC processing "static" data - reduce pause times.
     gc.collect()
     gc.freeze()
@@ -678,6 +703,7 @@ def main(args: argparse.Namespace):
             goodput_config_dict=goodput_config_dict,
             max_concurrency=args.max_concurrency,
             lora_modules=args.lora_modules,
+            extra_body=sampling_params,
         ))
 
     # Save config and results to json
@@ -1000,6 +1026,33 @@ if __name__ == "__main__":
         "from the sampled HF dataset.",
     )
 
+    sampling_group = parser.add_argument_group("sampling parameters")
+    sampling_group.add_argument(
+        "--top-p",
+        type=float,
+        default=None,
+        help="Top-p sampling parameter. Only has effect on openai-compatible "
+        "backends.")
+    sampling_group.add_argument(
+        "--top-k",
+        type=int,
+        default=None,
+        help="Top-k sampling parameter. Only has effect on openai-compatible "
+        "backends.")
+    sampling_group.add_argument(
+        "--min-p",
+        type=float,
+        default=None,
+        help="Min-p sampling parameter. Only has effect on openai-compatible "
+        "backends.")
+    sampling_group.add_argument(
+        "--temperature",
+        type=float,
+        default=None,
+        help="Temperature sampling parameter. Only has effect on "
+        "openai-compatible backends. If not specified, default to greedy "
+        "decoding (i.e. temperature==0.0).")
+
     parser.add_argument(
         '--tokenizer-mode',
         type=str,

From eb07c8cb5b72f08291886ffa4d5d7b19af59c7a2 Mon Sep 17 00:00:00 2001
From: Ben Jackson <ben@ben.com>
Date: Sun, 6 Apr 2025 00:44:36 -0700
Subject: [PATCH 252/593] [Frontend] Fix typo in tool chat templates for
 llama3.2 and toolace (#14501)

Signed-off-by: Ben Jackson <ben@ben.com>
---
 examples/tool_chat_template_llama3.2_pythonic.jinja | 2 +-
 examples/tool_chat_template_toolace.jinja           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/tool_chat_template_llama3.2_pythonic.jinja b/examples/tool_chat_template_llama3.2_pythonic.jinja
index 8c38de6c6a907..e4ec2353b3509 100644
--- a/examples/tool_chat_template_llama3.2_pythonic.jinja
+++ b/examples/tool_chat_template_llama3.2_pythonic.jinja
@@ -76,7 +76,7 @@
             {{- tool_call.name + '(' -}}
             {%- for param in tool_call.arguments %}
                 {{- param + '=' -}}
-                {{- "%sr" | format(tool_call.arguments[param]) -}}
+                {{- "%s" | format(tool_call.arguments[param]) -}}
                 {% if not loop.last %}, {% endif %}
             {%- endfor %}
             {{- ')' -}}
diff --git a/examples/tool_chat_template_toolace.jinja b/examples/tool_chat_template_toolace.jinja
index a9b3b7189dddf..da0f25cdcb337 100644
--- a/examples/tool_chat_template_toolace.jinja
+++ b/examples/tool_chat_template_toolace.jinja
@@ -44,7 +44,7 @@
             {{- tool_call.name + '(' -}}
             {%- for param in tool_call.arguments %}
                 {{- param + '=' -}}
-                {{- "%sr" | format(tool_call.arguments[param]) -}}
+                {{- "%s" | format(tool_call.arguments[param]) -}}
                 {% if not loop.last %}, {% endif %}
             {%- endfor %}
             {{- ')' -}}

From 9ca710e525e8db27e1a0a21649a59d208db3cf02 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sun, 6 Apr 2025 01:18:00 -0700
Subject: [PATCH 253/593] [CI][V1] Fix passing `tokenizer` as kwarg to
 `validate_guidance_grammar` (#16117)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/structured_output/backend_guidance.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
index a7ba710169497..ec7e627191a5d 100644
--- a/vllm/v1/structured_output/backend_guidance.py
+++ b/vllm/v1/structured_output/backend_guidance.py
@@ -163,7 +163,6 @@ def validate_guidance_grammar(
         tokenizer: Optional[llguidance.LLTokenizer] = None) -> None:
     tp, grm = get_structured_output_key(sampling_params)
     guidance_grm = serialize_guidance_grammar(tp, grm)
-    err = llguidance.LLMatcher.validate_grammar(guidance_grm,
-                                                tokenizer=tokenizer)
+    err = llguidance.LLMatcher.validate_grammar(guidance_grm, tokenizer)
     if err:
         raise ValueError(f"Grammar error: {err}")

From b6c502a15011327ffa6742e5305d154aa7502d96 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sun, 6 Apr 2025 17:42:48 +0800
Subject: [PATCH 254/593] [Misc] refactor example eagle (#16100)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 examples/offline_inference/eagle.py | 157 +++++++++++++++-------------
 1 file changed, 85 insertions(+), 72 deletions(-)

diff --git a/examples/offline_inference/eagle.py b/examples/offline_inference/eagle.py
index db5012bae2930..369417b2c18fc 100644
--- a/examples/offline_inference/eagle.py
+++ b/examples/offline_inference/eagle.py
@@ -7,89 +7,102 @@ from transformers import AutoTokenizer
 
 from vllm import LLM, SamplingParams
 
-parser = argparse.ArgumentParser()
 
-parser.add_argument(
-    "--dataset",
-    type=str,
-    default="./examples/data/gsm8k.jsonl",
-    help="downloaded from the eagle repo " \
-    "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/"
-)
-parser.add_argument("--max_num_seqs", type=int, default=8)
-parser.add_argument("--num_prompts", type=int, default=80)
-parser.add_argument("--num_spec_tokens", type=int, default=2)
-parser.add_argument("--tp", type=int, default=1)
-parser.add_argument("--draft_tp", type=int, default=1)
-parser.add_argument("--enforce_eager", action='store_true')
-parser.add_argument("--enable_chunked_prefill", action='store_true')
-parser.add_argument("--max_num_batched_tokens", type=int, default=2048)
-parser.add_argument("--temp", type=float, default=0)
+def load_prompts(dataset_path, num_prompts):
+    if os.path.exists(dataset_path):
+        prompts = []
+        try:
+            with open(dataset_path) as f:
+                for line in f:
+                    data = json.loads(line)
+                    prompts.append(data["turns"][0])
+        except Exception as e:
+            print(f"Error reading dataset: {e}")
+            return []
+    else:
+        prompts = [
+            "The future of AI is", "The president of the United States is"
+        ]
 
-args = parser.parse_args()
+    return prompts[:num_prompts]
 
-print(args)
 
-model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
-eagle_dir = "abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm"
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="./examples/data/gsm8k.jsonl",
+        help="downloaded from the eagle repo " \
+        "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/"
+    )
+    parser.add_argument("--max_num_seqs", type=int, default=8)
+    parser.add_argument("--num_prompts", type=int, default=80)
+    parser.add_argument("--num_spec_tokens", type=int, default=2)
+    parser.add_argument("--tp", type=int, default=1)
+    parser.add_argument("--draft_tp", type=int, default=1)
+    parser.add_argument("--enforce_eager", action='store_true')
+    parser.add_argument("--enable_chunked_prefill", action='store_true')
+    parser.add_argument("--max_num_batched_tokens", type=int, default=2048)
+    parser.add_argument("--temp", type=float, default=0)
+    args = parser.parse_args()
 
-max_model_len = 2048
+    model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
+    eagle_dir = "abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm"
 
-tokenizer = AutoTokenizer.from_pretrained(model_dir)
+    max_model_len = 2048
 
-if os.path.exists(args.dataset):
-    prompts = []
-    num_prompts = args.num_prompts
-    with open(args.dataset) as f:
-        for line in f:
-            data = json.loads(line)
-            prompts.append(data["turns"][0])
-else:
-    prompts = ["The future of AI is", "The president of the United States is"]
+    tokenizer = AutoTokenizer.from_pretrained(model_dir)
 
-prompts = prompts[:args.num_prompts]
-num_prompts = len(prompts)
+    prompts = load_prompts(args.dataset, args.num_prompts)
 
-prompt_ids = [
-    tokenizer.apply_chat_template([{
-        "role": "user",
-        "content": prompt
-    }],
-                                  add_generation_prompt=True)
-    for prompt in prompts
-]
+    prompt_ids = [
+        tokenizer.apply_chat_template([{
+            "role": "user",
+            "content": prompt
+        }],
+                                      add_generation_prompt=True)
+        for prompt in prompts
+    ]
 
-llm = LLM(
-    model=model_dir,
-    trust_remote_code=True,
-    tensor_parallel_size=args.tp,
-    enable_chunked_prefill=args.enable_chunked_prefill,
-    max_num_batched_tokens=args.max_num_batched_tokens,
-    enforce_eager=args.enforce_eager,
-    max_model_len=max_model_len,
-    max_num_seqs=args.max_num_seqs,
-    gpu_memory_utilization=0.8,
-    speculative_config={
-        "model": eagle_dir,
-        "num_speculative_tokens": args.num_spec_tokens,
-        "draft_tensor_parallel_size": args.draft_tp,
-        "max_model_len": max_model_len,
-    },
-    disable_log_stats=False,
-)
+    llm = LLM(
+        model=model_dir,
+        trust_remote_code=True,
+        tensor_parallel_size=args.tp,
+        enable_chunked_prefill=args.enable_chunked_prefill,
+        max_num_batched_tokens=args.max_num_batched_tokens,
+        enforce_eager=args.enforce_eager,
+        max_model_len=max_model_len,
+        max_num_seqs=args.max_num_seqs,
+        gpu_memory_utilization=0.8,
+        speculative_config={
+            "model": eagle_dir,
+            "num_speculative_tokens": args.num_spec_tokens,
+            "draft_tensor_parallel_size": args.draft_tp,
+            "max_model_len": max_model_len,
+        },
+        disable_log_stats=False,
+    )
 
-sampling_params = SamplingParams(temperature=args.temp, max_tokens=256)
+    sampling_params = SamplingParams(temperature=args.temp, max_tokens=256)
 
-outputs = llm.generate(prompt_token_ids=prompt_ids,
-                       sampling_params=sampling_params)
+    outputs = llm.generate(prompt_token_ids=prompt_ids,
+                           sampling_params=sampling_params)
 
-# calculate the average number of accepted tokens per forward pass, +1 is
-# to account for the token from the target model that's always going to be
-# accepted
-acceptance_counts = [0] * (args.num_spec_tokens + 1)
-for output in outputs:
-    for step, count in enumerate(output.metrics.spec_token_acceptance_counts):
-        acceptance_counts[step] += count
+    # calculate the average number of accepted tokens per forward pass, +1 is
+    # to account for the token from the target model that's always going to be
+    # accepted
+    acceptance_counts = [0] * (args.num_spec_tokens + 1)
+    for output in outputs:
+        for step, count in enumerate(
+                output.metrics.spec_token_acceptance_counts):
+            acceptance_counts[step] += count
 
-print(f"mean acceptance length: \
-    {sum(acceptance_counts) / acceptance_counts[0]:.2f}")
+    print("-" * 50)
+    print(f"mean acceptance length: \
+        {sum(acceptance_counts) / acceptance_counts[0]:.2f}")
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()

From d5ae4f7f421eacd8de2f7fb012ca7b1ef88e9c71 Mon Sep 17 00:00:00 2001
From: Paul Schweigert <paul@paulschweigert.com>
Date: Sun, 6 Apr 2025 08:10:57 -0400
Subject: [PATCH 255/593] [Doc][Bugfix] Add missing EOF in k8s deploy doc
 (#16025)

---
 docs/source/deployment/k8s.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/deployment/k8s.md b/docs/source/deployment/k8s.md
index 3885956791365..9079cfa8e1b66 100644
--- a/docs/source/deployment/k8s.md
+++ b/docs/source/deployment/k8s.md
@@ -46,6 +46,7 @@ metadata:
 type: Opaque
 data:
   token: $(HF_TOKEN)
+EOF
 ```
 
 Next, start the vLLM server as a Kubernetes Deployment and Service:

From c2a967151070f10634709288acca6d976fb4c493 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 6 Apr 2025 20:51:45 +0800
Subject: [PATCH 256/593] [Misc] Improve model redirect to accept json
 dictionary (#16119)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/envs.py                     |  5 +++++
 vllm/transformers_utils/utils.py | 37 ++++++++++++++++++++++----------
 2 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 6067f5bdd0578..a561b52aa0abe 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -665,6 +665,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1",
 
     # Use model_redirect to redirect the model name to a local folder.
+    # `model_redirect` can be a json file mapping the model between
+    # repo_id and local folder:
+    # {"meta-llama/Llama-3.2-1B": "/tmp/Llama-3.2-1B"}
+    # or a space separated values table file:
+    # meta-llama/Llama-3.2-1B   /tmp/Llama-3.2-1B
     "VLLM_MODEL_REDIRECT_PATH":
     lambda: os.environ.get("VLLM_MODEL_REDIRECT_PATH", None),
 
diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index 564c0f83389e2..81eb4d9b6abc3 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import json
 from functools import cache
 from os import PathLike
 from pathlib import Path
@@ -51,6 +52,26 @@ def modelscope_list_repo_files(
     return files
 
 
+def _maybe_json_dict(path: Union[str, PathLike]) -> dict[str, str]:
+    with open(path) as f:
+        try:
+            return json.loads(f.read())
+        except Exception:
+            return dict[str, str]()
+
+
+def _maybe_space_split_dict(path: Union[str, PathLike]) -> dict[str, str]:
+    parsed_dict = dict[str, str]()
+    with open(path) as f:
+        for line in f.readlines():
+            try:
+                model_name, redirect_name = line.strip().split()
+                parsed_dict[model_name] = redirect_name
+            except Exception:
+                pass
+    return parsed_dict
+
+
 @cache
 def maybe_model_redirect(model: str) -> str:
     """
@@ -68,16 +89,10 @@ def maybe_model_redirect(model: str) -> str:
     if not Path(model_redirect_path).exists():
         return model
 
-    with open(model_redirect_path) as f:
-        for line in f.readlines():
-            try:
-                model_name, redirect_name = line.split("\t")
-                if model == model_name:
-                    redirect_name = redirect_name.strip()
-                    logger.info("model redirect: [ %s ] -> [ %s ]", model,
-                                redirect_name)
-                    return redirect_name
-            except Exception:
-                pass
+    redirect_dict = (_maybe_json_dict(model_redirect_path)
+                     or _maybe_space_split_dict(model_redirect_path))
+    if (redirect_model := redirect_dict.get(model)):
+        logger.info("model redirect: [ %s ] -> [ %s ]", model, redirect_model)
+        return redirect_model
 
     return model

From 242a637aead7d5a60a65232d51d3a091fb918925 Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Sun, 6 Apr 2025 20:52:01 +0800
Subject: [PATCH 257/593] [Model] use AutoWeightsLoader for
 stablelm,starcoder2,zamba2 (#16103)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 vllm/model_executor/models/stablelm.py   | 94 +++++++++++++-----------
 vllm/model_executor/models/starcoder2.py | 84 +++++++++++----------
 vllm/model_executor/models/zamba2.py     | 78 ++++++++++----------
 3 files changed, 135 insertions(+), 121 deletions(-)

diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index a15faec547b95..53f520304abc4 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -44,7 +44,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -253,6 +253,45 @@ class StableLMEpochModel(nn.Module):
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class StablelmForCausalLM(nn.Module, SupportsPP):
 
@@ -308,46 +347,13 @@ class StablelmForCausalLM(nn.Module, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(
+            self,
+            # Models trained using ColossalAI may include these tensors in
+            # the checkpoint. Skip them.
+            skip_prefixes=[
+                "rotary_emb.inv_freq", "rotary_emb.cos_cached",
+                "rotary_emb.sin_cached"
+            ],
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 3d11dfd779210..8b9fb7cb7bc6e 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -45,7 +45,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -256,6 +256,41 @@ class Starcoder2Model(nn.Module):
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class Starcoder2ForCausalLM(nn.Module, SupportsPP):
 
@@ -319,41 +354,12 @@ class Starcoder2ForCausalLM(nn.Module, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-        ]
-
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-
-                if self.config.tie_word_embeddings and "lm_head.weight" in name:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(
+            self,
+            # Models trained using ColossalAI may include these tensors in
+            # the checkpoint. Skip them.
+            skip_prefixes=([
+                "rotary_emb.inv_freq", "lm_head.weight"
+            ] if self.config.tie_word_embeddings else ["rotary_emb.inv_freq"]),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
index 7e210244f794d..c5330203baca8 100644
--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -39,7 +39,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import HasInnerState, IsHybrid, SupportsV0Only
-from .utils import maybe_prefix
+from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
 
 
 class Zamba2LoRA(nn.Module):
@@ -777,6 +777,37 @@ class Zamba2Model(nn.Module):
         hidden_states = self.final_layernorm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for chkpt_weight_name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in chkpt_weight_name:
+                    continue
+                chkpt_weight_name = chkpt_weight_name.replace(
+                    weight_name, param_name)
+                param = params_dict[chkpt_weight_name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if chkpt_weight_name not in params_dict:
+                    continue
+                param = params_dict[chkpt_weight_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(chkpt_weight_name)
+        return loaded_params
+
 
 class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only):
     """Zamba2 model with causal language modeling head.
@@ -787,6 +818,12 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only):
     - Support for model parallelism and quantization
     - Sampling capabilities for text generation
     """
+    # To ensure correct weight loading and mapping.
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_substr={
+        "A_log": "A",
+        "0.weight": "A.weight",
+        "1.weight": "B.weight",
+    })
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         """Initialize the Zamba2 model for causal language modeling.
@@ -992,40 +1029,5 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-        ]
-
-        weights_dict = {}
-        for key, loaded_weight in weights:
-            if "A_log" in key:
-                key = key.replace("A_log", "A")
-            elif "adapter_list" in key:
-                key = key.replace("0.weight", "A.weight")
-                key = key.replace("1.weight", "B.weight")
-            weights_dict[key] = loaded_weight
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for chkpt_weight_name, loaded_weight in weights_dict.items():
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in chkpt_weight_name:
-                    continue
-                chkpt_weight_name = chkpt_weight_name.replace(
-                    weight_name, param_name)
-                param = params_dict[chkpt_weight_name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                if chkpt_weight_name not in params_dict:
-                    continue
-                param = params_dict[chkpt_weight_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(chkpt_weight_name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)

From 3a100b9278d1d2b27537fc86f14fb88d8c27466f Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Sun, 6 Apr 2025 10:04:50 -0400
Subject: [PATCH 258/593] [Bugfix] LoRA : Fix the order in which the kernels
 process LoRAs  (#16040)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 vllm/lora/ops/triton_ops/lora_kernel_metadata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/lora/ops/triton_ops/lora_kernel_metadata.py b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
index 1dcdfc814a891..055e78f406f3e 100644
--- a/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
+++ b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
@@ -111,7 +111,7 @@ class LoRAKernelMeta:
 
         # active_lora_ids, num_tokens_per_lora
         lora_ids, num_tokens_per_lora = torch.unique(token_lora_mapping,
-                                                     sorted=False,
+                                                     sorted=True,
                                                      return_counts=True)
         self.active_lora_ids[:lora_ids.size(0)].copy_(lora_ids,
                                                       non_blocking=True)

From da224daaa9b4864e4ce08ca0fce1bc089e1baa98 Mon Sep 17 00:00:00 2001
From: paolovic <91155454+paolovic@users.noreply.github.com>
Date: Sun, 6 Apr 2025 16:47:33 +0200
Subject: [PATCH 259/593] [Bugfix] add hf_token to EngineArgs (#16093)

Signed-off-by: paolovic <paul-philipp.luley@uzh.ch>
Co-authored-by: paolovic <paul-philipp.luley@uzh.ch>
---
 vllm/config.py                    |  6 +++++-
 vllm/engine/arg_utils.py          | 12 ++++++++++++
 vllm/entrypoints/llm.py           |  5 +++++
 vllm/transformers_utils/config.py |  6 +++++-
 4 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 2669d1a13b37e..d6f931ca1a436 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -173,6 +173,9 @@ class ModelConfig:
             Defaults to True.
         config_format: The config format which shall be loaded.
             Defaults to 'auto' which defaults to 'hf'.
+        hf_token: The token to use as HTTP bearer authorization for remote files
+            . If `True`, will use the token generated when running 
+            `huggingface-cli login` (stored in `~/.huggingface`).
         hf_overrides: If a dictionary, contains arguments to be forwarded to the
             HuggingFace config. If a callable, it is called to update the
             HuggingFace config.
@@ -256,6 +259,7 @@ class ModelConfig:
         limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
         use_async_output_proc: bool = True,
         config_format: ConfigFormat = ConfigFormat.AUTO,
+        hf_token: Optional[Union[bool, str]] = None,
         hf_overrides: Optional[HfOverrides] = None,
         mm_processor_kwargs: Optional[dict[str, Any]] = None,
         disable_mm_preprocessor_cache: bool = False,
@@ -356,7 +360,7 @@ class ModelConfig:
         self.hf_text_config = get_hf_text_config(self.hf_config)
         self.encoder_config = self._get_encoder_config()
         self.hf_image_processor_config = get_hf_image_processor_config(
-            self.model, revision)
+            self.model, hf_token=hf_token, revision=revision)
         self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
         self.use_async_output_proc = use_async_output_proc
         self.mm_processor_kwargs = mm_processor_kwargs
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 93dba20141dd7..af80541bc0954 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -138,6 +138,7 @@ class EngineArgs:
     code_revision: Optional[str] = None
     rope_scaling: Optional[Dict[str, Any]] = None
     rope_theta: Optional[float] = None
+    hf_token: Optional[Union[bool, str]] = None
     hf_overrides: Optional[HfOverrides] = None
     tokenizer_revision: Optional[str] = None
     quantization: Optional[str] = None
@@ -602,6 +603,16 @@ class EngineArgs:
                             help='RoPE theta. Use with `rope_scaling`. In '
                             'some cases, changing the RoPE theta improves the '
                             'performance of the scaled model.')
+        parser.add_argument(
+            '--hf-token',
+            type=str,
+            nargs='?',
+            const=True,
+            default=None,
+            help='The token to use as HTTP bearer authorization'
+            ' for remote files. If `True`, will use the token '
+            'generated when running `huggingface-cli login` '
+            '(stored in `~/.huggingface`).')
         parser.add_argument('--hf-overrides',
                             type=json.loads,
                             default=EngineArgs.hf_overrides,
@@ -1038,6 +1049,7 @@ class EngineArgs:
             code_revision=self.code_revision,
             rope_scaling=self.rope_scaling,
             rope_theta=self.rope_theta,
+            hf_token=self.hf_token,
             hf_overrides=self.hf_overrides,
             tokenizer_revision=self.tokenizer_revision,
             max_model_len=self.max_model_len,
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index f39b011c9301e..d252a2bb428c0 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -117,6 +117,9 @@ class LLM:
         disable_custom_all_reduce: See :class:`~vllm.config.ParallelConfig`
         disable_async_output_proc: Disable async output processing.
             This may result in lower performance.
+        hf_token: The token to use as HTTP bearer authorization for remote files
+            . If `True`, will use the token generated when running 
+            `huggingface-cli login` (stored in `~/.huggingface`).
         hf_overrides: If a dictionary, contains arguments to be forwarded to the
             HuggingFace config. If a callable, it is called to update the
             HuggingFace config.
@@ -177,6 +180,7 @@ class LLM:
         max_seq_len_to_capture: int = 8192,
         disable_custom_all_reduce: bool = False,
         disable_async_output_proc: bool = False,
+        hf_token: Optional[Union[bool, str]] = None,
         hf_overrides: Optional[HfOverrides] = None,
         mm_processor_kwargs: Optional[dict[str, Any]] = None,
         # After positional args are removed, move this right below `model`
@@ -232,6 +236,7 @@ class LLM:
             max_seq_len_to_capture=max_seq_len_to_capture,
             disable_custom_all_reduce=disable_custom_all_reduce,
             disable_async_output_proc=disable_async_output_proc,
+            hf_token=hf_token,
             hf_overrides=hf_overrides,
             mm_processor_kwargs=mm_processor_kwargs,
             override_pooler_config=override_pooler_config,
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index d27a126ddbc06..fe0319c9b033e 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -712,6 +712,7 @@ def load_params_config(model: Union[str, Path], revision: Optional[str],
 
 def get_hf_image_processor_config(
     model: Union[str, Path],
+    hf_token: Optional[Union[bool, str]] = None,
     revision: Optional[str] = None,
     **kwargs,
 ) -> Dict[str, Any]:
@@ -721,7 +722,10 @@ def get_hf_image_processor_config(
     # Separate model folder from file path for GGUF models
     if check_gguf_file(model):
         model = Path(model).parent
-    return get_image_processor_config(model, revision=revision, **kwargs)
+    return get_image_processor_config(model,
+                                      token=hf_token,
+                                      revision=revision,
+                                      **kwargs)
 
 
 def get_hf_text_config(config: PretrainedConfig):

From 72c8f1ad04cd895bf7b3ad85909903335940de3d Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sun, 6 Apr 2025 22:56:34 +0800
Subject: [PATCH 260/593] [Misc] update requires-python in pyproject.toml
 (#16116)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 07616c858f1f3..cff042e75bb81 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,7 +30,7 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
     "Topic :: Scientific/Engineering :: Information Analysis",
 ]
-requires-python = ">=3.9"
+requires-python = ">=3.9,<=3.12"
 dynamic = [ "version", "dependencies", "optional-dependencies"]
 
 [project.urls]

From 9bde5ba12709ea0fe9e1a1eeee1e8d7b4c7ea668 Mon Sep 17 00:00:00 2001
From: Chengji Yao <chengjiyao@google.com>
Date: Sun, 6 Apr 2025 11:25:55 -0700
Subject: [PATCH 261/593] [TPU] Update PyTorch/XLA (#16130)

Signed-off-by: Chengji Yao <chengjiyao@google.com>
---
 requirements/tpu.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index 085b79958f8b7..105c45db70df9 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -17,10 +17,10 @@ ray[data]
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250403-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250403-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250403-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250403-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250403-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250403-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250406-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250406-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250406-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250406-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250406-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250406-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
 

From b10e51989551cd80dd74079429ccf91f0807bd92 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 6 Apr 2025 13:48:14 -0700
Subject: [PATCH 262/593] [V1][Minor] Optimize get_cached_block (#16135)

---
 vllm/v1/core/block_pool.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index 43f30f7103c7a..74f3f7852c9a9 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -67,11 +67,11 @@ class BlockPool:
         Returns:
             The cached block if it exists, or None.
         """
-        if block_hash in self.cached_block_hash_to_block:
-            first_block_id = list(
-                self.cached_block_hash_to_block[block_hash].keys())[0]
-            return self.cached_block_hash_to_block[block_hash][first_block_id]
-        return None
+        cached_blocks = self.cached_block_hash_to_block.get(block_hash)
+        if not cached_blocks:
+            return None
+        first_block_id = next(iter(cached_blocks))
+        return cached_blocks[first_block_id]
 
     def cache_full_blocks(
         self,

From 2549c0dfef2f897d19eb8aa4294b3b8419ce078d Mon Sep 17 00:00:00 2001
From: Martin Hoyer <mhoyer@redhat.com>
Date: Mon, 7 Apr 2025 04:22:25 +0200
Subject: [PATCH 263/593] Fix requires-python (#16132)

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index cff042e75bb81..167e975c70fdb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,7 +30,7 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
     "Topic :: Scientific/Engineering :: Information Analysis",
 ]
-requires-python = ">=3.9,<=3.12"
+requires-python = ">=3.9,<3.13"
 dynamic = [ "version", "dependencies", "optional-dependencies"]
 
 [project.urls]

From 86fc2321ff1e1bd2ba15193053bf4515a19bb481 Mon Sep 17 00:00:00 2001
From: Kay Yan <kay.yan@daocloud.io>
Date: Mon, 7 Apr 2025 11:34:51 +0800
Subject: [PATCH 264/593] [Metrics] Add bucket for `request_latency`,
 `time_to_first_token` and `time_per_output_token` (#15202)

Signed-off-by: Kay Yan <kay.yan@daocloud.io>
---
 vllm/engine/metrics.py     | 7 ++++---
 vllm/v1/metrics/loggers.py | 7 ++++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 5890f654e3820..7c4265fac20b0 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -156,7 +156,8 @@ class Metrics:
             labelnames=labelnames,
             buckets=[
                 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
-                0.75, 1.0, 2.5, 5.0, 7.5, 10.0
+                0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0,
+                2560.0
             ])
         self.histogram_time_per_output_token = self._histogram_cls(
             name="vllm:time_per_output_token_seconds",
@@ -164,14 +165,14 @@ class Metrics:
             labelnames=labelnames,
             buckets=[
                 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
-                1.0, 2.5
+                1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
             ])
 
         # Request stats
         #   Latency
         request_latency_buckets = [
             0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
-            40.0, 50.0, 60.0
+            40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0
         ]
         self.histogram_e2e_time_request = self._histogram_cls(
             name="vllm:e2e_request_latency_seconds",
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 73883d9a735dd..3959be40b7253 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -239,7 +239,8 @@ class PrometheusStatLogger(StatLoggerBase):
                 documentation="Histogram of time to first token in seconds.",
                 buckets=[
                     0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
-                    0.75, 1.0, 2.5, 5.0, 7.5, 10.0
+                    0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0,
+                    640.0, 2560.0
                 ],
                 labelnames=labelnames).labels(*labelvalues)
 
@@ -249,13 +250,13 @@ class PrometheusStatLogger(StatLoggerBase):
                 documentation="Histogram of time per output token in seconds.",
                 buckets=[
                     0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5,
-                    0.75, 1.0, 2.5
+                    0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
                 ],
                 labelnames=labelnames).labels(*labelvalues)
 
         request_latency_buckets = [
             0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
-            40.0, 50.0, 60.0
+            40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0
         ]
         self.histogram_e2e_time_request = \
             prometheus_client.Histogram(

From 3749e28774da53abd787509714a6af7a976b5ccc Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 6 Apr 2025 20:38:12 -0700
Subject: [PATCH 265/593] [V1][Minor] Minor simplification for
 get_computed_blocks  (#16139)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/core/kv_cache_manager.py | 63 ++++++++++++++++----------------
 1 file changed, 32 insertions(+), 31 deletions(-)

diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index c0f7715209d11..4e74c20d36659 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -126,39 +126,40 @@ class KVCacheManager:
             self.req_to_block_hashes[request.request_id] = block_hashes
 
         self.prefix_cache_stats.requests += 1
-        if request.sampling_params.prompt_logprobs is None:
-            if len(block_hashes) * self.block_size == request.num_tokens:
-                # When prompt length is divisible by the block size and all
-                # blocks are cached, we need to recompute the last token. This
-                # have to be achieved by re-computing an entire block because
-                # allocate_slots() assumes num_computed_tokens is always a
-                # multiple of the block size. To achieve this, remove the last
-                # block hash from the block_hashes for find_longest_cache_hit
-                # This limitation can potentially be removed in the future to
-                # slightly improve the performance.
-                last_block_hash = block_hashes.pop()
-            else:
-                last_block_hash = None
-
-            computed_blocks = (
-                self.specialized_manager.find_longest_cache_hit(block_hashes))
-
-            if last_block_hash is not None:
-                # Add back the last block hash if it was removed.
-                block_hashes.append(last_block_hash)
-
-            self.prefix_cache_stats.queries += len(block_hashes)
-            self.prefix_cache_stats.hits += len(computed_blocks)
-
-            # NOTE(woosuk): Since incomplete blocks are not eligible for
-            # sharing, `num_computed_tokens` is always a multiple of
-            # `block_size`.
-            num_computed_tokens = len(computed_blocks) * self.block_size
-            return computed_blocks, num_computed_tokens
-        else:
-            # Skip cache hits for prompt logprobs
+        # When the request requires prompt logprobs, we skip prefix caching.
+        if request.sampling_params.prompt_logprobs is not None:
             return [], 0
 
+        if len(block_hashes) * self.block_size == request.num_tokens:
+            # When prompt length is divisible by the block size and all
+            # blocks are cached, we need to recompute the last token. This
+            # have to be achieved by re-computing an entire block because
+            # allocate_slots() assumes num_computed_tokens is always a
+            # multiple of the block size. To achieve this, remove the last
+            # block hash from the block_hashes for find_longest_cache_hit
+            # This limitation can potentially be removed in the future to
+            # slightly improve the performance.
+            last_block_hash = block_hashes.pop()
+        else:
+            last_block_hash = None
+
+        computed_blocks = (
+            self.specialized_manager.find_longest_cache_hit(block_hashes))
+        self.prefix_cache_stats.queries += len(block_hashes)
+        self.prefix_cache_stats.hits += len(computed_blocks)
+
+        if last_block_hash is not None:
+            # Add back the last block hash if it was removed.
+            # NOTE: Because block_hashes is cached in req_to_block_hashes,
+            # we shouldn't modify it directly.
+            block_hashes.append(last_block_hash)
+
+        # NOTE(woosuk): Since incomplete blocks are not eligible for
+        # sharing, `num_computed_tokens` is always a multiple of
+        # `block_size`.
+        num_computed_tokens = len(computed_blocks) * self.block_size
+        return computed_blocks, num_computed_tokens
+
     def allocate_slots(
         self,
         request: Request,

From 0a5738672158c07d5d66ac9f8c9e8876f2939bb9 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 7 Apr 2025 11:57:37 +0800
Subject: [PATCH 266/593] [Misc] Update Mistral-3.1 example (#16147)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 examples/offline_inference/mistral-small.py | 30 +++++++++++++++------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py
index 43be2aa80773f..bf8fb7dc521c3 100644
--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@@ -13,9 +13,14 @@ from vllm.sampling_params import SamplingParams
 # - Server:
 #
 # ```bash
+# # Mistral format
 # vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
 #   --tokenizer-mode mistral --config-format mistral --load-format mistral \
 #   --limit-mm-per-prompt 'image=4' --max-model-len 16384
+#
+# # HF format
+# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
+#   --limit-mm-per-prompt 'image=4' --max-model-len 16384
 # ```
 #
 # - Client:
@@ -44,19 +49,22 @@ from vllm.sampling_params import SamplingParams
 #     python demo.py simple
 #     python demo.py advanced
 
+# Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
+# These scripts have been tested on 2x L40 GPUs
+
 
 def run_simple_demo(args: argparse.Namespace):
     model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
     sampling_params = SamplingParams(max_tokens=8192)
 
-    # Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
     llm = LLM(
         model=model_name,
-        tokenizer_mode="mistral",
-        config_format="mistral",
-        load_format="mistral",
+        tokenizer_mode="mistral" if args.format == "mistral" else "auto",
+        config_format="mistral" if args.format == "mistral" else "auto",
+        load_format="mistral" if args.format == "mistral" else "auto",
         max_model_len=4096,
         max_num_seqs=2,
+        tensor_parallel_size=2,
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
@@ -88,17 +96,18 @@ def run_simple_demo(args: argparse.Namespace):
 
 def run_advanced_demo(args: argparse.Namespace):
     model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
-    max_img_per_msg = 5
+    max_img_per_msg = 3
     max_tokens_per_img = 4096
 
     sampling_params = SamplingParams(max_tokens=8192, temperature=0.7)
     llm = LLM(
         model=model_name,
-        tokenizer_mode="mistral",
-        config_format="mistral",
-        load_format="mistral",
+        tokenizer_mode="mistral" if args.format == "mistral" else "auto",
+        config_format="mistral" if args.format == "mistral" else "auto",
+        load_format="mistral" if args.format == "mistral" else "auto",
         limit_mm_per_prompt={"image": max_img_per_msg},
         max_model_len=max_img_per_msg * max_tokens_per_img,
+        tensor_parallel_size=2,
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
@@ -166,6 +175,11 @@ def main():
         help="Specify the demo mode: 'simple' or 'advanced'",
     )
 
+    parser.add_argument('--format',
+                        choices=["mistral", "hf"],
+                        default="mistral",
+                        help='Specify the format of the model to load.')
+
     parser.add_argument(
         '--disable-mm-preprocessor-cache',
         action='store_true',

From fc0f87768aa5fe253858d0a7ed5f0dbce3f64ba3 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 7 Apr 2025 12:07:15 +0800
Subject: [PATCH 267/593] [Bugfix] Make dummy encoder prompt padding
 alternative and add missing warnings (#16129)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 .../multimodal/processing/test_mllama.py      | 71 +++++++++++++++++++
 tests/models/utils.py                         |  3 +
 vllm/model_executor/models/whisper.py         |  4 ++
 vllm/multimodal/processing.py                 |  4 ++
 vllm/multimodal/profiling.py                  | 30 ++++++--
 5 files changed, 108 insertions(+), 4 deletions(-)
 create mode 100644 tests/models/multimodal/processing/test_mllama.py

diff --git a/tests/models/multimodal/processing/test_mllama.py b/tests/models/multimodal/processing/test_mllama.py
new file mode 100644
index 0000000000000..b89376cf17229
--- /dev/null
+++ b/tests/models/multimodal/processing/test_mllama.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for mllama's multimodal preprocessing and profiling."""
+import pytest
+from transformers import MllamaConfig
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.profiling import MultiModalProfiler
+
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id",
+                         ["meta-llama/Llama-3.2-11B-Vision-Instruct"])
+@pytest.mark.parametrize("max_model_len", [4096, 8192, 25600, 131072])
+@pytest.mark.parametrize("max_num_seqs", [1, 2, 8])
+def test_profiling(
+    model_id: str,
+    max_model_len: int,
+    max_num_seqs: int,
+):
+    # regression test for https://github.com/vllm-project/vllm/issues/13929
+    from vllm.model_executor.models.mllama import calc_token_per_chunk
+
+    model_config_kwargs = {
+        "max_model_len": max_model_len,
+    }
+    ctx = build_model_context(
+        model_id,
+        model_config_kwargs=model_config_kwargs,
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    mm_config = ctx.get_mm_config()
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    profiler = MultiModalProfiler(processor)
+
+    dummy_encoder_data = profiler.get_encoder_dummy_data(
+        max_model_len,
+        mm_counts=mm_config.limit_per_prompt,
+    )
+    dummy_mm_data = processor.dummy_inputs.get_dummy_processor_inputs(
+        max_model_len,
+        mm_counts=mm_config.limit_per_prompt,
+    )
+
+    hf_config = ctx.get_hf_config(MllamaConfig)
+    image_size = hf_config.vision_config.image_size
+    encoder_seq_lens = [len(dummy_encoder_data.prompt_token_ids)
+                        ] * max_num_seqs
+
+    mm_kwargs = processor.apply(
+        prompt=dummy_mm_data.prompt_text,
+        mm_data=dummy_mm_data.mm_data,
+        hf_processor_mm_kwargs=dict(),
+    )["mm_kwargs"]
+
+    # Get the actual number of encoder tokens for each sample.
+    # Because attn_metadata.encoder_seq_lens only counts the last
+    # group of images for each sample, which is used to cheat the
+    # block manager to allocate blocks for those images only.
+    # See MllamaMultiModalProcessor for more details.
+    num_tiles = [[t] for t in mm_kwargs.pop("num_tiles")]
+    num_tokens_per_tile = calc_token_per_chunk(image_size)
+    actual_encoder_seq_lens = [
+        sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles
+    ]
+
+    # simulate mllama image-present prefill.
+    for actual_len, last_group_len in zip(actual_encoder_seq_lens,
+                                          encoder_seq_lens):
+        assert actual_len >= last_group_len
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 7109169e89966..5407540114b4c 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -255,6 +255,7 @@ def build_model_context(
     model_id: str,
     task: TaskOption = "auto",
     dtype: Union[str, torch.dtype] = "auto",
+    model_config_kwargs: Optional[dict[str, Any]] = None,
     mm_processor_kwargs: Optional[dict[str, Any]] = None,
     limit_mm_per_prompt: Optional[dict[str, int]] = None,
     disable_mm_preprocessor_cache: bool = True,
@@ -274,6 +275,7 @@ def build_model_context(
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
+    model_config_kwargs = model_config_kwargs or {}
     model_config = ModelConfig(
         model_id,
         task=task,
@@ -286,5 +288,6 @@ def build_model_context(
         limit_mm_per_prompt=limit_mm_per_prompt,
         disable_mm_preprocessor_cache=disable_mm_preprocessor_cache,
         hf_overrides=model_info.hf_overrides,
+        **model_config_kwargs,
     )
     return InputContext(model_config)
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index eb6404922c6d0..e83abbe8b2527 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -580,6 +580,10 @@ class WhisperMultiModalProcessor(
         feature_extractor = self.info.get_feature_extractor()
         return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
 
+    @property
+    def pad_dummy_encoder_prompt(self) -> bool:
+        return True
+
     def create_encoder_prompt(
         self,
         prompt: Union[str, list[int]],
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index c8864c33fe372..00c0f87b0b237 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1654,6 +1654,10 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
         """
         raise NotImplementedError
 
+    @property
+    def pad_dummy_encoder_prompt(self) -> bool:
+        return False
+
     def create_decoder_prompt(
         self,
         prompt: Union[str, list[int]],
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 1df9a1f5eba1c..ea58ba699f373 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -15,7 +15,8 @@ from vllm.logger import init_logger
 from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
                      MultiModalInputs, MultiModalKwargs,
                      MultiModalPlaceholderDict)
-from .processing import BaseMultiModalProcessor, BaseProcessingInfo
+from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
+                         EncDecMultiModalProcessor)
 
 logger = init_logger(__name__)
 
@@ -200,7 +201,10 @@ class MultiModalProfiler(Generic[_I]):
         seq_len: int,
         mm_counts: Optional[Mapping[str, int]] = None,
     ) -> DummyEncoderData:
-        mm_inputs, _ = self.get_and_validate_mm_inputs(seq_len, mm_counts)
+        (
+            mm_inputs,
+            total_placeholders_by_modality,
+        ) = self.get_and_validate_mm_inputs(seq_len, mm_counts)
         mm_inputs = cast(MultiModalEncDecInputs, mm_inputs)
 
         # For encoder-decoder models, use encoder prompt token ids instead of
@@ -208,8 +212,26 @@ class MultiModalProfiler(Generic[_I]):
         encoder_prompt_token_ids = mm_inputs["encoder_prompt_token_ids"]
 
         total_len = len(encoder_prompt_token_ids)
-        num_tokens_to_pad = max(total_len, seq_len) - total_len
-        encoder_prompt_token_ids.extend([0] * num_tokens_to_pad)
+
+        # Encoder-decoder multimodal models only support v0
+        if total_len > seq_len:
+            # `max_num_batched_tokens` is defined by `SchedulerConfig`
+            logger.warning(
+                "The encoder sequence length used for profiling ("
+                "max_num_batched_tokens / max_num_seqs = %d) is too short "
+                "to hold the multi-modal embeddings in the worst case "
+                "(%d tokens in total, out of which %s are reserved for "
+                "multi-modal embeddings). This may cause certain "
+                "multi-modal inputs to fail during inference, even when "
+                "the input text is short. To avoid this, you should "
+                "increase `max_model_len`, reduce `max_num_seqs`, "
+                "and/or reduce `mm_counts`.", seq_len, total_len,
+                total_placeholders_by_modality)
+
+        processor = cast(EncDecMultiModalProcessor, self.processor)
+        if processor.pad_dummy_encoder_prompt:
+            num_tokens_to_pad = max(total_len, seq_len) - total_len
+            encoder_prompt_token_ids.extend([0] * num_tokens_to_pad)
 
         return DummyEncoderData(encoder_prompt_token_ids)
 

From bb8dab821e00ffdc1007b8b11edbcab6bcfa4140 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sun, 6 Apr 2025 21:37:58 -0700
Subject: [PATCH 268/593] [CI] Set max transformers version for Ultravox model
 test  (#16149)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 tests/models/registry.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index 39e104a11ab11..c078f9a96b553 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -329,7 +329,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                           min_transformers_version="4.49"),  # noqa: E501
     "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),
     "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501
-                                     trust_remote_code=True),
+                                     trust_remote_code=True,
+                                     max_transformers_version="4.50"),
     # [Encoder-decoder]
     # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
     # Therefore, we borrow the BartTokenizer from the original Bart model

From 95d63f38c039e6fce57cf9cddb4c32bbc655a376 Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Mon, 7 Apr 2025 13:32:06 +0800
Subject: [PATCH 269/593] doc: fix some typos in doc (#16154)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
---
 docs/source/design/mm_processing.md       | 2 +-
 docs/source/design/v1/torch_compile.md    | 2 +-
 docs/source/features/quantization/gguf.md | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/design/mm_processing.md b/docs/source/design/mm_processing.md
index 2a4dac786d4bc..0947c1da1e547 100644
--- a/docs/source/design/mm_processing.md
+++ b/docs/source/design/mm_processing.md
@@ -8,7 +8,7 @@ Here are the main features of {class}`~vllm.multimodal.processing.BaseMultiModal
 
 ## Prompt Update Detection
 
-One of the main responsibilies of HF processor is to update the prompt with placeholder tokens. For example:
+One of the main responsibilities of HF processor is to update the prompt with placeholder tokens. For example:
 
 - Insert feature placeholder tokens (e.g. `<image><image>...<image>`, the number of which equals to the feature size) at the start of the string.
 - Replace existing input placeholder tokens (e.g. `<image>` for a single image) with feature placeholder tokens (e.g. `<image><image>...<image>`, the number of which equals to the feature size).
diff --git a/docs/source/design/v1/torch_compile.md b/docs/source/design/v1/torch_compile.md
index 0dadc8089991c..57dba680b97c6 100644
--- a/docs/source/design/v1/torch_compile.md
+++ b/docs/source/design/v1/torch_compile.md
@@ -126,7 +126,7 @@ Unfortunately, because auto-tuning takes quite a long time (from seconds to minu
 
 ## Cudagraph Capture
 
-vLLM's V1 architecture uses piecewise cudagraph. The full computation graph is split as mentioned above, and we only capture the cudagraph for the piece of graph between attention operations (including the first graph before any attention operation, and the last graph after all the attention operation). This is based on a common observation: computation between attentions are usually token-wise and easy to deal with for cudagraph; while the attention operation is non-trival to be cudagraph compatible. Thus, by running the attention operation in eager mode while the rest operations in cudagraph, we keep the flexibility of the attention operation.
+vLLM's V1 architecture uses piecewise cudagraph. The full computation graph is split as mentioned above, and we only capture the cudagraph for the piece of graph between attention operations (including the first graph before any attention operation, and the last graph after all the attention operation). This is based on a common observation: computation between attentions are usually token-wise and easy to deal with for cudagraph; while the attention operation is non-trivial to be cudagraph compatible. Thus, by running the attention operation in eager mode while the rest operations in cudagraph, we keep the flexibility of the attention operation.
 
 The piecewise cudagraph also has fine-grained memory management. The purpose is to only exclude the attention kernel from cudagraph, while keeping all the rest modules and the memory allocation operations in the cudagraph. This is why the attention operation in V1 has the output tensor as the input of the attention.
 
diff --git a/docs/source/features/quantization/gguf.md b/docs/source/features/quantization/gguf.md
index 4b1ff4a22a23b..e93e4dcd3b578 100644
--- a/docs/source/features/quantization/gguf.md
+++ b/docs/source/features/quantization/gguf.md
@@ -29,7 +29,7 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlam
 We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size.
 :::
 
-GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-confing-path
+GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-config-path
 
 ```console
 # If you model is not supported by huggingface you can manually provide a huggingface compatible config path

From 7c80368710755ab54147eaa0090b7782622fda48 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 7 Apr 2025 19:04:02 +0800
Subject: [PATCH 270/593] [VLM] Florence-2 supports online serving (#16164)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 examples/template_florence2.jinja       |  7 +++++++
 vllm/entrypoints/chat_utils.py          |  4 ++--
 vllm/model_executor/models/florence2.py | 14 +++++++++++++-
 3 files changed, 22 insertions(+), 3 deletions(-)
 create mode 100644 examples/template_florence2.jinja

diff --git a/examples/template_florence2.jinja b/examples/template_florence2.jinja
new file mode 100644
index 0000000000000..d257aed6a85b0
--- /dev/null
+++ b/examples/template_florence2.jinja
@@ -0,0 +1,7 @@
+{%- for message in messages -%}
+    {%- if message['role'] == 'user' -%}
+        {{- message['content'] -}}
+    {%- elif message['role'] == 'assistant' -%}
+        {{- message['content'] -}}
+    {%- endif -%}
+{%- endfor -%}
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 9129e47de7e9d..9041b92a5de16 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -487,8 +487,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
                 return "<|endoftext10|>"  # 200010 (see vocab.json in hf model)
             if model_type in ("minicpmo", "minicpmv"):
                 return "(<image>./</image>)"
-            if model_type in ("blip-2", "fuyu", "paligemma", "pixtral",
-                              "mistral3"):
+            if model_type in ("blip-2", "florence2", "fuyu", "paligemma",
+                              "pixtral", "mistral3"):
                 # These models do not use image tokens in the prompt
                 return None
             if model_type == "qwen":
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 02535cc5473c7..70b8d51b713c4 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -10,7 +10,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
-from transformers import BatchFeature, PretrainedConfig
+from transformers import BartTokenizer, BatchFeature, PretrainedConfig
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -826,6 +826,18 @@ class Florence2MultiModalProcessor(
     ) -> Union[str, list[int]]:
         return [self.info.get_hf_config().eos_token_id]
 
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+        hf_processor = self.info.get_hf_processor()
+        tokenizer: BartTokenizer = hf_processor.tokenizer
+        prompt_text = tokenizer.decode(prompt_tokens)
+        # convert task tokens to prompt
+        prompt_text = hf_processor._construct_prompts([prompt_text])[0]
+        prompt_tokens = tokenizer.encode(prompt_text, add_special_tokens=False)
+        return prompt_tokens
+
     def _call_hf_processor(
         self,
         prompt: str,

From e9ba99f296e77038f59f59e33579f4011ed56411 Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Mon, 7 Apr 2025 19:06:24 +0800
Subject: [PATCH 271/593] [V1][Structured Output] Add
 `supports_structured_output()` method to Platform (#16148)

Signed-off-by: shen-shanshan <467638484@qq.com>
---
 vllm/platforms/cpu.py       | 4 ++++
 vllm/platforms/cuda.py      | 4 ++++
 vllm/platforms/hpu.py       | 4 ++++
 vllm/platforms/interface.py | 7 +++++++
 vllm/platforms/neuron.py    | 4 ++++
 vllm/platforms/rocm.py      | 4 ++++
 vllm/platforms/tpu.py       | 5 +++++
 vllm/platforms/xpu.py       | 4 ++++
 vllm/v1/engine/processor.py | 8 +++++---
 9 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 67466bdb98075..cfd7bc2a40571 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -180,3 +180,7 @@ class CpuPlatform(Platform):
         Get device specific communicator class for distributed communication.
         """
         return "vllm.distributed.device_communicators.cpu_communicator.CpuCommunicator"  # noqa
+
+    @classmethod
+    def supports_structured_output(cls) -> bool:
+        return True
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 0576022be448b..053cf74ebceb0 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -308,6 +308,10 @@ class CudaPlatformBase(Platform):
     def supports_v1(cls, model_config: ModelConfig) -> bool:
         return True
 
+    @classmethod
+    def supports_structured_output(cls) -> bool:
+        return True
+
     @classmethod
     def use_custom_allreduce(cls) -> bool:
         return True
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index 4c842b5251105..f011f14029a39 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -92,3 +92,7 @@ class HpuPlatform(Platform):
     @classmethod
     def get_device_communicator_cls(cls) -> str:
         return "vllm.distributed.device_communicators.hpu_communicator.HpuCommunicator"  # noqa
+
+    @classmethod
+    def supports_structured_output(cls) -> bool:
+        return True
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index b6f6029de9c82..2bb543bd73f70 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -379,6 +379,13 @@ class Platform:
         """
         return False
 
+    @classmethod
+    def supports_structured_output(cls) -> bool:
+        """
+        Returns whether the current platform can support structured output.
+        """
+        return False
+
     @classmethod
     def use_custom_allreduce(cls) -> bool:
         """
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index c1f426e5b8801..93657881cbdd8 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -67,3 +67,7 @@ class NeuronPlatform(Platform):
     @classmethod
     def use_all_gather(cls) -> bool:
         return True
+
+    @classmethod
+    def supports_structured_output(cls) -> bool:
+        return True
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index d18b7c26f7ec5..a2fbf416ecf20 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -303,6 +303,10 @@ class RocmPlatform(Platform):
         # V1 support on AMD gpus is experimental
         return True
 
+    @classmethod
+    def supports_structured_output(cls) -> bool:
+        return True
+
     @classmethod
     def use_custom_allreduce(cls) -> bool:
         # We only enable custom allreduce for MI300 series
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 43d3044cb93ee..eeadb4a71e5e7 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -133,3 +133,8 @@ class TpuPlatform(Platform):
     def supports_v1(cls, model_config: ModelConfig) -> bool:
         # V1 support on TPU is experimental
         return True
+
+    @classmethod
+    def supports_structured_output(cls) -> bool:
+        # Structured output is not supported on TPU.
+        return False
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 225e756cd7ce8..c4bd639384a40 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -140,3 +140,7 @@ class XPUPlatform(Platform):
     @classmethod
     def get_device_communicator_cls(cls) -> str:
         return "vllm.distributed.device_communicators.xpu_communicator.XpuCommunicator"  # noqa
+
+    @classmethod
+    def supports_structured_output(cls) -> bool:
+        return True
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 0d2892837eb28..403edddfcbee6 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -136,9 +136,11 @@ class Processor:
                                  f" != {engine_level_backend}")
         else:
             params.guided_decoding.backend = engine_level_backend
-        import vllm.platforms
-        if vllm.platforms.current_platform.is_tpu():
-            raise ValueError("Structured output is not supported on TPU.")
+
+        from vllm.platforms import current_platform
+        if not current_platform.supports_structured_output():
+            raise ValueError("Structured output is not supported on "
+                             f"{current_platform.device_name}.")
 
         # Request content validation
         if engine_level_backend.startswith("xgrammar"):

From 7699258ef01392926750825df6b254db45b0c4e8 Mon Sep 17 00:00:00 2001
From: YamPengLi <yampayne.lyp@alibaba-inc.com>
Date: Mon, 7 Apr 2025 19:06:41 +0800
Subject: [PATCH 272/593] [Model] Add Qwen3 and Qwen3MoE (#15289)

Signed-off-by: YamPengLi <yampayne.lyp@alibaba-inc.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/source/models/supported_models.md  |  10 +
 tests/models/registry.py                |  10 +
 vllm/model_executor/models/qwen2.py     |  16 +-
 vllm/model_executor/models/qwen3.py     | 329 +++++++++++++++
 vllm/model_executor/models/qwen3_moe.py | 531 ++++++++++++++++++++++++
 vllm/model_executor/models/registry.py  |   2 +
 6 files changed, 893 insertions(+), 5 deletions(-)
 create mode 100644 vllm/model_executor/models/qwen3.py
 create mode 100644 vllm/model_executor/models/qwen3_moe.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index bd310219ca529..8b568de7c81cc 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -478,6 +478,16 @@ See [this page](#generative-models) for more information on how to use generativ
   * `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.
   *
   * ✅︎
+- * `Qwen3ForCausalLM`
+  * Qwen3
+  * `Qwen/Qwen3-8B`, etc.
+  * ✅︎
+  * ✅︎
+- * `Qwen3MoeForCausalLM`
+  * Qwen3MoE
+  * `Qwen/Qwen3-MoE-15B-A2B`, etc.
+  * ✅︎
+  * ✅︎
 - * `StableLmForCausalLM`
   * StableLM
   * `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.
diff --git a/tests/models/registry.py b/tests/models/registry.py
index c078f9a96b553..574b8d9e13085 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -202,6 +202,16 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-7B-Instruct",
                                         extras={"2.5": "Qwen/Qwen2.5-7B-Instruct"}), # noqa: E501
     "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
+    "Qwen3ForCausalLM": _HfExamplesInfo(
+        "Qwen/Qwen3-8B",
+        is_available_online=False,
+        min_transformers_version="4.51"
+    ),
+    "Qwen3MoeForCausalLM": _HfExamplesInfo(
+        "Qwen/Qwen3-MoE-15B-A2B",
+        is_available_online=False,
+        min_transformers_version="4.51"
+    ),
     "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b",
                                      is_available_online=False),
     "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b",  # noqa: E501
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index c4d02e5ddeb15..2831a5a12330b 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -263,7 +263,11 @@ class Qwen2DecoderLayer(nn.Module):
     })
 class Qwen2Model(nn.Module):
 
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 decoder_layer_type: type[nn.Module] = Qwen2DecoderLayer):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
@@ -297,12 +301,14 @@ class Qwen2Model(nn.Module):
         else:
             self.embed_tokens = PPMissingLayer()
 
+        # Use the provided decoder layer type or default to Qwen2DecoderLayer
+        decoder_layer_type = decoder_layer_type or Qwen2DecoderLayer
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: Qwen2DecoderLayer(config=config,
-                                             cache_config=cache_config,
-                                             quant_config=quant_config,
-                                             prefix=prefix),
+            lambda prefix: decoder_layer_type(config=config,
+                                              cache_config=cache_config,
+                                              quant_config=quant_config,
+                                              prefix=prefix),
             prefix=f"{prefix}.layers",
         )
 
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
new file mode 100644
index 0000000000000..9c14038e61133
--- /dev/null
+++ b/vllm/model_executor/models/qwen3.py
@@ -0,0 +1,329 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen3 model compatible with HuggingFace weights."""
+from typing import Iterable, Optional, Set, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import Qwen3Config
+
+from vllm.attention import Attention, AttentionType
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .qwen2 import Qwen2MLP as Qwen3MLP
+from .qwen2 import Qwen2Model
+from .utils import AutoWeightsLoader, PPMissingLayer, maybe_prefix
+
+logger = init_logger(__name__)
+
+
+class Qwen3Attention(nn.Module):
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_heads: int,
+                 num_kv_heads: int,
+                 max_position: int = 4096 * 32,
+                 head_dim: Optional[int] = None,
+                 rms_norm_eps: float = 1e-06,
+                 qkv_bias: bool = False,
+                 rope_theta: float = 10000,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 rope_scaling: Optional[Tuple] = None,
+                 prefix: str = "",
+                 attn_type: str = AttentionType.DECODER) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim or hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn",
+                              attn_type=attn_type)
+        self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        # Add qk-norm
+        q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim,
+                           self.head_dim)
+        q_by_head = self.q_norm.forward_native(q_by_head)
+        q = q_by_head.view(q.shape)
+        k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim,
+                           self.head_dim)
+        k_by_head = self.k_norm.forward_native(k_by_head)
+        k = k_by_head.view(k.shape)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Qwen3DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Qwen3Config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        # Requires transformers > 4.32.0
+        rope_theta = getattr(config, "rope_theta", 1000000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+
+        # By default, Qwen3 uses causal attention as it is a decoder-only model.
+        # You can override the HF config with `is_causal=False` to enable
+        # bidirectional attention, which is used in some embedding models
+        # (e.g. Alibaba-NLP/gte-Qwen3-7B-instruct)
+        if getattr(config, "is_causal", True):
+            attn_type = AttentionType.DECODER
+        else:
+            attn_type = AttentionType.ENCODER_ONLY
+
+        self.self_attn = Qwen3Attention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rms_norm_eps=config.rms_norm_eps,
+            qkv_bias=getattr(config, 'attention_bias', False),
+            head_dim=getattr(config, 'head_dim', None),
+            cache_config=cache_config,
+            quant_config=quant_config,
+            rope_scaling=rope_scaling,
+            prefix=f"{prefix}.self_attn",
+            attn_type=attn_type,
+        )
+        self.mlp = Qwen3MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+ALL_DECODER_LAYER_TYPES = {
+    "attention": Qwen3DecoderLayer,
+}
+
+
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl,
+        # otherwise (seq_len, ).
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    })
+class Qwen3Model(Qwen2Model):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         decoder_layer_type=Qwen3DecoderLayer)
+
+
+class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.lora_config = lora_config
+
+        self.quant_config = quant_config
+        self.model = Qwen3Model(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+
+        if get_pp_group().is_last_rank:
+            if config.tie_word_embeddings:
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.lm_head = ParallelLMHead(config.vocab_size,
+                                              config.hidden_size,
+                                              quant_config=quant_config,
+                                              prefix=maybe_prefix(
+                                                  prefix, "lm_head"))
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = get_sampler()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
new file mode 100644
index 0000000000000..390bb7adf2559
--- /dev/null
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -0,0 +1,531 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen3MoE model compatible with HuggingFace weights."""
+from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (get_pp_group,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsPP
+from .utils import (extract_layer_index, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+logger = init_logger(__name__)
+
+
+class Qwen3MoeMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config,
+                                           reduce_results=reduce_results,
+                                           prefix=f"{prefix}.down_proj")
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Qwen3MoeSparseMoeBlock(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        if self.tp_size > config.num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.num_experts}.")
+
+        self.experts = FusedMoE(num_experts=config.num_experts,
+                                top_k=config.num_experts_per_tok,
+                                hidden_size=config.hidden_size,
+                                intermediate_size=config.moe_intermediate_size,
+                                reduce_results=False,
+                                renormalize=config.norm_topk_prob,
+                                quant_config=quant_config,
+                                prefix=f"{prefix}.experts")
+
+        self.gate = ReplicatedLinear(config.hidden_size,
+                                     config.num_experts,
+                                     bias=False,
+                                     quant_config=None,
+                                     prefix=f"{prefix}.gate")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_dim = hidden_states.shape[-1]
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        final_hidden_states = self.experts(hidden_states=hidden_states,
+                                           router_logits=router_logits)
+        final_hidden_states = final_hidden_states
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
+
+        return final_hidden_states.view(orig_shape)
+
+
+class Qwen3MoeAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        head_dim: Optional[int] = None,
+        rms_norm_eps: float = 1e-06,
+        qkv_bias: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim or (hidden_size // self.total_num_heads)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(hidden_size,
+                                          self.head_dim,
+                                          self.total_num_heads,
+                                          self.total_num_kv_heads,
+                                          bias=qkv_bias,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.qkv_proj")
+
+        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
+                                        hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.o_proj")
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+        self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        # Add qk-norm
+        q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim,
+                           self.head_dim)
+        q_by_head = self.q_norm.forward_native(q_by_head)
+        q = q_by_head.view(q.shape)
+
+        k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim,
+                           self.head_dim)
+        k_by_head = self.k_norm.forward_native(k_by_head)
+        k = k_by_head.view(k.shape)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Qwen3MoeDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.self_attn = Qwen3MoeAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            rms_norm_eps=config.rms_norm_eps,
+            qkv_bias=getattr(config, 'attention_bias', False),
+            head_dim=getattr(config, 'head_dim', None),
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        # `mlp_only_layers` in the config.
+        layer_idx = extract_layer_index(prefix)
+        mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else
+                           config.mlp_only_layers)
+        if (layer_idx not in mlp_only_layers) and (
+                config.num_experts > 0 and
+            (layer_idx + 1) % config.decoder_sparse_step == 0):
+            self.mlp = Qwen3MoeSparseMoeBlock(config=config,
+                                              quant_config=quant_config,
+                                              prefix=f"{prefix}.mlp")
+        else:
+            self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size,
+                                   intermediate_size=config.intermediate_size,
+                                   hidden_act=config.hidden_act,
+                                   quant_config=quant_config,
+                                   prefix=f"{prefix}.mlp")
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class Qwen3MoeModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=f"{prefix}.embed_tokens")
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Qwen3MoeDecoderLayer(config=config,
+                                                cache_config=cache_config,
+                                                quant_config=quant_config,
+                                                prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(positions, hidden_states, residual)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class Qwen3MoeForCausalLM(nn.Module, SupportsPP):
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Qwen3MoeModel(vllm_config=vllm_config,
+                                   prefix=maybe_prefix(prefix, "model"))
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = get_sampler()
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts)
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    if name.endswith("kv_scale"):
+                        remapped_kv_scale_name = name.replace(
+                            ".kv_scale", ".attn.kv_scale")
+                        if remapped_kv_scale_name not in params_dict:
+                            logger.warning_once(
+                                "Found kv scale in the checkpoint "
+                                f"(e.g. {name}), but not found the expected "
+                                f"name in the model "
+                                f"(e.g. {remapped_kv_scale_name}). "
+                                "kv-scale is not loaded.")
+                            continue
+                        else:
+                            name = remapped_kv_scale_name
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 6ead6509bfe8f..080aef8982d53 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -100,6 +100,8 @@ _TEXT_GENERATION_MODELS = {
     "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),
+    "Qwen3ForCausalLM": ("qwen3", "Qwen3ForCausalLM"),
+    "Qwen3MoeForCausalLM": ("qwen3_moe", "Qwen3MoeForCausalLM"),
     "RWForCausalLM": ("falcon", "FalconForCausalLM"),
     "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),
     "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),

From dc3529dbf65786fe25cce8144c76260266061c9d Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Mon, 7 Apr 2025 19:53:52 +0800
Subject: [PATCH 273/593] [Misc] improve example mlpspeculator and
 llm_engine_example (#16175)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 .../offline_inference/llm_engine_example.py   |  7 ++++++-
 examples/offline_inference/mlpspeculator.py   | 21 +++++++++++++------
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py
index e94f47b72b2e9..abff90d1c0cb6 100644
--- a/examples/offline_inference/llm_engine_example.py
+++ b/examples/offline_inference/llm_engine_example.py
@@ -1,5 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
-
+"""
+This file demonstrates using the `LLMEngine`
+for processing prompts with various sampling parameters.
+"""
 import argparse
 
 from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
@@ -26,6 +29,7 @@ def process_requests(engine: LLMEngine,
     """Continuously process a list of prompts and handle the outputs."""
     request_id = 0
 
+    print('-' * 50)
     while test_prompts or engine.has_unfinished_requests():
         if test_prompts:
             prompt, sampling_params = test_prompts.pop(0)
@@ -37,6 +41,7 @@ def process_requests(engine: LLMEngine,
         for request_output in request_outputs:
             if request_output.finished:
                 print(request_output)
+                print('-' * 50)
 
 
 def initialize_engine(args: argparse.Namespace) -> LLMEngine:
diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py
index 380c53fab2201..a2a984b04e005 100644
--- a/examples/offline_inference/mlpspeculator.py
+++ b/examples/offline_inference/mlpspeculator.py
@@ -1,4 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
+"""
+This file demonstrates the usage of text generation with an LLM model,
+comparing the performance with and without speculative decoding.
+
+Note that still not support `v1`:
+VLLM_USE_V1=0 python examples/offline_inference/mlpspeculator.py
+"""
 
 import gc
 import time
@@ -7,7 +14,7 @@ from vllm import LLM, SamplingParams
 
 
 def time_generation(llm: LLM, prompts: list[str],
-                    sampling_params: SamplingParams):
+                    sampling_params: SamplingParams, title: str):
     # Generate texts from the prompts. The output is a list of RequestOutput
     # objects that contain the prompt, generated text, and other information.
     # Warmup first
@@ -16,11 +23,15 @@ def time_generation(llm: LLM, prompts: list[str],
     start = time.time()
     outputs = llm.generate(prompts, sampling_params)
     end = time.time()
-    print((end - start) / sum([len(o.outputs[0].token_ids) for o in outputs]))
+    print("-" * 50)
+    print(title)
+    print("time: ",
+          (end - start) / sum(len(o.outputs[0].token_ids) for o in outputs))
     # Print the outputs.
     for output in outputs:
         generated_text = output.outputs[0].text
         print(f"text: {generated_text!r}")
+        print("-" * 50)
 
 
 if __name__ == "__main__":
@@ -41,8 +52,7 @@ if __name__ == "__main__":
     # Create an LLM without spec decoding
     llm = LLM(model="meta-llama/Llama-2-13b-chat-hf")
 
-    print("Without speculation")
-    time_generation(llm, prompts, sampling_params)
+    time_generation(llm, prompts, sampling_params, "Without speculation")
 
     del llm
     gc.collect()
@@ -55,5 +65,4 @@ if __name__ == "__main__":
         },
     )
 
-    print("With speculation")
-    time_generation(llm, prompts, sampling_params)
+    time_generation(llm, prompts, sampling_params, "With speculation")

From 8017c8db7f7831b997a772da89ff8a134376260c Mon Sep 17 00:00:00 2001
From: Robin <863579016@qq.com>
Date: Mon, 7 Apr 2025 22:17:39 +0800
Subject: [PATCH 274/593] [Doc]Update image to latest version (#16186)

Signed-off-by: WangErXiao <863579016@qq.com>
---
 docs/source/deployment/docker.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md
index 1ccb04ac625cf..6b794db656c05 100644
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@@ -34,11 +34,11 @@ If you need to use those dependencies (having accepted the license terms),
 create a custom Dockerfile on top of the base image with an extra layer that installs them:
 
 ```Dockerfile
-FROM vllm/vllm-openai:v0.8.2
+FROM vllm/vllm-openai:v0.8.3
 
-# e.g. install the `audio` and `video` optional dependencies
+# e.g. install the `audio` optional dependencies
 # NOTE: Make sure the version of vLLM matches the base image!
-RUN uv pip install --system vllm[audio,video]==0.8.2
+RUN uv pip install --system vllm[audio]==0.8.3
 ```
 
 :::

From 55dcce91df150f576c28520d987eaf1498fcb0bd Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Mon, 7 Apr 2025 08:06:27 -0700
Subject: [PATCH 275/593] Upstream Llama4 Support to Main (#16113)

Signed-off-by: Aston Zhang <22279212+astonzhang@users.noreply.github.com>
Signed-off-by: Chris Thi <chris.c.thi@gmail.com>
Signed-off-by: drisspg <drisspguessous@gmail.com>
Signed-off-by: Jon Swenson <jmswen@gmail.com>
Signed-off-by: Keyun Tong <tongkeyun@gmail.com>
Signed-off-by: Lu Fang <fanglu@meta.com>
Signed-off-by: Xiaodong Wang <xdwang@meta.com>
Signed-off-by: Yang Chen <yangche@fb.com>
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
Signed-off-by: Zijing Liu <liuzijing2014@gmail.com>
Signed-off-by: Lu Fang <lufang@fb.com>
Signed-off-by: Lu Fang <fanglu@fb.com>
Signed-off-by: Lucia Fang <fanglu@fb.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-pipeline.yaml                 |   3 +-
 benchmarks/kernels/benchmark_moe.py           |   3 +
 docs/source/models/supported_models.md        |  11 +-
 examples/offline_inference/audio_language.py  |   2 +-
 examples/offline_inference/vision_language.py |  37 +
 .../vision_language_multi_image.py            |  38 +
 requirements/common.txt                       |   2 +-
 requirements/test.in                          |   2 +-
 requirements/test.txt                         |   2 +-
 .../audio_language/test_ultravox.py           |  14 +-
 .../vision_language/test_models.py            |  37 +-
 .../vision_language/test_phi3v.py             |   9 +
 .../vision_language/test_pixtral.py           |   2 +
 .../multimodal/processing/test_common.py      |   1 +
 .../multimodal/processing/test_llama4.py      |  99 ++
 tests/models/registry.py                      |   6 +-
 tests/models/test_initialization.py           |  17 +-
 vllm/config.py                                |   2 +
 vllm/entrypoints/chat_utils.py                |   2 +-
 ...=1024,device_name=AMD_Instinct_MI300X.json | 200 ++++
 .../layers/fused_moe/cutlass_moe.py           |  15 +-
 .../layers/fused_moe/fused_moe.py             |  28 +-
 vllm/model_executor/layers/fused_moe/layer.py |  65 +-
 vllm/model_executor/layers/layernorm.py       |   7 +-
 .../layers/quantization/awq_marlin.py         |   5 +
 .../compressed_tensors_moe.py                 |  38 +-
 .../layers/quantization/experts_int8.py       |  27 +-
 .../model_executor/layers/quantization/fp8.py |   2 +
 .../layers/quantization/gguf.py               |   6 +
 .../layers/quantization/gptq_marlin.py        |   5 +
 .../layers/quantization/moe_wna16.py          |  33 +-
 .../layers/quantization/quark/quark_moe.py    |  30 +-
 .../model_executor/layers/rotary_embedding.py |  68 ++
 vllm/model_executor/model_loader/loader.py    |   4 +-
 vllm/model_executor/models/llama.py           |  26 +-
 vllm/model_executor/models/llama4.py          | 531 +++++++++++
 vllm/model_executor/models/mllama4.py         | 895 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |   1 +
 vllm/model_executor/models/telechat2.py       |   5 +-
 vllm/model_executor/models/teleflm.py         |   5 +-
 vllm/v1/attention/backends/flash_attn.py      | 250 ++++-
 vllm/v1/attention/backends/triton_attn.py     |  55 +-
 vllm/v1/worker/gpu_model_runner.py            |   1 +
 43 files changed, 2436 insertions(+), 155 deletions(-)
 create mode 100644 tests/models/multimodal/processing/test_llama4.py
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json
 create mode 100644 vllm/model_executor/models/llama4.py
 create mode 100644 vllm/model_executor/models/mllama4.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 0b775851c0572..55530d0da8d72 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -389,7 +389,8 @@ steps:
     - pytest -v -s models/test_transformers.py
     - pytest -v -s models/test_registry.py
     # V1 Test: https://github.com/vllm-project/vllm/issues/14531
-    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py
+    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4'
+    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
 
 - label: Language Models Test (Standard) # 32min
   #mirror_hardwares: [amd]
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index f1803b39c8836..afe0b53077a70 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -553,6 +553,9 @@ def main(args: argparse.Namespace):
         intermediate_size = config.moe_intermediate_size
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
     else:
+        if not hasattr(config, "hidden_size"):
+            # Support for llama4
+            config = config.text_config
         # Default: Mixtral.
         E = config.num_local_experts
         topk = config.num_experts_per_tok
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 8b568de7c81cc..2fb969ea85f1f 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -24,7 +24,7 @@ vLLM also supports model implementations that are available in Transformers. Thi
 
 To check if the modeling backend is Transformers, you can simply do this:
 
-```python 
+```python
 from vllm import LLM
 llm = LLM(model=..., task="generate")  # Name or path of your model
 llm.apply_model(lambda model: print(type(model)))
@@ -55,7 +55,7 @@ If your model is neither supported natively by vLLM or Transformers, you can sti
 Simply set `trust_remote_code=True` and vLLM will run any model on the Model Hub that is compatible with Transformers.
 Provided that the model writer implements their model in a compatible way, this means that you can run new models before they are officially supported in Transformers or vLLM!
 
-```python 
+```python
 from vllm import LLM
 llm = LLM(model=..., task="generate", trust_remote_code=True)  # Name or path of your model
 llm.apply_model(lambda model: print(model.__class__))
@@ -850,6 +850,13 @@ See [this page](#generative-models) for more information on how to use generativ
   *
   * ✅︎
   * ✅︎
+- * `Llama4ForConditionalGeneration`
+  * Llama-4-17B-Omni-Instruct
+  * T + I<sup>+</sup>
+  * `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc.
+  *
+  * ✅︎
+  * ✅︎
 - * `LlavaForConditionalGeneration`
   * LLaVA-1.5
   * T + I<sup>E+</sup>
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 840892ea07010..f33efbab955ef 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -47,7 +47,7 @@ def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
         model=model_name,
         trust_remote_code=True,
         max_model_len=4096,
-        max_num_seqs=5,
+        max_num_seqs=2,
         limit_mm_per_prompt={"audio": audio_count},
     )
 
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index c1115708505af..61d53dda1c479 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -582,6 +582,42 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+def run_llama4(questions: list[str], modality: str):
+    assert modality == "image"
+
+    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=4,
+        tensor_parallel_size=8,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        gpu_memory_utilization=0.4,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    messages = [[{
+        "role":
+        "user",
+        "content": [{
+            "type": "image"
+        }, {
+            "type": "text",
+            "text": f"{question}"
+        }]
+    }] for question in questions]
+    prompts = tokenizer.apply_chat_template(messages,
+                                            add_generation_prompt=True,
+                                            tokenize=False)
+    stop_token_ids = None
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
+
+
 # Molmo
 def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -907,6 +943,7 @@ model_example_map = {
     "minicpmv": run_minicpmv,
     "mistral3": run_mistral3,
     "mllama": run_mllama,
+    "llama4": run_llama4,
     "molmo": run_molmo,
     "NVLM_D": run_nvlm_d,
     "paligemma": run_paligemma,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 39951e5e89c46..e03ebe485eaaf 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -253,6 +253,43 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=4,
+        tensor_parallel_size=8,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *placeholders,
+            {
+                "type": "text",
+                "text": question
+            },
+        ],
+    }]
+
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
 
@@ -567,6 +604,7 @@ model_example_map = {
     "h2ovl_chat": load_h2ovl,
     "idefics3": load_idefics3,
     "internvl_chat": load_internvl,
+    "llama4": load_llama4,
     "mistral3": load_mistral3,
     "mllama": load_mllama,
     "NVLM_D": load_nvlm_d,
diff --git a/requirements/common.txt b/requirements/common.txt
index 7365a5b46a308..24a1e6d67ac22 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -6,7 +6,7 @@ requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
-transformers >= 4.50.3
+transformers >= 4.51.0
 huggingface-hub[hf_xet] >= 0.30.0  # Required for Xet downloads.
 tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
diff --git a/requirements/test.in b/requirements/test.in
index 364747e9c08f2..ac7f451e96a8d 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -30,7 +30,7 @@ mistral_common[opencv] >= 1.5.4 # required for pixtral test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.8 # required for model evaluation test
-transformers==4.50.3
+transformers==4.51.0
 huggingface-hub[hf_xet]>=0.30.0  # Required for Xet downloads.
 # quantization
 bitsandbytes>=0.45.3
diff --git a/requirements/test.txt b/requirements/test.txt
index 236b8be328058..39d6ed1acff06 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -645,7 +645,7 @@ tqdm==4.66.6
     #   transformers
 tqdm-multiprocess==0.0.11
     # via lm-eval
-transformers==4.50.3
+transformers==4.51.0
     # via
     #   -r requirements/test.in
     #   genai-perf
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index 83ece5d22bfb3..a843e41aa26e7 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -12,6 +12,7 @@ from vllm.sequence import SampleLogprobs
 
 from ....conftest import HfRunner, VllmRunner
 from ....utils import RemoteOpenAIServer
+from ...registry import HF_EXAMPLE_MODELS
 from ...utils import check_logprobs_close
 
 MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
@@ -55,7 +56,10 @@ def server(request, audio_assets):
         for key, value in request.param.items()
     ]
 
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    with RemoteOpenAIServer(MODEL_NAME,
+                            args,
+                            env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
+                                      "30"}) as remote_server:
         yield remote_server
 
 
@@ -106,6 +110,10 @@ def run_test(
     **kwargs,
 ):
     """Inference result should be the same between hf and vllm."""
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
+
     # NOTE: take care of the order. run vLLM first, and then run HF.
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it
@@ -156,6 +164,10 @@ def run_multi_audio_test(
     num_logprobs: int,
     **kwargs,
 ):
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
+
     with vllm_runner(model,
                      dtype=dtype,
                      enforce_eager=True,
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 3b34f012f6264..9d9e8278af4fc 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -160,17 +160,32 @@ VLM_TEST_SETTINGS = {
     ),
     "aya_vision": VLMTestInfo(
         models=["CohereForAI/aya-vision-8b"],
-        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        test_type=(VLMTestType.IMAGE),
         prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
         single_image_prompts=IMAGE_ASSETS.prompts({
             "stop_sign": "<image>What's the content in the center of the image?",  # noqa: E501
             "cherry_blossom": "<image>What is the season?",  # noqa: E501
         }),
         multi_image_prompt="<image><image>Describe the two images in detail.",  # noqa: E501
-        max_model_len=8192,
+        max_model_len=4096,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
-        vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}}
+        vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
+    ),
+    "aya_vision-multi_image": VLMTestInfo(
+        models=["CohereForAI/aya-vision-8b"],
+        test_type=(VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<image>What's the content in the center of the image?",  # noqa: E501
+            "cherry_blossom": "<image>What is the season?",  # noqa: E501
+        }),
+        multi_image_prompt="<image><image>Describe the two images in detail.",  # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
+        marks=[large_gpu_mark(min_gb=32)],
     ),
     "blip2": VLMTestInfo(
         # TODO: Change back to 2.7b once head_dim = 80 is supported
@@ -303,6 +318,22 @@ VLM_TEST_SETTINGS = {
         use_tokenizer_eos=True,
         patch_hf_runner=model_utils.internvl_patch_hf_runner,
     ),
+    "llama4": VLMTestInfo(
+        models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
+        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
+        img_idx_to_prompt=lambda _: "<|image|>",
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        distributed_executor_backend="mp",
+        image_size_factors=[(.25, 0.5, 1.0)],
+        hf_model_kwargs={"device_map": "auto"},
+        max_model_len=8192,
+        max_num_seqs=4,
+        dtype="bfloat16",
+        auto_cls=AutoModelForImageTextToText,
+        tensor_parallel_size=8,
+        vllm_runner_kwargs={"gpu_memory_utilization": 0.8},
+        marks=multi_gpu_marks(num_gpus=8),
+    ),
     "llava_next": VLMTestInfo(
         models=["llava-hf/llava-v1.6-mistral-7b-hf"],
         test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py
index 53b183b2735e1..237d499d8f6ad 100644
--- a/tests/models/decoder_only/vision_language/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/test_phi3v.py
@@ -5,7 +5,9 @@ import re
 from typing import Optional
 
 import pytest
+from packaging.version import Version
 from transformers import AutoTokenizer
+from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm.multimodal.image import rescale_image_size
 from vllm.platforms import current_platform
@@ -81,6 +83,13 @@ def run_test(
     from transformers import AutoImageProcessor  # noqa: F401
     from transformers import AutoProcessor  # noqa: F401
 
+    # Once the model repo is updated to 4.49, we should be able to run the
+    # test in `test_models.py` without the above workaround
+    if Version(TRANSFORMERS_VERSION) >= Version("4.49"):
+        pytest.skip(f"`transformers=={TRANSFORMERS_VERSION}` installed, "
+                    "but `transformers<=4.49` is required to run this model. "
+                    "Reason: Cannot run HF implementation")
+
     # NOTE: take care of the order. run vLLM first, and then run HF.
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it
diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py
index ee619d8d80c42..2f14a8ea321fe 100644
--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -176,6 +176,8 @@ def test_chat(
             model,
             dtype=dtype,
             tokenizer_mode="mistral",
+            load_format="mistral",
+            config_format="mistral",
             max_model_len=max_model_len,
             limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
     ) as vllm_model:
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index fdcd7a9e1738e..35334ef13b7aa 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -257,6 +257,7 @@ def _test_processing_correctness_mistral(
     "h2oai/h2ovl-mississippi-800m",
     "OpenGVLab/InternVL2-1B",
     "HuggingFaceM4/Idefics3-8B-Llama3",
+    "meta-llama/Llama-4-Scout-17B-16E-Instruct",
     "llava-hf/llava-1.5-7b-hf",
     "llava-hf/llava-v1.6-mistral-7b-hf",
     "llava-hf/LLaVA-NeXT-Video-7B-hf",
diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py
new file mode 100644
index 0000000000000..7ec7c80029746
--- /dev/null
+++ b/tests/models/multimodal/processing/test_llama4.py
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for Llama4's multimodal preprocessing kwargs."""
+
+import pytest
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.transformers_utils.tokenizer import encode_tokens
+
+from ....conftest import _ImageAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id",
+                         ["meta-llama/Llama-4-Scout-17B-16E-Instruct"])
+@pytest.mark.parametrize("mm_processor_kwargs", [{}])
+@pytest.mark.parametrize("num_imgs", [1, 5])
+@pytest.mark.parametrize("disable_mm_preprocessor_cache", [True, False])
+@pytest.mark.parametrize("tokenized_prompt", [True, False])
+def test_processor_override(
+    image_assets: _ImageAssets,
+    model_id: str,
+    mm_processor_kwargs: dict,
+    num_imgs: int,
+    disable_mm_preprocessor_cache: bool,
+    tokenized_prompt: bool,
+):
+    """Ensure llama4 processor works properly."""
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs,
+        limit_mm_per_prompt={"image": num_imgs},
+        disable_mm_preprocessor_cache=disable_mm_preprocessor_cache,
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    config = processor.info.get_hf_config()
+    tokenizer = processor.info.get_tokenizer()
+    hf_processor = processor.info.get_hf_processor()
+    vocab = tokenizer.get_vocab()
+
+    prompt = "<|begin_of_text|><|header_start|>user<|header_end|>" \
+        + "<|image|>" * num_imgs \
+        + "<|eot|><|header_start|>assistant<|header_end|>"
+    mm_data = {
+        "image": [
+            image_assets[(i % len(image_assets))].pil_image
+            for i in range(num_imgs)
+        ]
+    }
+    if tokenized_prompt:
+        prompt = encode_tokens(tokenizer, prompt)
+
+    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
+    mm_kwargs = processed_inputs["mm_kwargs"]
+
+    # place holder replacements
+    prompt_token_ids = processed_inputs["prompt_token_ids"]
+    assert prompt_token_ids.count(config.boi_token_index) == num_imgs
+    assert prompt_token_ids.count(config.eoi_token_index) == num_imgs
+    assert prompt_token_ids.count(vocab[hf_processor.image_token]) == num_imgs
+    aspect_ratios = mm_kwargs["aspect_ratios"]
+    num_x_separators = num_y_separators = 0
+    for tiles_y, tiles_x in aspect_ratios:
+        if tiles_x * tiles_y > 1:
+            num_x_separators += (tiles_x - 1) * tiles_y
+            num_y_separators += tiles_y
+    assert prompt_token_ids.count(vocab[hf_processor.tile_token]) \
+        == num_x_separators
+    assert prompt_token_ids.count(vocab[hf_processor.tile_global_token]) \
+        ==  num_y_separators
+
+    # image token offsets
+    img_locs = processed_inputs["mm_placeholders"].get("image", [])
+    assert len(img_locs) == num_imgs
+    assert [img_loc["offset"] for img_loc in img_locs] == \
+        [i for i, v in enumerate(prompt_token_ids) \
+        if v == config.boi_token_index]
+
+    # patch sizes and masks
+    assert prompt_token_ids.count(config.image_token_index) \
+        == sum(img_patch.sum() for img_patch in mm_kwargs["embed_is_patch"])
+    patch_token_id = vocab[hf_processor.img_patch_token]
+    num_patches = processed_inputs["prompt_token_ids"].count(patch_token_id)
+    mm_counts = {"image": num_imgs}
+    assert num_patches / num_imgs <= \
+        processor.info.get_mm_max_tokens_per_item(32768, mm_counts)["image"]
+    num_patches_per_chunk = processor.info.get_patch_per_chunk(
+        config.vision_config)
+    assert prompt_token_ids.count(config.image_token_index) \
+        == mm_kwargs["patches_per_image"].sum() * num_patches_per_chunk
+    assert mm_kwargs["pixel_values"].shape[0] \
+        == mm_kwargs["patches_per_image"].sum()
+
+    for embed_is_patch, aspect_ratio in zip(mm_kwargs["embed_is_patch"],
+                                            mm_kwargs["aspect_ratios"]):
+        assert embed_is_patch.shape[0] == \
+            len(tokenizer.encode(
+                hf_processor._prompt_split_image(
+                    aspect_ratio, num_patches_per_chunk),
+                add_special_tokens=False))
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 574b8d9e13085..e61cbc5756f6f 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -287,12 +287,16 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                         trust_remote_code=True,
                                         hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
     "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m",
-                                      extras={"2b": "h2oai/h2ovl-mississippi-2b"}),  # noqa: E501
+                                      extras={"2b": "h2oai/h2ovl-mississippi-2b"},  # noqa: E501
+                                      max_transformers_version="4.48",  # noqa: E501
+                                      transformers_version_reason="HF model is not compatible."),  # noqa: E501
     "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
                                          extras={"2B": "OpenGVLab/InternVL2-2B"},  # noqa: E501
                                          trust_remote_code=True),
     "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3",  # noqa: E501
                                                         {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}),  # noqa: E501
+    "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct",   # noqa: E501
+                                                      min_transformers_version="4.51"),
     "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
                                                      extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501
                                                              "mistral-fp8": "nm-testing/pixtral-12b-FP8-dynamic"}),  # noqa: E501
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 58705637ce94c..cd2b8f00d521b 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -7,6 +7,8 @@ from transformers import PretrainedConfig
 
 from vllm import LLM
 from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
+from vllm.utils import GiB_bytes
+from vllm.v1.core.kv_cache_utils import get_kv_cache_config
 from vllm.v1.engine.core import EngineCore as V1EngineCore
 
 from .registry import HF_EXAMPLE_MODELS
@@ -42,14 +44,21 @@ def test_can_initialize(model_arch):
         self.cache_config.num_gpu_blocks = 0
         self.cache_config.num_cpu_blocks = 0
 
-    def _initalize_kv_caches_v1(self, vllm_config):
-        # gpu_blocks (> 0), cpu_blocks
-        return 1, 0
+    def _initialize_kv_caches_v1(self, vllm_config):
+        kv_cache_specs = self.model_executor.get_kv_cache_specs()
+        scheduler_kv_cache_config = get_kv_cache_config(
+            vllm_config,
+            kv_cache_specs[0],
+            20 * GiB_bytes,
+        )
+
+        # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
+        return 1, 0, scheduler_kv_cache_config
 
     with (patch.object(V0LLMEngine, "_initialize_kv_caches",
                        _initialize_kv_caches_v0),
           patch.object(V1EngineCore, "_initialize_kv_caches",
-                       _initalize_kv_caches_v1)):
+                       _initialize_kv_caches_v1)):
         LLM(
             model_info.default,
             tokenizer=model_info.tokenizer,
diff --git a/vllm/config.py b/vllm/config.py
index d6f931ca1a436..c232f0f5e2235 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -358,6 +358,8 @@ class ModelConfig:
         self.hf_config = hf_config
 
         self.hf_text_config = get_hf_text_config(self.hf_config)
+        self.attention_chunk_size = getattr(self.hf_text_config,
+                                            "attention_chunk_size", None)
         self.encoder_config = self._get_encoder_config()
         self.hf_image_processor_config = get_hf_image_processor_config(
             self.model, hf_token=hf_token, revision=revision)
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 9041b92a5de16..d7e8d045108e3 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -500,7 +500,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
                               "internvl_chat", "skywork_chat", "NVLM_D",
                               "h2ovl_chat", "idefics3"):
                 return "<image>"
-            if model_type == "mllama":
+            if model_type in ("mllama", "llama4"):
                 return "<|image|>"
             if model_type in ("qwen2_vl", "qwen2_5_vl"):
                 return "<|vision_start|><|image_pad|><|vision_end|>"
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000000000..f10e39482e584
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
index a17afd1b357ed..d6a27aa0ddc47 100644
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -23,6 +23,7 @@ def cutlass_moe_fp8(
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
     out_dtype: torch.dtype = torch.half,
+    apply_router_weight_on_input: bool = False,
 ) -> torch.Tensor:
     """
     This function computes a a8w8-quantized Mixture of Experts (MoE) layer
@@ -96,8 +97,14 @@ def cutlass_moe_fp8(
     n = w2_q.size(1)
 
     topk = topk_ids.size(1)
+
     per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
         a2_scale.numel() != 1 if a2_scale is not None else False)
+    if apply_router_weight_on_input:
+        assert topk == 1, \
+            "apply_router_weight_on_input is only implemented for topk=1"
+        # TODO: this only works for topK=1, will need to update for topK>1
+        a = a * topk_weights.to(out_dtype)
 
     a_q, a1_scale = ops.scaled_fp8_quant(
         a, a1_scale, use_per_token_if_dynamic=per_act_token)
@@ -139,6 +146,8 @@ def cutlass_moe_fp8(
     ops.cutlass_moe_mm(c2, intemediate_q, w2_q, a2_scale, w2_scale,
                        expert_offsets[:-1], problem_sizes2, ab_strides2,
                        ab_strides2, c_strides2)
-
-    return (c2[c_map].view(m, topk, k) *
-            topk_weights.view(m, topk, 1).to(out_dtype)).sum(dim=1)
+    # Gather tokens
+    c2 = c2[c_map].view(m, topk, k)
+    if not apply_router_weight_on_input:
+        c2 = c2 * topk_weights.view(m, topk, 1).to(out_dtype)
+    return c2.sum(dim=1)
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 0817879c4d576..4ab99acb742f6 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -954,6 +954,7 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
                           topk_weights: torch.Tensor,
                           topk_ids: torch.Tensor,
                           activation: str = "silu",
+                          apply_router_weight_on_input: bool = False,
                           use_fp8_w8a8: bool = False,
                           use_int8_w8a16: bool = False,
                           use_int4_w4a16: bool = False,
@@ -967,10 +968,10 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
                           a2_scale: Optional[torch.Tensor] = None,
                           block_shape: Optional[List[int]] = None) -> None:
     fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True,
-                       activation, use_fp8_w8a8, use_int8_w8a16,
-                       use_int4_w4a16, global_num_experts, expert_map,
-                       w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale,
-                       block_shape)
+                       activation, apply_router_weight_on_input, use_fp8_w8a8,
+                       use_int8_w8a16, use_int4_w4a16, global_num_experts,
+                       expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale,
+                       a2_scale, block_shape)
 
 
 def inplace_fused_experts_fake(
@@ -980,6 +981,7 @@ def inplace_fused_experts_fake(
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         activation: str = "silu",
+        apply_router_weight_on_input: bool = False,
         use_fp8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
         use_int4_w4a16: bool = False,
@@ -1010,6 +1012,7 @@ def outplace_fused_experts(
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         activation: str = "silu",
+        apply_router_weight_on_input: bool = False,
         use_fp8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
         use_int4_w4a16: bool = False,
@@ -1023,10 +1026,11 @@ def outplace_fused_experts(
         a2_scale: Optional[torch.Tensor] = None,
         block_shape: Optional[List[int]] = None) -> torch.Tensor:
     return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids,
-                              False, activation, use_fp8_w8a8, use_int8_w8a16,
-                              use_int4_w4a16, global_num_experts, expert_map,
-                              w1_scale, w2_scale, w1_zp, w2_zp, a1_scale,
-                              a2_scale, block_shape)
+                              False, activation, apply_router_weight_on_input,
+                              use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16,
+                              global_num_experts, expert_map, w1_scale,
+                              w2_scale, w1_zp, w2_zp, a1_scale, a2_scale,
+                              block_shape)
 
 
 def outplace_fused_experts_fake(
@@ -1084,6 +1088,7 @@ def fused_experts(hidden_states: torch.Tensor,
                   topk_ids: torch.Tensor,
                   inplace: bool = False,
                   activation: str = "silu",
+                  apply_router_weight_on_input: bool = False,
                   use_fp8_w8a8: bool = False,
                   use_int8_w8a16: bool = False,
                   use_int4_w4a16: bool = False,
@@ -1099,6 +1104,7 @@ def fused_experts(hidden_states: torch.Tensor,
                   allow_deep_gemm: bool = False) -> torch.Tensor:
     if (allow_deep_gemm and use_fp8_w8a8
             and _valid_deep_gemm(hidden_states, w1, w2, expert_map)):
+        assert apply_router_weight_on_input is False
         return deep_gemm_moe_fp8(
             hidden_states=hidden_states,
             w1=w1,
@@ -1122,6 +1128,7 @@ def fused_experts(hidden_states: torch.Tensor,
             topk_weights=topk_weights,
             topk_ids=topk_ids,
             activation=activation,
+            apply_router_weight_on_input=apply_router_weight_on_input,
             use_fp8_w8a8=use_fp8_w8a8,
             use_int8_w8a16=use_int8_w8a16,
             use_int4_w4a16=use_int4_w4a16,
@@ -1143,6 +1150,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                        topk_ids: torch.Tensor,
                        inplace: bool = False,
                        activation: str = "silu",
+                       apply_router_weight_on_input: bool = False,
                        use_fp8_w8a8: bool = False,
                        use_int8_w8a16: bool = False,
                        use_int4_w4a16: bool = False,
@@ -1270,7 +1278,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                 sorted_token_ids,
                                 expert_ids,
                                 num_tokens_post_padded,
-                                False,
+                                apply_router_weight_on_input,
                                 top_k_num,
                                 config,
                                 compute_type=compute_type,
@@ -1307,7 +1315,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                 sorted_token_ids,
                                 expert_ids,
                                 num_tokens_post_padded,
-                                True,
+                                not apply_router_weight_on_input,
                                 1,
                                 config,
                                 compute_type=compute_type,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 661fb52bbee2a..0e35d8a80988c 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -65,7 +65,9 @@ class FusedMoEMethodBase(QuantizeMethodBase):
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
-        e_score_correction_bias: Optional[torch.Tensor] = None
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
     ) -> torch.Tensor:
         raise NotImplementedError
 
@@ -156,22 +158,25 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
-        return self.forward(x=x,
-                            layer=layer,
-                            router_logits=router_logits,
-                            top_k=top_k,
-                            renormalize=renormalize,
-                            use_grouped_topk=use_grouped_topk,
-                            topk_group=topk_group,
-                            num_expert_group=num_expert_group,
-                            global_num_experts=global_num_experts,
-                            expert_map=expert_map,
-                            custom_routing_function=custom_routing_function,
-                            scoring_func=scoring_func,
-                            e_score_correction_bias=e_score_correction_bias,
-                            activation=activation)
+        return self.forward(
+            x=x,
+            layer=layer,
+            router_logits=router_logits,
+            top_k=top_k,
+            renormalize=renormalize,
+            use_grouped_topk=use_grouped_topk,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias,
+            activation=activation,
+            apply_router_weight_on_input=apply_router_weight_on_input)
 
     def forward_cuda(
         self,
@@ -188,6 +193,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
         topk_weights, topk_ids = FusedMoE.select_experts(
@@ -202,15 +208,17 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias)
 
-        return fused_experts(hidden_states=x,
-                             w1=layer.w13_weight,
-                             w2=layer.w2_weight,
-                             topk_weights=topk_weights,
-                             topk_ids=topk_ids,
-                             inplace=True,
-                             activation=activation,
-                             global_num_experts=global_num_experts,
-                             expert_map=expert_map)
+        return fused_experts(
+            hidden_states=x,
+            w1=layer.w13_weight,
+            w2=layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            activation=activation,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map)
 
     def forward_cpu(
         self,
@@ -228,9 +236,11 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
         activation: str = "silu",
+        apply_router_weight_on_input: bool = False,
         **kwargs,
     ):
         assert activation == "silu", f"{activation} is not supported."
+        assert apply_router_weight_on_input is False
         return layer.ipex_fusion(
             x,
             use_grouped_topk,
@@ -259,6 +269,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
         assert not use_grouped_topk
@@ -266,6 +277,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         assert topk_group is None
         assert custom_routing_function is None
         assert layer is not None
+        assert apply_router_weight_on_input is False
         if scoring_func != "softmax":
             raise NotImplementedError(
                 "Only softmax scoring function is supported for HPU.")
@@ -290,12 +302,14 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
         assert not use_grouped_topk
         assert num_expert_group is None
         assert topk_group is None
         assert custom_routing_function is None
+        assert apply_router_weight_on_input is False
         if scoring_func != "softmax":
             raise NotImplementedError(
                 "Only softmax scoring function is supported for TPU.")
@@ -401,6 +415,7 @@ class FusedMoE(torch.nn.Module):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ):
         super().__init__()
@@ -486,6 +501,7 @@ class FusedMoE(torch.nn.Module):
             self.quant_method = quant_config.get_quant_method(self, prefix)
         assert self.quant_method is not None
 
+        self.apply_router_weight_on_input = apply_router_weight_on_input
         moe_quant_params = {
             "num_experts": self.local_num_experts,
             "hidden_size": hidden_size,
@@ -853,6 +869,7 @@ class FusedMoE(torch.nn.Module):
             scoring_func=self.scoring_func,
             e_score_correction_bias=self.e_score_correction_bias,
             activation=self.activation,
+            apply_router_weight_on_input=self.apply_router_weight_on_input,
         )
 
         if self.dp_size > 1:
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 76d3acb92fb81..5e8eb6c54c89d 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -92,6 +92,7 @@ class RMSNorm(CustomOp):
         eps: float = 1e-6,
         var_hidden_size: Optional[int] = None,
         has_weight: bool = True,
+        dtype: Optional[torch.dtype] = None,
     ) -> None:
         super().__init__()
 
@@ -100,8 +101,10 @@ class RMSNorm(CustomOp):
         self.variance_size_override = (None if var_hidden_size == hidden_size
                                        else var_hidden_size)
         self.has_weight = has_weight
-
-        self.weight = torch.ones(hidden_size)
+        if dtype is not None:
+            self.weight = torch.ones(hidden_size, dtype=dtype)
+        else:
+            self.weight = torch.ones(hidden_size)
         if self.has_weight:
             self.weight = nn.Parameter(self.weight)
 
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 473816fcc3ecd..cb1d5400f3a07 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -469,6 +469,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
         assert activation == "silu", "Only SiLU activation is supported."
@@ -476,6 +477,10 @@ class AWQMoEMethod(FusedMoEMethodBase):
             raise NotImplementedError(
                 "Expert Parallelism is not supported for "
                 "fused Marlin MoE method.")
+        if apply_router_weight_on_input:
+            raise NotImplementedError(
+                "Apply router weight on input is not supported for"
+                "fused Marlin MoE method.")
 
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index bf32bee89e895..f573c8ae5131b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -224,6 +224,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
         from vllm.model_executor.layers.fused_moe import fused_experts
@@ -240,20 +241,22 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias)
 
-        return fused_experts(x,
-                             layer.w13_weight,
-                             layer.w2_weight,
-                             topk_weights=topk_weights,
-                             topk_ids=topk_ids,
-                             inplace=True,
-                             activation=activation,
-                             use_fp8_w8a8=True,
-                             global_num_experts=global_num_experts,
-                             expert_map=expert_map,
-                             w1_scale=layer.w13_weight_scale,
-                             w2_scale=layer.w2_weight_scale,
-                             a1_scale=layer.w13_input_scale,
-                             a2_scale=layer.w2_input_scale)
+        return fused_experts(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            activation=activation,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            use_fp8_w8a8=True,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            a1_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale)
 
 
 class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
@@ -438,6 +441,7 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
 
@@ -474,6 +478,7 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
             a1_scale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
             out_dtype=x.dtype,
+            apply_router_weight_on_input=apply_router_weight_on_input,
         )
 
 
@@ -778,6 +783,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
         assert activation == "silu", "Only SiLU activation is supported."
@@ -785,6 +791,10 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
             raise NotImplementedError(
                 "Expert Parallelism is not supported for "
                 "fused Marlin MoE method.")
+        if apply_router_weight_on_input:
+            raise NotImplementedError(
+                "Apply router weight on input is not supported for "
+                "fused Marlin MoE method.")
 
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index d18ca55afebdb..be19b80975ecb 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -113,6 +113,7 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
         from vllm.model_executor.layers.fused_moe import fused_experts
@@ -129,18 +130,20 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias)
 
-        return fused_experts(x,
-                             layer.w13_weight,
-                             layer.w2_weight,
-                             topk_weights=topk_weights,
-                             topk_ids=topk_ids,
-                             inplace=True,
-                             activation=activation,
-                             use_int8_w8a16=True,
-                             global_num_experts=global_num_experts,
-                             expert_map=expert_map,
-                             w1_scale=layer.w13_scale,
-                             w2_scale=layer.w2_scale)
+        return fused_experts(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            activation=activation,
+            use_int8_w8a16=True,
+            global_num_experts=global_num_experts,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            expert_map=expert_map,
+            w1_scale=layer.w13_scale,
+            w2_scale=layer.w2_scale)
 
     @staticmethod
     def quantizing_weight_loader(layer, weight_loader):
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index e7c733db5c009..4435644c4f84e 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -773,6 +773,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
         from vllm.model_executor.layers.fused_moe import fused_experts
@@ -800,6 +801,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             activation=activation,
             use_fp8_w8a8=True,
             global_num_experts=global_num_experts,
+            apply_router_weight_on_input=apply_router_weight_on_input,
             expert_map=expert_map,
             w1_scale=(layer.w13_weight_scale_inv
                       if self.block_quant else layer.w13_weight_scale),
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index 9861e0a85b3f1..6b499f81c55fa 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -338,9 +338,15 @@ class GGUFMoEMethod(FusedMoEMethodBase):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ):
         assert activation == "silu", "Only SiLU activation is supported."
+        if apply_router_weight_on_input:
+            raise NotImplementedError(
+                "Apply router weight on input is not supported for"
+                "fused GGUF MoE method.")
+
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 9f53ffc1d7f6a..0615bb4ab4df7 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -592,9 +592,14 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
         assert activation == "silu", "Only SiLU activation is supported."
+        if apply_router_weight_on_input is not None:
+            raise NotImplementedError(
+                "Apply router weight on input is not supported for"
+                "fused Marlin MoE method.")
 
         # The input must currently be float16
         orig_dtype = x.dtype
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 41b75c9be05a4..00c4b661ef2cc 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -293,6 +293,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
         from vllm.model_executor.layers.fused_moe import fused_experts
@@ -312,21 +313,23 @@ class MoeWNA16Method(FusedMoEMethodBase):
         weight_bits = self.quant_config.weight_bits
         has_zp = self.quant_config.has_zp
 
-        return fused_experts(x,
-                             layer.w13_qweight,
-                             layer.w2_qweight,
-                             topk_weights=topk_weights,
-                             topk_ids=topk_ids,
-                             inplace=True,
-                             use_int4_w4a16=weight_bits == 4,
-                             use_int8_w8a16=weight_bits == 8,
-                             global_num_experts=global_num_experts,
-                             expert_map=expert_map,
-                             w1_scale=layer.w13_scales,
-                             w2_scale=layer.w2_scales,
-                             w1_zp=layer.w13_qzeros if has_zp else None,
-                             w2_zp=layer.w2_qzeros if has_zp else None,
-                             block_shape=[0, layer.group_size])
+        return fused_experts(
+            x,
+            layer.w13_qweight,
+            layer.w2_qweight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            use_int4_w4a16=weight_bits == 4,
+            use_int8_w8a16=weight_bits == 8,
+            global_num_experts=global_num_experts,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            expert_map=expert_map,
+            w1_scale=layer.w13_scales,
+            w2_scale=layer.w2_scales,
+            w1_zp=layer.w13_qzeros if has_zp else None,
+            w2_zp=layer.w2_qzeros if has_zp else None,
+            block_shape=[0, layer.group_size])
 
     @staticmethod
     def get_weight_loader(layer, weight_loader):
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index bc26a455c6f28..d1146c0f039d8 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -202,6 +202,8 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
     ) -> torch.Tensor:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
@@ -217,16 +219,18 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias)
 
-        return fused_experts(x,
-                             layer.w13_weight,
-                             layer.w2_weight,
-                             topk_weights=topk_weights,
-                             topk_ids=topk_ids,
-                             inplace=True,
-                             use_fp8_w8a8=True,
-                             global_num_experts=global_num_experts,
-                             expert_map=expert_map,
-                             w1_scale=layer.w13_weight_scale,
-                             w2_scale=layer.w2_weight_scale,
-                             a1_scale=layer.w13_input_scale,
-                             a2_scale=layer.w2_input_scale)
+        return fused_experts(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            use_fp8_w8a8=True,
+            global_num_experts=global_num_experts,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            expert_map=expert_map,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            a1_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale)
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index fd27775b7dc0c..624ed63ab8b4b 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -851,6 +851,70 @@ class Llama3RotaryEmbedding(RotaryEmbedding):
         return new_freqs
 
 
+class Llama4VisionRotaryEmbedding(RotaryEmbedding):
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+    ):
+        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
+                         is_neox_style, dtype)
+
+    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+        inv_freqs = super()._compute_inv_freq(base)
+        inv_freqs = inv_freqs[:(self.rotary_dim // 2)]
+        return inv_freqs
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.base)
+
+        # self.max_position_embeddings here is number of image patches
+        # i.e. (image_size // patch_size) ** 2
+        num_patches = self.max_position_embeddings
+        img_idx = torch.arange(num_patches,
+                    dtype=torch.int32) \
+                    .reshape(num_patches, 1)
+        img_idx = torch.cat([img_idx, img_idx[:1]], dim=0)
+        img_idx[-1, -1] = -2  # set to ID_CLS_TOKEN
+        num_patches_single_dim = int(math.sqrt(num_patches))
+        frequencies_x = img_idx % num_patches_single_dim
+        frequencies_y = img_idx // num_patches_single_dim
+        freqs_x = ((frequencies_x + 1)[..., None] *
+                   inv_freq[None, None, :]).repeat_interleave(2, dim=-1)
+        freqs_y = ((frequencies_y + 1)[..., None] *
+                   inv_freq[None, None, :]).repeat_interleave(2, dim=-1)
+        freqs = torch.cat([freqs_x, freqs_y],
+                          dim=-1).float().contiguous()[..., ::2]
+        freqs = freqs.masked_fill(img_idx.reshape(-1, 1, 1) < 0, 0)
+        cache = torch.view_as_complex(
+            torch.stack([torch.cos(freqs), torch.sin(freqs)], dim=-1))
+        return cache
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(query.device)
+        query_ = torch.view_as_complex(query.float().reshape(
+            *query.shape[:-1], -1, 2))
+        key_ = torch.view_as_complex(key.float().reshape(
+            *key.shape[:-1], -1, 2))
+        broadcast_shape = [
+            d if i == 1 or i == (query_.ndim - 1) else 1
+            for i, d in enumerate(query_.shape)
+        ]
+        freqs_ci = self.cos_sin_cache.view(*broadcast_shape)
+        query_out = torch.view_as_real(query_ * freqs_ci).flatten(3)
+        key_out = torch.view_as_real(key_ * freqs_ci).flatten(3)
+        return query_out.type_as(query), key_out.type_as(key)
+
+
 class MRotaryEmbedding(RotaryEmbedding):
     """Rotary Embedding with Multimodal Sections."""
 
@@ -1130,6 +1194,10 @@ def get_rope(
                                                scaling_factor, low_freq_factor,
                                                high_freq_factor,
                                                original_max_position)
+        elif scaling_type == "mllama4":
+            rotary_emb = Llama4VisionRotaryEmbedding(head_size, rotary_dim,
+                                                     max_position, base,
+                                                     is_neox_style, dtype)
         elif scaling_type == "default":
             if "mrope_section" in rope_scaling:
                 rotary_emb = MRotaryEmbedding(
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 5649cf2dd2cf1..7e43438851d1f 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -111,10 +111,12 @@ def _initialize_model(
     vllm_config: VllmConfig,
     *,
     prefix: str = "",
+    model_class: Optional[type[nn.Module]] = None,
 ) -> nn.Module:
     """Initialize a model with the given configurations."""
     model_config = vllm_config.model_config
-    model_class, _ = get_model_architecture(model_config)
+    if model_class is None:
+        model_class, _ = get_model_architecture(model_config)
 
     if vllm_config.quant_config is not None:
         configure_quant_config(vllm_config.quant_config, model_class)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 81b5d9bda9acd..caa4a5108a923 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -22,7 +22,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only LLaMA model compatible with HuggingFace weights."""
-from typing import Any, Dict, Iterable, Optional, Set, Tuple, Type, Union
+from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -65,6 +65,7 @@ class LlamaMLP(nn.Module):
         quant_config: Optional[QuantizationConfig] = None,
         bias: bool = False,
         prefix: str = "",
+        reduce_results: bool = True,
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
@@ -79,6 +80,7 @@ class LlamaMLP(nn.Module):
             output_size=hidden_size,
             bias=bias,
             quant_config=quant_config,
+            reduce_results=reduce_results,
             prefix=f"{prefix}.down_proj",
         )
         if hidden_act != "silu":
@@ -292,7 +294,7 @@ class LlamaModel(nn.Module):
                  *,
                  vllm_config: VllmConfig,
                  prefix: str = "",
-                 layer_type: Type[LlamaDecoderLayer] = LlamaDecoderLayer):
+                 layer_type: type[nn.Module] = LlamaDecoderLayer):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
@@ -466,10 +468,14 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "ffn_norm": "post_attention_layernorm",
         "tok_embeddings": "model.embed_tokens",
         "output": "lm_head",
-        "norm": "model.norm"
+        "norm": "model.norm",
     }
 
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 layer_type: type[nn.Module] = LlamaDecoderLayer):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
@@ -478,7 +484,8 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self.lora_config = lora_config
 
         self.model = self._init_model(vllm_config=vllm_config,
-                                      prefix=maybe_prefix(prefix, "model"))
+                                      prefix=maybe_prefix(prefix, "model"),
+                                      layer_type=layer_type)
 
         if get_pp_group().is_last_rank:
             self.unpadded_vocab_size = config.vocab_size
@@ -513,8 +520,13 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
-    def _init_model(self, vllm_config: VllmConfig, prefix: str = ""):
-        return LlamaModel(vllm_config=vllm_config, prefix=prefix)
+    def _init_model(self,
+                    vllm_config: VllmConfig,
+                    prefix: str = "",
+                    layer_type: type[nn.Module] = LlamaDecoderLayer):
+        return LlamaModel(vllm_config=vllm_config,
+                          prefix=prefix,
+                          layer_type=layer_type)
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
new file mode 100644
index 0000000000000..029f6044598cc
--- /dev/null
+++ b/vllm/model_executor/models/llama4.py
@@ -0,0 +1,531 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# Copyright 2025 the LLAMA4, Meta Inc., vLLM, and HuggingFace Inc. team.
+# All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only LLaMA model compatible with HuggingFace weights."""
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
+
+import torch
+from torch import nn
+from transformers import Llama4TextConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+
+from .llama import LlamaForCausalLM, LlamaMLP, LlamaModel
+from .utils import (AutoWeightsLoader, extract_layer_index,
+                    is_pp_missing_parameter)
+
+
+class Llama4MoE(nn.Module):
+
+    @staticmethod
+    def custom_routing_function(
+        hidden_states: torch.Tensor,
+        gating_output: torch.Tensor,
+        topk: int,
+        renormalize: bool,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        router_scores, router_indices = torch.topk(gating_output, topk, dim=-1)
+        router_scores = torch.sigmoid(router_scores.float()).to(
+            hidden_states.dtype)
+        return (router_scores, router_indices.to(torch.int32))
+
+    def __init__(self,
+                 config: Llama4TextConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.top_k = config.num_experts_per_tok
+
+        intermediate_size_moe = config.intermediate_size
+        self.router = ReplicatedLinear(config.hidden_size,
+                                       config.num_local_experts,
+                                       bias=False,
+                                       quant_config=None,
+                                       prefix=f"{prefix}.router")
+
+        self.experts = FusedMoE(
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            custom_routing_function=Llama4MoE.custom_routing_function,
+            intermediate_size=intermediate_size_moe,
+            apply_router_weight_on_input=True,
+            reduce_results=False,
+            renormalize=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts")
+
+        self.shared_expert = LlamaMLP(
+            hidden_size=config.hidden_size,
+            intermediate_size=intermediate_size_moe,
+            hidden_act="silu",
+            quant_config=quant_config,
+            bias=False,
+            prefix=f"{prefix}.shared_expert",
+            reduce_results=False,  # We need to do scatter before reduce
+        )
+
+    def forward(self, hidden_states):
+        router_logits, _ = self.router(hidden_states)
+        shared_out = self.shared_expert(hidden_states)
+        routed_out = self.experts(
+            hidden_states=hidden_states,
+            router_logits=router_logits,
+        )
+        experts_out = routed_out + shared_out
+
+        if self.tp_size > 1:
+            experts_out = tensor_model_parallel_all_reduce(experts_out)
+
+        return experts_out
+
+
+class Llama4Attention(nn.Module):
+
+    def __init__(self,
+                 config: Llama4TextConfig,
+                 hidden_size: int,
+                 num_heads: int,
+                 num_kv_heads: int,
+                 rope_theta: float = 10000,
+                 rope_scaling: Optional[Dict[str, Any]] = None,
+                 max_position_embeddings: int = 8192,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 bias: bool = False,
+                 bias_o_proj: bool = False,
+                 cache_config: Optional[CacheConfig] = None,
+                 prefix: str = "") -> None:
+        super().__init__()
+        self.layer_idx = extract_layer_index(prefix)
+        self.hidden_size = hidden_size
+        self.no_rope_layers = config.no_rope_layers
+        self.nope = self.no_rope_layers[self.layer_idx] == 0
+        self.use_qk_norm = config.use_qk_norm and not self.nope
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = config.head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        # TODO: attn_temperature_tuning should be a bool in huggingface
+        self.attn_temperature_tuning = self.nope and \
+            config.attn_temperature_tuning > 0
+
+        self.floor_scale = getattr(config, "floor_scale", 8192.0)
+        self.attn_scale = getattr(config, "attn_scale", 0.1)
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.n_rep = self.num_heads // self.num_kv_heads
+        self.q_norm = RMSNorm(
+            hidden_size=self.q_size,
+            eps=config.rms_norm_eps,
+            has_weight=False,
+            dtype=torch.float32,
+        ) if self.use_qk_norm else None
+        self.k_norm = RMSNorm(
+            hidden_size=self.kv_size,
+            eps=config.rms_norm_eps,
+            has_weight=False,
+            dtype=torch.float32,
+        ) if self.use_qk_norm else None
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias_o_proj,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        is_neox_style = True
+        is_gguf = quant_config and quant_config.get_name() == "gguf"
+        if is_gguf and config.model_type == "llama":
+            is_neox_style = False
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=int(rope_theta),
+            rope_scaling=rope_scaling if rope_scaling != "default" else None,
+            is_neox_style=is_neox_style,
+        ) if not self.nope else None
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            per_layer_sliding_window=None,
+            use_irope=not self.nope,
+            prefix=f"{prefix}.attn",
+        )
+
+    def _get_attn_scale(self, positions: torch.Tensor) -> torch.Tensor:
+        floor = torch.floor((positions + 1.0) / self.floor_scale)
+        attn_scale = torch.log(floor + 1.0) * self.attn_scale + 1.0
+
+        return attn_scale.unsqueeze(-1)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        if self.rotary_emb is not None:
+            q, k = self.rotary_emb(positions, q, k)
+        if self.q_norm is not None:
+            q = self.q_norm(q.float()).to(q.dtype)
+        if self.k_norm is not None:
+            k = self.k_norm(k.float()).to(k.dtype)
+
+        # We are applying temperature tuning (https://arxiv.org/abs/2501.19399)
+        # to NoPE layers, where the inference-time temperature tuning function
+        # is customized to not affect short context
+        # while working at very long context
+        # https://arxiv.org/abs/2501.19399
+        #
+        # We should apply temperature tuning between (after) rotary / QK norm
+        # and (before) attention.
+        if self.attn_temperature_tuning and self.nope:
+            attn_scale = self._get_attn_scale(positions)
+            q = (q * attn_scale).to(q.dtype)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Llama4DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Llama4TextConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.layer_idx = extract_layer_index(prefix)
+        self.hidden_size = config.hidden_size
+        rope_theta = config.rope_theta
+        rope_scaling = config.rope_scaling
+        max_position_embeddings = config.max_position_embeddings
+
+        self.self_attn = Llama4Attention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=False,
+            bias_o_proj=False,
+            cache_config=cache_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        is_moe_layer = (self.layer_idx +
+                        1) % config.interleave_moe_layer_step == 0
+        if is_moe_layer:
+            self.feed_forward = Llama4MoE(
+                config=config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.feed_forward",
+            )
+        else:
+            self.feed_forward = LlamaMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=config.intermediate_size_mlp,
+                hidden_act="silu",
+                quant_config=quant_config,
+                bias=False,
+                prefix=f"{prefix}.feed_forward",
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(positions=positions,
+                                       hidden_states=hidden_states)
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.feed_forward(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class Llama4Model(LlamaModel):
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 layer_type: type[Llama4DecoderLayer] = Llama4DecoderLayer):
+        self.num_experts = vllm_config.model_config.hf_config.num_local_experts
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         layer_type=layer_type)
+
+    def load_moe_expert_weights(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+        params_dict: Dict[str, nn.Parameter],
+        loaded_params: Set[str],
+        expert_params_mapping: List[Tuple[str, str, int, str]],
+        fused: bool = True,
+    ) -> bool:
+        expert_param_loaded = False
+        if "experts.gate_up_proj" in name:
+            loaded_weight = loaded_weight.chunk(2, dim=-1)
+        for (param_name, weight_name, expert_id,
+             shard_id) in expert_params_mapping:
+            new_loaded_weight = loaded_weight
+            if fused:
+                e_str, _, proj_str, _ = weight_name.split('.')
+                weight_name = f"{e_str}.{proj_str}"
+                param_name = f"{param_name}weight"
+            if weight_name not in name:
+                continue
+            full_param_name = name.replace(weight_name, param_name)
+            # Skip layers on other devices.
+            if is_pp_missing_parameter(name, self):
+                continue
+            if ((name.endswith(".bias") or name.endswith("_bias"))
+                    and name not in params_dict):
+                continue
+            param = params_dict[full_param_name]
+            weight_loader = param.weight_loader
+            if fused:
+                if "w13" in full_param_name:
+                    shard_idx = 0 if shard_id == "w1" else 1
+                    new_loaded_weight = new_loaded_weight[shard_idx]
+                new_loaded_weight = new_loaded_weight.transpose(-1, -2)
+                layer_idx = extract_layer_index(name)
+                # EP mapping
+                expert_map = self.layers[
+                    layer_idx].feed_forward.experts.expert_map
+                if expert_map is not None:
+                    local_expert_indices = (expert_map != -1) \
+                                            .nonzero() \
+                                            .flatten() \
+                                            .to(new_loaded_weight.device)
+                    new_loaded_weight = new_loaded_weight[local_expert_indices]
+                    expert_id = local_expert_indices[0].item()
+            else:
+                # TODO: add EP support for non fused weights
+                pass
+            weight_loader(param,
+                          new_loaded_weight,
+                          full_param_name,
+                          shard_id=shard_id,
+                          expert_id=expert_id)
+
+            loaded_params.add(full_param_name)
+            expert_param_loaded = True
+        return expert_param_loaded
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        fused_experts_params = False
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.num_experts)
+        expert_params_mapping_fused = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_up_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="gate_up_proj",
+            num_experts=1)
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "experts.gate_up_proj" in name or "experts.down_proj" in name:
+                fused_experts_params = True
+                expert_params_mapping = expert_params_mapping_fused
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name or "experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                loaded_params.add(name)
+                break
+            else:
+                moe_loaded = self.load_moe_expert_weights(
+                    name,
+                    loaded_weight,
+                    params_dict,
+                    loaded_params,
+                    expert_params_mapping,
+                    fused=fused_experts_params)
+
+                if not moe_loaded:
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+                    loaded_params.add(name)
+        return loaded_params
+
+
+class Llama4ForCausalLM(LlamaForCausalLM):
+
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        # Update temperature tuning config from generation config
+        gen_config = vllm_config.model_config.try_get_generation_config()
+        gen_config.update(vllm_config.model_config.override_generation_config)
+        vllm_config.model_config.hf_config.attn_temperature_tuning \
+            = gen_config.get("attn_temperature_tuning", False)
+
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         layer_type=Llama4DecoderLayer)
+
+    def _init_model(self,
+                    vllm_config: VllmConfig,
+                    prefix: str = "",
+                    layer_type: type[Llama4DecoderLayer] = Llama4DecoderLayer):
+        return Llama4Model(vllm_config=vllm_config,
+                           prefix=prefix,
+                           layer_type=layer_type)
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        weights = [
+            self.permute_qk_weight_for_rotary(name, loaded_weight)
+            for name, loaded_weight in weights
+        ]
+        return loader.load_weights(weights)
+
+    def permute_qk_weight_for_rotary(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+    ) -> Tuple[str, torch.Tensor]:
+
+        def permute(w: torch.Tensor, n_heads: int):
+            attn_in = self.config.head_dim * n_heads
+            attn_out = self.config.hidden_size
+
+            return w.view(n_heads, attn_in // n_heads // 2, 2,
+                          attn_out).transpose(1, 2).reshape(attn_in, attn_out)
+
+        modules = name.split(".")
+
+        # rotary embeds should be sliced
+        if ("wk" in modules or "k_proj" in modules) \
+           and modules[-1] == "weight":
+            loaded_weight = permute(loaded_weight,
+                                    self.config.num_key_value_heads)
+        elif ("wq" in modules or "q_proj" in modules) \
+                and modules[-1] == "weight":
+            loaded_weight = permute(loaded_weight,
+                                    self.config.num_attention_heads)
+
+        return name, loaded_weight
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
new file mode 100644
index 0000000000000..dae98093bc6e1
--- /dev/null
+++ b/vllm/model_executor/models/mllama4.py
@@ -0,0 +1,895 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# Copyright 2025 the LLAMA4, Meta Inc., vLLM, and HuggingFace Inc. team.
+# All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from collections.abc import Iterable, Mapping
+from functools import cached_property
+from itertools import tee
+from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
+
+import torch
+from torch import nn
+from transformers import BatchFeature, Llama4Config, Llama4VisionConfig
+from transformers.image_utils import SizeDict
+from transformers.models.llama4 import Llama4Processor
+from transformers.models.llama4.image_processing_llama4_fast import (
+    find_supported_resolutions, get_best_fit)
+
+from vllm.attention.layer import MultiHeadAttention
+from vllm.config import VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.inputs import InputProcessingContext
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.model_loader.loader import _initialize_model
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .llama4 import Llama4ForCausalLM
+from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
+                    merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features
+
+logger = init_logger(__name__)
+
+
+class Llama4ImagePatchInputs(TypedDict):
+    type: Literal["pixel_values"]
+    flat_data: torch.Tensor
+    """
+    Shape:
+    `(batch_size * num_chunks, num_channels, image size, image size)`
+    """
+    patches_per_image: torch.Tensor
+    """
+    The number of total patches for each image in the batch.
+
+    This is used to split the embeddings which has the first two dimensions
+    flattened just like `flat_data`.
+    """
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+    """
+    aspect_ratios: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A list of aspect ratios corresponding to the number of tiles
+    in each dimension that each image in the batch corresponds to.
+
+    Shape:
+    `(batch_size, ratio)` where ratio is a pair `(ratio_h, ratio_w)`
+    """
+
+
+class Llama4VisionMLP(nn.Module):
+
+    def __init__(self,
+                 input_size: int,
+                 intermediate_size: int,
+                 output_size: int,
+                 bias: bool,
+                 output_activation: bool,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.fc1 = ColumnParallelLinear(
+            input_size=input_size,
+            output_size=intermediate_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
+        self.fc2 = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=output_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
+        self.activation_fn = nn.GELU()
+        self.output_activation = output_activation
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        if self.output_activation:
+            return self.activation_fn(hidden_states)
+        return hidden_states
+
+
+class Llama4MultiModalProjector(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.linear_1 = ColumnParallelLinear(
+            input_size=config.vision_config.vision_output_dim,
+            output_size=config.text_config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            gather_output=True,
+            prefix=f"{prefix}.linear_1",
+        )
+
+    def forward(self, image_features):
+        hidden_states, _ = self.linear_1(image_features)
+        return hidden_states
+
+
+def pixel_shuffle(input_tensor, shuffle_ratio):
+    # input_tensor: [batch_size, num_patches, channels]
+    batch_size, num_patches, channels = input_tensor.shape
+    patch_size = int(math.sqrt(num_patches))
+
+    input_tensor = input_tensor.view(batch_size, patch_size, patch_size, -1)
+    batch_size, height, width, channels = input_tensor.size()
+
+    reshaped_tensor = input_tensor.view(batch_size, height,
+                                        int(width * shuffle_ratio),
+                                        int(channels / shuffle_ratio))
+    reshaped_tensor = reshaped_tensor.permute(0, 2, 1, 3).contiguous()
+
+    reshaped_tensor = reshaped_tensor.view(batch_size,
+                                           int(height * shuffle_ratio),
+                                           int(width * shuffle_ratio),
+                                           int(channels / (shuffle_ratio**2)))
+    reshaped_tensor = reshaped_tensor.permute(0, 2, 1, 3).contiguous()
+
+    output_tensor = reshaped_tensor.view(batch_size, -1,
+                                         reshaped_tensor.shape[-1])
+    return output_tensor
+
+
+class Llama4VisionPixelShuffleMLP(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.pixel_shuffle_ratio = config.pixel_shuffle_ratio
+        self.inner_dim = int(config.projector_input_dim //
+                             (self.pixel_shuffle_ratio**2))
+        self.output_dim = config.projector_output_dim
+        self.mlp = Llama4VisionMLP(
+            input_size=config.intermediate_size,
+            intermediate_size=config.projector_input_dim,
+            output_size=config.projector_output_dim,
+            bias=config.multi_modal_projector_bias,
+            output_activation=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp")
+
+    def forward(self, encoded_patches: torch.Tensor) -> torch.Tensor:
+        encoded_patches = pixel_shuffle(encoded_patches,
+                                        self.pixel_shuffle_ratio)
+        return self.mlp(encoded_patches)
+
+
+class Llama4VisionAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: Llama4VisionConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = config.hidden_size // self.num_heads
+        assert self.num_heads % self.tp_size == 0
+        self.num_local_heads = self.num_heads // self.tp_size
+        self.q_size = self.num_local_heads * self.head_dim
+        self.kv_size = self.num_local_heads * self.head_dim
+        self.attention_dropout = config.attention_dropout
+        self.scaling = self.head_dim**-0.5
+
+        self.attn = MultiHeadAttention(self.num_local_heads, self.head_dim,
+                                       self.scaling)
+        self.qkv_proj = QKVParallelLinear(
+            self.embed_dim,
+            self.head_dim,
+            self.num_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.num_heads * self.head_dim,
+            self.embed_dim,
+            bias=True,
+            input_is_parallel=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.rotary_emb = get_rope(
+            head_size=self.head_dim,
+            rotary_dim=config.hidden_size // config.num_attention_heads // 2,
+            # number of image patches
+            max_position=(config.image_size // config.patch_size)**2,
+            base=config.rope_theta,
+            rope_scaling={"rope_type": "mllama4"},
+            is_neox_style=False,
+            dtype=torch.complex64,  # important
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        input_shape = hidden_states.shape[:-1]
+
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        q = q.view(q.shape[0], q.shape[1], self.num_local_heads, self.head_dim)
+        k = k.view(k.shape[0], k.shape[1], self.num_local_heads, self.head_dim)
+        q, k = self.rotary_emb(q, k)
+
+        q = q.view(q.shape[0], q.shape[1], -1)
+        k = k.view(k.shape[0], k.shape[1], -1)
+
+        attn_output = self.attn(q, k, v)
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output, _ = self.o_proj(attn_output)
+
+        return attn_output
+
+
+class Llama4VisionEncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Llama4VisionConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_attention_heads = config.num_attention_heads
+        self.intermediate_size = config.intermediate_size
+
+        self.self_attn = Llama4VisionAttention(config,
+                                               quant_config=quant_config,
+                                               prefix=f"{prefix}.self_attn")
+        self.mlp = Llama4VisionMLP(input_size=config.hidden_size,
+                                   intermediate_size=config.intermediate_size,
+                                   output_size=config.hidden_size,
+                                   bias=True,
+                                   output_activation=False,
+                                   quant_config=quant_config,
+                                   prefix=f"{prefix}.mlp")
+
+        self.input_layernorm = nn.LayerNorm(config.hidden_size)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size)
+
+    def forward(
+        self,
+        hidden_state: torch.Tensor,
+    ):
+        # Self Attention
+        residual = hidden_state
+        hidden_state = self.input_layernorm(hidden_state)
+        hidden_state = self.self_attn(hidden_state)
+        hidden_state = residual + hidden_state
+
+        # Feed forward
+        residual = hidden_state
+        hidden_state = self.post_attention_layernorm(hidden_state)
+        hidden_state = self.mlp(hidden_state)
+        hidden_state = residual + hidden_state
+
+        outputs = (hidden_state, )
+        return outputs
+
+
+class Llama4VisionEncoder(nn.Module):
+
+    def __init__(
+        self,
+        config: Llama4VisionConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([
+            Llama4VisionEncoderLayer(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.layers.{layer_idx}",
+            ) for layer_idx in range(config.num_hidden_layers)
+        ])
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape
+                    `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to
+                directly pass an embedded representation. This is useful if you
+                want more control over how to convert `input_ids` indices into
+                associated vectors than the model's internal embedding
+                lookup matrix.
+        """
+
+        for encoder_layer in self.layers:
+            layer_outputs = encoder_layer(hidden_states)
+            hidden_states = layer_outputs[0]
+
+        return hidden_states
+
+
+class Llama4UnfoldConvolution(nn.Module):
+
+    def __init__(self,
+                 config: Llama4VisionConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        kernel_size = config.patch_size
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size, kernel_size)
+        self.unfold = torch.nn.Unfold(kernel_size=kernel_size,
+                                      stride=config.patch_size)
+        self.linear = ColumnParallelLinear(config.num_channels *
+                                           kernel_size[0] * kernel_size[1],
+                                           config.hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config,
+                                           gather_output=True,
+                                           prefix=f"{prefix}.linear")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.unfold(hidden_states)
+        hidden_states = hidden_states.permute(0, 2, 1)
+        hidden_states, _ = self.linear(hidden_states)
+        return hidden_states
+
+
+class Llama4VisionModel(nn.Module):
+
+    def __init__(
+        self,
+        config: Llama4VisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.hidden_size = config.hidden_size
+        self.num_channels = config.num_channels
+
+        self.num_patches = (self.image_size // self.patch_size)**2 + 1
+        self.scale = config.hidden_size**-0.5
+
+        self.patch_embedding = Llama4UnfoldConvolution(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.patch_embedding")
+
+        self.class_embedding = nn.Parameter(self.scale *
+                                            torch.randn(self.hidden_size))
+        self.positional_embedding_vlm = nn.Parameter(
+            self.scale * torch.randn(self.num_patches, self.hidden_size))
+
+        # layer norms
+        self.layernorm_pre = nn.LayerNorm(self.hidden_size, eps=1e-5)
+        self.layernorm_post = nn.LayerNorm(self.hidden_size, eps=1e-5)
+
+        # encoders
+        self.model = Llama4VisionEncoder(config,
+                                         quant_config=quant_config,
+                                         prefix=f"{prefix}.model")
+        self.vision_adapter = Llama4VisionPixelShuffleMLP(
+            config, quant_config, prefix=f"{prefix}.vision_adapter")
+
+    def forward(
+        self,
+        images_flattened: torch.Tensor,
+    ) -> torch.Tensor:
+        # Patch embedding
+        hidden_state = self.patch_embedding(images_flattened)
+        num_tiles, num_patches, hidden_dim = hidden_state.shape
+
+        # Add cls token
+        class_embedding = self.class_embedding.expand(hidden_state.shape[0], 1,
+                                                      hidden_state.shape[-1])
+        hidden_state = torch.cat([hidden_state, class_embedding], dim=1)
+        num_patches += 1
+
+        # Position embeddings
+        hidden_state = hidden_state.reshape(
+            num_tiles,
+            1,
+            num_patches,
+            hidden_dim,
+        )
+        positional_embedding = self.positional_embedding_vlm.to(
+            dtype=hidden_state.dtype, device=hidden_state.device)
+        hidden_state = hidden_state + positional_embedding
+        hidden_state = self.layernorm_pre(hidden_state)
+        hidden_state = hidden_state.view(num_tiles, -1, hidden_dim)
+
+        # Apply encoder
+        hidden_state = self.model(hidden_state)
+        hidden_state = self.layernorm_post(hidden_state)
+
+        # Remove CLS token output
+        hidden_state = hidden_state[:, :-1, :]
+
+        # now, we use Llama4VisionPixelShuffle + mlp to project embeddings
+        hidden_state = self.vision_adapter(hidden_state)
+
+        return hidden_state
+
+
+class Mllama4ProcessingInfo(BaseProcessingInfo):
+
+    def __init__(self, ctx: InputProcessingContext) -> None:
+        super().__init__(ctx)
+
+    def get_hf_config(self) -> Llama4Config:
+        return self.ctx.get_hf_config(Llama4Config)
+
+    def get_hf_processor(self, **kwargs: object) -> Llama4Processor:
+        return self.ctx.get_hf_processor(Llama4Processor,
+                                         use_fast=True,
+                                         **kwargs)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 10}
+
+    @staticmethod
+    def get_patch_per_chunk(vision_config: Llama4VisionConfig) -> int:
+        image_size = vision_config.image_size
+        patch_size = vision_config.patch_size
+
+        assert (
+            image_size %
+            patch_size == 0), f"chunk size {image_size} should be multiple of "
+        f"patch_size {patch_size}"
+
+        ds_ratio = int(round(1.0 / (vision_config.pixel_shuffle_ratio**2)))
+        return (image_size // patch_size)**2 // ds_ratio
+
+    def get_max_num_tiles(self) -> int:
+        image_processor = self.get_hf_processor().image_processor
+        return image_processor.max_patches
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        vision_config = self.get_hf_config().vision_config
+        # image_start + local tiles * (patches + 1 x separator) +
+        # 1 global tile * (image x 1 + patches) + image_end
+        token_per_chunk = self.get_patch_per_chunk(vision_config) + 1
+        mm_max_tokens = (self.get_max_num_tiles() + 1) * token_per_chunk + 2
+        return {"image": mm_max_tokens}
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        vision_config = self.get_hf_config().vision_config
+        image_size = vision_config.image_size
+        # Result in the max possible feature size (h:w = 16:1)
+        return ImageSize(height=self.get_max_num_tiles() * image_size,
+                         width=image_size)
+
+
+class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
+                                 ):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        tokenizer = self.info.get_tokenizer()
+
+        if mm_data is None:
+            return tokenizer(prompt, add_special_tokens=False)  # exclude bos
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+        processor = self.info.get_hf_processor(**mm_kwargs)
+        image_processor = processor.image_processor
+        vision_config = self.info.get_hf_config().vision_config
+
+        if processed_outputs.get("pixel_values") is not None:
+            assert "images" in mm_data, \
+                "images expected to be in mm_data when pixel_values is present"
+
+            images = mm_data["images"]
+            parsed_images = (self._get_data_parser().parse_mm_data({
+                "image":
+                images
+            }).get_items("image", ImageProcessorItems))
+
+            tile_size = vision_config.image_size
+            possible_resolutions = find_supported_resolutions(
+                max_num_chunks=self.info.get_max_num_tiles(),
+                patch_size=SizeDict(height=tile_size, width=tile_size),
+            )
+            best_fit_sizes = [
+                get_best_fit(
+                    (image.size[1], image.size[0]),
+                    torch.tensor(possible_resolutions),
+                    resize_to_max_canvas=image_processor.resize_to_max_canvas)
+                for image in parsed_images
+            ]
+            # TODO tile height/width do not necessarily need to match
+            aspect_ratios = [(image_size[0] // tile_size,
+                              image_size[1] // tile_size)
+                             for image_size in best_fit_sizes]
+            patches_per_image = [
+                1 if r_h * r_w == 1 else 1 + r_h * r_w
+                for (r_h, r_w) in aspect_ratios
+            ]
+
+            # embed_is_patch should have one feature per image-related token:
+            #   <|image_start|>, <|tile_*_separator|>, <|image|>, <|image_end|>
+            #             -> False
+            #   <|patch|> -> True
+            # embed_is_patch has no entries corresponding to non-image-related
+            # tokens.
+            patch_id = tokenizer.get_vocab()[processor.img_patch_token]
+            num_patches_per_chunk = self.info.get_patch_per_chunk(
+                vision_config)
+            expanded_image_tokens_list = [
+                processor._prompt_split_image(aspect_ratio,
+                                              num_patches_per_chunk)
+                for aspect_ratio in aspect_ratios
+            ]
+            expanded_image_token_ids = [
+                tokenizer.encode(image_tokens, add_special_tokens=False)
+                for image_tokens in expanded_image_tokens_list
+            ]
+            embed_is_patch = [
+                torch.tensor(tokens) == patch_id
+                for tokens in expanded_image_token_ids
+            ]
+
+            processed_outputs["aspect_ratios"] = aspect_ratios
+            processed_outputs["patches_per_image"] = torch.tensor(
+                patches_per_image)
+            processed_outputs["embed_is_patch"] = embed_is_patch
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        patches_per_image = hf_inputs.get("patches_per_image", torch.empty(0))
+        return dict(
+            pixel_values=MultiModalFieldConfig.flat_from_sizes(
+                "image", patches_per_image),
+            patches_per_image=MultiModalFieldConfig.batched("image"),
+            aspect_ratios=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> List[PromptUpdate]:
+        assert (
+            mm_items.get_count("image", strict=False) == 0
+            or "aspect_ratios" in out_mm_kwargs
+        ), "Transformers expect to include aspect_ratios in out_mm_kwargs"
+
+        config = self.info.get_hf_config()
+        vision_config = config.vision_config
+
+        num_patches_per_chunk = self.info.get_patch_per_chunk(vision_config)
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_token = hf_processor.image_token
+
+        def get_replacement(item_idx: int):
+            aspect_ratio = out_mm_kwargs["aspect_ratios"][item_idx]
+            return hf_processor._prompt_split_image(
+                aspect_ratio=aspect_ratio,
+                num_patches_per_chunk=num_patches_per_chunk)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=image_token,
+                replacement=get_replacement,
+            )
+        ]
+
+
+class Mllama4DummyInputsBuilder(BaseDummyInputsBuilder[Mllama4ProcessingInfo]):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
+
+        (target_width,
+         target_height) = self.info.get_image_size_with_most_features()
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+        image_token = self.info.get_hf_processor().fake_image_token
+        return ProcessorInputs(
+            prompt_text=image_token * num_images,
+            mm_data=mm_data,
+        )
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Mllama4MultiModalProcessor,
+    info=Mllama4ProcessingInfo,
+    dummy_inputs=Mllama4DummyInputsBuilder,
+)
+class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                     SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.config = config
+        self.quant_config = quant_config
+        self.multimodal_config = multimodal_config
+        self.vision_model = Llama4VisionModel(config.vision_config,
+                                              None,
+                                              prefix=maybe_prefix(
+                                                  prefix, "vision_model"))
+        self.multi_modal_projector = Llama4MultiModalProjector(
+            self.config,
+            None,
+            prefix=maybe_prefix(prefix, "multi_modal_projector"))
+
+        self.language_model = _initialize_model(
+            vllm_config=vllm_config.with_hf_config(config.text_config),
+            prefix=maybe_prefix(prefix, "language_model"),
+            model_class=Llama4ForCausalLM,
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    @cached_property
+    def sampler(self):
+        if hasattr(self.language_model, "sampler"):
+            return self.language_model.sampler
+
+        return get_sampler()
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Llama4ImagePatchInputs]:
+        # num_images, 1, num_chunks, channel, image_size, image_size
+        pixel_values = kwargs.pop("pixel_values", None)
+        if pixel_values is None:
+            return None
+
+        # num_images x num_chunks, channel, image_size, image_size
+        # TODO: confirm handling for variable lengths
+        flat_pixel_values = flatten_bn(pixel_values, concat=True)
+        patches_per_image = flatten_bn(kwargs.pop("patches_per_image"))
+
+        embed_is_patch = kwargs.pop("embed_is_patch", None)
+        if not isinstance(embed_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of embed_is_patch. "
+                             f"Got type: {type(embed_is_patch)}")
+
+        aspect_ratios = kwargs.pop("aspect_ratios", None)
+        if not isinstance(aspect_ratios, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of aspect_ratios. "
+                             f"Got type: {type(aspect_ratios)}")
+
+        return Llama4ImagePatchInputs(
+            type="pixel_values",
+            flat_data=flat_pixel_values,
+            patches_per_image=patches_per_image,
+            embed_is_patch=embed_is_patch,
+            aspect_ratios=aspect_ratios,
+        )
+
+    def _process_image_input(
+            self, image_input: Llama4ImagePatchInputs) -> MultiModalEmbeddings:
+        flat_data = image_input["flat_data"]
+        patches_per_image = image_input["patches_per_image"].tolist()
+        vision_embeddings_flat = self.vision_model(flat_data)
+        return vision_embeddings_flat.split(patches_per_image, dim=0)
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs) -> Optional[MultiModalEmbeddings]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+
+        # num_images x [num_chunks, num_patches, hidden_dim]
+        image_features = self._process_image_input(image_input)
+        # num_images x [num_chunks x num_patches, hidden_dim]
+        image_features_flat = [img.flatten(0, 1) for img in image_features]
+        # num_images x [1, input_len] -> num_images x [input_len]
+        embed_is_patch_flat = [
+            is_patch.flatten(0, 1)
+            for is_patch in image_input["embed_is_patch"]
+        ]
+
+        return scatter_patch_features(
+            image_features_flat,
+            embed_is_patch_flat,
+        )
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+
+        if multimodal_embeddings is not None:
+            multimodal_embeddings = torch.cat(multimodal_embeddings)
+            mm_embeddings = self.multi_modal_projector(multimodal_embeddings)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, select_patch_features(mm_embeddings),
+                self.config.image_token_index)
+
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner,
+        # this condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        return self.language_model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def sample(self, logits: torch.Tensor,
+               sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]:
+        return self.language_model.sample(logits, sampling_metadata)
+
+    def separate_weights(
+        self,
+        weights: Iterable[Tuple[str, torch.Tensor]],
+        prefix: str,
+    ) -> Tuple[Iterable[Tuple[str, torch.Tensor]], Iterable[Tuple[
+            str, torch.Tensor]]]:
+        weights1, weights2 = tee(weights, 2)
+
+        def get_prefix_weights() -> Iterable[Tuple[str, torch.Tensor]]:
+            for name, data in weights1:
+                if name.startswith(prefix):
+                    yield (name, data)
+
+        def get_other_weights() -> Iterable[Tuple[str, torch.Tensor]]:
+            for name, data in weights2:
+                if not name.startswith(prefix):
+                    yield (name, data)
+
+        return get_prefix_weights(), get_other_weights()
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
+            (".self_attn.qkv_proj", ".self_attn.k_proj", "k"),
+            (".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        updated_params: Set[str] = set()
+
+        # language_model is an Llama4ForCausalLM instance. We load it's
+        # using llama4's load_weights routine.
+        language_model_weights, other_weights = self.separate_weights(
+            weights, prefix="language_model.model.")
+        loader = AutoWeightsLoader(self)
+        loaded_language_model_params = loader.load_weights(
+            language_model_weights)
+        assert loaded_language_model_params is not None
+        updated_params.update(loaded_language_model_params)
+
+        for name, loaded_weight in other_weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                updated_params.add(name)
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+
+                weight_loader(param, loaded_weight)
+                updated_params.add(name)
+        return updated_params
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 080aef8982d53..3abbb1f0c3b60 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -196,6 +196,7 @@ _MULTIMODAL_MODELS = {
     # [Encoder-decoder]
     "Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"),  # noqa: E501
     "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"),  # noqa: E501
+    "Llama4ForConditionalGeneration": ("mllama4", "Llama4ForConditionalGeneration"),  # noqa: E501
     "SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),
     "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"),  # noqa: E501
 }
diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py
index 062b1c2cf5f54..379e19e1beea1 100644
--- a/vllm/model_executor/models/telechat2.py
+++ b/vllm/model_executor/models/telechat2.py
@@ -19,9 +19,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Iterable, Set, Tuple, Type
+from typing import Iterable, Set, Tuple
 
 import torch
+import torch.nn as nn
 
 from vllm.config import VllmConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -124,7 +125,7 @@ class TeleChat2ForCausalLM(LlamaForCausalLM):
     def _init_model(self,
                     vllm_config: VllmConfig,
                     prefix: str = "",
-                    layer_type: Type[LlamaDecoderLayer] = LlamaDecoderLayer):
+                    layer_type: type[nn.Module] = LlamaDecoderLayer):
         return TeleChat2Model(vllm_config=vllm_config, prefix=prefix)
 
     def load_weights(self, weights: Iterable[Tuple[str,
diff --git a/vllm/model_executor/models/teleflm.py b/vllm/model_executor/models/teleflm.py
index e670b1df08f70..e05f23f99e979 100644
--- a/vllm/model_executor/models/teleflm.py
+++ b/vllm/model_executor/models/teleflm.py
@@ -22,9 +22,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Type
-
 import torch
+import torch.nn as nn
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -39,7 +38,7 @@ class TeleFLMModel(LlamaModel):
         *,
         vllm_config: VllmConfig,
         prefix: str = "",
-        layer_type: Type[LlamaDecoderLayer] = LlamaDecoderLayer,
+        layer_type: type[nn.Module] = LlamaDecoderLayer,
     ):
         super().__init__(vllm_config=vllm_config,
                          prefix=prefix,
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 92e4ffd0371ad..1a8d2420db7a7 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -96,6 +96,183 @@ class FlashAttentionMetadata:
     # For logging.
     num_input_tokens: int = 0  # Number of tokens including padding.
 
+    # for local attention
+    @dataclass
+    class LocalAttentionMetadata:
+        local_query_start_loc: torch.Tensor
+        local_seqused_k: torch.Tensor
+        local_block_table: torch.Tensor
+        local_max_query_len: int
+        local_max_seq_len: int
+
+    local_attn_metadata: Optional[LocalAttentionMetadata] = None
+
+
+#
+# Take in `query_start_loc_np` and `seq_lens_np` and break the sequences into
+# local attention blocks, where each block is passed to the attention kernel
+# as an independent local ("virtual") batch item.
+#
+# For example, if are performing a chunked prefill a batch of 3 sequences:
+#   q_seqlens  = [4, 10, 5]
+#   kv_seqlens = [6, 17, 9]
+# Then normally for regular attention we would compute with an attention mask
+#  for batch idx 0 (q_seqlens = 4, kv_seqlens = 6) like:
+#   batch idx: 0 (q_seqlens = 4, kv_seqlens = 6)
+#        k_toks >   0 1 2 3 4 5
+#        q_toks v  _____________
+#               0 | 1 1 1
+#               1 | 1 1 1 1
+#               2 | 1 1 1 1 1
+#               3 | 1 1 1 1 1 1
+#
+# for local attention (with attn_chunk_size = 4) we would compute with an
+#  attention mask like:
+#   batch idx: 0  (q_seqlens = 4, kv_seqlens = 6, attn_chunk_size = 4)
+#        k_toks >   0 1 2 3 4 5
+#        q_toks v  _____________
+#               0 | 1 1 1
+#               1 | 1 1 1 1
+#               2 |         1
+#               3 |         1 1
+#
+# We can simulate this mask using standard flash-attention by breaking the
+#  sequences into local ("virtual") batches, where each local batch item is a
+#  local attention block, so in this case batch idx 0 would be broken up into:
+#
+#   local-batch idx: 0 (q_seqlens = 2, kv_seqlens = 4)  (batch 0)
+#        k_toks >   0 1 2 3
+#        q_toks v  _____________
+#               0 | 1 1 1
+#               1 | 1 1 1 1
+#   local-batch idx: 1 (q_seqlens = 2, kv_seqlens = 2) (batch 0)
+#        k_toks >   4 5
+#        q_toks v  _____________
+#               2 | 1
+#               3 | 1 1
+#
+# e.g. if we have:
+#   attn_chunk_size = 4
+#   query_start_loc_np = [0, 4, 14, 19] (q_seqlens = [4, 10, 5])
+# Then this function would return:
+#                           __b0__  ______b1______  __b2__ < orig batch indices
+#   q_seqlens_local    = [   2,  2,  1,  4,  4,  1,  4,  1]
+#   cu_seqlens_q_local = [0, 4,  6, 10, 14, 18, 19, 23, 24]
+#   seqlens_k_local    = [   4,  2,  4,  4,  4,  1,  4,  1]
+#   block_table_local  : shape[local_virtual_batches, pages_per_local_batch]
+def make_local_attention_virtual_batches(
+    attn_chunk_size: int,
+    query_start_loc_np: np.ndarray,
+    seq_lens_np: np.ndarray,
+    block_table: torch.Tensor,
+    page_size: int = 0,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray, torch.Tensor]:
+    q_seqlens = query_start_loc_np[1:] - query_start_loc_np[:-1]
+    actual_batch_size = seq_lens_np.shape[0]
+
+    # Handle if we are starting in the middle of a local attention block,
+    #  we assume q_seqlens > 0 (for all elements), for each batch idx we compute
+    #  the number of tokens that are not in the first local attention block and
+    #  then we can simply use a cdiv for the rest.
+    # For example if we have:
+    #   attn_chunk_size = 4
+    #   q_seqlens = [4, 10, 5]
+    #   k_seqlens = [6, 17, 9]
+    # Then we would get:
+    #   new_tokens_in_first_block = [2, 1, 4]
+    #   local_blocks = [2, 4, 2]
+    q_tokens_in_first_block = np.minimum(
+        attn_chunk_size - ((seq_lens_np - q_seqlens) % attn_chunk_size),
+        q_seqlens).astype(np.int32)
+    tokens_in_last_block = attn_chunk_size + (seq_lens_np % -attn_chunk_size)
+    local_blocks = 1 + cdiv(q_seqlens - q_tokens_in_first_block,
+                            attn_chunk_size)
+
+    # Once we know the number of local blocks we can compute the request spans
+    #  for each batch idx, we can figure out the number of "virtual" requests we
+    #  have to make,
+    # For the above example we would get:
+    #   seqlens_q_local = [2, 2, 1, 4, 4, 1, 4, 1]
+    #
+    # First Get batched arange. (E.g., [2, 4, 2] -> [0, 1, 0, 1, 2, 3, 0, 1])
+    #   (TODO: max a utility to share this code with _prepare_inputs)
+    # arange step 1. [2, 4, 2] -> [2, 6, 8]
+    cu_num_blocks = np.cumsum(local_blocks)
+    virtual_batches = cu_num_blocks[-1]
+    # arange step 2. [2, 6, 8] -> [0, 0, 2, 2, 2, 2, 6, 6]
+    block_offsets = np.repeat(cu_num_blocks - local_blocks, local_blocks)
+    # arange step 3. [0, 1, 0, 1, 2, 3, 0, 1]
+    arange = np.arange(virtual_batches, dtype=np.int32) - block_offsets
+    # also compute reverse arange (i.e. [1, 0, 3, 2, 1, 0, 1, 0])
+    rarange = np.repeat(local_blocks, local_blocks) - arange - 1
+    # Then we can compute the seqlens_q_local, handling the fact that the
+    #  first and last blocks could be partial
+    seqlens_q_local = \
+        np.repeat(q_seqlens - q_tokens_in_first_block, local_blocks)
+    # set the first block since this may be a partial block
+    seqlens_q_local[arange == 0] = q_tokens_in_first_block
+    # set the remaining blocks
+    seqlens_q_local[arange > 0] = np.minimum(
+        seqlens_q_local - attn_chunk_size * (arange - 1),
+        attn_chunk_size)[arange > 0]
+
+    # convert from q_seqlens to cu_seqlens_q
+    cu_seqlens_q_local = np.pad(np.cumsum(seqlens_q_local), (1, 0))\
+        .astype(np.int32)
+
+    # compute the seqlens_k_local,
+    #  basically a full local attention block for all but the last block in each
+    #  batch
+    # For our example this will be:
+    #   seqlens_k_local = [4, 2, 4, 4, 4, 1, 4, 1]
+    seqlens_k_local = np.full(cu_num_blocks[-1],
+                              attn_chunk_size,
+                              dtype=np.int32)
+    seqlens_k_local[cu_num_blocks - 1] = tokens_in_last_block
+
+    k_seqstarts_absolute = np.repeat(seq_lens_np, local_blocks) - \
+        (rarange * attn_chunk_size + \
+            np.repeat(tokens_in_last_block, local_blocks))
+    # For the example the local attention blocks start at:
+    #                           _b0_  _____b1_____  _b2_
+    #   k_seqstarts_absolute = [0, 4, 4, 8, 12, 16, 4, 8]
+    block_starts = k_seqstarts_absolute // page_size
+    assert attn_chunk_size % page_size == 0, \
+        f"attn_chunk_size {attn_chunk_size} is not " \
+        f"divisible by page_size {page_size}"
+    pages_per_local_batch = attn_chunk_size // page_size
+
+    # Create a block_table for the local attention blocks
+    # For out example if we have a block-table like (assuming page_size=2):
+    #   block_table = [
+    #     [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],  < batch 0
+    #     [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],  < batch 1
+    #     [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],  < batch 2
+    #   ]
+    # Then for the local batches we would want a block-table like
+    #   block_table_local = [
+    #     [  0,  1 ], < local-batch 0, (batch 0, starting from k[0])
+    #     [  2,  3 ], < local-batch 1, (batch 0, starting from k[4])
+    #     [ 12, 13 ], < local-batch 2, (batch 1, starting from k[4])
+    #     [ 14, 15 ], < local-batch 3, (batch 1, starting from k[8])
+    #     [ 16, 17 ], < local-batch 4, (batch 1, starting from k[12])
+    #     [ 18, 19 ], < local-batch 5, (batch 1, starting from k[16])
+    #     [ 22, 23 ], < local-batch 6, (batch 2, starting from k[4])
+    #     [ 24, 25 ], < local-batch 7, (batch 2, starting from k[8])
+    #   ]
+    block_indices= np.broadcast_to(
+        np.arange(pages_per_local_batch, dtype=np.int32),
+        (virtual_batches, pages_per_local_batch)) \
+            + np.expand_dims(block_starts, axis=1)
+    block_indices = block_indices.flatten()
+    batch_indices = np.repeat(np.arange(actual_batch_size, dtype=np.int32),
+                              local_blocks * pages_per_local_batch)
+    block_table_local = block_table[batch_indices, block_indices]\
+        .view(virtual_batches, -1)
+
+    return seqlens_q_local, cu_seqlens_q_local, seqlens_k_local, \
+        block_table_local
+
 
 class FlashAttentionMetadataBuilder:
 
@@ -109,18 +286,40 @@ class FlashAttentionMetadataBuilder:
     def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
               common_prefix_len: int):
         max_seq_len = self.runner.seq_lens_np[:num_reqs].max()
-        query_start_loc = self.runner.query_start_loc_cpu[:num_reqs + 1].to(
-            self.runner.device, non_blocking=True)
-        seq_lens = self.runner.seq_lens_cpu[:num_reqs].to(self.runner.device,
-                                                          non_blocking=True)
+        query_start_loc_cpu = self.runner.query_start_loc_cpu[:num_reqs + 1]
+        query_start_loc = query_start_loc_cpu.to(self.runner.device,
+                                                 non_blocking=True)
+        seq_lens_cpu = self.runner.seq_lens_cpu[:num_reqs]
+        seq_lens = seq_lens_cpu.to(self.runner.device, non_blocking=True)
         block_table = (
             self.runner.input_batch.block_table.get_device_tensor()[:num_reqs])
         slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
             self.runner.device, non_blocking=True).long()
 
+        # for local attention
+        local_attn_metadata = None
+        if self.runner.attention_chunk_size is not None:
+            seqlens_q_local_np, virt_q_cu_seqlens_np, virt_k_seqlens_np, \
+                virt_block_table = make_local_attention_virtual_batches(
+                    self.runner.attention_chunk_size,
+                    self.runner.query_start_loc_np[:num_reqs + 1],
+                    self.runner.seq_lens_np[:num_reqs],
+                    block_table,
+                    self.runner.block_size,
+                )
+            local_attn_metadata = FlashAttentionMetadata.LocalAttentionMetadata(
+                local_query_start_loc=torch.from_numpy(
+                    virt_q_cu_seqlens_np).to(self.runner.device,
+                                             non_blocking=True),
+                local_seqused_k=torch.from_numpy(virt_k_seqlens_np).to(
+                    self.runner.device, non_blocking=True),
+                local_block_table=virt_block_table,
+                local_max_query_len=seqlens_q_local_np.max(),
+                local_max_seq_len=virt_k_seqlens_np.max(),
+            )
+
         use_cascade = common_prefix_len > 0
         if use_cascade:
-            # TODO: Optimize.
             cu_prefix_query_lens = torch.tensor([0, num_actual_tokens],
                                                 dtype=torch.int32,
                                                 device=self.runner.device)
@@ -149,6 +348,7 @@ class FlashAttentionMetadataBuilder:
             cu_prefix_query_lens=cu_prefix_query_lens,
             prefix_kv_lens=prefix_kv_lens,
             suffix_kv_lens=suffix_kv_lens,
+            local_attn_metadata=local_attn_metadata,
         )
         return attn_metadata
 
@@ -167,6 +367,7 @@ class FlashAttentionImpl(AttentionImpl):
         blocksparse_params: Optional[dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
+        use_irope: bool = False,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -203,6 +404,7 @@ class FlashAttentionImpl(AttentionImpl):
                                       "encoder/decoder cross-attention "
                                       "are not implemented for "
                                       "FlashAttentionImpl")
+        self.use_irope = use_irope
         self.vllm_flash_attn_version = get_flash_attn_version()
         if is_quantized_kv_cache(self.kv_cache_dtype) \
             and not flash_attn_supports_fp8():
@@ -265,8 +467,7 @@ class FlashAttentionImpl(AttentionImpl):
             layer._k_scale,
             layer._v_scale,
         )
-        descale_shape = (attn_metadata.query_start_loc.shape[0] - 1,
-                         key.shape[1])
+
         if self.kv_cache_dtype.startswith("fp8"):
             key_cache = key_cache.view(torch.float8_e4m3fn)
             value_cache = value_cache.view(torch.float8_e4m3fn)
@@ -278,22 +479,41 @@ class FlashAttentionImpl(AttentionImpl):
             query = query.reshape((num_tokens, num_heads, head_size))
 
         # Compute attention and update output up to `num_actual_tokens`.
-        if not attn_metadata.use_cascade:
-            # Regular attention (common case).
+        use_local_attn = \
+            (self.use_irope and attn_metadata.local_attn_metadata is not None)
+
+        if not attn_metadata.use_cascade or use_local_attn:
+            if use_local_attn:
+                assert attn_metadata.local_attn_metadata is not None
+                local_metadata = attn_metadata.local_attn_metadata
+                cu_seqlens_q = local_metadata.local_query_start_loc
+                seqused_k = local_metadata.local_seqused_k
+                max_seqlen_q = local_metadata.local_max_query_len
+                max_seqlen_k = local_metadata.local_max_seq_len
+                block_table = local_metadata.local_block_table
+            else:
+                cu_seqlens_q = attn_metadata.query_start_loc
+                seqused_k = attn_metadata.seq_lens
+                max_seqlen_q = attn_metadata.max_query_len
+                max_seqlen_k = attn_metadata.max_seq_len
+                block_table = attn_metadata.block_table
+
+            descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1])
+
             flash_attn_varlen_func(
                 q=query[:num_actual_tokens],
                 k=key_cache,
                 v=value_cache,
                 out=output[:num_actual_tokens],
-                cu_seqlens_q=attn_metadata.query_start_loc,
-                max_seqlen_q=attn_metadata.max_query_len,
-                seqused_k=attn_metadata.seq_lens,
-                max_seqlen_k=attn_metadata.max_seq_len,
+                cu_seqlens_q=cu_seqlens_q,
+                max_seqlen_q=max_seqlen_q,
+                seqused_k=seqused_k,
+                max_seqlen_k=max_seqlen_k,
                 softmax_scale=self.scale,
                 causal=True,
                 alibi_slopes=self.alibi_slopes,
                 window_size=self.sliding_window,
-                block_table=attn_metadata.block_table,
+                block_table=block_table,
                 softcap=self.logits_soft_cap,
                 fa_version=self.vllm_flash_attn_version,
                 q_descale=layer._q_scale.expand(descale_shape),
@@ -302,6 +522,8 @@ class FlashAttentionImpl(AttentionImpl):
             )
             return output
 
+        assert not use_local_attn, (
+            "Cascade attention does not support local attention.")
         # Cascade attention (rare case).
         cascade_attention(
             output[:num_actual_tokens],
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index 15b49b14c1dd7..5f96104705675 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -70,6 +70,7 @@ class TritonAttentionImpl(AttentionImpl):
         blocksparse_params: Optional[dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
+        use_irope: bool = False,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -86,6 +87,7 @@ class TritonAttentionImpl(AttentionImpl):
         else:
             self.sliding_window = (sliding_window - 1, 0)
         self.kv_cache_dtype = kv_cache_dtype
+        self.use_irope = use_irope
 
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
@@ -156,24 +158,41 @@ class TritonAttentionImpl(AttentionImpl):
             layer._v_scale,
         )
 
+        use_local_attn = \
+            (self.use_irope and attn_metadata.local_attn_metadata is not None)
+
+        if use_local_attn:
+            assert attn_metadata.local_attn_metadata is not None
+            local_metadata = attn_metadata.local_attn_metadata
+            cu_seqlens_q = local_metadata.local_query_start_loc
+            sequesd_k = local_metadata.local_seqused_k
+            max_seqlen_q = local_metadata.local_max_query_len
+            max_seqlen_k = local_metadata.local_max_seq_len
+            block_table = local_metadata.local_block_table
+        else:
+            cu_seqlens_q = attn_metadata.query_start_loc
+            sequesd_k = attn_metadata.seq_lens
+            max_seqlen_q = attn_metadata.max_query_len
+            max_seqlen_k = attn_metadata.max_seq_len
+            block_table = attn_metadata.block_table
+
         # Compute attention and update output up to `num_actual_tokens`.
-        chunked_prefill_paged_decode(
-            query=query[:num_actual_tokens],
-            key=key[:num_actual_tokens],
-            value=value[:num_actual_tokens],
-            output=output[:num_actual_tokens],
-            kv_cache_dtype=self.kv_cache_dtype,
-            key_cache=key_cache,
-            value_cache=value_cache,
-            block_table=attn_metadata.block_table,
-            query_start_loc=attn_metadata.query_start_loc,
-            seq_lens=attn_metadata.seq_lens,
-            max_seq_len=attn_metadata.max_seq_len,
-            max_query_len=attn_metadata.max_query_len,
-            k_scale=layer._k_scale,
-            v_scale=layer._v_scale,
-            alibi_slopes=self.alibi_slopes,
-            sliding_window=self.sliding_window[0],
-            sm_scale=self.scale)
+        chunked_prefill_paged_decode(query=query[:num_actual_tokens],
+                                     key=key[:num_actual_tokens],
+                                     value=value[:num_actual_tokens],
+                                     output=output[:num_actual_tokens],
+                                     kv_cache_dtype=self.kv_cache_dtype,
+                                     key_cache=key_cache,
+                                     value_cache=value_cache,
+                                     block_table=block_table,
+                                     query_start_loc=cu_seqlens_q,
+                                     seq_lens=sequesd_k,
+                                     max_seq_len=max_seqlen_k,
+                                     max_query_len=max_seqlen_q,
+                                     k_scale=layer._k_scale,
+                                     v_scale=layer._v_scale,
+                                     alibi_slopes=self.alibi_slopes,
+                                     sliding_window=self.sliding_window[0],
+                                     sm_scale=self.scale)
 
         return output
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 82b07c6cd3272..5133c637f0e0b 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -113,6 +113,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
         self.head_size = model_config.get_head_size()
         self.hidden_size = model_config.get_hidden_size()
+        self.attention_chunk_size = model_config.attention_chunk_size
 
         self.attn_backend = get_attn_backend(
             self.head_size,

From 027b204ff15e803190775e04d39973606a3a7021 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 7 Apr 2025 23:15:58 +0800
Subject: [PATCH 276/593] [Bugfix] Re-enable support for
 `ChatGLMForConditionalGeneration` (#16187)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/supported_models.md | 4 ++--
 tests/models/registry.py               | 2 ++
 vllm/model_executor/models/registry.py | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 2fb969ea85f1f..368c903c3252b 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -233,9 +233,9 @@ See [this page](#generative-models) for more information on how to use generativ
   * `facebook/bart-base`, `facebook/bart-large-cnn`, etc.
   *
   *
-- * `ChatGLMModel`
+- * `ChatGLMModel`, `ChatGLMForConditionalGeneration`
   * ChatGLM
-  * `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.
+  * `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc.
   * ✅︎
   * ✅︎
 - * `CohereForCausalLM`, `Cohere2ForCausalLM`
diff --git a/tests/models/registry.py b/tests/models/registry.py
index e61cbc5756f6f..10b93460c56b0 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -124,6 +124,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "BloomForCausalLM": _HfExamplesInfo("bigscience/bloomz-1b1"),
     "ChatGLMModel": _HfExamplesInfo("THUDM/chatglm3-6b",
                                     trust_remote_code=True),
+    "ChatGLMForConditionalGeneration": _HfExamplesInfo("thu-coai/ShieldLM-6B-chatglm3",  # noqa: E501
+                                                       trust_remote_code=True),
     "CohereForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r-v01",
                                          trust_remote_code=True),
     "Cohere2ForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r7b-12-2024", # noqa: E501
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 3abbb1f0c3b60..43ff892349e24 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -43,6 +43,7 @@ _TEXT_GENERATION_MODELS = {
     "BambaForCausalLM": ("bamba", "BambaForCausalLM"),
     "BloomForCausalLM": ("bloom", "BloomForCausalLM"),
     "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
+    "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),
     "CohereForCausalLM": ("commandr", "CohereForCausalLM"),
     "Cohere2ForCausalLM": ("commandr", "CohereForCausalLM"),
     "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),

From 66d433b94ffd215820c6c90d97ca0d511dc12143 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 8 Apr 2025 01:54:36 +0800
Subject: [PATCH 277/593] [V1] Revert the default `max_num_seqs` to V0 values
 for most hardware (#16158)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/getting_started/v1_user_guide.md | 7 -------
 tests/v1/engine/test_engine_args.py          | 6 ++++--
 vllm/engine/arg_utils.py                     | 3 ++-
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/docs/source/getting_started/v1_user_guide.md b/docs/source/getting_started/v1_user_guide.md
index e70f5a3bdec1e..a87484c3bb042 100644
--- a/docs/source/getting_started/v1_user_guide.md
+++ b/docs/source/getting_started/v1_user_guide.md
@@ -156,10 +156,3 @@ vLLM V1 is currently optimized for decoder-only transformers. Models requiring
   cross-attention between separate encoder and decoder are not yet supported (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`).
 
 For a complete list of supported models, see the [list of supported models](https://docs.vllm.ai/en/latest/models/supported_models.html).
-
-## Frequently Asked Questions
-
-**I'm using vLLM V1 and I'm getting CUDA OOM errors. What should I do?**
-The default `max_num_seqs` has been raised from `256` in V0 to `1024` in V1. If you encounter CUDA OOM only when using V1 engine, try setting a lower value of `max_num_seqs` or `gpu_memory_utilization`.
-
-On the other hand, if you get an error about insufficient memory for the cache blocks, you should increase `gpu_memory_utilization` as this indicates that your GPU has sufficient memory but you're not allocating enough to vLLM for KV cache blocks.
diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py
index 8963b21c4eb11..9b2f1a9199319 100644
--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
@@ -64,15 +64,17 @@ def test_defaults_with_usage_context():
         # For H100 and H200, we use larger default values.
         default_llm_tokens = 16384
         default_server_tokens = 8192
+        default_max_num_seqs = 1024
     else:
         default_llm_tokens = 8192
         default_server_tokens = 2048
+        default_max_num_seqs = 256
 
-    assert vllm_config.scheduler_config.max_num_seqs == 1024
+    assert vllm_config.scheduler_config.max_num_seqs == default_max_num_seqs
     assert vllm_config.scheduler_config.max_num_batched_tokens == default_llm_tokens  # noqa: E501
 
     engine_args = EngineArgs(model="facebook/opt-125m")
     vllm_config = engine_args.create_engine_config(
         UsageContext.OPENAI_API_SERVER)
-    assert vllm_config.scheduler_config.max_num_seqs == 1024
+    assert vllm_config.scheduler_config.max_num_seqs == default_max_num_seqs
     assert vllm_config.scheduler_config.max_num_batched_tokens == default_server_tokens  # noqa: E501
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index af80541bc0954..9ccfdf58cfd66 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1666,12 +1666,14 @@ class EngineArgs:
                 UsageContext.LLM_CLASS: 16384,
                 UsageContext.OPENAI_API_SERVER: 8192,
             }
+            default_max_num_seqs = 1024
         else:
             # TODO(woosuk): Tune the default values for other hardware.
             default_max_num_batched_tokens = {
                 UsageContext.LLM_CLASS: 8192,
                 UsageContext.OPENAI_API_SERVER: 2048,
             }
+            default_max_num_seqs = 256
 
         use_context_value = usage_context.value if usage_context else None
         if (self.max_num_batched_tokens is None
@@ -1682,7 +1684,6 @@ class EngineArgs:
                 "Setting max_num_batched_tokens to %d for %s usage context.",
                 self.max_num_batched_tokens, use_context_value)
 
-        default_max_num_seqs = 1024
         if self.max_num_seqs is None:
             self.max_num_seqs = default_max_num_seqs
 

From ad434d4cfe1349059868c8b4132dd5237a3e147f Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Mon, 7 Apr 2025 14:30:06 -0400
Subject: [PATCH 278/593] Print the warning only once (#16193)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 vllm/multimodal/profiling.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index ea58ba699f373..ff9df6e13f1a8 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -216,17 +216,18 @@ class MultiModalProfiler(Generic[_I]):
         # Encoder-decoder multimodal models only support v0
         if total_len > seq_len:
             # `max_num_batched_tokens` is defined by `SchedulerConfig`
-            logger.warning(
+            logger.warning_once(
                 "The encoder sequence length used for profiling ("
-                "max_num_batched_tokens / max_num_seqs = %d) is too short "
+                f"max_num_batched_tokens / max_num_seqs = {seq_len}) "
+                " is too short "
                 "to hold the multi-modal embeddings in the worst case "
-                "(%d tokens in total, out of which %s are reserved for "
+                f"({total_len} tokens in total, out of which "
+                f"{total_placeholders_by_modality} are reserved for "
                 "multi-modal embeddings). This may cause certain "
                 "multi-modal inputs to fail during inference, even when "
                 "the input text is short. To avoid this, you should "
                 "increase `max_model_len`, reduce `max_num_seqs`, "
-                "and/or reduce `mm_counts`.", seq_len, total_len,
-                total_placeholders_by_modality)
+                "and/or reduce `mm_counts`.")
 
         processor = cast(EncDecMultiModalProcessor, self.processor)
         if processor.pad_dummy_encoder_prompt:
@@ -251,17 +252,18 @@ class MultiModalProfiler(Generic[_I]):
         # V0 does not support chunked prefill.
         if total_len > seq_len and not envs.VLLM_USE_V1:
             # `max_num_batched_tokens` is defined by `SchedulerConfig`
-            logger.warning(
+            logger.warning_once(
                 "The sequence length used for profiling ("
-                "max_num_batched_tokens / max_num_seqs = %d) is too short "
+                f"max_num_batched_tokens / max_num_seqs = {seq_len}) "
+                "is too short "
                 "to hold the multi-modal embeddings in the worst case "
-                "(%d tokens in total, out of which %s are reserved for "
+                f"({total_len} tokens in total, out of which "
+                f"{total_placeholders_by_modality} are reserved for "
                 "multi-modal embeddings). This may cause certain "
                 "multi-modal inputs to fail during inference, even when "
                 "the input text is short. To avoid this, you should "
                 "increase `max_model_len`, reduce `max_num_seqs`, "
-                "and/or reduce `mm_counts`.", seq_len, total_len,
-                total_placeholders_by_modality)
+                "and/or reduce `mm_counts`.")
 
         if total_len < seq_len:
             prompt_token_ids.extend([0] * (seq_len - total_len))

From 090c856d7681f65143fece96f9dfd555c4b7d59b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Mon, 7 Apr 2025 20:40:58 +0200
Subject: [PATCH 279/593] [Misc] Human-readable `max-model-len` cli arg
 (#16181)

Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 tests/engine/test_arg_utils.py | 38 +++++++++++++++++++++++++-
 vllm/engine/arg_utils.py       | 50 ++++++++++++++++++++++++++++++++--
 2 files changed, 85 insertions(+), 3 deletions(-)

diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 8698d124e73ff..92387b46425e6 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from argparse import ArgumentTypeError
+from argparse import ArgumentError, ArgumentTypeError
 
 import pytest
 
@@ -142,3 +142,39 @@ def test_composite_arg_parser(arg, expected, option):
     else:
         args = parser.parse_args([f"--{option}", arg])
     assert getattr(args, option.replace("-", "_")) == expected
+
+
+def test_human_readable_model_len():
+    # `exit_on_error` disabled to test invalid values below
+    parser = EngineArgs.add_cli_args(
+        FlexibleArgumentParser(exit_on_error=False))
+
+    args = parser.parse_args([])
+    assert args.max_model_len is None
+
+    args = parser.parse_args(["--max-model-len", "1024"])
+    assert args.max_model_len == 1024
+
+    # Lower
+    args = parser.parse_args(["--max-model-len", "1m"])
+    assert args.max_model_len == 1_000_000
+    args = parser.parse_args(["--max-model-len", "10k"])
+    assert args.max_model_len == 10_000
+
+    # Capital
+    args = parser.parse_args(["--max-model-len", "3K"])
+    assert args.max_model_len == 1024 * 3
+    args = parser.parse_args(["--max-model-len", "10M"])
+    assert args.max_model_len == 2**20 * 10
+
+    # Decimal values
+    args = parser.parse_args(["--max-model-len", "10.2k"])
+    assert args.max_model_len == 10200
+    # ..truncated to the nearest int
+    args = parser.parse_args(["--max-model-len", "10.212345k"])
+    assert args.max_model_len == 10212
+
+    # Invalid (do not allow decimals with binary multipliers)
+    for invalid in ["1a", "pwd", "10.24", "1.23M"]:
+        with pytest.raises(ArgumentError):
+            args = parser.parse_args(["--max-model-len", invalid])
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 9ccfdf58cfd66..6d9f89faf71a8 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -3,6 +3,7 @@
 import argparse
 import dataclasses
 import json
+import re
 import threading
 from dataclasses import dataclass
 from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Mapping, Optional,
@@ -368,10 +369,14 @@ class EngineArgs:
             'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
             'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
         parser.add_argument('--max-model-len',
-                            type=int,
+                            type=human_readable_int,
                             default=EngineArgs.max_model_len,
                             help='Model context length. If unspecified, will '
-                            'be automatically derived from the model config.')
+                            'be automatically derived from the model config. '
+                            'Supports k/m/g/K/M/G in human-readable format.\n'
+                            'Examples:\n'
+                            '- 1k → 1000\n'
+                            '- 1K → 1024\n')
         parser.add_argument(
             '--guided-decoding-backend',
             type=str,
@@ -1740,6 +1745,47 @@ def _warn_or_fallback(feature_name: str) -> bool:
     return should_exit
 
 
+def human_readable_int(value):
+    """Parse human-readable integers like '1k', '2M', etc.
+    Including decimal values with decimal multipliers.
+    
+    Examples:
+    - '1k' -> 1,000
+    - '1K' -> 1,024
+    - '25.6k' -> 25,600
+    """
+    value = value.strip()
+    match = re.fullmatch(r'(\d+(?:\.\d+)?)([kKmMgGtT])', value)
+    if match:
+        decimal_multiplier = {
+            'k': 10**3,
+            'm': 10**6,
+            'g': 10**9,
+        }
+        binary_multiplier = {
+            'K': 2**10,
+            'M': 2**20,
+            'G': 2**30,
+        }
+
+        number, suffix = match.groups()
+        if suffix in decimal_multiplier:
+            mult = decimal_multiplier[suffix]
+            return int(float(number) * mult)
+        elif suffix in binary_multiplier:
+            mult = binary_multiplier[suffix]
+            # Do not allow decimals with binary multipliers
+            try:
+                return int(number) * mult
+            except ValueError as e:
+                raise argparse.ArgumentTypeError("Decimals are not allowed " \
+                f"with binary suffixes like {suffix}. Did you mean to use " \
+                f"{number}{suffix.lower()} instead?") from e
+
+    # Regular plain number.
+    return int(value)
+
+
 # These functions are used by sphinx to build the documentation
 def _engine_args_parser():
     return EngineArgs.add_cli_args(FlexibleArgumentParser())

From ed636d99caa00ebfff66410baedbd059121807a0 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 7 Apr 2025 14:02:05 -0700
Subject: [PATCH 280/593] [Misc] Move Llama 4 projector call into encoder
 execution (#16201)

---
 vllm/model_executor/models/mllama4.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index dae98093bc6e1..d8c0234b8f42d 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -760,6 +760,8 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
         flat_data = image_input["flat_data"]
         patches_per_image = image_input["patches_per_image"].tolist()
         vision_embeddings_flat = self.vision_model(flat_data)
+        vision_embeddings_flat = self.multi_modal_projector(
+            vision_embeddings_flat)
         return vision_embeddings_flat.split(patches_per_image, dim=0)
 
     def get_multimodal_embeddings(self,
@@ -791,10 +793,9 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
 
         if multimodal_embeddings is not None:
-            multimodal_embeddings = torch.cat(multimodal_embeddings)
-            mm_embeddings = self.multi_modal_projector(multimodal_embeddings)
             inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, select_patch_features(mm_embeddings),
+                input_ids, inputs_embeds,
+                select_patch_features(multimodal_embeddings),
                 self.config.image_token_index)
 
         return inputs_embeds

From 3147586ebdb36ceae653e9dceec8cf9922fe2c28 Mon Sep 17 00:00:00 2001
From: Benjamin Chislett <benjamin.chislett@centml.ai>
Date: Mon, 7 Apr 2025 18:15:43 -0400
Subject: [PATCH 281/593] [Bugfix] Fix guidance backend for Qwen models
 (#16210)

Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
---
 vllm/v1/structured_output/backend_guidance.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
index ec7e627191a5d..9150a28570bdd 100644
--- a/vllm/v1/structured_output/backend_guidance.py
+++ b/vllm/v1/structured_output/backend_guidance.py
@@ -46,7 +46,8 @@ class GuidanceBackend(StructuredOutputBackend):
             in vllm_config.decoding_config.guided_decoding_backend)
 
         tokenizer = tokenizer_group.get_lora_tokenizer(None)
-        self.ll_tokenizer = llguidance_hf.from_tokenizer(tokenizer, None)
+        self.ll_tokenizer = llguidance_hf.from_tokenizer(
+            tokenizer, self.vocab_size)
 
     def compile_grammar(self, request_type: StructuredOutputOptions,
                         grammar_spec: str) -> StructuredOutputGrammar:

From 7f6d47c1a2a74bd68a69103149ca729560076b9e Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Mon, 7 Apr 2025 15:30:15 -0700
Subject: [PATCH 282/593] [V1][BugFix] Exit properly if engine core fails
 during startup (#16137)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 requirements/test.in                       |  1 +
 requirements/test.txt                      |  3 ++
 tests/v1/engine/test_engine_core_client.py | 41 ++++++++++++++++++++++
 vllm/v1/engine/core_client.py              | 23 +++++++-----
 vllm/v1/utils.py                           | 13 ++++---
 5 files changed, 67 insertions(+), 14 deletions(-)

diff --git a/requirements/test.in b/requirements/test.in
index ac7f451e96a8d..73bd3e4a4f571 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -5,6 +5,7 @@ pytest-forked
 pytest-asyncio
 pytest-rerunfailures
 pytest-shard
+pytest-timeout
 
 # testing utils
 awscli
diff --git a/requirements/test.txt b/requirements/test.txt
index 39d6ed1acff06..88b5ab51d26e7 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -444,6 +444,7 @@ pytest==8.3.3
     #   pytest-mock
     #   pytest-rerunfailures
     #   pytest-shard
+    #   pytest-timeout
 pytest-asyncio==0.24.0
     # via -r requirements/test.in
 pytest-forked==1.6.0
@@ -454,6 +455,8 @@ pytest-rerunfailures==14.0
     # via -r requirements/test.in
 pytest-shard==0.1.2
     # via -r requirements/test.in
+pytest-timeout==2.3.1
+    # via -r requirements/test.in
 python-dateutil==2.9.0.post0
     # via
     #   botocore
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 68844b877c17d..8ebdaf63b4847 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -3,8 +3,10 @@
 import asyncio
 import time
 import uuid
+from threading import Thread
 from typing import Optional
 
+import psutil
 import pytest
 from transformers import AutoTokenizer
 
@@ -245,3 +247,42 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
             await core_client.call_utility_async("echo", None, "help!")
 
         assert str(e_info.value) == "Call to echo method failed: help!"
+
+
+@pytest.mark.timeout(10)
+def test_startup_failure(monkeypatch: pytest.MonkeyPatch):
+
+    with monkeypatch.context() as m, pytest.raises(Exception) as e_info:
+        m.setenv("VLLM_USE_V1", "1")
+
+        engine_args = EngineArgs(model=MODEL_NAME)
+        vllm_config = engine_args.create_engine_config(
+            usage_context=UsageContext.UNKNOWN_CONTEXT)
+        executor_class = Executor.get_class(vllm_config)
+
+        # Start another thread to wait for engine core process to start
+        # and kill it - simulate fatal uncaught process exit.
+        this_proc = psutil.Process()
+        children_before = set(this_proc.children())
+
+        def kill_first_child():
+            while True:
+                time.sleep(0.5)
+                children = set(this_proc.children()) - children_before
+                if children:
+                    child = children.pop()
+                    print("Killing child core process", child.pid)
+                    child.kill()
+                    break
+
+        Thread(target=kill_first_child, daemon=True).start()
+
+        _core_client = EngineCoreClient.make_client(
+            multiprocess_mode=True,
+            asyncio_mode=True,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=True,
+        )
+
+    assert "Engine core initialization failed" in str(e_info.value)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index b94b0aa75386a..2e5f9021f1009 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -411,10 +411,21 @@ class MPClient(EngineCoreClient):
 
         # Wait for engine core process(es) to send ready messages.
         identities = set(eng.index for eng in self.resources.core_engines)
+        poller = zmq.Poller()
+        poller.register(sync_input_socket, zmq.POLLIN)
+        for eng in self.resources.core_engines:
+            poller.register(eng.proc_handle, zmq.POLLIN)
         while identities:
-            while not sync_input_socket.poll(timeout=STARTUP_POLL_PERIOD_MS):
-                logger.info("Waiting for %d core engine proc(s) to start: %s",
-                            len(identities), identities)
+            events = poller.poll(STARTUP_POLL_PERIOD_MS)
+            if not events:
+                logger.debug("Waiting for %d core engine proc(s) to start: %s",
+                             len(identities), identities)
+                continue
+            if len(events) > 1 or events[0][0] != sync_input_socket:
+                # One of the core processes exited.
+                raise RuntimeError("Engine core initialization failed. "
+                                   "See root cause above.")
+
             eng_id_bytes, msg = sync_input_socket.recv_multipart()
             eng_id = int.from_bytes(eng_id_bytes, byteorder="little")
             if eng_id not in identities:
@@ -424,12 +435,6 @@ class MPClient(EngineCoreClient):
             logger.info("Core engine process %d ready.", eng_id)
             identities.discard(eng_id)
 
-        # Double check that the process are running.
-        for engine in self.resources.core_engines:
-            proc = engine.proc_handle.proc
-            if proc.exitcode is not None:
-                raise RuntimeError(f"Engine proc {proc.name} not running")
-
     def _init_core_engines(
         self,
         vllm_config: VllmConfig,
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index fed5761b04b6c..32d8101f681d3 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,10 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import multiprocessing
 import os
 import weakref
 from collections import defaultdict
 from collections.abc import Sequence
+from multiprocessing import Process
 from typing import (TYPE_CHECKING, Any, Callable, Generic, Optional, TypeVar,
                     Union, overload)
 
@@ -112,20 +112,23 @@ class BackgroundProcHandle:
         process_kwargs["output_path"] = output_path
 
         # Run busy loop in background process.
-        self.proc = context.Process(target=target_fn,
-                                    kwargs=process_kwargs,
-                                    name=process_name)
+        self.proc: Process = context.Process(target=target_fn,
+                                             kwargs=process_kwargs,
+                                             name=process_name)
         self._finalizer = weakref.finalize(self, shutdown, self.proc,
                                            input_path, output_path)
         self.proc.start()
 
+    def fileno(self):
+        return self.proc.sentinel
+
     def shutdown(self):
         self._finalizer()
 
 
 # Note(rob): shutdown function cannot be a bound method,
 # else the gc cannot collect the object.
-def shutdown(proc: multiprocessing.Process, input_path: str, output_path: str):
+def shutdown(proc: Process, input_path: str, output_path: str):
     # Shutdown the process.
     if proc.is_alive():
         proc.terminate()

From fad6e2538ebacbed4ba9cb04f0d143b7daf2be29 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Tue, 8 Apr 2025 06:30:35 +0800
Subject: [PATCH 283/593] [Misc] add description attribute in CLI (#15921)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 vllm/entrypoints/cli/benchmark/base.py | 1 +
 vllm/entrypoints/cli/benchmark/main.py | 1 +
 vllm/entrypoints/cli/openai.py         | 7 +++++--
 vllm/entrypoints/cli/serve.py          | 3 ++-
 4 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/cli/benchmark/base.py b/vllm/entrypoints/cli/benchmark/base.py
index c41b2c5867818..94fb415f581f4 100644
--- a/vllm/entrypoints/cli/benchmark/base.py
+++ b/vllm/entrypoints/cli/benchmark/base.py
@@ -32,6 +32,7 @@ class BenchmarkSubcommandBase(CLISubcommand):
         parser = subparsers.add_parser(
             self.name,
             help=self.help,
+            description=self.help,
             usage=f"vllm bench {self.name} [options]")
         self.add_cli_args(parser)
         return parser
diff --git a/vllm/entrypoints/cli/benchmark/main.py b/vllm/entrypoints/cli/benchmark/main.py
index 7583540920d30..1bcb25be2fcae 100644
--- a/vllm/entrypoints/cli/benchmark/main.py
+++ b/vllm/entrypoints/cli/benchmark/main.py
@@ -33,6 +33,7 @@ class BenchmarkSubcommand(CLISubcommand):
         bench_parser = subparsers.add_parser(
             "bench",
             help="vLLM bench subcommand.",
+            description="vLLM bench subcommand.",
             usage="vllm bench <bench_type> [options]")
         bench_subparsers = bench_parser.add_subparsers(required=True,
                                                        dest="bench_type")
diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py
index 21a7d48b75c18..1d1bba1d49ce0 100644
--- a/vllm/entrypoints/cli/openai.py
+++ b/vllm/entrypoints/cli/openai.py
@@ -126,7 +126,8 @@ class ChatCommand(CLISubcommand):
             subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
         chat_parser = subparsers.add_parser(
             "chat",
-            help="Generate chat completions via the running API server",
+            help="Generate chat completions via the running API server.",
+            description="Generate chat completions via the running API server.",
             usage="vllm chat [options]")
         _add_query_options(chat_parser)
         chat_parser.add_argument(
@@ -162,7 +163,9 @@ class CompleteCommand(CLISubcommand):
         complete_parser = subparsers.add_parser(
             "complete",
             help=("Generate text completions based on the given prompt "
-                  "via the running API server"),
+                  "via the running API server."),
+            description=("Generate text completions based on the given prompt "
+                         "via the running API server."),
             usage="vllm complete [options]")
         _add_query_options(complete_parser)
         return complete_parser
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index e89ac4e219997..5c8781b50d2ca 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -34,7 +34,8 @@ class ServeSubcommand(CLISubcommand):
             subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
         serve_parser = subparsers.add_parser(
             "serve",
-            help="Start the vLLM OpenAI Compatible API server",
+            help="Start the vLLM OpenAI Compatible API server.",
+            description="Start the vLLM OpenAI Compatible API server.",
             usage="vllm serve [model_tag] [options]")
         serve_parser.add_argument("model_tag",
                                   type=str,

From 24f1c01e0f1619d9933dfc352c22be3e03d9e58d Mon Sep 17 00:00:00 2001
From: leon-seidel <83984854+leon-seidel@users.noreply.github.com>
Date: Tue, 8 Apr 2025 00:38:25 +0200
Subject: [PATCH 284/593] [Bugfix][V0] XGrammar structured output supports Enum
 (#15878)

Signed-off-by: Leon Seidel <leon.seidel@fau.de>
---
 tests/entrypoints/llm/test_guided_generate.py | 43 +++++++++++++++++++
 vllm/model_executor/guided_decoding/utils.py  |  4 --
 2 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index 3f275e0b2ec74..3b85ad68c057a 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -3,9 +3,11 @@
 import json
 import re
 import weakref
+from enum import Enum
 
 import jsonschema
 import pytest
+from pydantic import BaseModel
 
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.entrypoints.llm import LLM
@@ -330,3 +332,44 @@ def test_guided_json_object(llm, guided_decoding_backend: str):
             # Parse to verify it is valid JSON
             parsed_json = json.loads(generated_text)
             assert isinstance(parsed_json, dict)
+
+
+class CarType(str, Enum):
+    sedan = "sedan"
+    suv = "SUV"
+    truck = "Truck"
+    coupe = "Coupe"
+
+
+class CarDescription(BaseModel):
+    brand: str
+    model: str
+    car_type: CarType
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+def test_guided_json_completion_with_enum(llm, guided_decoding_backend: str):
+    json_schema = CarDescription.model_json_schema()
+    sampling_params = SamplingParams(temperature=1.0,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         json=json_schema,
+                                         backend=guided_decoding_backend))
+    outputs = llm.generate(
+        prompts="Generate a JSON with the brand, model and car_type of"
+        "the most iconic car from the 90's",
+        sampling_params=sampling_params,
+        use_tqdm=True)
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json, schema=json_schema)
\ No newline at end of file
diff --git a/vllm/model_executor/guided_decoding/utils.py b/vllm/model_executor/guided_decoding/utils.py
index 10981776e768c..ba7c102526997 100644
--- a/vllm/model_executor/guided_decoding/utils.py
+++ b/vllm/model_executor/guided_decoding/utils.py
@@ -14,10 +14,6 @@ def has_xgrammar_unsupported_json_features(schema: dict) -> bool:
         if "pattern" in obj:
             return True
 
-        # Check for enum restrictions
-        if "enum" in obj:
-            return True
-
         # Check for numeric ranges
         if obj.get("type") in ("integer", "number") and any(
                 key in obj for key in [

From 652907b3543a796a4006156325dcb370cd07b59f Mon Sep 17 00:00:00 2001
From: Driss Guessous <32754868+drisspg@users.noreply.github.com>
Date: Mon, 7 Apr 2025 16:39:28 -0700
Subject: [PATCH 285/593] Torchao (#14231)

Signed-off-by: drisspg <drisspguessous@gmail.com>
---
 docs/source/features/quantization/index.md    |   1 +
 docs/source/features/quantization/torchao.md  |  34 +++++
 tests/quantization/test_torchao.py            |  25 ++++
 .../layers/quantization/__init__.py           |   5 +-
 .../layers/quantization/torchao.py            | 127 ++++++++++++++++++
 5 files changed, 191 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/features/quantization/torchao.md
 create mode 100644 tests/quantization/test_torchao.py
 create mode 100644 vllm/model_executor/layers/quantization/torchao.py

diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md
index 4b59695afbd1a..6f539f6e3f486 100644
--- a/docs/source/features/quantization/index.md
+++ b/docs/source/features/quantization/index.md
@@ -18,4 +18,5 @@ int8
 fp8
 quark
 quantized_kvcache
+torchao
 :::
diff --git a/docs/source/features/quantization/torchao.md b/docs/source/features/quantization/torchao.md
new file mode 100644
index 0000000000000..9a85f0bab9ec7
--- /dev/null
+++ b/docs/source/features/quantization/torchao.md
@@ -0,0 +1,34 @@
+# TorchAO
+
+TorchAO is an architecture optimization library for PyTorch, it provides high performance dtypes, optimization techniques and kernels for inference and training, featuring composability with native PyTorch features like torch.compile, FSDP etc.. Some benchmark numbers can be found [here](https://github.com/pytorch/ao/tree/main/torchao/quantization#benchmarks).
+
+We recommend installing the latest torchao nightly with
+
+```console
+# Install the latest TorchAO nightly build
+# Choose the CUDA version that matches your system (cu126, cu128, etc.)
+pip install --pre torchao>=10.0.0 --index-url https://download.pytorch.org/whl/nightly/cu126
+```
+
+## Quantizing HuggingFace Models
+You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code:
+
+```Python
+import torch
+from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
+from torchao.quantization import Int8WeightOnlyConfig
+
+model_name = "meta-llama/Meta-Llama-3-8B"
+quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
+quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+input_text = "What are we having for dinner?"
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+hub_repo = # YOUR HUB REPO ID
+tokenizer.push_to_hub(hub_repo)
+quantized_model.push_to_hub(hub_repo, safe_serialization=False)
+```
+
+Alternatively, you can use the TorchAO Quantization space for quantizing models with a simple UI.
+See: https://huggingface.co/spaces/medmekk/TorchAO_Quantization
diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py
new file mode 100644
index 0000000000000..314ec90e34f93
--- /dev/null
+++ b/tests/quantization/test_torchao.py
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: Apache-2.0
+import importlib.metadata
+import importlib.util
+
+import pytest
+
+DTYPE = ["bfloat16"]
+
+TORCHAO_AVAILABLE = importlib.util.find_spec("torchao") is not None
+
+
+@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
+def test_pre_quantized_model(vllm_runner):
+    with vllm_runner("drisspg/float8_dynamic_act_float8_weight-opt-125m",
+                     quantization="torchao",
+                     dtype="bfloat16",
+                     enforce_eager=True) as llm:
+        output = llm.generate_greedy(["The capital of France is"],
+                                     max_tokens=32)
+    assert output
+    print(output)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index a4dc4e9cbf2b2..89533955fd769 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -31,7 +31,8 @@ QUANTIZATION_METHODS: List[str] = [
     "neuron_quant",
     "ipex",
     "quark",
-    "moe_wna16"
+    "moe_wna16",
+    "torchao",
 ]
 
 # The customized quantization methods which will be added to this dict.
@@ -103,6 +104,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     from .neuron_quant import NeuronQuantConfig
     from .ptpc_fp8 import PTPCFp8Config
     from .qqq import QQQConfig
+    from .torchao import TorchAOConfig
     from .tpu_int8 import Int8TpuConfig
 
     method_to_config: Dict[str, Type[QuantizationConfig]] = {
@@ -132,6 +134,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
         "ipex": IPEXConfig,
         "quark": QuarkConfig,
         "moe_wna16": MoeWNA16Config,
+        "torchao": TorchAOConfig,
     }
     # Update the `method_to_config` with customized quantization methods.
     method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)
diff --git a/vllm/model_executor/layers/quantization/torchao.py b/vllm/model_executor/layers/quantization/torchao.py
new file mode 100644
index 0000000000000..5c2babcf4ab63
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/torchao.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any, Dict, List, Optional
+
+import torch
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter
+
+from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.utils import set_weight_attrs
+
+
+class TorchAOConfig(QuantizationConfig):
+    """Config class for torchao."""
+
+    def __init__(self, torchao_config) -> None:
+        self.torchao_config = torchao_config
+
+    def __repr__(self) -> str:
+        return f"TorchAOConfig({self.torchao_config})"
+
+    def get_name(self) -> str:
+        return "torchao"
+
+    def get_supported_act_dtypes(self) -> List[torch.dtype]:
+        return [torch.float32, torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 75
+
+    @staticmethod
+    def get_config_filenames() -> List[str]:
+        return ["config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "TorchAOConfig":
+        """Create the quant config from an hf model config"""
+        try:
+            from torchao.core.config import config_from_dict
+        except ImportError as err:
+            raise ImportError(
+                "Please install torchao>=0.10.0 via "
+                "`pip install torchao>=0.10.0` to use torchao quantization."
+            ) from err
+
+        hf_config = cls.get_from_keys_or(config, ["quant_type"], None)
+        assert hf_config is not None, "quant_type must be specified"
+        assert (len(hf_config) == 1 and "default" in hf_config
+                ), "Expected only one key 'default' in quant_type dictionary"
+        quant_type = hf_config["default"]
+        ao_config = config_from_dict(quant_type)
+        return cls(ao_config)
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["TorchAOLinearMethod"]:
+        if isinstance(layer, LinearBase):
+            return TorchAOLinearMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+def torchao_quantize_param_data(param: torch.Tensor,
+                                torchao_config: Any) -> torch.nn.Parameter:
+    """Quantize a Tensor with torchao quantization specified by torchao_config
+
+    Args:
+       `param`: weight parameter of the linear module
+       `torchao_config`: type of quantization and their arguments we want to
+        use to quantize the Tensor
+    """
+    from torchao.core.config import AOBaseConfig
+    from torchao.quantization import quantize_
+    assert isinstance(torchao_config, AOBaseConfig)
+    dummy_linear = torch.nn.Linear(param.shape[1], param.shape[0], bias=False)
+    dummy_linear.weight = param
+    quantize_(dummy_linear, torchao_config)
+    return dummy_linear.weight
+
+
+class TorchAOLinearMethod(LinearMethodBase):
+    """Linear method for torchao.
+
+    Args:
+        torchao_config: The torchao quantization config, a string
+        that encodes the type of quantization and all relevant arguments.
+    """
+
+    def __init__(self, quant_config: TorchAOConfig):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        weight = Parameter(
+            torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        weight = torchao_quantize_param_data(weight,
+                                             self.quant_config.torchao_config)
+
+        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
+
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(weight, extra_weight_attrs)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return F.linear(x, layer.weight, bias)

From 21802c4b6d96ba1b6a85913c01a3784b55a12248 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Mon, 7 Apr 2025 19:28:14 -0600
Subject: [PATCH 286/593] [ROCm][Bugfix][FP8] Make fp8 quant respect fused
 modules mapping (#16031)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 vllm/model_executor/layers/quantization/fp8.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 4435644c4f84e..512d64496bd49 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -116,7 +116,9 @@ class Fp8Config(QuantizationConfig):
         from vllm.attention.layer import Attention  # Avoid circular import
 
         if isinstance(layer, LinearBase):
-            if is_layer_skipped(prefix, self.ignored_layers):
+            if is_layer_skipped(prefix=prefix,
+                                ignored_layers=self.ignored_layers,
+                                fused_mapping=self.packed_modules_mapping):
                 return UnquantizedLinearMethod()
             return Fp8LinearMethod(self)
         elif isinstance(layer, FusedMoE):

From a865bc1ca6f794b0127e601058cee5100860875a Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 8 Apr 2025 10:09:03 +0800
Subject: [PATCH 287/593] [core] do not send error across process (#16174)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/v1/executor/multiproc_executor.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 1d5175eb6adc3..d79bce194b713 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -119,10 +119,9 @@ class MultiprocExecutor(Executor):
                     timeout=dequeue_timeout)
 
                 if status != WorkerProc.ResponseStatus.SUCCESS:
-                    if isinstance(result, Exception):
-                        raise result
-                    else:
-                        raise RuntimeError("Worker failed")
+                    raise RuntimeError(
+                        "Worker failed with error %s, please check the"
+                        " stack trace above for the root cause", result)
 
                 responses[w.rank] = result
 
@@ -378,9 +377,11 @@ class WorkerProc:
                 # Notes have been introduced in python 3.11
                 if hasattr(e, "add_note"):
                     e.add_note(traceback.format_exc())
-                self.worker_response_mq.enqueue(
-                    (WorkerProc.ResponseStatus.FAILURE, e))
                 logger.exception("WorkerProc hit an exception: %s", exc_info=e)
+                # exception might not be serializable, so we convert it to
+                # string, only for logging purpose.
+                self.worker_response_mq.enqueue(
+                    (WorkerProc.ResponseStatus.FAILURE, str(e)))
                 continue
 
             self.worker_response_mq.enqueue(

From f94ab12f792901f592f56f5c8727c72432fa29c2 Mon Sep 17 00:00:00 2001
From: Miles Williams <42222518+mlsw@users.noreply.github.com>
Date: Tue, 8 Apr 2025 03:09:06 +0100
Subject: [PATCH 288/593] [Misc] Update compressed-tensors to version 0.9.3
 (#16196)

Signed-off-by: Miles Williams <42222518+mlsw@users.noreply.github.com>
---
 requirements/common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 24a1e6d67ac22..0748418fad9cc 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -36,7 +36,7 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.9.2 # required for compressed-tensors
+compressed-tensors == 0.9.3 # required for compressed-tensors
 depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files

From 1d01211264cd58e287a3d09c3980d367a53bc9cf Mon Sep 17 00:00:00 2001
From: Satyajith Chilappagari <satchill@amazon.com>
Date: Mon, 7 Apr 2025 19:11:18 -0700
Subject: [PATCH 289/593] Update BASE_IMAGE to 2.22 release of Neuron (#16218)

---
 docker/Dockerfile.neuron | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docker/Dockerfile.neuron b/docker/Dockerfile.neuron
index 067645906366e..2b63fe301bac6 100644
--- a/docker/Dockerfile.neuron
+++ b/docker/Dockerfile.neuron
@@ -1,6 +1,6 @@
 # default base image
 # https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
-ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04"
+ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.22.0-ubuntu22.04"
 
 FROM $BASE_IMAGE
 
@@ -21,9 +21,9 @@ VOLUME [ ${APP_MOUNT} ]
 WORKDIR ${APP_MOUNT}/vllm
 
 RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
-RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
-RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity
+RUN python3 -m pip install sentencepiece transformers==4.48.0 -U
+RUN python3 -m pip install neuronx-cc==2.17.194.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
 RUN python3 -m pip install pytest
 
 # uninstall transformers-neuronx package explicitly to avoid version conflict

From f2ebb6f541ab6afa30907232df6883319a0a4d83 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 7 Apr 2025 19:43:41 -0700
Subject: [PATCH 290/593] [V1] Scatter and gather placeholders in the model
 runner (#16076)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Jennifer Zhao <ai.jenniferzhao@gmail.com>
---
 docs/source/contributing/model/multimodal.md  |  16 +-
 docs/source/models/supported_models.md        |   3 -
 .../vision_language/test_models.py            |   5 +-
 .../vision_language/test_pixtral.py           |  24 +--
 .../multimodal/processing/test_llama4.py      |  12 +-
 .../multimodal/processing/test_llava_next.py  |   4 +-
 .../processing/test_llava_onevision.py        |   4 +-
 tests/multimodal/test_processing.py           |   9 ++
 tests/v1/core/test_kv_cache_utils.py          |  46 +++---
 vllm/model_executor/models/aya_vision.py      |  71 +++------
 vllm/model_executor/models/chameleon.py       |   6 +-
 vllm/model_executor/models/fuyu.py            |  85 ++++-------
 vllm/model_executor/models/gemma3_mm.py       |  83 +++-------
 vllm/model_executor/models/h2ovl.py           |   2 +-
 vllm/model_executor/models/idefics3.py        |  82 ++--------
 vllm/model_executor/models/internvl.py        |  43 +-----
 vllm/model_executor/models/llava.py           |  55 +------
 vllm/model_executor/models/minicpmo.py        |  74 ++-------
 vllm/model_executor/models/minicpmv.py        | 142 +++++-------------
 vllm/model_executor/models/mistral3.py        |  50 +-----
 vllm/model_executor/models/mllama4.py         |  99 ++++--------
 vllm/model_executor/models/molmo.py           |  74 ++-------
 vllm/model_executor/models/nvlm_d.py          |  32 +---
 vllm/model_executor/models/paligemma.py       |   6 +-
 vllm/model_executor/models/phi3v.py           |  11 +-
 vllm/model_executor/models/pixtral.py         |  48 +-----
 vllm/model_executor/models/qwen2_audio.py     |   6 +-
 vllm/model_executor/models/qwen_vl.py         |   6 +-
 vllm/model_executor/models/skyworkr1v.py      |  42 +-----
 vllm/model_executor/models/vision.py          |  77 +---------
 vllm/multimodal/base.py                       |   4 +-
 vllm/multimodal/inputs.py                     |  32 +++-
 vllm/multimodal/processing.py                 |  75 ++++++---
 vllm/multimodal/profiling.py                  |   2 +-
 vllm/multimodal/utils.py                      |   2 +-
 vllm/v1/core/kv_cache_utils.py                |   7 +-
 vllm/v1/core/sched/scheduler.py               |   8 +-
 vllm/v1/request.py                            |   2 +-
 vllm/v1/worker/gpu_model_runner.py            |  61 +++++---
 vllm/v1/worker/tpu_model_runner.py            |  86 ++++++++---
 vllm/v1/worker/utils.py                       |  45 ++++++
 41 files changed, 521 insertions(+), 1020 deletions(-)

diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index 9cbfc32991f09..c4894d39edc97 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -860,8 +860,8 @@ prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
 )
 ```
 
-To accommodate this, instead of a string you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`
-with different `full` and `feature` attributes:
+To assign the vision embeddings to only the image tokens, instead of a string
+you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`:
 
 ```python
 hf_config = self.info.get_hf_config()
@@ -879,9 +879,9 @@ def get_replacement_fuyu(item_idx: int):
     image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                     [_NEWLINE_TOKEN_ID]) * nrows
 
-    return PromptUpdateDetails(
-        full=image_tokens + [bos_token_id],
-        features=image_tokens,
+    return PromptUpdateDetails.select_token_id(
+        image_tokens + [bos_token_id],
+        embed_token_id=_IMAGE_TOKEN_ID,
     )
 ```
 
@@ -914,9 +914,9 @@ def _get_prompt_updates(
         image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                         [_NEWLINE_TOKEN_ID]) * nrows
 
-        return PromptUpdateDetails(
-            full=image_tokens + [bos_token_id],
-            features=image_tokens,
+        return PromptUpdateDetails.select_token_id(
+            image_tokens + [bos_token_id],
+            embed_token_id=_IMAGE_TOKEN_ID,
         )
 
     return [
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 368c903c3252b..9e54b2cf54c7d 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -1006,9 +1006,6 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
 :::{important}
-To use Gemma3 series models, you have to install Hugging Face Transformers library from source via
-`pip install git+https://github.com/huggingface/transformers`.
-
 Pan-and-scan image pre-processing is currently supported on V0 (but not V1).
 You can enable it by passing `--mm-processor-kwargs '{"do_pan_and_scan": True}'`.
 :::
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 9d9e8278af4fc..c0cac8d87492d 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -330,9 +330,8 @@ VLM_TEST_SETTINGS = {
         max_num_seqs=4,
         dtype="bfloat16",
         auto_cls=AutoModelForImageTextToText,
-        tensor_parallel_size=8,
-        vllm_runner_kwargs={"gpu_memory_utilization": 0.8},
-        marks=multi_gpu_marks(num_gpus=8),
+        tensor_parallel_size=4,
+        marks=multi_gpu_marks(num_gpus=4),
     ),
     "llava_next": VLMTestInfo(
         models=["llava-hf/llava-v1.6-mistral-7b-hf"],
diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py
index 2f14a8ea321fe..6ebe75f0e8129 100644
--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -200,22 +200,14 @@ def test_chat(
 
 
 @large_gpu_test(min_gb=48)
-@pytest.mark.parametrize(
-    "prompt,expected_ranges",
-    [(_create_engine_inputs_hf(IMG_URLS[:1]), [{
-        "offset": 11,
-        "length": 494
-    }]),
-     (_create_engine_inputs_hf(IMG_URLS[1:4]), [{
-         "offset": 11,
-         "length": 266
-     }, {
-         "offset": 277,
-         "length": 1056
-     }, {
-         "offset": 1333,
-         "length": 418
-     }])])
+@pytest.mark.parametrize("prompt,expected_ranges",
+                         [(_create_engine_inputs_hf(IMG_URLS[:1]),
+                           [PlaceholderRange(offset=11, length=494)]),
+                          (_create_engine_inputs_hf(IMG_URLS[1:4]), [
+                              PlaceholderRange(offset=11, length=266),
+                              PlaceholderRange(offset=277, length=1056),
+                              PlaceholderRange(offset=1333, length=418)
+                          ])])
 def test_multi_modal_placeholders(vllm_runner, prompt,
                                   expected_ranges: list[PlaceholderRange],
                                   monkeypatch) -> None:
diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py
index 7ec7c80029746..578dcd4a44452 100644
--- a/tests/models/multimodal/processing/test_llama4.py
+++ b/tests/models/multimodal/processing/test_llama4.py
@@ -71,13 +71,11 @@ def test_processor_override(
     # image token offsets
     img_locs = processed_inputs["mm_placeholders"].get("image", [])
     assert len(img_locs) == num_imgs
-    assert [img_loc["offset"] for img_loc in img_locs] == \
+    assert [img_loc.offset for img_loc in img_locs] == \
         [i for i, v in enumerate(prompt_token_ids) \
         if v == config.boi_token_index]
 
     # patch sizes and masks
-    assert prompt_token_ids.count(config.image_token_index) \
-        == sum(img_patch.sum() for img_patch in mm_kwargs["embed_is_patch"])
     patch_token_id = vocab[hf_processor.img_patch_token]
     num_patches = processed_inputs["prompt_token_ids"].count(patch_token_id)
     mm_counts = {"image": num_imgs}
@@ -89,11 +87,3 @@ def test_processor_override(
         == mm_kwargs["patches_per_image"].sum() * num_patches_per_chunk
     assert mm_kwargs["pixel_values"].shape[0] \
         == mm_kwargs["patches_per_image"].sum()
-
-    for embed_is_patch, aspect_ratio in zip(mm_kwargs["embed_is_patch"],
-                                            mm_kwargs["aspect_ratios"]):
-        assert embed_is_patch.shape[0] == \
-            len(tokenizer.encode(
-                hf_processor._prompt_split_image(
-                    aspect_ratio, num_patches_per_chunk),
-                add_special_tokens=False))
diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
index fe56a200a330f..b82bfe483dbbc 100644
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -92,8 +92,8 @@ def _validate_image_prompt_replacements_one(
         first_placeholder = image_placeholders[0]
 
         # NOTE: There is a BOS token
-        assert first_placeholder["offset"] == 1
-        assert first_placeholder["length"] == (
+        assert first_placeholder.offset == 1
+        assert first_placeholder.length == (
             len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
 
     except Exception as exc:
diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
index 7cefdd37ee49a..dcc8dc8dab5a0 100644
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -92,8 +92,8 @@ def _validate_image_prompt_replacements_one(
 
         first_placeholder = image_placeholders[0]
 
-        assert first_placeholder["offset"] == 0
-        assert first_placeholder["length"] == len(
+        assert first_placeholder.offset == 0
+        assert first_placeholder.length == len(
             processed_inputs["prompt_token_ids"]) // num_imgs
     except Exception as exc:
         failed_size_excs.append((image_size, exc))
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index da112bd7a921c..fa9588a050965 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -785,6 +785,7 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=6,
                         tokens=[32000, 32000],
+                        is_embed=None,
                     ),
                 ],
                 "pattern_4": [
@@ -793,6 +794,7 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=3,
                         tokens=[32000],
+                        is_embed=None,
                     ),
                 ],
             }
@@ -807,12 +809,14 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=1,
                         tokens=[32000, 32000],
+                        is_embed=None,
                     ),
                     PlaceholderFeaturesInfo(
                         modality="pattern_1",
                         item_idx=1,
                         start_idx=5,
                         tokens=[32000, 32000],
+                        is_embed=None,
                     ),
                 ],
                 "pattern_3": [
@@ -821,6 +825,7 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=7,
                         tokens=[1550, 918, 1550],
+                        is_embed=None,
                     ),
                 ],
                 # No match for pattern_4 as it has lower priority than pattern_1
@@ -835,12 +840,14 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=1,
                         tokens=[32000, 32000],
+                        is_embed=None,
                     ),
                     PlaceholderFeaturesInfo(
                         modality="pattern_1",
                         item_idx=1,
                         start_idx=3,
                         tokens=[32000, 32000],
+                        is_embed=None,
                     ),
                 ],
                 "pattern_4": [
@@ -849,6 +856,7 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=5,
                         tokens=[32000],
+                        is_embed=None,
                     ),
                 ],
                 "pattern_3": [
@@ -857,6 +865,7 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=6,
                         tokens=[1550, 918, 1550],
+                        is_embed=None,
                     ),
                 ],
             }
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 8362af24a67ed..51836644b3251 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -3,7 +3,7 @@
 import pytest
 import torch
 
-from vllm.multimodal.inputs import MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.utils import sha256
 # disable yapf here as it formats differently than isort such that both fail
@@ -158,13 +158,10 @@ def test_generate_block_hash_extra_keys():
     request = make_request(
         request_id=0,
         prompt_token_ids=[_ for _ in range(20)],
-        mm_positions=[{
-            "offset": 0,
-            "length": 5
-        }, {
-            "offset": 10,
-            "length": 5
-        }],
+        mm_positions=[
+            PlaceholderRange(offset=0, length=5),
+            PlaceholderRange(offset=10, length=5),
+        ],
         mm_hashes=["hash1", "hash2"],
     )
 
@@ -222,13 +219,10 @@ def test_hash_request_tokens(hash_fn):
     request = make_request(
         request_id=0,
         prompt_token_ids=[_ for _ in range(6)],
-        mm_positions=[{
-            "offset": 0,
-            "length": 3
-        }, {
-            "offset": 3,
-            "length": 3
-        }],
+        mm_positions=[
+            PlaceholderRange(offset=0, length=3),
+            PlaceholderRange(offset=3, length=3),
+        ],
         mm_hashes=["hash1", "hash2"],
     )
 
@@ -253,25 +247,19 @@ def test_hash_tokens_different_mm_input(hash_fn):
     request1 = make_request(
         request_id=0,
         prompt_token_ids=[_ for _ in range(6)],
-        mm_positions=[{
-            "offset": 0,
-            "length": 3
-        }, {
-            "offset": 3,
-            "length": 3
-        }],
+        mm_positions=[
+            PlaceholderRange(offset=0, length=3),
+            PlaceholderRange(offset=3, length=3),
+        ],
         mm_hashes=["hash1", "hash2"],
     )
     request2 = make_request(
         request_id=1,
         prompt_token_ids=[_ for _ in range(6)],
-        mm_positions=[{
-            "offset": 0,
-            "length": 3
-        }, {
-            "offset": 3,
-            "length": 3
-        }],
+        mm_positions=[
+            PlaceholderRange(offset=0, length=3),
+            PlaceholderRange(offset=3, length=3),
+        ],
         mm_hashes=["hash3", "hash2"],
     )
     block_size = 3
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index b4bf1d82c083e..6b68885d375a2 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -27,7 +27,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo,
                                         MultiModalFieldConfig,
                                         PromptReplacement, PromptUpdate,
-                                        encode_tokens)
+                                        PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -35,7 +35,6 @@ from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features
 
 
 class AyaVisionImagePixelInputs(TypedDict):
@@ -51,13 +50,6 @@ class AyaVisionImagePixelInputs(TypedDict):
     num_patches: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 class AyaVisionMultiModalProjector(nn.Module):
 
@@ -135,21 +127,20 @@ class AyaVisionProcessingInfo(BaseProcessingInfo):
     def get_max_image_tokens(self) -> int:
         hf_processor = self.get_hf_processor()
         image_processor = hf_processor.image_processor
+
         image_size = self.get_image_size_with_most_features()
-        tokenizer = hf_processor.tokenizer
         num_patches = self.get_num_patches(
             image_width=image_size.width,
             image_height=image_size.height,
             size=image_processor.size,
             min_patches=image_processor.min_patches,
-            max_patches=image_processor.max_patches)
-        image_string = hf_processor._prompt_split_image(num_patches)
-        x = encode_tokens(
-            tokenizer,
-            image_string,
-            add_special_tokens=False,
+            max_patches=image_processor.max_patches,
         )
-        return len(x)
+
+        img_patches_per_tile = (hf_processor.img_size //
+                                hf_processor.patch_size)**2
+
+        return num_patches * img_patches_per_tile
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
@@ -221,7 +212,6 @@ class AyaVisionMultiModalProcessor(
         hf_processor = self.info.get_hf_processor(**mm_kwargs)
         image_processor = hf_processor.image_processor
 
-        hf_config = self.info.get_hf_config()
         # HF processor pops the `num_patches` kwarg, which is needed by vLLM
         if (images :=
                 mm_data.get("images")) is not None and '<image>' in prompt:
@@ -234,6 +224,7 @@ class AyaVisionMultiModalProcessor(
                 parsed_images.get_image_size(i)
                 for i in range(len(parsed_images))
             ]
+
             num_patches = [
                 self.info.get_num_patches(
                     image_width=image_size.width,
@@ -243,20 +234,6 @@ class AyaVisionMultiModalProcessor(
                     max_patches=image_processor.max_patches)
                 for image_size in image_sizes
             ]
-            image_tokens_list = [
-                hf_processor._prompt_split_image(num_patch)
-                for num_patch in num_patches
-            ]
-            tokenizer = self.info.get_tokenizer()
-            image_token_ids = [
-                tokenizer.encode(image_tokens, add_special_tokens=False)
-                for image_tokens in image_tokens_list
-            ]
-            embed_is_patch = [
-                torch.tensor(image_repl_tokens) == hf_config.image_token_index
-                for image_repl_tokens in image_token_ids
-            ]
-            processed_outputs["embed_is_patch"] = embed_is_patch
             processed_outputs["num_patches"] = torch.tensor(num_patches)
 
         return processed_outputs
@@ -271,7 +248,6 @@ class AyaVisionMultiModalProcessor(
             pixel_values=MultiModalFieldConfig.flat_from_sizes(
                 "image", num_patches),
             num_patches=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
@@ -283,6 +259,7 @@ class AyaVisionMultiModalProcessor(
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_token = hf_processor.image_token
+        img_patch_token = hf_processor.img_patch_token
         image_processor = hf_processor.image_processor
 
         def get_replacement(item_idx: int):
@@ -294,8 +271,11 @@ class AyaVisionMultiModalProcessor(
                 image_height=image_size.height,
                 size=image_processor.size,
                 min_patches=image_processor.min_patches,
-                max_patches=image_processor.max_patches)
-            return hf_processor._prompt_split_image(num_patches=num_patches)
+                max_patches=image_processor.max_patches,
+            )
+            repl = hf_processor._prompt_split_image(num_patches=num_patches)
+
+            return PromptUpdateDetails.select_text(repl, img_patch_token)
 
         return [
             PromptReplacement(
@@ -424,7 +404,6 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
             self, **kwargs: object) -> Optional[AyaVisionImagePixelInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
         num_patches = kwargs.pop("num_patches", None)
-        embed_is_patch = kwargs.pop("embed_is_patch", None)
         image_embeds = kwargs.pop("image_embeds", None)
         assert image_embeds is None, "Aya Vision does not support image_embeds."
 
@@ -436,18 +415,13 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
             raise ValueError("Incorrect type of num_patches. "
                              f"Got type: {type(num_patches)}")
 
-        if not isinstance(embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of embed_is_patch. "
-                             f"Got type: {type(embed_is_patch)}")
-
         pixel_values = flatten_bn(pixel_values, concat=True)
         num_patches = flatten_bn(num_patches, concat=True)
-        embed_is_patch = flatten_bn(embed_is_patch)
+
         return AyaVisionImagePixelInputs(
             type="pixel_values",
             pixel_values=self._validate_pixel_values(pixel_values),
             num_patches=num_patches,
-            embed_is_patch=embed_is_patch,
         )
 
     def get_multimodal_embeddings(
@@ -455,11 +429,8 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
-        image_features = self._process_image_input(image_input, **kwargs)
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+
+        return self._process_image_input(image_input, **kwargs)
 
     def get_input_embeddings(
         self,
@@ -471,9 +442,9 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids=input_ids,
                 inputs_embeds=inputs_embeds,
-                multimodal_embeddings=select_patch_features(
-                    multimodal_embeddings),
-                placeholder_token_id=self.config.image_token_index)
+                multimodal_embeddings=multimodal_embeddings,
+                placeholder_token_id=self.config.image_token_index,
+            )
 
         return inputs_embeds
 
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index f758c98ea5e59..3d527cb6f529d 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -162,9 +162,9 @@ class ChameleonMultiModalProcessor(
             PromptReplacement(
                 modality="image",
                 target=[image_token_id],
-                replacement=PromptUpdateDetails(
-                    full=([image_start_id] + image_tokens + [image_end_id]),
-                    features=image_tokens,
+                replacement=PromptUpdateDetails.select_token_id(
+                    [image_start_id] + image_tokens + [image_end_id],
+                    embed_token_id=image_token_id,
                 ),
             )
         ]
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index a807b047a1aae..189b91db4a862 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -18,7 +18,7 @@
 """ PyTorch Fuyu model."""
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, Set, Tuple, TypedDict, Union
+from typing import Literal, Optional, Set, Tuple, TypedDict
 
 import torch
 import torch.nn as nn
@@ -43,7 +43,6 @@ from vllm.sequence import IntermediateTensors
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features
 
 # Cannot find the following 2 numbers from hf config.
 _IMAGE_TOKEN_ID = 71011
@@ -66,14 +65,6 @@ class FuyuImagePatchInputs(TypedDict):
     flattened just like `flat_data`.
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 class FuyuProcessingInfo(BaseProcessingInfo):
 
@@ -94,15 +85,7 @@ class FuyuProcessingInfo(BaseProcessingInfo):
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> Mapping[str, int]:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        max_ncols, max_nrows = self.get_image_feature_grid_size(
-            image_width=target_width,
-            image_height=target_height,
-        )
-        max_image_tokens = (max_ncols + 1) * max_nrows
-
-        return {"image": max_image_tokens}
+        return {"image": self.get_max_image_tokens()}
 
     def get_image_feature_grid_size(
         self,
@@ -128,11 +111,32 @@ class FuyuProcessingInfo(BaseProcessingInfo):
         nrows = math.ceil(image_height / patch_height)
         return ncols, nrows
 
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        ncols, nrows = self.get_image_feature_grid_size(
+            image_width=image_width,
+            image_height=image_height,
+        )
+
+        return ncols * nrows
+
     def get_image_size_with_most_features(self) -> ImageSize:
         image_processor = self.get_image_processor()
         return ImageSize(width=image_processor.size["width"],
                          height=image_processor.size["height"])
 
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+        )
+
 
 class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]):
 
@@ -192,19 +196,6 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
 
             processed_outputs["image_patches"] = image_patches[0]
 
-            # get patch grid size for each image
-            embed_is_patch = []
-            for image in images:
-                ncols, nrows = self.info.get_image_feature_grid_size(
-                    image_width=image.width,
-                    image_height=image.height,
-                )
-
-                mask = torch.tensor(([True] * ncols + [False]) * nrows)
-                embed_is_patch.append(mask)
-
-            processed_outputs["embed_is_patch"] = embed_is_patch
-
         return processed_outputs
 
     def _apply_hf_processor_tokens_only(
@@ -224,8 +215,7 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(image_patches=MultiModalFieldConfig.batched("image"),
-                    embed_is_patch=MultiModalFieldConfig.batched("image"))
+        return dict(image_patches=MultiModalFieldConfig.batched("image"))
 
     def _get_prompt_updates(
         self,
@@ -252,9 +242,9 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
             image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                             [_NEWLINE_TOKEN_ID]) * nrows
 
-            return PromptUpdateDetails(
-                full=image_tokens + [bos_token_id],
-                features=image_tokens,
+            return PromptUpdateDetails.select_token_id(
+                image_tokens + [bos_token_id],
+                embed_token_id=_IMAGE_TOKEN_ID,
             )
 
         return [
@@ -329,20 +319,13 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
                 raise ValueError("Incorrect type of image patches. "
                                  f"Got type: {type(image_patches)}")
 
-            embed_is_patch = kwargs.pop("embed_is_patch")
-            if not isinstance(embed_is_patch, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of embed_is_patch. "
-                                 f"Got type: {type(embed_is_patch)}")
-
             image_patches_flat = flatten_bn(image_patches)
-            embed_is_patch = flatten_bn(embed_is_patch)
 
             return FuyuImagePatchInputs(
                 type="image_patches",
                 flat_data=self._validate_pixel_values(
                     flatten_bn(image_patches_flat, concat=True)),
                 patches_per_image=[x.size(0) for x in image_patches_flat],
-                embed_is_patch=embed_is_patch,
             )
 
         return None
@@ -364,12 +347,7 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         if image_input is None:
             return None
 
-        image_features = self._process_image_input(image_input)
-
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -379,8 +357,11 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
             inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds,
-                select_patch_features(multimodal_embeddings), _IMAGE_TOKEN_ID)
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                _IMAGE_TOKEN_ID,
+            )
         return inputs_embeds
 
     def forward(
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index bbdea70a7bcfd..9552ee1f0b3a7 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -25,7 +25,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PlaceholderFeaturesInfo,
                                         PromptReplacement, PromptTargetMatch,
                                         PromptUpdate, PromptUpdateDetails,
-                                        encode_tokens, find_mm_placeholders,
+                                        find_mm_placeholders,
                                         replace_token_matches)
 # yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
@@ -36,7 +36,6 @@ from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features
 
 logger = init_logger(__name__)
 
@@ -54,14 +53,6 @@ class Gemma3ImagePixelInputs(TypedDict):
     num_patches: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 Gemma3ImageInputs = Gemma3ImagePixelInputs
 
@@ -183,7 +174,7 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
         if processor is None:
             processor = self.get_hf_processor()
 
-        image_token = processor.boi_token
+        boi_token = processor.boi_token
 
         num_crops = self.get_num_crops(
             image_width=image_width,
@@ -192,19 +183,21 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
         )
 
         if num_crops == 0:
-            image_text = image_token
+            image_text = boi_token
         else:
-            crops_image_tokens = " ".join(image_token
-                                          for _ in range(num_crops))
+            crops_image_tokens = " ".join(boi_token for _ in range(num_crops))
             image_text = (
-                f"Here is the original image {image_token} and here are some "
+                f"Here is the original image {boi_token} and here are some "
                 f"crops to help you see better {crops_image_tokens}")
 
-        repl_full = image_text.replace(image_token,
+        repl_full = image_text.replace(boi_token,
                                        processor.full_image_sequence)
-        repl_features = repl_full.strip("\n")
 
-        return PromptUpdateDetails(full=repl_full, features=repl_features)
+        tokenizer = processor.tokenizer
+        vocab = tokenizer.get_vocab()
+        image_token_id = vocab[tokenizer.image_token]
+
+        return PromptUpdateDetails.select_token_id(repl_full, image_token_id)
 
     def get_num_image_tokens(
         self,
@@ -213,19 +206,17 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
         image_height: int,
         processor: Optional[Gemma3Processor],
     ) -> int:
-        tokenizer = self.get_tokenizer()
-        image_repl = self.get_image_repl(
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        num_crops = self.get_num_crops(
             image_width=image_width,
             image_height=image_height,
             processor=processor,
         )
+        image_seq_len = processor.image_seq_length
 
-        image_repl_tokens = encode_tokens(
-            tokenizer,
-            image_repl.features,
-            add_special_tokens=False,
-        )
-        return len(image_repl_tokens)
+        return (num_crops + 1) * image_seq_len
 
     def get_image_size_with_most_features(self) -> ImageSize:
         processor = self.get_hf_processor()
@@ -301,28 +292,6 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
             ]
             hf_processor = self.info.get_hf_processor(**mm_kwargs)
 
-            image_repl_features = [
-                self.info.get_image_repl(image_width=size.width,
-                                         image_height=size.height,
-                                         processor=hf_processor).features
-                for size in image_sizes
-            ]
-
-            tokenizer = self.info.get_tokenizer()
-            image_repls_feature_tokens = [
-                tokenizer.encode(image_repl, add_special_tokens=False)
-                for image_repl in image_repl_features
-            ]
-
-            vocab = tokenizer.get_vocab()
-            image_token_id = vocab[tokenizer.image_token]
-
-            embed_is_patch = [
-                torch.tensor(image_repl_tokens) == image_token_id
-                for image_repl_tokens in image_repls_feature_tokens
-            ]
-            processed_outputs["embed_is_patch"] = embed_is_patch
-
             num_crops = [
                 self.info.get_num_crops(image_width=size.width,
                                         image_height=size.height,
@@ -344,7 +313,6 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
             pixel_values=MultiModalFieldConfig.flat_from_sizes(
                 "image", num_crops + 1),
             num_crops=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
         )
 
     def _get_prompt_updates(
@@ -454,6 +422,7 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
                     item_idx=p.item_idx,
                     start_idx=repl_orig_idxs[p.start_idx],
                     tokens=p.tokens,
+                    is_embed=p.is_embed,
                 ) for p in placeholders
             ]
             for modality, placeholders in repls.items()
@@ -572,7 +541,6 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
             self, **kwargs: object) -> Optional[Gemma3ImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
         num_crops = kwargs.pop("num_crops", None)
-        embed_is_patch = kwargs.pop("embed_is_patch", None)
         image_embeds = kwargs.pop("image_embeds", None)
         assert image_embeds is None, "Gemma3 does not support image_embeds."
         if pixel_values is None:
@@ -586,19 +554,13 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
             raise ValueError("Incorrect type of num_crops. "
                              f"Got type: {type(num_crops)}")
 
-        if not isinstance(embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of embed_is_patch. "
-                             f"Got type: {type(embed_is_patch)}")
-
         pixel_values = flatten_bn(pixel_values, concat=True)
         num_crops = flatten_bn(num_crops, concat=True)
-        embed_is_patch = flatten_bn(embed_is_patch)
 
         return Gemma3ImagePixelInputs(
             type="pixel_values",
             pixel_values=self._validate_pixel_values(pixel_values),
             num_patches=num_crops + 1,
-            embed_is_patch=embed_is_patch,
         )
 
     def _image_pixels_to_features(
@@ -635,12 +597,7 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
         if image_input is None:
             return None
 
-        image_features = self._process_image_input(image_input)
-
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -652,7 +609,7 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.config.image_token_index,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 3b2ad695f83ef..f975a19a364ed 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -257,7 +257,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
         repl_features = IMG_CONTEXT * feature_size
         repl_full = IMG_START + repl_features + IMG_END
 
-        return PromptUpdateDetails(full=repl_full, features=repl_features)
+        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
 
     def resolve_min_max_num(
         self,
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index da4a44346c32e..347106bc4dcf8 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -41,7 +41,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         MultiModalDataItems,
                                         MultiModalFieldConfig,
                                         PromptReplacement, PromptUpdate,
-                                        encode_tokens)
+                                        PromptUpdateDetails)
 # yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
@@ -54,7 +54,6 @@ from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal
 from .llama import LlamaModel
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features
 
 
 class Idefics3ImagePixelInputs(TypedDict):
@@ -69,14 +68,6 @@ class Idefics3ImagePixelInputs(TypedDict):
     num_patches: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 class Idefics3ImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
@@ -86,14 +77,6 @@ class Idefics3ImageEmbeddingInputs(TypedDict):
     `hidden_size` must match the hidden size of language model backbone.
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs]
 
@@ -275,19 +258,16 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
         image_height: int,
         processor: Optional[Idefics3Processor],
     ) -> int:
-        tokenizer = self.get_tokenizer()
-        image_repl = self.get_image_repl(
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        num_patches = self.get_num_patches(
             image_width=image_width,
             image_height=image_height,
             processor=processor,
         )
 
-        image_repl_tokens = encode_tokens(
-            tokenizer,
-            image_repl,
-            add_special_tokens=False,
-        )
-        return len(image_repl_tokens)
+        return num_patches * processor.image_seq_len
 
     def get_image_size_with_most_features(self) -> ImageSize:
         processor = self.get_hf_processor()
@@ -364,28 +344,6 @@ class Idefics3MultiModalProcessor(
         ]
         hf_processor = self.info.get_hf_processor(**mm_kwargs)
 
-        image_repl_features = [
-            self.info.get_image_repl(image_width=size.width,
-                                     image_height=size.height,
-                                     processor=hf_processor)
-            for size in image_sizes
-        ]
-
-        tokenizer = self.info.get_tokenizer()
-        image_repls_feature_tokens = [
-            tokenizer.encode(image_repl, add_special_tokens=False)
-            for image_repl in image_repl_features
-        ]
-
-        vocab = tokenizer.get_vocab()
-        image_token_id = vocab[hf_processor.image_token.content]
-
-        embed_is_patch = [
-            torch.tensor(image_repl_tokens) == image_token_id
-            for image_repl_tokens in image_repls_feature_tokens
-        ]
-        processed_outputs["embed_is_patch"] = embed_is_patch
-
         num_patches = [
             self.info.get_num_patches(
                 image_width=size.width,
@@ -415,7 +373,6 @@ class Idefics3MultiModalProcessor(
                 "image", num_patches),
             image_embeds=MultiModalFieldConfig.batched("image"),
             num_patches=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
         )
 
     def _get_prompt_updates(
@@ -427,17 +384,22 @@ class Idefics3MultiModalProcessor(
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_token = hf_processor.image_token.content
 
-        def get_replacement_idefics3(item_idx: int) -> str:
+        def get_replacement_idefics3(item_idx: int) -> PromptUpdateDetails:
             images = mm_items.get_items("image", ImageProcessorItems)
 
             image_size = images.get_image_size(item_idx)
 
-            return self.info.get_image_repl(
+            image_repl = self.info.get_image_repl(
                 image_width=image_size.width,
                 image_height=image_size.height,
                 processor=hf_processor,
             )
 
+            return PromptUpdateDetails.select_text(
+                image_repl,
+                embed_text=image_token,
+            )
+
         return [
             PromptReplacement(
                 modality="image",
@@ -675,13 +637,6 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
         if pixel_values is None and image_embeds is None:
             return None
 
-        embed_is_patch = kwargs.pop("embed_is_patch")
-        if not isinstance(embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of embed_is_patch. "
-                             f"Got type: {type(embed_is_patch)}")
-
-        embed_is_patch = flatten_bn(embed_is_patch)
-
         if image_embeds is not None:
             if not isinstance(image_embeds, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of image embeddings. "
@@ -690,7 +645,6 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
             return Idefics3ImageEmbeddingInputs(
                 type="image_embeds",
                 data=flatten_bn(image_embeds, concat=True),
-                embed_is_patch=embed_is_patch,
             )
 
         if pixel_values is not None:
@@ -718,7 +672,6 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
                 pixel_values=self._validate_pixel_values(pixel_values),
                 pixel_attention_mask=pixel_attention_mask,
                 num_patches=num_patches,
-                embed_is_patch=embed_is_patch,
             )
 
         raise AssertionError("This line should be unreachable.")
@@ -754,12 +707,7 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
         if image_input is None:
             return None
 
-        image_features = self._process_image_input(image_input)
-
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -771,7 +719,7 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.config.image_token_id,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 0729f4c7d203c..cf5608e3de7ba 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -39,7 +39,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features
 
 IMG_START = '<img>'
 IMG_END = '</img>'
@@ -60,14 +59,6 @@ class InternVLImagePixelInputs(TypedDict):
     num_patches: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 class InternVLImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
@@ -419,24 +410,12 @@ class BaseInternVLProcessor(ABC):
                 torch.tensor([len(item) for item in pixel_values_lst]),
             }
 
-            tokenizer = self.tokenizer
-            image_token_id = self.image_token_id
-
-            embed_is_patch = list[torch.Tensor]()
-
             for pixel_values in pixel_values_lst:
                 num_patches = pixel_values.shape[0]
                 feature_size = num_patches * self.num_image_token
 
                 image_repl = self.get_image_repl(feature_size, num_patches)
-                feature_tokens = tokenizer.encode(image_repl.features,
-                                                  add_special_tokens=False)
-
                 text = [t.replace('<image>', image_repl.full, 1) for t in text]
-                embed_is_patch.append(
-                    torch.tensor(feature_tokens) == image_token_id)
-
-            image_inputs["embed_is_patch"] = embed_is_patch
 
         text_inputs = self.tokenizer(text)
 
@@ -460,7 +439,7 @@ class InternVLProcessor(BaseInternVLProcessor):
         repl_features = IMG_CONTEXT * feature_size
         repl_full = IMG_START + repl_features + IMG_END
 
-        return PromptUpdateDetails(full=repl_full, features=repl_features)
+        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
 
 
 class BaseInternVLProcessingInfo(BaseProcessingInfo):
@@ -599,7 +578,6 @@ class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
             pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
                 "image", image_num_patches),
             image_num_patches=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
             image_token_id=MultiModalFieldConfig.shared("image", num_images),
         )
@@ -831,7 +809,6 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
             self, **kwargs: object) -> Optional[InternVLImageInputs]:
         pixel_values_flat = kwargs.pop("pixel_values_flat", None)
         image_num_patches = kwargs.pop("image_num_patches", None)
-        embed_is_patch = kwargs.pop("embed_is_patch", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
         if pixel_values_flat is None and image_embeds is None:
@@ -860,20 +837,14 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
                 raise ValueError("Incorrect type of image_num_patches. "
                                  f"Got type: {type(image_num_patches)}")
 
-            if not isinstance(embed_is_patch, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of embed_is_patch. "
-                                 f"Got type: {type(embed_is_patch)}")
-
             pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
             image_num_patches = flatten_bn(image_num_patches, concat=True)
-            embed_is_patch = flatten_bn(embed_is_patch)
 
             return InternVLImagePixelInputs(
                 type="pixel_values",
                 pixel_values_flat=self._validate_pixel_values(
                     pixel_values_flat),
                 num_patches=image_num_patches,
-                embed_is_patch=embed_is_patch,
             )
 
         raise AssertionError("This line should be unreachable.")
@@ -919,15 +890,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
         if image_input is None:
             return None
 
-        image_features = self._process_image_input(image_input)
-
-        if image_input["type"] != "pixel_values":
-            return image_features
-
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -941,7 +904,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.img_context_token_id,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 45a0bf73b837d..b34ac38f68071 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -32,7 +32,8 @@ from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, ProcessingCache,
-                                        PromptReplacement, PromptUpdate)
+                                        PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -42,8 +43,7 @@ from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import (get_vision_encoder_info, scatter_patch_features,
-                     select_patch_features)
+from .vision import get_vision_encoder_info
 
 
 class LlavaImagePixelInputs(TypedDict):
@@ -67,14 +67,6 @@ class PixtralHFImagePixelInputs(TypedDict):
     in which case the data is passed as a list instead of a batched tensor.
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-    
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 class LlavaImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
@@ -343,23 +335,6 @@ class PixtralHFMultiModalProcessor(
                     for p, (h, w) in zip(pixel_values, image_sizes)
                 ]
 
-            hf_config = self.info.get_hf_config()
-            vision_config = hf_config.vision_config
-            assert isinstance(vision_config, PixtralVisionConfig)
-            encoder_info = PixtralHFEncoderInfo(vision_config)
-
-            tile_sizes = [
-                encoder_info.get_patch_grid_size(
-                    image_width=pixel_value.shape[-1],
-                    image_height=pixel_value.shape[-2],
-                ) for pixel_value in processed_outputs["pixel_values"]
-            ]
-            embed_is_patch = [
-                torch.tensor(([True] * ncols + [False]) * nrows)
-                for ncols, nrows in tile_sizes
-            ]
-            processed_outputs["embed_is_patch"] = embed_is_patch
-
         return processed_outputs
 
     def _get_mm_fields_config(
@@ -369,7 +344,6 @@ class PixtralHFMultiModalProcessor(
     ) -> Mapping[str, MultiModalFieldConfig]:
         return dict(
             pixel_values=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
@@ -404,7 +378,7 @@ class PixtralHFMultiModalProcessor(
             tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
             tokens[-1] = image_end_id
 
-            return tokens
+            return PromptUpdateDetails.select_token_id(tokens, image_token_id)
 
         return [
             PromptReplacement(
@@ -612,17 +586,9 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
                                  f"Got type: {type(pixel_values)}")
 
             if self.config.vision_config.model_type == "pixtral":
-                embed_is_patch = kwargs.pop("embed_is_patch")
-                if not isinstance(embed_is_patch, (torch.Tensor, list)):
-                    raise ValueError("Incorrect type of embed_is_patch. "
-                                     f"Got type: {type(embed_is_patch)}")
-
-                embed_is_patch = flatten_bn(embed_is_patch)
-
                 return PixtralHFImagePixelInputs(
                     type="pixel_values_pixtral",
                     pixel_values=flatten_bn(pixel_values),
-                    embed_is_patch=embed_is_patch,
                 )
 
             return LlavaImagePixelInputs(
@@ -714,16 +680,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
         if image_input is None:
             return None
 
-        image_features = self._process_image_input(image_input)
-
-        if image_input["type"] != "pixel_values_pixtral":
-            # The path is used for pixtral (V0 only) and llava (V0/V1)
-            return image_features
-
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -735,7 +692,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.config.image_token_index,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index c74e086d3748e..a4fb0cb1741e9 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -40,7 +40,8 @@ from vllm.multimodal.parse import (AudioItem, AudioProcessorItems,
                                    DictEmbeddingItems, ModalityData,
                                    ModalityDataItems, MultiModalDataItems,
                                    MultiModalDataParser)
-from vllm.multimodal.processing import PromptReplacement, PromptUpdate
+from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
 from vllm.multimodal.profiling import ProcessorInputs
 
 from .minicpmv import (_MAX_FRAMES_PER_VIDEO, MiniCPMV2_6,
@@ -50,7 +51,6 @@ from .minicpmv import (_MAX_FRAMES_PER_VIDEO, MiniCPMV2_6,
                        _minicpmv_field_config)
 from .utils import (AutoWeightsLoader, cast_overflow_tensors, flatten_bn,
                     maybe_prefix)
-from .vision import scatter_patch_features
 
 CPU_DEVICE = torch.device("cpu")
 
@@ -73,14 +73,6 @@ class MiniCPMOAudioFeatureInputs(TypedDict):
     which equals to `audio_features.shape[-1]`
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which audio embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_audios, num_embeds)`
-    """
-
 
 class MiniCPMOAudioEmbeddingInputs(TypedDict):
     type: Literal["audio_embeds"]
@@ -93,14 +85,6 @@ class MiniCPMOAudioEmbeddingInputs(TypedDict):
     Length of each slice may vary, so pass it as a list.
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which audio embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_audios, num_embeds)`
-    """
-
 
 MiniCPMOAudioInputs = Union[MiniCPMOAudioFeatureInputs,
                             MiniCPMOAudioEmbeddingInputs]
@@ -115,7 +99,6 @@ def _minicpmo_field_config(hf_inputs: Mapping[str, torch.Tensor]):
         audio_features=MultiModalFieldConfig.batched("audio"),
         audio_feature_lens=MultiModalFieldConfig.batched("audio"),
         audio_embeds=MultiModalFieldConfig.batched("audio"),
-        audio_embed_is_patch=MultiModalFieldConfig.batched("audio"),
         audio_token_id=MultiModalFieldConfig.shared("audio", num_audios),
     )
 
@@ -197,8 +180,7 @@ class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
         pool_step = self.get_default_audio_pool_step()
         fbank_feat_in_chunk = 100
         cnn_feat_in_chunk = (fbank_feat_in_chunk - 1) // 2 + 1
-        num_audio_tokens = (cnn_feat_in_chunk - pool_step) // pool_step + 1
-        return num_audio_tokens + 2  # <audio>(<unk>*N)</audio>
+        return (cnn_feat_in_chunk - pool_step) // pool_step + 1
 
     def get_max_audio_chunks_with_most_features(self) -> int:
         return 30
@@ -209,8 +191,7 @@ class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
 
     def get_audio_len_by_num_chunks(self, num_chunks: int) -> int:
         sampling_rate = self.get_default_audio_sampling_rate()
-        # exclude <audio> </audio>
-        num_tokens_per_chunk = self.get_max_audio_tokens_per_chunk() - 2
+        num_tokens_per_chunk = self.get_max_audio_tokens_per_chunk()
         return int(num_chunks * sampling_rate / num_tokens_per_chunk) + 1
 
     def get_num_frames_with_most_features(
@@ -295,13 +276,6 @@ class MiniCPMOMultiModalProcessor(
 
         if isinstance(parsed_audios, MiniCPMOAudioEmbeddingItems):
             audio_inputs = {}
-
-            audio_lens = [
-                self.info.get_audio_len_by_num_chunks(
-                    sum(map(len,
-                            parsed_audios.get(i)["audio_embeds"])))
-                for i in range(len(parsed_audios))
-            ]
         else:
             audio_inputs = self._base_call_hf_processor(
                 prompts=[self.info.audio_pattern] * len(parsed_audios),
@@ -323,27 +297,7 @@ class MiniCPMOMultiModalProcessor(
             ]
             audio_inputs["audio_features"] = unpadded_audio_features
 
-            audio_lens = [
-                parsed_audios.get_audio_length(i)
-                for i in range(len(parsed_audios))
-            ]
-
-        audio_repl_features = [
-            self.get_audio_prompt_texts(audio_len) for audio_len in audio_lens
-        ]
-
         tokenizer = self.info.get_tokenizer()
-        audio_repls_feature_tokens = [
-            tokenizer.encode(audio_repl, add_special_tokens=False)
-            for audio_repl in audio_repl_features
-        ]
-
-        embed_is_patch = [
-            self.get_embed_is_patch(audio_repl_tokens)
-            for audio_repl_tokens in audio_repls_feature_tokens
-        ]
-        audio_inputs["audio_embed_is_patch"] = embed_is_patch
-
         unk_token_id = tokenizer.get_vocab()["<unk>"]
         audio_inputs["audio_token_id"] = torch.tensor(unk_token_id)
 
@@ -384,7 +338,10 @@ class MiniCPMOMultiModalProcessor(
             else:
                 audio_len = audios.get_audio_length(item_idx)
 
-            return self.get_audio_prompt_texts(audio_len)
+            return PromptUpdateDetails.select_text(
+                self.get_audio_prompt_texts(audio_len),
+                "<unk>",
+            )
 
         return [
             *base_updates,
@@ -713,13 +670,6 @@ class MiniCPMO(MiniCPMV2_6):
             assert isinstance(audio_token_id, torch.Tensor)
             self.mm_token_ids.add(audio_token_id.flatten().unique().item())
 
-        audio_embed_is_patch = kwargs.pop("audio_embed_is_patch")
-        if not isinstance(audio_embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of audio_embed_is_patch. "
-                             f"Got type: {type(audio_embed_is_patch)}")
-
-        audio_embed_is_patch = flatten_bn(audio_embed_is_patch)
-
         if audio_embeds is not None:
             if not isinstance(audio_embeds, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of audio_embeds. "
@@ -730,7 +680,6 @@ class MiniCPMO(MiniCPMV2_6):
             return MiniCPMOAudioEmbeddingInputs(
                 type="audio_embeds",
                 audio_embeds=audio_embeds_flat,
-                embed_is_patch=audio_embed_is_patch,
             )
 
         if not isinstance(audio_features, (torch.Tensor, list)):
@@ -749,7 +698,6 @@ class MiniCPMO(MiniCPMV2_6):
             type="audio_features",
             audio_features=audio_features_flat,
             audio_feature_lens=audio_feature_lens_flat,
-            embed_is_patch=audio_embed_is_patch,
         )
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
@@ -781,10 +729,6 @@ class MiniCPMO(MiniCPMV2_6):
             if modality == "audios":
                 audio_input = modalities["audios"]
                 audio_features = self._process_audio_input(audio_input)
-                multimodal_embeddings += tuple(
-                    scatter_patch_features(
-                        audio_features,
-                        audio_input["embed_is_patch"],
-                    ))
+                multimodal_embeddings += tuple(audio_features)
 
         return multimodal_embeddings
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 5fab9df3f8f99..eb20a963ae2ab 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -56,7 +56,7 @@ from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem,
                                    VideoItem, VideoProcessorItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate)
+                                        PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
@@ -67,7 +67,6 @@ from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP)
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features
 
 # For profile run
 _MAX_FRAMES_PER_VIDEO = 16
@@ -90,14 +89,6 @@ class MiniCPMVImagePixelInputs(TypedDict):
     This should be in `(height, width)` format.
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
     num_slices: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
@@ -112,14 +103,6 @@ class MiniCPMVImageEmbeddingInputs(TypedDict):
     instead of a batched tensor.
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 MiniCPMVImageInputs = Union[MiniCPMVImagePixelInputs,
                             MiniCPMVImageEmbeddingInputs]
@@ -245,12 +228,10 @@ def _minicpmv_field_config(hf_inputs: Mapping[str, torch.Tensor]):
         image_sizes=MultiModalFieldConfig.batched("image"),
         tgt_sizes=MultiModalFieldConfig.batched("image"),
         image_embeds=MultiModalFieldConfig.batched("image"),
-        embed_is_patch=MultiModalFieldConfig.batched("image"),
         video_pixel_values=MultiModalFieldConfig.batched("video"),
         video_image_sizes=MultiModalFieldConfig.batched("video"),
         video_tgt_sizes=MultiModalFieldConfig.batched("video"),
         video_embeds=MultiModalFieldConfig.batched("video"),
-        video_embed_is_patch=MultiModalFieldConfig.batched("video"),
         image_token_id=MultiModalFieldConfig.shared("image", num_images),
         video_token_id=MultiModalFieldConfig.shared("video", num_videos),
     )
@@ -398,22 +379,43 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
             use_image_id=use_image_id,
         )
 
+    def get_sliced_grid(
+        self,
+        image_size: ImageSize,
+        # For MiniCPM V/O 2.6
+        max_slice_nums: Optional[int] = None,
+    ) -> Optional[tuple[int, int]]:
+        image_processor = self.get_image_processor()
+        version = self.get_model_version()
+
+        if version == (2, 0) or version == (2, 5):
+            return image_processor.get_sliced_grid(image_size)
+
+        if max_slice_nums is None:
+            max_slice_nums = image_processor.max_slice_nums
+
+        return image_processor.get_sliced_grid(
+            image_size,
+            max_slice_nums=max_slice_nums,
+        )
+
     def get_num_image_tokens(
         self,
         image_size: ImageSize,
         max_slice_nums: Optional[int] = None,
-        use_image_id: bool = True,
     ) -> int:
-        tokenizer = self.get_tokenizer()
-        image_placeholders = self.get_slice_image_placeholder(
+        image_processor = self.get_image_processor()
+
+        grid = self.get_sliced_grid(
             image_size,
             max_slice_nums=max_slice_nums,
-            use_image_id=use_image_id,
         )
-        image_token_ids = tokenizer.encode(image_placeholders,
-                                           add_special_tokens=False)
+        if grid is None:
+            ncols = nrows = 0
+        else:
+            ncols, nrows = grid
 
-        return len(image_token_ids)
+        return (ncols * nrows + 1) * image_processor.image_feature_size
 
     def get_max_image_tokens(self) -> int:
         image_size = self.get_image_size_with_most_features()
@@ -433,7 +435,6 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
         return self.get_num_image_tokens(
             frame_size,
             max_slice_nums=self.get_video_max_slice_num(),
-            use_image_id=False,
         )
 
     def get_max_video_tokens(
@@ -539,14 +540,6 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
             use_image_id=False,
         ) * num_frames
 
-    def get_embed_is_patch(
-        self,
-        input_ids: list[int],
-    ) -> torch.Tensor:
-        tokenizer = self.info.get_tokenizer()
-        unk_token_id = tokenizer.get_vocab()["<unk>"]
-        return torch.tensor(input_ids) == unk_token_id
-
     def process_images(
         self,
         mm_data: Mapping[str, object],
@@ -570,26 +563,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
                 out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
             )
 
-        image_sizes = [
-            parsed_images.get_image_size(i) for i in range(len(parsed_images))
-        ]
-        image_repl_features = [
-            self.get_image_prompt_texts(size, idx)
-            for idx, size in enumerate(image_sizes)
-        ]
-
         tokenizer = self.info.get_tokenizer()
-        image_repls_feature_tokens = [
-            tokenizer.encode(image_repl, add_special_tokens=False)
-            for image_repl in image_repl_features
-        ]
-
-        embed_is_patch = [
-            self.get_embed_is_patch(image_repl_tokens)
-            for image_repl_tokens in image_repls_feature_tokens
-        ]
-        image_inputs["embed_is_patch"] = embed_is_patch
-
         unk_token_id = tokenizer.get_vocab()["<unk>"]
         image_inputs["image_token_id"] = torch.tensor(unk_token_id)
 
@@ -625,31 +599,9 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
                 out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
             )
 
-        frame_sizes = [
-            parsed_videos.get_frame_size(i) for i in range(len(parsed_videos))
-        ]
-        num_frames = [
-            parsed_videos.get_num_frames(i) for i in range(len(parsed_videos))
-        ]
-        video_repl_features = [
-            self.get_video_prompt_texts(size, nframes)
-            for size, nframes in zip(frame_sizes, num_frames)
-        ]
-
-        tokenizer = self.info.get_tokenizer()
-        video_repls_feature_tokens = [
-            tokenizer.encode(video_repl, add_special_tokens=False)
-            for video_repl in video_repl_features
-        ]
-
-        embed_is_patch = [
-            self.get_embed_is_patch(video_repl_tokens)
-            for video_repl_tokens in video_repls_feature_tokens
-        ]
-        video_inputs["embed_is_patch"] = embed_is_patch
-
         video_inputs = {f"video_{k}": v for k, v in video_inputs.items()}
 
+        tokenizer = self.info.get_tokenizer()
         unk_token_id = tokenizer.get_vocab()["<unk>"]
         video_inputs["video_token_id"] = torch.tensor(unk_token_id)
 
@@ -740,7 +692,10 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
             image_size = images.get_image_size(item_idx)
 
-            return self.get_image_prompt_texts(image_size, item_idx)
+            return PromptUpdateDetails.select_text(
+                self.get_image_prompt_texts(image_size, item_idx),
+                "<unk>",
+            )
 
         def get_video_replacement(item_idx: int):
             videos = mm_items.get_items(
@@ -749,7 +704,10 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
             frame_size = videos.get_frame_size(item_idx)
             num_frames = videos.get_num_frames(item_idx)
 
-            return self.get_video_prompt_texts(frame_size, num_frames)
+            return PromptUpdateDetails.select_text(
+                self.get_video_prompt_texts(frame_size, num_frames),
+                "<unk>",
+            )
 
         get_replacement = {
             "image": get_image_replacement,
@@ -832,14 +790,6 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
             assert isinstance(image_token_id, torch.Tensor)
             self.mm_token_ids.add(image_token_id.flatten().unique().item())
 
-        embed_is_patch = kwargs.pop("embed_is_patch")
-        if not isinstance(embed_is_patch, (torch.Tensor, list)):
-            raise ValueError(
-                f"Incorrect type of embed_is_patch for {modality=}. "
-                f"Got type: {type(embed_is_patch)}")
-
-        embed_is_patch = flatten_bn(embed_is_patch)
-
         if image_embeds is not None:
             if not isinstance(image_embeds, (torch.Tensor, list)):
                 raise ValueError(
@@ -851,7 +801,6 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
             return MiniCPMVImageEmbeddingInputs(
                 type="image_embeds",
                 image_embeds=image_embeds_flat,
-                embed_is_patch=embed_is_patch,
             )
 
         if not isinstance(pixel_values, (torch.Tensor, list)):
@@ -879,7 +828,6 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
             type="pixel_values",
             pixel_values=pixel_values_flat,
             tgt_sizes=tgt_sizes_flat,
-            embed_is_patch=embed_is_patch,
             num_slices=num_slices_flat,
         )
 
@@ -936,19 +884,11 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
             if modality == "images":
                 image_input = modalities["images"]
                 image_features = self._process_vision_input(image_input)
-                multimodal_embeddings += tuple(
-                    scatter_patch_features(
-                        image_features,
-                        image_input["embed_is_patch"],
-                    ))
+                multimodal_embeddings += tuple(image_features)
             if modality == "videos":
                 video_input = modalities["videos"]
                 video_features = self._process_vision_input(video_input)
-                multimodal_embeddings += tuple(
-                    scatter_patch_features(
-                        video_features,
-                        video_input["embed_is_patch"],
-                    ))
+                multimodal_embeddings += tuple(video_features)
 
         return multimodal_embeddings
 
@@ -971,7 +911,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 list(self.mm_token_ids),
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index 872769dd649e0..b6fbc6b1fa3d0 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -27,7 +27,8 @@ from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, ProcessingCache,
-                                        PromptReplacement, PromptUpdate)
+                                        PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -35,8 +36,7 @@ from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import (get_vision_encoder_info, scatter_patch_features,
-                     select_patch_features)
+from .vision import get_vision_encoder_info
 
 
 class Mistral3ImagePixelInputs(TypedDict):
@@ -49,14 +49,6 @@ class Mistral3ImagePixelInputs(TypedDict):
     in which case the data is passed as a list instead of a batched tensor.
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-    
-    Shape: `(batch_size, num_images, num_embeds)`
-    """
-
 
 class Mistral3PatchMerger(nn.Module):
     """
@@ -266,23 +258,6 @@ class Mistral3MultiModalProcessor(
                 p[:, :h, :w] for p, (h, w) in zip(pixel_values, image_sizes)
             ]
 
-            hf_config = self.info.get_hf_config()
-            vision_config = hf_config.vision_config
-            assert isinstance(vision_config, PixtralVisionConfig)
-            encoder_info = PixtralHFEncoderInfo(vision_config)
-
-            tile_sizes = [
-                encoder_info.get_patch_grid_size(
-                    image_width=pixel_value.shape[-1],
-                    image_height=pixel_value.shape[-2],
-                ) for pixel_value in processed_outputs["pixel_values"]
-            ]
-            embed_is_patch = [
-                torch.tensor(([True] * ncols + [False]) * nrows)
-                for ncols, nrows in tile_sizes
-            ]
-            processed_outputs["embed_is_patch"] = embed_is_patch
-
         return processed_outputs
 
     def _get_mm_fields_config(
@@ -292,7 +267,6 @@ class Mistral3MultiModalProcessor(
     ) -> Mapping[str, MultiModalFieldConfig]:
         return dict(
             pixel_values=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
@@ -327,7 +301,7 @@ class Mistral3MultiModalProcessor(
             tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
             tokens[-1] = image_end_id
 
-            return tokens
+            return PromptUpdateDetails.select_token_id(tokens, image_token_id)
 
         return [
             PromptReplacement(
@@ -418,8 +392,6 @@ def init_vision_tower_for_llava(
     )
 
 
-# TODO(mgoin): Support V1, there are issues with image batching/chunking
-# that need to be resolved first.
 @MULTIMODAL_REGISTRY.register_processor(
     _build_mistral3_processor,
     info=_build_mistral3_info,
@@ -509,16 +481,9 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsMultiModal,
             raise ValueError("Incorrect type of pixel values. "
                              f"Got type: {type(pixel_values)}")
 
-        assert self.config.vision_config.model_type == "pixtral"
-        embed_is_patch = kwargs.pop("embed_is_patch")
-        if not isinstance(embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of embed_is_patch. "
-                             f"Got type: {type(embed_is_patch)}")
-
         return Mistral3ImagePixelInputs(
             type="pixel_values_pixtral",
             pixel_values=flatten_bn(pixel_values),
-            embed_is_patch=flatten_bn(embed_is_patch),
         )
 
     def _process_image_input(
@@ -557,10 +522,7 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         vision_embeddings = self._process_image_input(image_input)
 
-        return scatter_patch_features(
-            vision_embeddings,
-            image_input["embed_is_patch"],
-        )
+        return vision_embeddings
 
     def get_input_embeddings(
         self,
@@ -572,7 +534,7 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsMultiModal,
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.config.image_token_index,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index d8c0234b8f42d..d76d63774b4e4 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -33,7 +33,6 @@ from vllm.attention.layer import MultiHeadAttention
 from vllm.config import VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.inputs import InputProcessingContext
-from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
@@ -50,7 +49,7 @@ from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate)
+                                        PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -58,9 +57,6 @@ from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .llama4 import Llama4ForCausalLM
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features
-
-logger = init_logger(__name__)
 
 
 class Llama4ImagePatchInputs(TypedDict):
@@ -77,11 +73,7 @@ class Llama4ImagePatchInputs(TypedDict):
     This is used to split the embeddings which has the first two dimensions
     flattened just like `flat_data`.
     """
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-    """
+
     aspect_ratios: Union[torch.Tensor, list[torch.Tensor]]
     """
     A list of aspect ratios corresponding to the number of tiles
@@ -510,11 +502,10 @@ class Mllama4ProcessingInfo(BaseProcessingInfo):
         mm_counts: Mapping[str, int],
     ) -> Mapping[str, int]:
         vision_config = self.get_hf_config().vision_config
-        # image_start + local tiles * (patches + 1 x separator) +
-        # 1 global tile * (image x 1 + patches) + image_end
-        token_per_chunk = self.get_patch_per_chunk(vision_config) + 1
-        mm_max_tokens = (self.get_max_num_tiles() + 1) * token_per_chunk + 2
-        return {"image": mm_max_tokens}
+        patch_per_chunk = self.get_patch_per_chunk(vision_config)
+        num_patches = self.get_max_num_tiles() + 1
+
+        return {"image": patch_per_chunk * num_patches}
 
     def get_image_size_with_most_features(self) -> ImageSize:
         vision_config = self.get_hf_config().vision_config
@@ -523,6 +514,14 @@ class Mllama4ProcessingInfo(BaseProcessingInfo):
         return ImageSize(height=self.get_max_num_tiles() * image_size,
                          width=image_size)
 
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+        )
+
 
 class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
                                  ):
@@ -578,33 +577,9 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
                 for (r_h, r_w) in aspect_ratios
             ]
 
-            # embed_is_patch should have one feature per image-related token:
-            #   <|image_start|>, <|tile_*_separator|>, <|image|>, <|image_end|>
-            #             -> False
-            #   <|patch|> -> True
-            # embed_is_patch has no entries corresponding to non-image-related
-            # tokens.
-            patch_id = tokenizer.get_vocab()[processor.img_patch_token]
-            num_patches_per_chunk = self.info.get_patch_per_chunk(
-                vision_config)
-            expanded_image_tokens_list = [
-                processor._prompt_split_image(aspect_ratio,
-                                              num_patches_per_chunk)
-                for aspect_ratio in aspect_ratios
-            ]
-            expanded_image_token_ids = [
-                tokenizer.encode(image_tokens, add_special_tokens=False)
-                for image_tokens in expanded_image_tokens_list
-            ]
-            embed_is_patch = [
-                torch.tensor(tokens) == patch_id
-                for tokens in expanded_image_token_ids
-            ]
-
             processed_outputs["aspect_ratios"] = aspect_ratios
             processed_outputs["patches_per_image"] = torch.tensor(
                 patches_per_image)
-            processed_outputs["embed_is_patch"] = embed_is_patch
 
         return processed_outputs
 
@@ -619,7 +594,6 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
                 "image", patches_per_image),
             patches_per_image=MultiModalFieldConfig.batched("image"),
             aspect_ratios=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
         )
 
     def _get_prompt_updates(
@@ -639,12 +613,17 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
         num_patches_per_chunk = self.info.get_patch_per_chunk(vision_config)
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_token = hf_processor.image_token
+        img_patch_token = hf_processor.img_patch_token
 
         def get_replacement(item_idx: int):
             aspect_ratio = out_mm_kwargs["aspect_ratios"][item_idx]
-            return hf_processor._prompt_split_image(
+
+            repl = hf_processor._prompt_split_image(
                 aspect_ratio=aspect_ratio,
-                num_patches_per_chunk=num_patches_per_chunk)
+                num_patches_per_chunk=num_patches_per_chunk,
+            )
+
+            return PromptUpdateDetails.select_text(repl, img_patch_token)
 
         return [
             PromptReplacement(
@@ -737,11 +716,6 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
         flat_pixel_values = flatten_bn(pixel_values, concat=True)
         patches_per_image = flatten_bn(kwargs.pop("patches_per_image"))
 
-        embed_is_patch = kwargs.pop("embed_is_patch", None)
-        if not isinstance(embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of embed_is_patch. "
-                             f"Got type: {type(embed_is_patch)}")
-
         aspect_ratios = kwargs.pop("aspect_ratios", None)
         if not isinstance(aspect_ratios, (torch.Tensor, list)):
             raise ValueError("Incorrect type of aspect_ratios. "
@@ -751,7 +725,6 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
             type="pixel_values",
             flat_data=flat_pixel_values,
             patches_per_image=patches_per_image,
-            embed_is_patch=embed_is_patch,
             aspect_ratios=aspect_ratios,
         )
 
@@ -759,10 +732,15 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
             self, image_input: Llama4ImagePatchInputs) -> MultiModalEmbeddings:
         flat_data = image_input["flat_data"]
         patches_per_image = image_input["patches_per_image"].tolist()
+
         vision_embeddings_flat = self.vision_model(flat_data)
         vision_embeddings_flat = self.multi_modal_projector(
             vision_embeddings_flat)
-        return vision_embeddings_flat.split(patches_per_image, dim=0)
+
+        return [
+            img.flatten(0, 1)
+            for img in vision_embeddings_flat.split(patches_per_image, dim=0)
+        ]
 
     def get_multimodal_embeddings(self,
                                   **kwargs) -> Optional[MultiModalEmbeddings]:
@@ -770,20 +748,7 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
         if image_input is None:
             return None
 
-        # num_images x [num_chunks, num_patches, hidden_dim]
-        image_features = self._process_image_input(image_input)
-        # num_images x [num_chunks x num_patches, hidden_dim]
-        image_features_flat = [img.flatten(0, 1) for img in image_features]
-        # num_images x [1, input_len] -> num_images x [input_len]
-        embed_is_patch_flat = [
-            is_patch.flatten(0, 1)
-            for is_patch in image_input["embed_is_patch"]
-        ]
-
-        return scatter_patch_features(
-            image_features_flat,
-            embed_is_patch_flat,
-        )
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -794,9 +759,11 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         if multimodal_embeddings is not None:
             inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds,
-                select_patch_features(multimodal_embeddings),
-                self.config.image_token_index)
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                self.config.image_token_index,
+            )
 
         return inputs_embeds
 
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index b2f795155f17b..6857bfa810e3e 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -46,7 +46,8 @@ from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptIndexTargets,
-                                        PromptInsertion, PromptUpdate)
+                                        PromptInsertion, PromptUpdate,
+                                        PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -56,7 +57,6 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features
 
 # TODO: hard-coded for now. Consider making it configurable.
 VIT_LAYERS = [-2, -9]
@@ -84,14 +84,6 @@ class MolmoImageInputs(TypedDict):
     Shape: `(batch_size * num_images, num_crops, num_patch)`
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-    
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
     num_crops: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
@@ -1146,30 +1138,6 @@ class MolmoProcessorWrapper:
         if image_input_idx is not None:
             feat_is_patch = image_input_idx >= 0
 
-            input_is_embed = torch.isin(
-                input_ids,
-                torch.tensor([
-                    self.image_patch_id,
-                    self.im_col_id,
-                    self.im_start_id,
-                    self.im_end_id,
-                ]),
-            )
-            embed_ids = input_ids[input_is_embed]
-            embed_is_patch = embed_ids == self.image_patch_id
-            assert embed_is_patch.sum() == feat_is_patch.sum()
-
-            # image_tokens = extra_joint + joint
-            # Both `extra_joint` and `joint` have `im_start_id` and `im_end_id`
-            embed_start = torch.nonzero(embed_ids == self.im_start_id)[::2, 0]
-            embed_end = torch.nonzero(embed_ids == self.im_end_id)[1::2, 0]
-            assert len(embed_start) == len(embed_end) == len(images)
-
-            embed_is_patch = [
-                embed_is_patch[start:end + 1]
-                for start, end in zip(embed_start, embed_end)
-            ]
-
             tilings = [
                 self.select_tiling(
                     image_width=image.size[0],
@@ -1181,7 +1149,6 @@ class MolmoProcessorWrapper:
             assert num_crops.sum() == len(feat_is_patch)
 
             outputs["feat_is_patch"] = feat_is_patch
-            outputs["embed_is_patch"] = embed_is_patch
             outputs["num_crops"] = num_crops
             outputs["img_patch_id"] = self.image_patch_id
 
@@ -1220,17 +1187,13 @@ class MolmoProcessingInfo(BaseProcessingInfo):
         )
         pooling_size = processor.pooling_size
 
-        base_image_input_size = processor.base_image_input_size
-        base_image_input_d = processor.image_patch_size
+        image_token_length_w = processor.image_token_length_w
+        image_token_length_h = processor.image_token_length_h
 
-        crop_patches = base_image_input_size[0] // base_image_input_d
+        extra = image_token_length_w * image_token_length_h
+        joint = ((ncols + 1) // pooling_size) * ((nrows + 1) // pooling_size)
 
-        per_row = ncols // pooling_size + 1
-        joint = per_row * (nrows // pooling_size) + 2
-        image_token_length = (crop_patches + pooling_size - 1) // pooling_size
-        resize = (image_token_length + 1) * image_token_length + 2
-
-        return resize + joint
+        return extra + joint
 
     def get_max_image_tokens(self) -> int:
         target_width, target_height = self.get_image_size_with_most_features()
@@ -1328,7 +1291,6 @@ class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
                 "image", num_crops),
             feat_is_patch=MultiModalFieldConfig.flat_from_sizes(
                 "image", num_crops),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
             num_crops=MultiModalFieldConfig.batched("image"),
             img_patch_id=MultiModalFieldConfig.shared("image", num_images),
         )
@@ -1368,8 +1330,10 @@ class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
             joint = ([img_start_id] + joint_row *
                      ((nrows + 1) // pooling_size) + [img_end_id])
 
-            image_tokens = extra_joint + joint
-            return image_tokens
+            return PromptUpdateDetails.select_token_id(
+                extra_joint + joint,
+                embed_token_id=img_patch_id,
+            )
 
         return [
             PromptInsertion(
@@ -1475,11 +1439,6 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
             raise ValueError("Incorrect type of feat_is_patch. "
                              f"Got type: {type(feat_is_patch)}")
 
-        embed_is_patch = kwargs.pop("embed_is_patch", None)
-        if not isinstance(embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of embed_is_patch. "
-                             f"Got type: {type(embed_is_patch)}")
-
         num_crops = kwargs.pop("num_crops", None)
         if not isinstance(num_crops, (torch.Tensor, list)):
             raise ValueError("Incorrect type of num_crops. "
@@ -1491,14 +1450,12 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
                              f"Got type: {type(img_patch_id)}")
         self.img_patch_id = img_patch_id.flatten().unique().item()
 
-        embed_is_patch = flatten_bn(embed_is_patch)
         num_crops = flatten_bn(num_crops, concat=True)
 
         return MolmoImageInputs(
             images=images,
             image_masks=image_masks,
             feat_is_patch=feat_is_patch,
-            embed_is_patch=embed_is_patch,
             num_crops=num_crops,
         )
 
@@ -1537,12 +1494,7 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
         if image_input is None:
             return None
 
-        image_features = self._process_image_input(image_input)
-
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -1556,7 +1508,7 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.img_patch_id,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py
index 9d04f30c8f3fe..314f75c203012 100644
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -57,7 +57,7 @@ class NVLMProcessor(BaseInternVLProcessor):
         # when trying to find "<tile" as a subsequence of "<Image><tile"
         repl = "<Image>" + features + "</Image>"
 
-        return PromptUpdateDetails(full=repl, features=repl)
+        return PromptUpdateDetails.select_text(repl, IMG_PAD)
 
 
 class NVLMProcessingInfo(BaseInternVLProcessingInfo):
@@ -84,31 +84,6 @@ class NVLMProcessingInfo(BaseInternVLProcessingInfo):
             **kwargs,
         )
 
-    def get_max_image_tokens(self) -> int:
-        hf_processor = self.get_hf_processor()
-        tokenizer = hf_processor.tokenizer
-
-        max_num_patches = hf_processor.max_dynamic_patch
-        # we need +1 here because max_dynamic_patch in config doesn't
-        # include the thumbnail patch
-        tile_pos_identifiers = [
-            f"<tile_{i+1}>" for i in range(max_num_patches)
-        ]
-        if hf_processor.use_thumbnail and max_num_patches != 1:
-            tile_pos_identifiers += ["<tile_global_thumbnail>"]
-
-        # "<Image><tile" is tokenized as ["<Image", "><", "tile"]
-        # so we include <tile_1> in the start_str
-        start_str = "<Image>" + tile_pos_identifiers.pop(0)
-        end_str = "</Image>"
-        start_token_len = len(tokenizer.encode(start_str))
-        end_token_len = len(tokenizer.encode(end_str))
-        tile_token_len = sum(
-            len(tokenizer.encode(identifier))
-            for identifier in tile_pos_identifiers)
-        non_image_tokens_num = start_token_len + end_token_len + tile_token_len
-        return super().get_max_image_tokens() + non_image_tokens_num
-
 
 class NVLMDummyInputsBuilder(InternVLDummyInputsBuilder[NVLMProcessingInfo]):
 
@@ -177,10 +152,7 @@ class NVLMMultiModalProcessor(InternVLMultiModalProcessor[NVLMProcessingInfo]):
 
             repl = hf_processor.get_image_repl(feature_size, num_patches)
 
-            return PromptUpdateDetails(
-                full=repl.full + "\n",
-                features=repl.features + "\n",
-            )
+            return PromptUpdateDetails.select_text(repl.full + "\n", IMG_PAD)
 
         # See note in dummy data regarding why we have the extra newline
         return [
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 6fedb8c819849..845f77ac39ce7 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -162,9 +162,9 @@ class PaliGemmaMultiModalProcessor(
                 modality="image",
                 target=PromptIndexTargets.prefix(
                     [bos_token_id] if tokenizer.add_bos_token else []),
-                insertion=PromptUpdateDetails(
-                    full=image_tokens + [bos_token_id],
-                    features=image_tokens,
+                insertion=PromptUpdateDetails.select_token_id(
+                    image_tokens + [bos_token_id],
+                    embed_token_id=image_token_id,
                 ),
             )
         ]
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index d5c64989e64d3..d3b0688f21c38 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -40,8 +40,7 @@ from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, BoundPromptUpdate,
                                         PlaceholderFeaturesInfo,
-                                        PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+                                        PromptReplacement, PromptUpdate)
 # yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
@@ -443,12 +442,7 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
                     processor=hf_processor,
                 )
 
-            image_tokens = [_IMAGE_TOKEN_ID] * num_image_tokens
-
-            return PromptUpdateDetails(
-                full=image_tokens,
-                features=image_tokens,
-            )
+            return [_IMAGE_TOKEN_ID] * num_image_tokens
 
         num_images = mm_items.get_count("image", strict=False)
 
@@ -517,6 +511,7 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
                         item_idx=p.item_idx,
                         start_idx=p.start_idx - 1,
                         tokens=p.tokens,
+                        is_embed=p.is_embed,
                     ) for p in ps
                 ]
                 for modality, ps in placeholders.items()
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index f8c7cc9382aac..e07c6516aef2e 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -37,7 +37,7 @@ from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate)
+                                        PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import (MistralTokenizer,
@@ -46,8 +46,7 @@ from vllm.transformers_utils.tokenizer import (MistralTokenizer,
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (flatten_bn, init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
-from .vision import (VisionEncoderInfo, resolve_visual_encoder_outputs,
-                     scatter_patch_features, select_patch_features)
+from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
 
 try:
     from xformers import ops as xops
@@ -68,14 +67,6 @@ class PixtralImagePixelInputs(TypedDict):
     The result of stacking :attr:`ImageEncoding.tokens` from each prompt.
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 class PixtralProcessorAdapter:
     """
@@ -144,11 +135,8 @@ class PixtralProcessorAdapter:
                 "For more info, see: "
                 "https://github.com/vllm-project/vllm/issues/8411.")
 
-        image_token_id = self.image_token_id
-
         images_processed = list[torch.Tensor]()
         images_tokens = list[torch.Tensor]()
-        images_embed_is_patch = list[torch.Tensor]()
 
         for image in images:
             image_inputs = self.image_processor(ImageChunk(image=image))
@@ -157,12 +145,10 @@ class PixtralProcessorAdapter:
 
             images_processed.append(image_processed)
             images_tokens.append(image_tokens)
-            images_embed_is_patch.append(image_tokens == image_token_id)
 
         return {
             "input_ids": torch.cat(images_tokens)[None].expand(len(text), -1),
             "images": images_processed,
-            "embed_is_patch": images_embed_is_patch,
         }
 
 
@@ -213,7 +199,7 @@ class PixtralProcessingInfo(BaseProcessingInfo):
         ncols, nrows = processor.image_processor._image_to_num_tokens(
             Image.new("RGB", (image_width, image_height)))
 
-        return (ncols + 1) * nrows
+        return ncols * nrows
 
     def get_image_size_with_most_features(self) -> ImageSize:
         image_processor = self.get_hf_processor().image_processor
@@ -263,10 +249,7 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
         hf_inputs: Mapping[str, NestedTensors],
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(
-            images=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
-        )
+        return dict(images=MultiModalFieldConfig.batched("image"))
 
     def _get_prompt_updates(
         self,
@@ -290,7 +273,7 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
             tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
             tokens[-1] = image_end_id
 
-            return tokens
+            return PromptUpdateDetails.select_token_id(tokens, image_token_id)
 
         return [
             PromptReplacement(
@@ -381,17 +364,9 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
             raise ValueError("Incorrect type of images. "
                              f"Got type: {type(images)}")
 
-        embed_is_patch = kwargs.pop("embed_is_patch")
-        if not isinstance(embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of embed_is_patch. "
-                             f"Got type: {type(embed_is_patch)}")
-
-        embed_is_patch = flatten_bn(embed_is_patch)
-
         return PixtralImagePixelInputs(
             type="pixel_values",
             images=flatten_bn(images),
-            embed_is_patch=embed_is_patch,
         )
 
     def _process_image_input(
@@ -427,12 +402,7 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
         if image_input is None:
             return None
 
-        image_features = self._process_image_input(image_input)
-
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -444,7 +414,7 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.vision_args.image_token_id,
             )
         return inputs_embeds
@@ -963,9 +933,7 @@ class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]):
             image_width=image_width,
             image_height=image_height,
         )
-
-        # Consider the image_break_token
-        return (ncols + 1) * nrows
+        return ncols * nrows
 
     def get_max_image_tokens(self) -> int:
         image_size = self.get_image_size()
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index ccb5a3f600b2d..54220037d253f 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -229,9 +229,9 @@ class Qwen2AudioMultiModalProcessor(
 
             audio_tokens = [audio_token_id] * num_features
 
-            return PromptUpdateDetails(
-                full=[audio_bos_id] + audio_tokens + [audio_eos_id],
-                features=audio_tokens,
+            return PromptUpdateDetails.select_token_id(
+                [audio_bos_id] + audio_tokens + [audio_eos_id],
+                embed_token_id=audio_token_id,
             )
 
         return [
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index 4e9d02ae0abdb..a2ec9a9a4d177 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -647,9 +647,9 @@ class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
             PromptReplacement(
                 modality="image",
                 target=[img_start_id, img_end_id],
-                replacement=PromptUpdateDetails(
-                    full=[img_start_id] + image_tokens + [img_end_id],
-                    features=image_tokens,
+                replacement=PromptUpdateDetails.select_token_id(
+                    [img_start_id] + image_tokens + [img_end_id],
+                    embed_token_id=img_pad_id,
                 ),
             )
         ]
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index ac5de0e36b894..e3deae828a33c 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -40,7 +40,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features
 
 IMG_START = '<img>'
 IMG_END = '</img>'
@@ -61,14 +60,6 @@ class SkyworkR1VImagePixelInputs(TypedDict):
     num_patches: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 class SkyworkR1VImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
@@ -419,24 +410,13 @@ class BaseSkyworkR1VProcessor(ABC):
                 torch.tensor([len(item) for item in pixel_values_lst]),
             }
 
-            tokenizer = self.tokenizer
-            image_token_id = self.image_token_id
-
-            embed_is_patch = list[torch.Tensor]()
-
             for pixel_values in pixel_values_lst:
                 num_patches = pixel_values.shape[0]
                 feature_size = num_patches * self.num_image_token
 
                 image_repl = self.get_image_repl(feature_size, num_patches)
-                feature_tokens = tokenizer.encode(image_repl.features,
-                                                  add_special_tokens=False)
 
                 text = [t.replace('<image>', image_repl.full, 1) for t in text]
-                embed_is_patch.append(
-                    torch.tensor(feature_tokens) == image_token_id)
-
-            image_inputs["embed_is_patch"] = embed_is_patch
 
         text_inputs = self.tokenizer(text)
 
@@ -460,7 +440,7 @@ class SkyworkR1VProcessor(BaseSkyworkR1VProcessor):
         repl_features = IMG_CONTEXT * feature_size
         repl_full = IMG_START + repl_features + IMG_END
 
-        return PromptUpdateDetails(full=repl_full, features=repl_features)
+        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
 
 
 class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo):
@@ -599,7 +579,6 @@ class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[_I]):
             pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
                 "image", image_num_patches),
             image_num_patches=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
             image_token_id=MultiModalFieldConfig.shared("image", num_images),
         )
@@ -835,7 +814,6 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
             self, **kwargs: object) -> Optional[SkyworkR1VImageInputs]:
         pixel_values_flat = kwargs.pop("pixel_values_flat", None)
         image_num_patches = kwargs.pop("image_num_patches", None)
-        embed_is_patch = kwargs.pop("embed_is_patch", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
         if pixel_values_flat is None and image_embeds is None:
@@ -864,20 +842,14 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
                 raise ValueError("Incorrect type of image_num_patches. "
                                  f"Got type: {type(image_num_patches)}")
 
-            if not isinstance(embed_is_patch, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of embed_is_patch. "
-                                 f"Got type: {type(embed_is_patch)}")
-
             pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
             image_num_patches = flatten_bn(image_num_patches, concat=True)
-            embed_is_patch = flatten_bn(embed_is_patch)
 
             return SkyworkR1VImagePixelInputs(
                 type="pixel_values",
                 pixel_values_flat=self._validate_pixel_values(
                     pixel_values_flat),
                 num_patches=image_num_patches,
-                embed_is_patch=embed_is_patch,
             )
 
         raise AssertionError("This line should be unreachable.")
@@ -923,15 +895,7 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
         if image_input is None:
             return None
 
-        image_features = self._process_image_input(image_input)
-
-        if image_input["type"] != "pixel_values":
-            return image_features
-
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -945,7 +909,7 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.img_context_token_id,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 9e00da682e808..347f51499b7be 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -1,8 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from abc import ABC, abstractmethod
-from collections.abc import Sequence
-from typing import Final, Generic, Optional, Protocol, TypeVar, Union, cast
+from typing import Final, Generic, Optional, Protocol, TypeVar, Union
 
 import torch
 from transformers import PretrainedConfig
@@ -10,12 +9,9 @@ from transformers import PretrainedConfig
 import vllm.envs as envs
 from vllm.attention.selector import (backend_name_to_enum,
                                      get_global_forced_attn_backend)
-from vllm.jsontree import JSONTree, json_map_leaves
 from vllm.logger import init_logger
 from vllm.platforms import _Backend, current_platform
 
-from .interfaces import MultiModalEmbeddings
-
 logger = init_logger(__name__)
 
 _C = TypeVar("_C", bound=PretrainedConfig)
@@ -155,74 +151,3 @@ def resolve_visual_encoder_outputs(
     if post_layer_norm is not None and uses_last_layer:
         hs_pool[-1] = post_layer_norm(encoder_outputs)
     return torch.cat(hs_pool, dim=-1)
-
-
-def scatter_patch_features(
-    patches: Union[torch.Tensor, Sequence[torch.Tensor]],
-    embed_is_patch: Union[torch.Tensor, Sequence[torch.Tensor]],
-) -> tuple[torch.Tensor, ...]:
-    """
-    Scatter the patch features into a contiguous tensor that corresponds
-    to the embedding tokens defined by the multimodal processor.
-    
-    The rest of the values in the tensor are set to NaN so that they
-    can be filtered out by :func`select_patch_features`.
-
-    Args:
-        patches: The patch features for each image.
-          Shape: `(num_images, <patch_dims>, feature_depth)`
-        embed_is_patch: A boolean mask indicating which image embeddings
-          correspond to patch tokens for each image.
-          Shape: `(num_images, num_embeds)`
-
-    Note:
-        The original code only considers patch tokens as feature
-        tokens, but our processor considers all image-related tokens
-        as feature tokens because the feature tokens need to be
-        consecutive in `input_ids`.
-
-    Example:
-        A simplified example for one image:
-
-        .. code-block::
-
-            Embedding tokens (from HF processor):
-            [<start> <patch> <patch>  <col>  <patch> <patch>  <col>  <end> ]
-
-            embed_is_patch (from HF processor):
-            [ False   True    True    False    True    True   False  False ]
-
-            Encoder outputs (from model):
-            [  p1      p2      p3      p4   ]
-
-            The resulting embedding tensor is:
-            [  nan     p1      p2      nan      p3      p4     nan    nan  ]
-    """
-    if len(patches) != len(embed_is_patch):
-        raise ValueError(f"Inconsistent num_images: {len(patches)=} vs. "
-                         f"{len(embed_is_patch)=}")
-
-    def get_embed_one(patches_one: torch.Tensor, e_is_patch: torch.Tensor):
-        embed_one = patches_one.new_full(
-            (e_is_patch.shape[0], patches_one.shape[-1]),
-            fill_value=torch.nan,
-        )
-        embed_one[e_is_patch] = patches_one
-        return embed_one
-
-    return tuple(
-        get_embed_one(patches_one, e_is_patch)
-        for patches_one, e_is_patch in zip(patches, embed_is_patch))
-
-
-def select_patch_features(
-        multimodal_embeddings: MultiModalEmbeddings) -> MultiModalEmbeddings:
-    """
-    Given the outputs of :func:`scatter_patch_features`, return only
-    the values that correspond to patch features.
-    """
-    selected_features = json_map_leaves(
-        lambda x: x[~x.isnan()].view(-1, *x.shape[1:]),
-        cast(JSONTree[torch.Tensor], multimodal_embeddings),
-    )
-    return cast(MultiModalEmbeddings, selected_features)
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 5159b0bca8c1c..ad95b982499c9 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -385,8 +385,8 @@ class MultiModalPlaceholderMap:
         for placeholder_dict, mm_item in zip(multi_modal_placeholders,
                                              multi_modal_items):
             placeholder = range(
-                placeholder_dict["offset"],
-                placeholder_dict["offset"] + placeholder_dict["length"],
+                placeholder_dict.offset,
+                placeholder_dict.offset + placeholder_dict.length,
             )
             intersection = range(
                 max(positions.start, placeholder.start),
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 81d72ff190222..53729799b629c 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -109,7 +109,8 @@ The built-in modalities are defined by :class:`MultiModalDataBuiltins`.
 """
 
 
-class PlaceholderRange(TypedDict):
+@dataclass(frozen=True)
+class PlaceholderRange:
     """
     Placeholder location information for multi-modal data.
 
@@ -121,8 +122,8 @@ class PlaceholderRange(TypedDict):
 
         .. code-block::
 
-            A: { "offset": 0, "length": 4 }
-            B: { "offset": 5, "length": 4 }
+            A: PlaceholderRange(offset=0, length=4)
+            B: PlaceholderRange(offset=5, length=4)
     """
 
     offset: int
@@ -131,6 +132,31 @@ class PlaceholderRange(TypedDict):
     length: int
     """The length of the placeholder."""
 
+    is_embed: Optional[torch.Tensor] = None
+    """
+    A boolean mask of shape `(length,)` indicating which positions
+    between `offset` and `offset + length` to assign embeddings to.
+    """
+
+    def get_num_embeds(self) -> int:
+        if self.is_embed is None:
+            return self.length
+
+        return int(self.is_embed.sum().item())
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, self.__class__):
+            return False
+        if not (self.offset, self.length) == (other.offset, other.length):
+            return False
+
+        if self.is_embed is None:
+            return other.is_embed is None
+        if other.is_embed is None:
+            return self.is_embed is None
+
+        return nested_tensors_equal(self.is_embed, other.is_embed)
+
 
 NestedTensors = Union[list["NestedTensors"], list[torch.Tensor], torch.Tensor,
                       tuple[torch.Tensor, ...]]
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 00c0f87b0b237..64f657db94bba 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -108,16 +108,46 @@ class PromptUpdateDetails(Generic[_S]):
     full: _S
     """The full content."""
 
-    features: _S
+    is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]] = None
     """
-    The part of the content that corresponds to feature placeholders;
-    this will be replaced by the output of the vision encoder during model
-    inference.
+    Given :attr:`full`, return a boolean mask of shape `(len(full),)`
+    indicating which positions of `full` to assign embeddings to.
+
+    `None` (default) means to assign embeddings to all positions of `full`.
+
+    The embeddings are obtained by calling
+    :class:`SupportsMultiModal.get_multimodal_embeddings`.
     """
 
     @staticmethod
     def from_seq(seq: _S) -> "PromptUpdateDetails[_S]":
-        return PromptUpdateDetails(full=seq, features=seq)
+        return PromptUpdateDetails(full=seq)
+
+    @staticmethod
+    def select_text(
+        seq: _S,
+        embed_text: str,
+    ) -> "PromptUpdateDetails[_S]":
+
+        def is_embed(full: "_BoundPromptSequence") -> torch.Tensor:
+            embed_token_ids = encode_tokens(full.tokenizer, embed_text)
+
+            return torch.isin(
+                torch.tensor(full.token_ids),
+                torch.tensor(embed_token_ids),
+            )
+
+        return PromptUpdateDetails(full=seq, is_embed=is_embed)
+
+    @staticmethod
+    def select_token_id(
+        seq: _S,
+        embed_token_id: int,
+    ) -> "PromptUpdateDetails[_S]":
+        return PromptUpdateDetails(
+            full=seq,
+            is_embed=lambda f: torch.tensor(f.token_ids) == embed_token_id,
+        )
 
 
 PromptUpdateInfo = Union[PromptSeq, PromptUpdateDetails]
@@ -406,7 +436,7 @@ class _BoundPromptSequence:
 @dataclass
 class _BoundPromptContent:
     full: _BoundPromptSequence
-    features: _BoundPromptSequence
+    is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]]
 
 
 @dataclass
@@ -466,10 +496,8 @@ class BoundPromptUpdate:
 
         bound_full = _BoundPromptSequence.from_seq(self.tokenizer,
                                                    content.full)
-        bound_features = _BoundPromptSequence.from_seq(self.tokenizer,
-                                                       content.features)
         bound_content = _BoundPromptContent(full=bound_full,
-                                            features=bound_features)
+                                            is_embed=content.is_embed)
 
         if cache_key is not None:
             self._content_cache[cache_key] = bound_content
@@ -605,15 +633,19 @@ class PlaceholderFeaturesInfo:
     item_idx: int
     start_idx: int
     tokens: list[int]
+    is_embed: Optional[torch.Tensor]
 
     @property
     def length(self) -> int:
         return len(self.tokens)
 
     def to_range(self) -> PlaceholderRange:
+        # TODO: Is it worth it to optimize this by stripping the
+        # leading and ending positions where `is_embed=False`?
         return PlaceholderRange(
             offset=self.start_idx,
             length=self.length,
+            is_embed=self.is_embed,
         )
 
 
@@ -806,22 +838,17 @@ def _iter_placeholders(
                     continue
 
                 if prompt[start_idx:end_idx_full] == content_tokens_full:
-                    content_tokens_feat = content.features.token_ids
+                    content_is_embed = content.is_embed
+                    if content_is_embed is not None:
+                        content_is_embed = content_is_embed(content.full)
 
-                    try:
-                        match = next(
-                            iter_token_matches(content_tokens_full,
-                                               content_tokens_feat))
-                        yield PlaceholderFeaturesInfo(
-                            modality=modality,
-                            item_idx=item_idx,
-                            start_idx=start_idx + match.start_idx,
-                            tokens=content_tokens_feat,
-                        )
-                    except StopIteration:
-                        raise AssertionError(
-                            f"{content_tokens_feat=} should be a "
-                            f"subsequence of {content_tokens_full=}") from None
+                    yield PlaceholderFeaturesInfo(
+                        modality=modality,
+                        item_idx=item_idx,
+                        start_idx=start_idx,
+                        tokens=content_tokens_full,
+                        is_embed=content_is_embed,
+                    )
 
                     # Exclude overlapping matches
                     start_idx = end_idx_full
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index ff9df6e13f1a8..485a90a204f84 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -181,7 +181,7 @@ class MultiModalProfiler(Generic[_I]):
         placeholders_by_modality = mm_inputs["mm_placeholders"]
 
         total_placeholders_by_modality = {
-            modality: sum(item["length"] for item in placeholders)
+            modality: sum(item.get_num_embeds() for item in placeholders)
             for modality, placeholders in placeholders_by_modality.items()
         }
         expected_placeholders_by_modality = {
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index fc0fb8929b1e7..77c83f0c2b212 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -340,7 +340,7 @@ def merge_and_sort_multimodal_metadata(
             all_items.append((modality, placeholder, hash_value))
 
     # Sort all items by offset
-    all_items.sort(key=lambda x: x[1]['offset'])
+    all_items.sort(key=lambda x: x[1].offset)
 
     # Split into separate lists
     sorted_modalities = [item[0] for item in all_items]
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 34bc9369b125d..afcf7e344a0f0 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -310,8 +310,7 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
     # Note that we assume mm_positions is sorted by offset.
     # We do not need to check all mm inputs if the start token index is out of
     # range. This usually happens in the late prefill phase and decoding phase.
-    if mm_positions[-1]["offset"] + mm_positions[-1][
-            "length"] < start_token_idx:
+    if mm_positions[-1].offset + mm_positions[-1].length < start_token_idx:
         return extra_keys, start_mm_idx
 
     # Support start_mm_idx == -1 to indicate the last mm input.
@@ -322,8 +321,8 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
     curr_mm_idx = start_mm_idx
     while mm_positions and curr_mm_idx < len(mm_positions):
         assert mm_hashes[curr_mm_idx] is not None
-        offset = mm_positions[curr_mm_idx]["offset"]
-        length = mm_positions[curr_mm_idx]["length"]
+        offset = mm_positions[curr_mm_idx].offset
+        length = mm_positions[curr_mm_idx].length
         if end_token_idx > offset:
             if start_token_idx > offset + length:
                 # This block has passed the current mm input.
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 81f8ad25051c3..b3905987efc77 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -505,8 +505,8 @@ class Scheduler(SchedulerInterface):
         assert mm_positions is not None
         assert len(mm_positions) > 0
         for i, pos_info in enumerate(mm_positions):
-            start_pos = pos_info["offset"]
-            num_encoder_tokens = pos_info["length"]
+            start_pos = pos_info.offset
+            num_encoder_tokens = pos_info.length
 
             # The encoder output is needed if the two ranges overlap:
             # [num_computed_tokens, num_computed_tokens + num_new_tokens) and
@@ -596,8 +596,8 @@ class Scheduler(SchedulerInterface):
             if cached_encoder_input_ids:
                 for input_id in list(cached_encoder_input_ids):
                     mm_positions = request.mm_positions[input_id]
-                    start_pos = mm_positions["offset"]
-                    num_tokens = mm_positions["length"]
+                    start_pos = mm_positions.offset
+                    num_tokens = mm_positions.length
                     if start_pos + num_tokens <= request.num_computed_tokens:
                         # The encoder output is already processed and stored
                         # in the decoder's KV cache.
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 490fe4e83d3ad..daf59fd76e9a9 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -121,7 +121,7 @@ class Request:
 
     def get_num_encoder_tokens(self, input_id: int) -> int:
         assert input_id < len(self.mm_positions)
-        num_tokens = self.mm_positions[input_id]["length"]
+        num_tokens = self.mm_positions[input_id].length
         return num_tokens
 
     @property
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 5133c637f0e0b..a83409a72a88e 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -19,7 +19,8 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.model_loader import get_model
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
@@ -43,7 +44,8 @@ from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
-from .utils import sanity_check_mm_encoder_outputs
+from .utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs,
+                    scatter_mm_placeholders)
 
 if TYPE_CHECKING:
     import xgrammar as xgr
@@ -830,19 +832,21 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         )
         return metadata
 
-    def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
+    def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
         scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
         if not scheduled_encoder_inputs:
             return
 
         # Batch the multi-modal inputs.
-        mm_inputs: list[MultiModalKwargs] = []
-        req_input_ids: list[tuple[str, int]] = []
+        mm_inputs = list[MultiModalKwargs]()
+        req_ids_pos = list[tuple[str, int, PlaceholderRange]]()
         for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
             req_state = self.requests[req_id]
-            for input_id in encoder_input_ids:
-                mm_inputs.append(req_state.mm_inputs[input_id])
-                req_input_ids.append((req_id, input_id))
+
+            for mm_input_id in encoder_input_ids:
+                mm_inputs.append(req_state.mm_inputs[mm_input_id])
+                req_ids_pos.append(
+                    (req_id, mm_input_id, req_state.mm_positions[mm_input_id]))
 
         # Batch mm inputs as much as we can: if a request in the batch has
         # multiple modalities or a different modality than the previous one,
@@ -878,16 +882,23 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 encoder_outputs.append(output)
 
         # Cache the encoder outputs.
-        for (req_id, input_id), output in zip(req_input_ids, encoder_outputs):
+        for (req_id, input_id, pos_info), output in zip(
+                req_ids_pos,
+                encoder_outputs,
+        ):
             if req_id not in self.encoder_cache:
                 self.encoder_cache[req_id] = {}
-            self.encoder_cache[req_id][input_id] = output
 
-    def _gather_encoder_outputs(
+            self.encoder_cache[req_id][input_id] = scatter_mm_placeholders(
+                output,
+                is_embed=pos_info.is_embed,
+            )
+
+    def _gather_mm_embeddings(
         self,
         scheduler_output: "SchedulerOutput",
     ) -> list[torch.Tensor]:
-        encoder_outputs: list[torch.Tensor] = []
+        mm_embeds: list[torch.Tensor] = []
         for req_id in self.input_batch.req_ids:
             num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
                 req_id]
@@ -895,8 +906,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             num_computed_tokens = req_state.num_computed_tokens
             mm_positions = req_state.mm_positions
             for i, pos_info in enumerate(mm_positions):
-                start_pos = pos_info["offset"]
-                num_encoder_tokens = pos_info["length"]
+                start_pos = pos_info.offset
+                num_encoder_tokens = pos_info.length
 
                 # The encoder output is needed if the two ranges overlap:
                 # [num_computed_tokens,
@@ -918,8 +929,16 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 assert req_id in self.encoder_cache
                 assert i in self.encoder_cache[req_id]
                 encoder_output = self.encoder_cache[req_id][i]
-                encoder_outputs.append(encoder_output[start_idx:end_idx])
-        return encoder_outputs
+
+                if (is_embed := pos_info.is_embed) is not None:
+                    is_embed = is_embed[start_idx:end_idx]
+
+                mm_embeds_item = gather_mm_placeholders(
+                    encoder_output[start_idx:end_idx],
+                    is_embed=is_embed,
+                )
+                mm_embeds.append(mm_embeds_item)
+        return mm_embeds
 
     def get_model(self) -> nn.Module:
         return self.model
@@ -984,10 +1003,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
 
         if self.is_multimodal_model:
             # Run the multimodal encoder if any.
-            self._execute_encoder(scheduler_output)
-            encoder_outputs = self._gather_encoder_outputs(scheduler_output)
+            self._execute_mm_encoder(scheduler_output)
+            mm_embeds = self._gather_mm_embeddings(scheduler_output)
         else:
-            encoder_outputs = []
+            mm_embeds = []
 
         # Prepare the decoder inputs.
         attn_metadata, logits_indices, spec_decode_metadata = (
@@ -1009,9 +1028,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             # embeddings), we always use embeddings (rather than token ids)
             # as input to the multimodal model, even when the input is text.
             input_ids = self.input_ids[:num_scheduled_tokens]
-            if encoder_outputs:
+            if mm_embeds:
                 inputs_embeds = self.model.get_input_embeddings(
-                    input_ids, encoder_outputs)
+                    input_ids, mm_embeds)
             else:
                 inputs_embeds = self.model.get_input_embeddings(input_ids)
             # TODO(woosuk): Avoid the copy. Optimize.
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 0668e7168b5f7..4b058122f0ce9 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -19,7 +19,8 @@ from vllm.config import VllmConfig
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
@@ -36,7 +37,8 @@ from vllm.v1.sample.tpu.sampler import Sampler as TPUSampler
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
-from .utils import sanity_check_mm_encoder_outputs
+from .utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs,
+                    scatter_mm_placeholders)
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
@@ -507,19 +509,46 @@ class TPUModelRunner:
         logits_indices = logits_indices.to(self.device)
         return attn_metadata, logits_indices
 
-    def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
+    def _scatter_placeholders(
+        self,
+        embeds: torch.Tensor,
+        is_embed: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        if is_embed is None:
+            return embeds
+
+        placeholders = embeds.new_full(
+            (is_embed.shape[0], embeds.shape[-1]),
+            fill_value=torch.nan,
+        )
+        placeholders[is_embed] = embeds
+        return placeholders
+
+    def _gather_placeholders(
+        self,
+        placeholders: torch.Tensor,
+        is_embed: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        if is_embed is None:
+            return placeholders
+
+        return placeholders[is_embed]
+
+    def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
         scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
         if not scheduled_encoder_inputs:
             return
 
         # Batch the multi-modal inputs.
-        mm_inputs: list[MultiModalKwargs] = []
-        req_input_ids: list[tuple[str, int]] = []
+        mm_inputs = list[MultiModalKwargs]()
+        req_ids_pos = list[tuple[str, int, PlaceholderRange]]()
         for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
             req_state = self.requests[req_id]
-            for input_id in encoder_input_ids:
-                mm_inputs.append(req_state.mm_inputs[input_id])
-                req_input_ids.append((req_id, input_id))
+
+            for mm_input_id in encoder_input_ids:
+                mm_inputs.append(req_state.mm_inputs[mm_input_id])
+                req_ids_pos.append(
+                    (req_id, mm_input_id, req_state.mm_positions[mm_input_id]))
 
         # Batch mm inputs as much as we can: if a request in the batch has
         # multiple modalities or a different modality than the previous one,
@@ -555,16 +584,23 @@ class TPUModelRunner:
                 encoder_outputs.append(output)
 
         # Cache the encoder outputs.
-        for (req_id, input_id), output in zip(req_input_ids, encoder_outputs):
+        for (req_id, input_id, pos_info), output in zip(
+                req_ids_pos,
+                encoder_outputs,
+        ):
             if req_id not in self.encoder_cache:
                 self.encoder_cache[req_id] = {}
-            self.encoder_cache[req_id][input_id] = output
 
-    def _gather_encoder_outputs(
+            self.encoder_cache[req_id][input_id] = scatter_mm_placeholders(
+                output,
+                is_embed=pos_info.is_embed,
+            )
+
+    def _gather_mm_embeddings(
         self,
         scheduler_output: "SchedulerOutput",
     ) -> list[torch.Tensor]:
-        encoder_outputs: list[torch.Tensor] = []
+        mm_embeds: list[torch.Tensor] = []
         for req_id in self.input_batch.req_ids:
             num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
                 req_id]
@@ -572,8 +608,8 @@ class TPUModelRunner:
             num_computed_tokens = req_state.num_computed_tokens
             mm_positions = req_state.mm_positions
             for i, pos_info in enumerate(mm_positions):
-                start_pos = pos_info["offset"]
-                num_encoder_tokens = pos_info["length"]
+                start_pos = pos_info.offset
+                num_encoder_tokens = pos_info.length
 
                 # The encoder output is needed if the two ranges overlap:
                 # [num_computed_tokens,
@@ -595,8 +631,16 @@ class TPUModelRunner:
                 assert req_id in self.encoder_cache
                 assert i in self.encoder_cache[req_id]
                 encoder_output = self.encoder_cache[req_id][i]
-                encoder_outputs.append(encoder_output[start_idx:end_idx])
-        return encoder_outputs
+
+                if (is_embed := pos_info.is_embed) is not None:
+                    is_embed = is_embed[start_idx:end_idx]
+
+                mm_embeds_item = gather_mm_placeholders(
+                    encoder_output[start_idx:end_idx],
+                    is_embed=is_embed,
+                )
+                mm_embeds.append(mm_embeds_item)
+        return mm_embeds
 
     @torch.no_grad()
     def execute_model(
@@ -612,10 +656,10 @@ class TPUModelRunner:
 
         if self.is_multimodal_model:
             # Run the multimodal encoder if any.
-            self._execute_encoder(scheduler_output)
-            encoder_outputs = self._gather_encoder_outputs(scheduler_output)
+            self._execute_mm_encoder(scheduler_output)
+            mm_embeds = self._gather_mm_embeddings(scheduler_output)
         else:
-            encoder_outputs = []
+            mm_embeds = []
 
         # Prepare inputs
         attn_metadata, logits_indices = self._prepare_inputs(scheduler_output)
@@ -623,9 +667,9 @@ class TPUModelRunner:
             # NOTE(woosuk): To unify token ids and soft tokens (vision
             # embeddings), we always use embeddings (rather than token ids)
             # as input to the multimodal model, even when the input is text.
-            if encoder_outputs:
+            if mm_embeds:
                 inputs_embeds = self.model.get_input_embeddings(
-                    self.input_ids, encoder_outputs)
+                    self.input_ids, mm_embeds)
             else:
                 inputs_embeds = self.model.get_input_embeddings(self.input_ids)
             input_ids = None
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index b1d3aa7cd8afb..e46ca0c90fe38 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -1,4 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+
 import torch
 
 
@@ -27,3 +29,46 @@ def sanity_check_mm_encoder_outputs(
         f"but got tensors with shapes {[e.shape for e in mm_embeddings]} "
         "instead. This is most likely due to incorrect implementation "
         "of the model's `get_multimodal_embeddings` method.")
+
+
+def scatter_mm_placeholders(
+    embeds: torch.Tensor,
+    is_embed: Optional[torch.Tensor],
+) -> torch.Tensor:
+    """
+    Scatter the multimodal embeddings into a contiguous tensor that represents
+    the placeholder tokens.
+
+    :class:`vllm.multimodal.processing.PromptUpdateDetails.is_embed`.
+
+    Args:
+        embeds: The multimodal embeddings.
+          Shape: `(num_embeds, embed_dim)`
+        is_embed: A boolean mask indicating which positions in the placeholder
+          tokens need to be filled with multimodal embeddings.
+          Shape: `(num_placeholders, num_embeds)`
+    """
+    if is_embed is None:
+        return embeds
+
+    placeholders = embeds.new_full(
+        (is_embed.shape[0], embeds.shape[-1]),
+        fill_value=torch.nan,
+    )
+    placeholders[is_embed] = embeds
+    return placeholders
+
+
+def gather_mm_placeholders(
+    placeholders: torch.Tensor,
+    is_embed: Optional[torch.Tensor],
+) -> torch.Tensor:
+    """
+    Reconstructs the embeddings from the placeholder tokens.
+
+    This is the operation of :func:`scatter_mm_placeholders`.
+    """
+    if is_embed is None:
+        return placeholders
+
+    return placeholders[is_embed]

From ad971af8c7dc2c006b58853207b36942569a10ee Mon Sep 17 00:00:00 2001
From: zxfan-cpu <zxfanzhang@tencent.com>
Date: Tue, 8 Apr 2025 11:48:47 +0800
Subject: [PATCH 291/593] [Bugfix] fix use-ep bug to enable ep by dp/tp size >
 1 (#16161)

---
 vllm/model_executor/layers/fused_moe/layer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 0e35d8a80988c..80ac5f42dfb89 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -437,7 +437,7 @@ class FusedMoE(torch.nn.Module):
         # Use expert parallelism instead of tensor parallelism?
         vllm_config = get_current_vllm_config()
         use_ep = (vllm_config.parallel_config.enable_expert_parallel
-                  and self.tp_size > 1)
+                  and self.tp_size * self.dp_size > 1)
 
         # For smuggling this layer into the fused moe custom op
         self.use_direct_call = self.dp_size == 1

From 05a015d6a52e6093f1ac0b76ada5b7da4d6a5671 Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Date: Mon, 7 Apr 2025 20:59:26 -0700
Subject: [PATCH 292/593] Add warning for Attention backends that do not
 support irope yet (#16212)

---
 vllm/attention/backends/flashinfer.py      | 8 ++++++++
 vllm/attention/backends/hpu_attn.py        | 5 +++++
 vllm/attention/backends/ipex_attn.py       | 8 ++++++++
 vllm/attention/backends/pallas.py          | 8 ++++++++
 vllm/attention/backends/rocm_flash_attn.py | 5 +++++
 vllm/attention/backends/torch_sdpa.py      | 5 +++++
 vllm/attention/backends/xformers.py        | 5 +++++
 vllm/v1/attention/backends/pallas.py       | 8 ++++++++
 8 files changed, 52 insertions(+)

diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 0556c191ddea6..09717a1121d05 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -38,9 +38,12 @@ from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
 from vllm.attention.layer import Attention
 from vllm.attention.ops.paged_attn import PagedAttention
 from vllm.config import VllmConfig, get_current_vllm_config
+from vllm.logger import init_logger
 from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
                         make_tensor_with_pad)
 
+logger = init_logger(__name__)
+
 if TYPE_CHECKING:
     from vllm.worker.model_runner import (ModelInputForGPUBuilder,
                                           ModelInputForGPUWithSamplingMetadata)
@@ -907,7 +910,12 @@ class FlashInferImpl(AttentionImpl):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
     ) -> None:
+        if use_irope:
+            logger.warning_once(
+                "Using irope in FlashInfer is not supported yet, it will fall"
+                " back to global attention for long context.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index f948fbc0a1096..55b03bbf32e4b 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -108,8 +108,13 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         max_seq_len: int = 4096,
         attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
     ) -> None:
         super(AttentionImpl, self).__init__()
+        if use_irope:
+            logger.warning_once(
+                "Using irope in HPU is not supported yet, it will fall back "
+                "to global attention for long context.")
         self.kv_cache_dtype = kv_cache_dtype
         self.num_heads = num_heads
         self.head_size = head_size
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index d3c61ea26a02a..99917a92af5f9 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -14,6 +14,9 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.attention.ops.paged_attn import (PagedAttention,
                                            PagedAttentionMetadata)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
 
 _PARTITION_SIZE = 512
 
@@ -119,7 +122,12 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
     ) -> None:
+        if use_irope:
+            logger.warning_once(
+                "Using irope in Ipex is not supported yet, it will fall"
+                " back to global attention for long context.")
         if blocksparse_params is not None:
             raise ValueError(
                 "IPEX backend does not support block-sparse attention.")
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index 2ee66ab9e966e..91d20a4e7bfc0 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -11,6 +11,9 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType,
                                               is_quantized_kv_cache)
 from vllm.attention.backends.utils import CommonAttentionState
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
 
 
 class PallasAttentionBackend(AttentionBackend):
@@ -105,7 +108,12 @@ class PallasAttentionBackendImpl(AttentionImpl):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
     ) -> None:
+        if use_irope:
+            logger.warning_once(
+                "Using irope in Pallas is not supported yet, it will fall back "
+                "to global attention for long context.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 9a4ee2ae70612..ebeefdf4fd327 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -462,7 +462,12 @@ class ROCmFlashAttentionImpl(AttentionImpl):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
     ) -> None:
+        if use_irope:
+            logger.warning_once(
+                "Using irope in ROCm Flash Attention is not supported yet, it "
+                "will fail back to global attention for long context.")
         if blocksparse_params is not None:
             raise ValueError(
                 "ROCmFlashAttention does not support blocksparse attention.")
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index afe2acff4ab3d..c1bd638f2605d 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -404,6 +404,7 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -411,6 +412,10 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
         if logits_soft_cap is not None:
             logger.warning_once("Torch SPDA does not support logits soft cap. "
                                 "Outputs may be slightly off.")
+        if use_irope:
+            logger.warning_once(
+                "Using irope in Torch SPDA is not supported yet, it will fall"
+                " back to global attention for long context.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 14c94c9ac4cab..cd152e57d7496 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -389,6 +389,7 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
@@ -396,6 +397,10 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
         if logits_soft_cap is not None:
             logger.warning_once("XFormers does not support logits soft cap. "
                                 "Outputs may be slightly off.")
+        if use_irope:
+            logger.warning_once(
+                "Using irope in XFormers is not supported yet, it will fall"
+                " back to global attention for long context.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index af729ee9910f4..3e8149a24ebf7 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -10,6 +10,9 @@ import torch_xla.experimental.custom_kernel  # noqa: F401
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer, AttentionType)
 from vllm.attention.backends.utils import CommonAttentionState
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
 
 
 class PallasAttentionBackend(AttentionBackend):
@@ -80,7 +83,12 @@ class PallasAttentionBackendImpl(AttentionImpl):
         blocksparse_params: Optional[dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
     ) -> None:
+        if use_irope:
+            logger.warning_once(
+                "Using irope in Pallas is not supported yet, it will fall back "
+                "to global attention for long context.")
         if blocksparse_params is not None:
             raise ValueError("Paged attention Pallas kernel does "
                              "not support block-sparse attention.")

From b99733d0929aba5ec4f523885d9b417d50b90fc2 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Mon, 7 Apr 2025 23:14:15 -0600
Subject: [PATCH 293/593] [Bugfix] Do not skip "empty" parts of chats that are
 parsable (#16219)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 tests/entrypoints/test_chat_utils.py | 89 +++++++++++++++++++++++++++-
 vllm/entrypoints/chat_utils.py       | 22 +++----
 2 files changed, 98 insertions(+), 13 deletions(-)

diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 8cc51a5d73b3f..a00387ef6b8c1 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -30,6 +30,7 @@ QWEN25VL_MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
 MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
 LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B"
 HERMES_MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-8B"
+MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
 
 
 @pytest.fixture(scope="function")
@@ -80,6 +81,30 @@ def mllama_tokenizer():
     )
 
 
+@pytest.fixture(scope="function")
+def mistral_model_config():
+    return ModelConfig(MISTRAL_MODEL_ID,
+                       task="generate",
+                       tokenizer=MISTRAL_MODEL_ID,
+                       tokenizer_mode="auto",
+                       trust_remote_code=True,
+                       dtype="auto",
+                       seed=0,
+                       limit_mm_per_prompt={
+                           "image": 2,
+                       })
+
+
+@pytest.fixture(scope="module")
+def mistral_tokenizer():
+    return TokenizerGroup(
+        tokenizer_id=MISTRAL_MODEL_ID,
+        enable_lora=False,
+        max_num_seqs=5,
+        max_input_length=None,
+    )
+
+
 @pytest.fixture(scope="module")
 def image_url():
     image = ImageAsset('cherry_blossom')
@@ -131,6 +156,66 @@ def test_parse_chat_messages_single_image(
     _assert_mm_data_is_image_input(mm_data, 1)
 
 
+def test_parse_chat_messages_empty_system(
+    mistral_model_config,
+    mistral_tokenizer,
+):
+    # Test string format
+    conversation, _ = parse_chat_messages(
+        [{
+            "role": "system",
+            "content": ""
+        }, {
+            "role": "user",
+            "content": [{
+                "type": "text",
+                "text": "Who are you?"
+            }]
+        }],
+        mistral_model_config,
+        mistral_tokenizer,
+        content_format="string",
+    )
+    assert conversation == [{
+        "role": "system",
+        "content": ""
+    }, {
+        "role": "user",
+        "content": "Who are you?"
+    }]
+
+    # Test openai format
+    conversation, _ = parse_chat_messages(
+        [{
+            "role": "system",
+            "content": ""
+        }, {
+            "role": "user",
+            "content": [{
+                "type": "text",
+                "text": "Who are you?"
+            }]
+        }],
+        mistral_model_config,
+        mistral_tokenizer,
+        content_format="openai",
+    )
+    assert conversation == [{
+        "role": "system",
+        "content": [{
+            "type": "text",
+            "text": ""
+        }]
+    }, {
+        "role":
+        "user",
+        "content": [{
+            "type": "text",
+            "text": "Who are you?"
+        }]
+    }]
+
+
 @pytest.mark.asyncio
 async def test_parse_chat_messages_single_image_async(
     phi3v_model_config,
@@ -671,7 +756,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
     # Build a config for the model
     model_config = ModelConfig(model,
                                task="generate",
-                               tokenizer=MLLAMA_MODEL_ID,
+                               tokenizer=model,
                                tokenizer_mode="auto",
                                trust_remote_code=True,
                                dtype="auto",
@@ -682,7 +767,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
 
     # Build the tokenizer group and grab the underlying tokenizer
     tokenizer_group = TokenizerGroup(
-        MLLAMA_MODEL_ID,
+        model,
         enable_lora=False,
         max_num_seqs=5,
         max_input_length=None,
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index d7e8d045108e3..11c759a6174e6 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -872,19 +872,19 @@ MM_PARSER_MAP: dict[
     Callable[[ChatCompletionContentPartParam], _ContentPart],
 ] = {
     "text":
-    lambda part: _TextParser(part).get("text", ""),
+    lambda part: _TextParser(part).get("text", None),
     "image_url":
-    lambda part: _ImageParser(part).get("image_url", {}).get("url", ""),
+    lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
     "image_embeds":
-    lambda part: _ImageEmbedsParser(part).get("image_embeds", {}),
+    lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
     "audio_url":
-    lambda part: _AudioParser(part).get("audio_url", {}).get("url", ""),
+    lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
     "input_audio":
-    lambda part: _InputAudioParser(part).get("input_audio", {}),
+    lambda part: _InputAudioParser(part).get("input_audio", None),
     "refusal":
-    lambda part: _RefusalParser(part).get("refusal", ""),
+    lambda part: _RefusalParser(part).get("refusal", None),
     "video_url":
-    lambda part: _VideoParser(part).get("video_url", {}).get("url", ""),
+    lambda part: _VideoParser(part).get("video_url", {}).get("url", None),
 }
 
 
@@ -1003,11 +1003,11 @@ def _parse_chat_message_content_part(
     part_type, content = _parse_chat_message_content_mm_part(part)
 
     # if part_type is text/refusal/image_url/audio_url/video_url/input_audio but
-    # content is empty, log a warning and skip
-    if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and not content:
+    # content is None, log a warning and skip
+    if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and content is None:
         logger.warning(
-            "Skipping multimodal part (type: '%s') "
-            "with empty / unparsable content.", part_type)
+            "Skipping multimodal part '%s' (type: '%s') "
+            "with empty / unparsable content.", part, part_type)
         return None
 
     if part_type in ("text", "refusal"):

From f6b32efb7f08a58e06cfee87c3dd241962bce3e1 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 8 Apr 2025 13:38:13 +0800
Subject: [PATCH 294/593] [Bugfix] Fix and reorganize broken GGUF tests and
 bump gguf version (#16194)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 requirements/common.txt                       |  2 +-
 .../models/decoder_only/language/test_gguf.py | 80 ++++++++++++++-----
 2 files changed, 61 insertions(+), 21 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 0748418fad9cc..7e361c27edbcb 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -28,7 +28,7 @@ filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/31
 partial-json-parser # used for parsing partial JSON outputs
 pyzmq
 msgspec
-gguf == 0.10.0
+gguf >= 0.13.0
 importlib_metadata
 mistral_common[opencv] >= 1.5.4
 opencv-python-headless >= 4.11.0    # required for video IO
diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py
index dd34a2577a084..925e7104eaeff 100644
--- a/tests/models/decoder_only/language/test_gguf.py
+++ b/tests/models/decoder_only/language/test_gguf.py
@@ -9,11 +9,13 @@ from typing import NamedTuple
 
 import pytest
 from huggingface_hub import hf_hub_download
+from pytest import MarkDecorator
 from transformers import AutoTokenizer
 
 from tests.quantization.utils import is_quant_method_supported
 
 from ....conftest import VllmRunner
+from ....utils import multi_gpu_test
 from ...utils import check_logprobs_close
 
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
@@ -25,6 +27,7 @@ class GGUFTestConfig(NamedTuple):
     original_model: str
     gguf_repo: str
     gguf_filename: str
+    marks: list[MarkDecorator] = []
 
     @property
     def gguf_model(self):
@@ -35,6 +38,7 @@ LLAMA_CONFIG = GGUFTestConfig(
     original_model="meta-llama/Llama-3.2-1B-Instruct",
     gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF",
     gguf_filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf",
+    marks=[pytest.mark.quant_model],
 )
 
 QWEN2_CONFIG = GGUFTestConfig(
@@ -81,34 +85,24 @@ MODELS = [
 ]
 
 
-@pytest.mark.skipif(not is_quant_method_supported("gguf"),
-                    reason="gguf is not supported on this GPU type.")
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("tp_size", [1, 2])
-def test_models(
-    num_gpus_available: int,
+def check_model_outputs(
     vllm_runner: type[VllmRunner],
-    example_prompts: list[str],
+    prompts: list[str],
     model: GGUFTestConfig,
     dtype: str,
     max_tokens: int,
     num_logprobs: int,
     tp_size: int,
-) -> None:
-    if num_gpus_available < tp_size:
-        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
-
+):
     tokenizer = AutoTokenizer.from_pretrained(model.original_model)
     if tokenizer.chat_template is not None:
         messages = [[{
             'role': 'user',
             'content': prompt
-        }] for prompt in example_prompts]
-        example_prompts = tokenizer.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True)
+        }] for prompt in prompts]
+        prompts = tokenizer.apply_chat_template(messages,
+                                                tokenize=False,
+                                                add_generation_prompt=True)
 
     # Run gguf model.
     with vllm_runner(model_name=model.gguf_model,
@@ -118,17 +112,19 @@ def test_models(
                      max_model_len=MAX_MODEL_LEN,
                      tensor_parallel_size=tp_size) as gguf_model:
         gguf_outputs = gguf_model.generate_greedy_logprobs(
-            example_prompts[:-1], max_tokens, num_logprobs)
+            prompts[:-1], max_tokens, num_logprobs)
 
     # Run unquantized model.
+    # Should run with tp=1, otherwise the test will stuck at
+    # nccl initialization.
     with vllm_runner(
             model_name=model.original_model,
             enforce_eager=True,  # faster tests
             dtype=dtype,
             max_model_len=MAX_MODEL_LEN,
-            tensor_parallel_size=tp_size) as original_model:
+            tensor_parallel_size=1) as original_model:
         original_outputs = original_model.generate_greedy_logprobs(
-            example_prompts[:-1], max_tokens, num_logprobs)
+            prompts[:-1], max_tokens, num_logprobs)
 
     check_logprobs_close(
         outputs_0_lst=original_outputs,
@@ -136,3 +132,47 @@ def test_models(
         name_0="original",
         name_1="gguf",
     )
+
+
+@pytest.mark.skipif(not is_quant_method_supported("gguf"),
+                    reason="gguf is not supported on this GPU type.")
+@pytest.mark.parametrize("model", [
+    pytest.param(test_config, marks=test_config.marks)
+    for test_config in MODELS
+])
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("tp_size", [1])
+def test_models(
+    vllm_runner: type[VllmRunner],
+    example_prompts: list[str],
+    model: GGUFTestConfig,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tp_size: int,
+) -> None:
+    check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
+                        num_logprobs, tp_size)
+
+
+@pytest.mark.skipif(not is_quant_method_supported("gguf"),
+                    reason="gguf is not supported on this GPU type.")
+@pytest.mark.parametrize("model", [LLAMA_CONFIG])
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [8])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("tp_size", [2])
+@multi_gpu_test(num_gpus=2)
+def test_distributed(
+    vllm_runner: type[VllmRunner],
+    example_prompts: list[str],
+    model: GGUFTestConfig,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tp_size: int,
+) -> None:
+    check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
+                        num_logprobs, tp_size)

From 87918e40c4bdc7ff8204501bb307a783d6354347 Mon Sep 17 00:00:00 2001
From: Siyuan Liu <lsiyuan@google.com>
Date: Mon, 7 Apr 2025 23:23:53 -0700
Subject: [PATCH 295/593] [torch.compile][TPU] Make @support_torch_compile work
 for XLA backend (#15782)

Signed-off-by: Siyuan Liu <lsiyuan@google.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
---
 vllm/v1/worker/tpu_model_runner.py | 111 ++++++++++-------------------
 vllm/v1/worker/tpu_worker.py       |  12 +++-
 2 files changed, 47 insertions(+), 76 deletions(-)

diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 4b058122f0ce9..7360c8760f21a 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -15,6 +15,7 @@ import torch_xla.runtime as xr
 import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention
+from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
 from vllm.config import VllmConfig
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
@@ -691,11 +692,10 @@ class TPUModelRunner:
             hidden_states = self.model(
                 input_ids=input_ids,
                 positions=self.position_ids,
-                kv_caches=self.kv_caches,
                 inputs_embeds=inputs_embeds,
             )
-        selected_token_ids = self.model.sample_from_hidden(
-            hidden_states, tpu_sampling_metadata)
+        selected_token_ids = self.sample_from_hidden(hidden_states,
+                                                     tpu_sampling_metadata)
         # Remove padding on cpu and keep dynamic op outside of xla graph.
         selected_token_ids = selected_token_ids.cpu()[:num_reqs]
 
@@ -795,17 +795,15 @@ class TPUModelRunner:
                 "get_tensor_model_parallel_rank",
                 return_value=xm_tp_rank):
             model = get_model(vllm_config=self.vllm_config)
-        model = model.eval()
+        # Sync all pending XLA execution during model initialization and weight
+        # loading.
         xm.mark_step()
         xm.wait_device_ops()
-        model = ModelWrapperV1(model)
-        self.model = torch.compile(model,
-                                   backend="openxla",
-                                   fullgraph=True,
-                                   dynamic=False)
+        self.model = model
+        self.sampler = TPUSampler()
 
     @torch.no_grad()
-    def _dummy_run(self, kv_caches, num_tokens: int) -> None:
+    def _dummy_run(self, num_tokens: int) -> None:
         if self.is_multimodal_model:
             input_ids = None
             inputs_embeds = torch.zeros((num_tokens, self.hidden_size),
@@ -856,7 +854,6 @@ class TPUModelRunner:
         with set_forward_context(attn_metadata, self.vllm_config, 0):
             out = self.model(input_ids=input_ids,
                              positions=position_ids,
-                             kv_caches=kv_caches,
                              inputs_embeds=inputs_embeds)
         self._hidden_states_dtype = out.dtype
 
@@ -868,7 +865,7 @@ class TPUModelRunner:
         start = time.perf_counter()
         for num_tokens in self.num_tokens_paddings:
             logger.info("  -- num_tokens: %d", num_tokens)
-            self._dummy_run(self.kv_caches, num_tokens)
+            self._dummy_run(num_tokens)
             xm.mark_step()
         xm.wait_device_ops()
         end = time.perf_counter()
@@ -899,8 +896,7 @@ class TPUModelRunner:
                     from_input_batch(self.input_batch, indices)
                 logger.info("  -- num_tokens: %d, num_seqs: %d", num_tokens,
                             num_reqs_to_sample)
-                out = self.model.sample_from_hidden(dummy_hidden,
-                                                    sampling_meta)
+                out = self.sample_from_hidden(dummy_hidden, sampling_meta)
                 out = out.cpu()
                 # Requests can't be more than tokens. But do compile for the
                 # next bigger value in case num_tokens uses bucketed padding.
@@ -954,45 +950,17 @@ class TPUModelRunner:
             self.vllm_config.compilation_config.static_forward_context,
             self.kv_caches)
 
-
-class ModelWrapperV1(nn.Module):
-
-    def __init__(self, model: nn.Module):
-        super().__init__()
-        self.model = model
-        self.sampler = TPUSampler()
-
-    def sample(
-            self, logits: torch.Tensor,
-            sampling_metadata: TPUSupportedSamplingMetadata) -> SamplerOutput:
-        sampler_out = self.sampler(logits, sampling_metadata)
-        return sampler_out
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: list[torch.Tensor],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """Executes the forward pass of the model.
-
-        Args:
-            input_ids: The input token IDs of shape [num_tokens].
-            positions: The input position IDs of shape [num_tokens].
-            kv_caches: The key and value caches. They can be None during the
-                memory profiling at initialization.
-            inputs_embeds: The input embeddings of shape [num_tokens,
-                hidden_size]. It is used for multimodal models.
-        """
-
-        hidden_states = self.model(
-            input_ids=input_ids,
-            positions=positions,
-            inputs_embeds=inputs_embeds,
-        )
-
-        return hidden_states
+    def reset_dynamo_cache(self):
+        if self.is_multimodal_model:
+            assert hasattr(self.model, "language_model")
+            compiled_model = self.model.language_model.model
+        else:
+            compiled_model = self.model.model
+        if isinstance(compiled_model, TorchCompileWrapperWithCustomDispatcher):
+            logger.info("Clear dynamo cache and cached dynamo bytecode.")
+            torch._dynamo.eval_frame.remove_from_cache(
+                compiled_model.original_code_object)
+            compiled_model.compiled_codes.clear()
 
     def sample_from_hidden(
         self,
@@ -1000,33 +968,30 @@ class ModelWrapperV1(nn.Module):
         sampling_metadata: TPUSupportedSamplingMetadata,
     ) -> torch.Tensor:
         """
-        Sample with xla-friendly function. This function is to be traced 
-        separately from `forward` for lighter compilation overhead.
-        """
+            Sample with xla-friendly function. This function is to be traced 
+            separately for lighter compilation overhead.
+            """
         # Tensor `sample_hidden_states` is of fixed pre-compiled size.
         sample_hidden_states = \
             hidden_states[sampling_metadata.indices_do_sample]
-        logits = self.compute_logits(sample_hidden_states)
+        # SamplingMetadata here for pruning output in LogitsProcessor, disabled.
+        logits = self.model.compute_logits(sample_hidden_states, None)
+
+        def sample(
+                logits: torch.Tensor,
+                sampling_metadata: TPUSupportedSamplingMetadata
+        ) -> SamplerOutput:
+            sampler_out = self.sampler(logits, sampling_metadata)
+            return sampler_out
+
         # Optimized greedy sampling branch, tracing both paths in a single pass
         # NOTE all_greedy is a scalar, this is just an optimized if/else.
-        out_tokens = torch.where(sampling_metadata.all_greedy,
-                        torch.argmax(logits, dim=-1, keepdim=True),
-                        self.sample(logits, sampling_metadata)\
-                                            .sampled_token_ids)
+        out_tokens = torch.where(
+            sampling_metadata.all_greedy,
+            torch.argmax(logits, dim=-1, keepdim=True),
+            sample(logits, sampling_metadata).sampled_token_ids)
         return out_tokens
 
-    def compute_logits(self,
-                       hidden_states: torch.Tensor) -> Optional[torch.Tensor]:
-        # SamplingMetadata here for pruning output in LogitsProcessor, disabled
-        logits = self.model.compute_logits(hidden_states, None)
-        return logits
-
-    def get_multimodal_embeddings(self, *args, **kwargs):
-        return self.model.get_multimodal_embeddings(*args, **kwargs)
-
-    def get_input_embeddings(self, *args, **kwargs):
-        return self.model.get_input_embeddings(*args, **kwargs)
-
 
 def _get_padded_number(n: int, multiple: int) -> int:
     return ((n + multiple - 1) // multiple) * multiple
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index 67902b41b2844..73c43969b87b5 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -157,13 +157,19 @@ class TPUWorker:
             runner_kv_caches)
 
         self.model_runner._dummy_run(
-            runner_kv_caches,
-            num_tokens=self.scheduler_config.max_num_batched_tokens,
-        )
+            self.scheduler_config.max_num_batched_tokens)
 
         # Synchronize before measuring the memory usage.
         xm.wait_device_ops()
 
+        # During the profiling run, the model runs without KV cache. After
+        # the profiling run, the model always runs with KV cache. Here we clear
+        # the dynamo cache and cached bytecode to ensure the model always has
+        # one compiled bytecode. Having one FX graph/cached bytecode per
+        # compiled model is required for `support_torch_compile` decorator to
+        # skip dynamo guard.
+        self.model_runner.reset_dynamo_cache()
+
         # Get the maximum amount of memory used by the model weights and
         # intermediate activations.
         m = xm.get_memory_info(self.device)

From 8e5314a46859cdd089a71c0e27ce6dd5fe05b17f Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 8 Apr 2025 00:24:07 -0600
Subject: [PATCH 296/593] [V1] Add `disable_chunked_mm_input` arg to disable
 partial mm input prefill (#15837)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 tests/v1/core/test_scheduler.py | 45 +++++++++++++++++++++++++++++++++
 vllm/config.py                  |  8 ++++++
 vllm/engine/arg_utils.py        | 16 ++++++++++++
 vllm/v1/core/sched/scheduler.py | 11 ++++++++
 4 files changed, 80 insertions(+)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 21a1cbf540ae0..75c507555559e 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -24,6 +24,7 @@ def create_scheduler(
     max_num_batched_tokens: int = 8192,
     enable_prefix_caching: Optional[bool] = None,
     long_prefill_token_threshold: int = 0,
+    disable_chunked_mm_input: bool = False,
 ) -> Scheduler:
     '''Create scheduler under test.
 
@@ -43,6 +44,7 @@ def create_scheduler(
         max_num_batched_tokens=max_num_batched_tokens,
         max_model_len=max_num_batched_tokens,
         long_prefill_token_threshold=long_prefill_token_threshold,
+        disable_chunked_mm_input=disable_chunked_mm_input,
     )
     model_config = ModelConfig(
         model=model,
@@ -278,6 +280,49 @@ def test_schedule_partial_requests():
     assert requests[2].request_id not in output.num_scheduled_tokens
 
 
+def test_no_mm_input_chunking():
+    # Disable multimodal input chunking.
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        max_num_batched_tokens=1024,
+        disable_chunked_mm_input=True,
+    )
+    mm_positions = [[PlaceholderRange(offset=400, length=800)]]
+    requests = create_requests(num_requests=1,
+                               num_tokens=1200,
+                               mm_positions=mm_positions)
+    for request in requests:
+        scheduler.add_request(request)
+
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 1
+    assert len(output.scheduled_cached_reqs) == 0
+    assert len(output.finished_req_ids) == 0
+    # We want to only see the 400 text tokens at the start scheduled
+    assert output.num_scheduled_tokens[requests[0].request_id] == 400
+
+    req_to_index = {
+        request.request_id: i
+        for i, request in enumerate(requests)
+    }
+    model_runner_output = ModelRunnerOutput(
+        req_ids=[request.request_id for request in requests],
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[] for _ in range(len(requests))],
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+    scheduler.update_from_output(output, model_runner_output)
+
+    output = scheduler.schedule()
+    assert len(scheduler.running) == 1
+    assert len(output.scheduled_new_reqs) == 0
+    assert len(output.scheduled_cached_reqs) == 1
+    assert len(output.finished_req_ids) == 0
+    assert output.num_scheduled_tokens[requests[0].request_id] == 800
+
+
 @pytest.mark.parametrize("enable_prefix_caching", [True, False])
 def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
     """Test scheduling behavior with concurrent partial requests.
diff --git a/vllm/config.py b/vllm/config.py
index c232f0f5e2235..439e27b154ab3 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1721,6 +1721,14 @@ class SchedulerConfig:
 
     chunked_prefill_enabled: bool = field(init=False)
 
+    # If set to true and chunked prefill is enabled, we do not want to
+    # partially schedule a multimodal item. Only used in V1
+    # This ensures that if a request has a mixed prompt
+    # (like text tokens TTTT followed by image tokens IIIIIIIIII) where only
+    # some image tokens can be scheduled (like TTTTIIIII, leaving IIIII),
+    # it will be scheduled as TTTT in one step and IIIIIIIIII in the next.
+    disable_chunked_mm_input: bool = False
+
     # scheduler class or path. "vllm.core.scheduler.Scheduler" (default)
     # or "mod.custom_class".
     scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler"
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 6d9f89faf71a8..0c81e3edbe33e 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -179,6 +179,7 @@ class EngineArgs:
 
     scheduler_delay_factor: float = 0.0
     enable_chunked_prefill: Optional[bool] = None
+    disable_chunked_mm_input: bool = False
 
     guided_decoding_backend: str = 'xgrammar'
     logits_processor_pattern: Optional[str] = None
@@ -1017,6 +1018,20 @@ class EngineArgs:
             "Note that even if this is set to False, cascade attention will be "
             "only used when the heuristic tells that it's beneficial.")
 
+        parser.add_argument(
+            "--disable-chunked-mm-input",
+            action=StoreBoolean,
+            default=EngineArgs.disable_chunked_mm_input,
+            nargs="?",
+            const="False",
+            help="Disable multimodal input chunking attention for V1. "
+            "If set to true and chunked prefill is enabled, we do not want to"
+            " partially schedule a multimodal item. This ensures that if a "
+            "request has a mixed prompt (like text tokens TTTT followed by "
+            "image tokens IIIIIIIIII) where only some image tokens can be "
+            "scheduled (like TTTTIIIII, leaving IIIII), it will be scheduled "
+            "as TTTT in one step and IIIIIIIIII in the next.")
+
         return parser
 
     @classmethod
@@ -1261,6 +1276,7 @@ class EngineArgs:
             num_lookahead_slots=num_lookahead_slots,
             delay_factor=self.scheduler_delay_factor,
             enable_chunked_prefill=self.enable_chunked_prefill,
+            disable_chunked_mm_input=self.disable_chunked_mm_input,
             is_multimodal_model=model_config.is_multimodal_model,
             preemption_mode=self.preemption_mode,
             num_scheduler_steps=self.num_scheduler_steps,
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index b3905987efc77..488d32cb82cfd 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -522,6 +522,17 @@ class Scheduler(SchedulerInterface):
             if self.encoder_cache_manager.has_cache(request, i):
                 # The encoder input is already computed and cached.
                 continue
+
+            # If no encoder input chunking is allowed, we do not want to
+            # partially schedule a multimodal item. If the scheduled range would
+            # only cover part of the mm input, roll back to before the mm item.
+            if (self.scheduler_config.disable_chunked_mm_input
+                    and num_computed_tokens < start_pos
+                    and (num_computed_tokens + num_new_tokens)
+                    < (start_pos + num_encoder_tokens)):
+                num_new_tokens = start_pos - num_computed_tokens
+                break
+
             if (not self.encoder_cache_manager.can_allocate(request, i)
                     or num_encoder_tokens > encoder_budget):
                 # The encoder cache is full or the encoder budget is exhausted.

From b4ac449a83383b18c920d1dba0ca39a2e1030f1a Mon Sep 17 00:00:00 2001
From: Kebe <mail@kebe7jun.com>
Date: Tue, 8 Apr 2025 15:18:15 +0800
Subject: [PATCH 297/593] [Misc] Merge the logs of pp layers partitions
 (#16225)

Signed-off-by: Kebe <mail@kebe7jun.com>
---
 vllm/distributed/utils.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index cae1a25519b3e..2cb57afd45664 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -102,10 +102,11 @@ def get_pp_indices(num_hidden_layers: int, pp_rank: int,
         if remaining_layers := num_hidden_layers % pp_size:
             for i in range(2, remaining_layers + 2):
                 partitions[-i] += 1
-            logger.info("Hidden layers were unevenly partitioned: %s",
-                        ",".join(str(p) for p in partitions))
-            logger.info("This can be manually overridden using the "
-                        "VLLM_PP_LAYER_PARTITION environment variable")
+            logger.info(
+                "Hidden layers were unevenly partitioned: [%s]. "
+                "This can be manually overridden using the "
+                "VLLM_PP_LAYER_PARTITION environment variable",
+                ",".join(str(p) for p in partitions))
 
     start_layer = sum(partitions[:pp_rank])
     end_layer = start_layer + partitions[pp_rank]

From 995e3d1f41ddd3068664e8f7ff578e36df9c642d Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Tue, 8 Apr 2025 00:20:22 -0700
Subject: [PATCH 298/593] [Docs] Add Slides from Singapore Meetup (#16213)

Signed-off-by: simon-mo <simon.mo@hey.com>
---
 README.md                        | 5 +----
 docs/source/community/meetups.md | 1 +
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index aa1264abbb995..1a9aabc75a44f 100644
--- a/README.md
+++ b/README.md
@@ -15,11 +15,8 @@ Easy, fast, and cheap LLM serving for everyone
 
 ---
 
-[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
-
----
-
 *Latest News* 🔥
+- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
 - [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
 - [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
 - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
diff --git a/docs/source/community/meetups.md b/docs/source/community/meetups.md
index 954dc4e7ec997..085918bed2b09 100644
--- a/docs/source/community/meetups.md
+++ b/docs/source/community/meetups.md
@@ -4,6 +4,7 @@
 
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
+- [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day), April 3rd 2025. [[Slides]](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
 - [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama), March 27th 2025. [[Slides]](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
 - [The first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg), March 16th 2025. [[Slides]](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
 - [The East Coast vLLM Meetup](https://lu.ma/7mu4k4xx), March 11th 2025. [[Slides]](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0)

From 7f00899ff7e192706a591d77af1e5fa51d22a8e2 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Tue, 8 Apr 2025 18:42:32 +0800
Subject: [PATCH 299/593] [Misc] format and refactor some examples (#16252)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 examples/offline_inference/mistral-small.py   |  5 +-
 .../offline_inference/multilora_inference.py  |  2 +
 examples/offline_inference/neuron.py          | 57 +++++++-----
 .../neuron_int8_quantization.py               | 65 ++++++++------
 examples/offline_inference/prefix_caching.py  | 87 ++++++++++---------
 examples/offline_inference/reproduciblity.py  | 21 +++--
 examples/offline_inference/rlhf.py            |  8 +-
 .../offline_inference/simple_profiling.py     |  4 +-
 .../offline_inference/torchrun_example.py     |  4 +-
 examples/offline_inference/tpu.py             | 30 ++++---
 examples/offline_inference/vision_language.py |  4 +
 .../vision_language_embedding.py              |  2 +
 .../vision_language_multi_image.py            |  4 +
 13 files changed, 178 insertions(+), 115 deletions(-)

diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py
index bf8fb7dc521c3..efa1aa5b03692 100644
--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@@ -90,8 +90,9 @@ def run_simple_demo(args: argparse.Namespace):
         },
     ]
     outputs = llm.chat(messages, sampling_params=sampling_params)
-
+    print("-" * 50)
     print(outputs[0].outputs[0].text)
+    print("-" * 50)
 
 
 def run_advanced_demo(args: argparse.Namespace):
@@ -162,7 +163,9 @@ def run_advanced_demo(args: argparse.Namespace):
     ]
 
     outputs = llm.chat(messages=messages, sampling_params=sampling_params)
+    print("-" * 50)
     print(outputs[0].outputs[0].text)
+    print("-" * 50)
 
 
 def main():
diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py
index 4b0d115e6609c..de409740292a8 100644
--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@@ -61,6 +61,7 @@ def process_requests(engine: LLMEngine,
     """Continuously process a list of prompts and handle the outputs."""
     request_id = 0
 
+    print("-" * 50)
     while test_prompts or engine.has_unfinished_requests():
         if test_prompts:
             prompt, sampling_params, lora_request = test_prompts.pop(0)
@@ -75,6 +76,7 @@ def process_requests(engine: LLMEngine,
         for request_output in request_outputs:
             if request_output.finished:
                 print(request_output)
+                print("-" * 50)
 
 
 def initialize_engine() -> LLMEngine:
diff --git a/examples/offline_inference/neuron.py b/examples/offline_inference/neuron.py
index 517d1bfce95d8..5906c7b2c6b30 100644
--- a/examples/offline_inference/neuron.py
+++ b/examples/offline_inference/neuron.py
@@ -12,27 +12,36 @@ prompts = [
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
-# Create an LLM.
-llm = LLM(
-    model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    max_num_seqs=8,
-    # The max_model_len and block_size arguments are required to be same as
-    # max sequence length when targeting neuron device.
-    # Currently, this is a known limitation in continuous batching support
-    # in transformers-neuronx.
-    # TODO(liangfu): Support paged-attention in transformers-neuronx.
-    max_model_len=1024,
-    block_size=1024,
-    # The device can be automatically detected when AWS Neuron SDK is installed.
-    # The device argument can be either unspecified for automated detection,
-    # or explicitly assigned.
-    device="neuron",
-    tensor_parallel_size=2)
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+def main():
+    # Create an LLM.
+    llm = LLM(
+        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        max_num_seqs=8,
+        # The max_model_len and block_size arguments are required to be same as
+        # max sequence length when targeting neuron device.
+        # Currently, this is a known limitation in continuous batching support
+        # in transformers-neuronx.
+        # TODO(liangfu): Support paged-attention in transformers-neuronx.
+        max_model_len=1024,
+        block_size=1024,
+        # ruff: noqa: E501
+        # The device can be automatically detected when AWS Neuron SDK is installed.
+        # The device argument can be either unspecified for automated detection,
+        # or explicitly assigned.
+        device="neuron",
+        tensor_parallel_size=2)
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/neuron_int8_quantization.py b/examples/offline_inference/neuron_int8_quantization.py
index c899a01a0bb93..af21274a3a5b8 100644
--- a/examples/offline_inference/neuron_int8_quantization.py
+++ b/examples/offline_inference/neuron_int8_quantization.py
@@ -22,31 +22,40 @@ prompts = [
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
-# Create an LLM.
-llm = LLM(
-    model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    max_num_seqs=8,
-    # The max_model_len and block_size arguments are required to be same as
-    # max sequence length when targeting neuron device.
-    # Currently, this is a known limitation in continuous batching support
-    # in transformers-neuronx.
-    # TODO(liangfu): Support paged-attention in transformers-neuronx.
-    max_model_len=2048,
-    block_size=2048,
-    # The device can be automatically detected when AWS Neuron SDK is installed.
-    # The device argument can be either unspecified for automated detection,
-    # or explicitly assigned.
-    device="neuron",
-    quantization="neuron_quant",
-    override_neuron_config={
-        "cast_logits_dtype": "bfloat16",
-    },
-    tensor_parallel_size=2)
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+def main():
+    # Create an LLM.
+    llm = LLM(
+        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        max_num_seqs=8,
+        # The max_model_len and block_size arguments are required to be same as
+        # max sequence length when targeting neuron device.
+        # Currently, this is a known limitation in continuous batching support
+        # in transformers-neuronx.
+        # TODO(liangfu): Support paged-attention in transformers-neuronx.
+        max_model_len=2048,
+        block_size=2048,
+        # ruff: noqa: E501
+        # The device can be automatically detected when AWS Neuron SDK is installed.
+        # The device argument can be either unspecified for automated detection,
+        # or explicitly assigned.
+        device="neuron",
+        quantization="neuron_quant",
+        override_neuron_config={
+            "cast_logits_dtype": "bfloat16",
+        },
+        tensor_parallel_size=2)
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/prefix_caching.py b/examples/offline_inference/prefix_caching.py
index 4c326c417b4db..f0bec387d3a9b 100644
--- a/examples/offline_inference/prefix_caching.py
+++ b/examples/offline_inference/prefix_caching.py
@@ -31,55 +31,62 @@ generating_prompts = [prefix + prompt for prompt in prompts]
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.0)
 
-# Create an LLM without prefix caching as a baseline.
-regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)
 
-print("Results without `enable_prefix_caching`")
+def main():
+    # Create an LLM without prefix caching as a baseline.
+    regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)
 
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = regular_llm.generate(generating_prompts, sampling_params)
+    print("Results without `enable_prefix_caching`")
 
-regular_generated_texts = []
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    regular_generated_texts.append(generated_text)
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    # ruff: noqa: E501
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = regular_llm.generate(generating_prompts, sampling_params)
 
-print("-" * 80)
+    regular_generated_texts = []
+    # Print the outputs.
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        regular_generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
 
-# Destroy the LLM object and free up the GPU memory.
-del regular_llm
-cleanup_dist_env_and_memory()
+    # Destroy the LLM object and free up the GPU memory.
+    del regular_llm
+    cleanup_dist_env_and_memory()
 
-# Create an LLM with prefix caching enabled.
-prefix_cached_llm = LLM(model="facebook/opt-125m",
-                        enable_prefix_caching=True,
-                        gpu_memory_utilization=0.4)
+    # Create an LLM with prefix caching enabled.
+    prefix_cached_llm = LLM(model="facebook/opt-125m",
+                            enable_prefix_caching=True,
+                            gpu_memory_utilization=0.4)
 
-# Warmup so that the shared prompt's KV cache is computed.
-prefix_cached_llm.generate(generating_prompts[0], sampling_params)
+    # Warmup so that the shared prompt's KV cache is computed.
+    prefix_cached_llm.generate(generating_prompts[0], sampling_params)
 
-# Generate with prefix caching.
-outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
+    # Generate with prefix caching.
+    outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
 
-print("Results with `enable_prefix_caching`")
+    print("Results with `enable_prefix_caching`")
 
-cached_generated_texts = []
-# Print the outputs. You should see the same outputs as before.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    cached_generated_texts.append(generated_text)
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    cached_generated_texts = []
+    # Print the outputs. You should see the same outputs as before.
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        cached_generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
 
-print("-" * 80)
+    # Compare the results and display the speedup
+    generated_same = all([
+        regular_generated_texts[i] == cached_generated_texts[i]
+        for i in range(len(prompts))
+    ])
+    print(f"Generated answers are the same: {generated_same}")
 
-# Compare the results and display the speedup
-generated_same = all([
-    regular_generated_texts[i] == cached_generated_texts[i]
-    for i in range(len(prompts))
-])
-print(f"Generated answers are the same: {generated_same}")
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/reproduciblity.py b/examples/offline_inference/reproduciblity.py
index d0197bf6d5ba0..b2be117d1a0a9 100644
--- a/examples/offline_inference/reproduciblity.py
+++ b/examples/offline_inference/reproduciblity.py
@@ -19,8 +19,6 @@ SEED = 42
 # because it is almost impossible to make the scheduling deterministic in the
 # online serving setting.
 
-llm = LLM(model="facebook/opt-125m", seed=SEED)
-
 prompts = [
     "Hello, my name is",
     "The president of the United States is",
@@ -29,8 +27,17 @@ prompts = [
 ]
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
-outputs = llm.generate(prompts, sampling_params)
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+def main():
+    llm = LLM(model="facebook/opt-125m", seed=SEED)
+    outputs = llm.generate(prompts, sampling_params)
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py
index b0418c092ca3c..e0ed0ac49754b 100644
--- a/examples/offline_inference/rlhf.py
+++ b/examples/offline_inference/rlhf.py
@@ -85,11 +85,13 @@ sampling_params = SamplingParams(temperature=0)
 
 outputs = ray.get(llm.generate.remote(prompts, sampling_params))
 
+print("-" * 50)
 for output in outputs:
     prompt = output.prompt
     generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, "
+    print(f"Prompt: {prompt!r}\n"
           f"Generated text: {generated_text!r}")
+    print("-" * 50)
 
 # set up the communication between the training process
 # and the inference engine.
@@ -120,8 +122,10 @@ assert all(ray.get(llm.collective_rpc.remote("check_weights_changed")))
 # use the updated model to generate texts, they will be nonsense
 # because the weights are all zeros.
 outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params))
+print("-" * 50)
 for output in outputs_updated:
     prompt = output.prompt
     generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, "
+    print(f"Prompt: {prompt!r}\n"
           f"Generated text: {generated_text!r}")
+    print("-" * 50)
diff --git a/examples/offline_inference/simple_profiling.py b/examples/offline_inference/simple_profiling.py
index b45954b3bd54a..6a8e3a5a3e757 100644
--- a/examples/offline_inference/simple_profiling.py
+++ b/examples/offline_inference/simple_profiling.py
@@ -32,10 +32,12 @@ if __name__ == "__main__":
     llm.stop_profile()
 
     # Print the outputs.
+    print("-" * 50)
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
 
     # Add a buffer to wait for profiler in the background process
     # (in case MP is on) to finish writing profiling output.
diff --git a/examples/offline_inference/torchrun_example.py b/examples/offline_inference/torchrun_example.py
index 7a57f29a07fa8..c6d9e6b47e21f 100644
--- a/examples/offline_inference/torchrun_example.py
+++ b/examples/offline_inference/torchrun_example.py
@@ -36,11 +36,13 @@ llm = LLM(
 outputs = llm.generate(prompts, sampling_params)
 
 # all ranks will have the same outputs
+print("-" * 50)
 for output in outputs:
     prompt = output.prompt
     generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, "
+    print(f"Prompt: {prompt!r}\n"
           f"Generated text: {generated_text!r}")
+    print("-" * 50)
 """
 Further tips:
 
diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py
index 956219d30f383..dea717c36082f 100644
--- a/examples/offline_inference/tpu.py
+++ b/examples/offline_inference/tpu.py
@@ -16,14 +16,22 @@ N = 1
 # Currently, top-p sampling is disabled. `top_p` should be 1.0.
 sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16)
 
-# Set `enforce_eager=True` to avoid ahead-of-time compilation.
-# In real workloads, `enforace_eager` should be `False`.
-llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
-          max_num_batched_tokens=64,
-          max_num_seqs=4)
-outputs = llm.generate(prompts, sampling_params)
-for output, answer in zip(outputs, answers):
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    assert generated_text.startswith(answer)
+
+def main():
+    # Set `enforce_eager=True` to avoid ahead-of-time compilation.
+    # In real workloads, `enforace_eager` should be `False`.
+    llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
+              max_num_batched_tokens=64,
+              max_num_seqs=4)
+    outputs = llm.generate(prompts, sampling_params)
+    print("-" * 50)
+    for output, answer in zip(outputs, answers):
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        assert generated_text.startswith(answer)
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 61d53dda1c479..a944260c26925 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -1089,14 +1089,18 @@ def main(args):
         start_time = time.time()
         outputs = llm.generate(inputs, sampling_params=sampling_params)
         elapsed_time = time.time() - start_time
+        print("-" * 50)
         print("-- generate time = {}".format(elapsed_time))
+        print("-" * 50)
 
     else:
         outputs = llm.generate(inputs, sampling_params=sampling_params)
 
+    print("-" * 50)
     for o in outputs:
         generated_text = o.outputs[0].text
         print(generated_text)
+        print("-" * 50)
 
 
 if __name__ == "__main__":
diff --git a/examples/offline_inference/vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py
index a0b2b44b4e829..8321d3e254a2a 100644
--- a/examples/offline_inference/vision_language_embedding.py
+++ b/examples/offline_inference/vision_language_embedding.py
@@ -143,8 +143,10 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
         "multi_modal_data": mm_data,
     })
 
+    print("-" * 50)
     for output in outputs:
         print(output.outputs.embedding)
+        print("-" * 50)
 
 
 def main(args: Namespace):
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index e03ebe485eaaf..39465c9b0ce45 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -644,9 +644,11 @@ def run_generate(model, question: str, image_urls: list[str],
         },
         sampling_params=sampling_params)
 
+    print("-" * 50)
     for o in outputs:
         generated_text = o.outputs[0].text
         print(generated_text)
+        print("-" * 50)
 
 
 def run_chat(model: str, question: str, image_urls: list[str],
@@ -687,9 +689,11 @@ def run_chat(model: str, question: str, image_urls: list[str],
         chat_template=req_data.chat_template,
     )
 
+    print("-" * 50)
     for o in outputs:
         generated_text = o.outputs[0].text
         print(generated_text)
+        print("-" * 50)
 
 
 def main(args: Namespace):

From 69ecaa7c7916499730f50813c6435c8f11e65c52 Mon Sep 17 00:00:00 2001
From: Alex Brooks <alex.brooks@ibm.com>
Date: Tue, 8 Apr 2025 05:05:27 -0600
Subject: [PATCH 300/593] [Misc] Add warning for multimodal data in
 LLM.beam_search (#16241)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
---
 vllm/entrypoints/llm.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index d252a2bb428c0..c79ab16a1949f 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -536,6 +536,16 @@ class LLM:
                                          tokenizer.eos_token_id,
                                          length_penalty)
 
+        # TODO - fix handling of multimodal data for beam search; we pass it
+        # through in the async version on the abstract EngineClient, but not
+        # here.
+        if any("multi_modal_data" in prompt
+               and prompt["multi_modal_data"] is not None
+               for prompt in prompts):
+            logger.warning(
+                "Multimodal data appears to have been provided, but is not"
+                " currently being passed through in LLM.beam_search()!")
+
         tokenizer = self.get_tokenizer()
         # generate 2 * beam_width candidates at each step
         # following the huggingface transformers implementation

From 5a1e1c8353b9c3b5356d3ac071e01e310f4f8c69 Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Tue, 8 Apr 2025 19:05:47 +0800
Subject: [PATCH 301/593] [Model] use AutoWeightsLoader for
 phimoe,qwen2_moe,qwen3_moe (#16203)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 vllm/model_executor/models/phimoe.py    | 173 ++++++++++++------------
 vllm/model_executor/models/qwen2_moe.py | 122 +++++++++--------
 vllm/model_executor/models/qwen3_moe.py | 123 +++++++++--------
 3 files changed, 220 insertions(+), 198 deletions(-)

diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index f8728acdfbfa0..381a33d98b9cb 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -49,7 +49,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -448,6 +448,8 @@ class PhiMoEModel(nn.Module):
                        (lora_config.max_loras or 1)) if lora_config else 0)
         self.vocab_size = config.vocab_size + lora_vocab
         self.org_vocab_size = config.vocab_size
+        self.config = config
+        self.quant_config = quant_config
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
@@ -504,6 +506,88 @@ class PhiMoEModel(nn.Module):
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="w1",
+            ckpt_down_proj_name="w2",
+            ckpt_up_proj_name="w3",
+            num_experts=self.config.num_local_experts)
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     fall_back_to_pt_during_load = False
@@ -585,85 +669,8 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-        ]
-
-        expert_params_mapping = FusedMoE.make_expert_params_mapping(
-            ckpt_gate_proj_name="w1",
-            ckpt_down_proj_name="w2",
-            ckpt_up_proj_name="w3",
-            num_experts=self.config.num_local_experts)
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache quantization scales
-                param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
-                weight_loader(param, loaded_weight)
-                loaded_params.add(scale_name)
-                continue
-
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Skip layers on other devices.
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                for mapping in expert_params_mapping:
-                    param_name, weight_name, expert_id, shard_id = mapping
-                    if weight_name not in name:
-                        continue
-                    name = name.replace(weight_name, param_name)
-                    # Skip layers on other devices.
-                    if is_pp_missing_parameter(name, self):
-                        continue
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(
-                        param,
-                        loaded_weight,
-                        name,
-                        shard_id=shard_id,
-                        expert_id=expert_id,
-                    )
-                    break
-                else:
-                    # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
-                        continue
-                    # Skip layers on other devices.
-                    if is_pp_missing_parameter(name, self):
-                        continue
-                    # Remapping the name of FP8 kv-scale.
-                    name = maybe_remap_kv_scale_name(name, params_dict)
-                    if name is None:
-                        continue
-
-                    param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
-                    weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["rotary_emb.inv_freq"]),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 21855ba9dcf87..2700c706b972e 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -55,7 +55,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (extract_layer_index, is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, extract_layer_index,
+                    is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -329,6 +330,7 @@ class Qwen2MoeModel(nn.Module):
         quant_config = vllm_config.quant_config
 
         self.vocab_size = config.vocab_size
+        self.config = config
 
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
@@ -377,60 +379,6 @@ class Qwen2MoeModel(nn.Module):
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-
-class Qwen2MoeForCausalLM(nn.Module, SupportsPP):
-
-    fall_back_to_pt_during_load = False
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        self.config = config
-        self.quant_config = quant_config
-        self.model = Qwen2MoeModel(vllm_config=vllm_config,
-                                   prefix=maybe_prefix(prefix, "model"))
-        self.lm_head = ParallelLMHead(config.vocab_size,
-                                      config.hidden_size,
-                                      quant_config=quant_config)
-        if self.config.tie_word_embeddings:
-            self.lm_head.weight = self.model.embed_tokens.weight
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
-        return hidden_states
-
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[torch.Tensor]:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
-        return logits
-
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
@@ -453,8 +401,6 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP):
         params_dict = dict(self.named_parameters())
         loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
@@ -531,3 +477,65 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP):
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
+
+
+class Qwen2MoeForCausalLM(nn.Module, SupportsPP):
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Qwen2MoeModel(vllm_config=vllm_config,
+                                   prefix=maybe_prefix(prefix, "model"))
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = get_sampler()
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["rotary_emb.inv_freq"]),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 390bb7adf2559..f0ef79dfdfe28 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -52,7 +52,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (extract_layer_index, is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, extract_layer_index,
+                    is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -326,7 +327,7 @@ class Qwen3MoeModel(nn.Module):
 
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
-
+        self.config = config
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
@@ -375,60 +376,6 @@ class Qwen3MoeModel(nn.Module):
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-
-class Qwen3MoeForCausalLM(nn.Module, SupportsPP):
-
-    fall_back_to_pt_during_load = False
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        self.config = config
-        self.quant_config = quant_config
-        self.model = Qwen3MoeModel(vllm_config=vllm_config,
-                                   prefix=maybe_prefix(prefix, "model"))
-        self.lm_head = ParallelLMHead(config.vocab_size,
-                                      config.hidden_size,
-                                      quant_config=quant_config)
-        if self.config.tie_word_embeddings:
-            self.lm_head.weight = self.model.embed_tokens.weight
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
-        return hidden_states
-
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[torch.Tensor]:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
-        return logits
-
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
@@ -451,8 +398,6 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP):
         params_dict = dict(self.named_parameters())
         loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
@@ -529,3 +474,65 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP):
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
+
+
+class Qwen3MoeForCausalLM(nn.Module, SupportsPP):
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Qwen3MoeModel(vllm_config=vllm_config,
+                                   prefix=maybe_prefix(prefix, "model"))
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = get_sampler()
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["rotary_emb.inv_freq"]),
+        )
+        return loader.load_weights(weights)

From 9351f91be96c58167631e43feb807a78cf2f0340 Mon Sep 17 00:00:00 2001
From: TY-AMD <tianyuan.wu@amd.com>
Date: Tue, 8 Apr 2025 20:10:26 +0800
Subject: [PATCH 302/593] [BugFix][ROCm] Fix GGUF MoE Dispatch Block_Dim for
 ROCm (#16247)

Signed-off-by: Tianyuan Wu <Tianyuan.Wu@amd.com>
---
 csrc/quantization/gguf/moe.cuh | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/csrc/quantization/gguf/moe.cuh b/csrc/quantization/gguf/moe.cuh
index c10c59d7a38a7..df9b84abcc134 100644
--- a/csrc/quantization/gguf/moe.cuh
+++ b/csrc/quantization/gguf/moe.cuh
@@ -129,7 +129,7 @@ static __device__ __forceinline__ void moe_q(
 }
 
 #if defined(USE_ROCM)
-  #define MOE_X_Q4_0 64
+  #define MOE_X_Q4_0 8
   #define MOE_Y_Q4_0 128
   #define NWARPS_Q4_0 8
 #else
@@ -190,7 +190,7 @@ static void ggml_moe_q4_0_q8_1_cuda(
 }
 
 #if defined(USE_ROCM)
-  #define MOE_X_Q4_1 64
+  #define MOE_X_Q4_1 8
   #define MOE_Y_Q4_1 128
   #define NWARPS_Q4_1 8
 #else
@@ -251,7 +251,7 @@ static void ggml_moe_q4_1_q8_1_cuda(
 }
 
 #if defined(USE_ROCM)
-  #define MOE_X_Q5_0 64
+  #define MOE_X_Q5_0 8
   #define MOE_Y_Q5_0 128
   #define NWARPS_Q5_0 8
 #else
@@ -312,7 +312,7 @@ static void ggml_moe_q5_0_q8_1_cuda(
 }
 
 #if defined(USE_ROCM)
-  #define MOE_X_Q5_1 64
+  #define MOE_X_Q5_1 8
   #define MOE_Y_Q5_1 128
   #define NWARPS_Q5_1 8
 #else
@@ -373,7 +373,7 @@ static void ggml_moe_q5_1_q8_1_cuda(
 }
 
 #if defined(USE_ROCM)
-  #define MOE_X_Q8_0 64
+  #define MOE_X_Q8_0 8
   #define MOE_Y_Q8_0 128
   #define NWARPS_Q8_0 8
 #else
@@ -434,7 +434,7 @@ static void ggml_moe_q8_0_q8_1_cuda(
 }
 
 #if defined(USE_ROCM)
-  #define MOE_X_Q2_K 64
+  #define MOE_X_Q2_K 8
   #define MOE_Y_Q2_K 128
   #define NWARPS_Q2_K 8
 #else
@@ -495,7 +495,7 @@ static void ggml_moe_q2_K_q8_1_cuda(
 }
 
 #if defined(USE_ROCM)
-  #define MOE_X_Q3_K 64
+  #define MOE_X_Q3_K 8
   #define MOE_Y_Q3_K 128
   #define NWARPS_Q3_K 8
 #else
@@ -556,7 +556,7 @@ static void ggml_moe_q3_K_q8_1_cuda(
 }
 
 #if defined(USE_ROCM)
-  #define MOE_X_Q4_K 64
+  #define MOE_X_Q4_K 8
   #define MOE_Y_Q4_K 128
   #define NWARPS_Q4_K 8
 #else
@@ -617,7 +617,7 @@ static void ggml_moe_q4_K_q8_1_cuda(
 }
 
 #if defined(USE_ROCM)
-  #define MOE_X_Q5_K 64
+  #define MOE_X_Q5_K 8
   #define MOE_Y_Q5_K 128
   #define NWARPS_Q5_K 8
 #else
@@ -678,7 +678,7 @@ static void ggml_moe_q5_K_q8_1_cuda(
 }
 
 #if defined(USE_ROCM)
-  #define MOE_X_Q6_K 64
+  #define MOE_X_Q6_K 8
   #define MOE_Y_Q6_K 128
   #define NWARPS_Q6_K 8
 #else

From e11880deea8a181537a84d076730e0f0dd493f0a Mon Sep 17 00:00:00 2001
From: Kebe <mail@kebe7jun.com>
Date: Tue, 8 Apr 2025 21:51:06 +0800
Subject: [PATCH 303/593] [Bugfix] Remove triton do_bench fast_flush arg
 (#16256)

Signed-off-by: Kebe <mail@kebe7jun.com>
---
 tests/kernels/test_flashmla.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/kernels/test_flashmla.py b/tests/kernels/test_flashmla.py
index 21c1079fc8eb3..3985c6834f60e 100644
--- a/tests/kernels/test_flashmla.py
+++ b/tests/kernels/test_flashmla.py
@@ -124,7 +124,7 @@ def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal,
     cal_diff(out_flash, out_torch, "out")
     cal_diff(lse_flash, lse_torch, "lse")
 
-    t = triton.testing.do_bench(flash_mla, fast_flush=False)
+    t = triton.testing.do_bench(flash_mla)
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
     bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d +
              b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)

From 90cb44eb0205b6beca6311411113555a839e9d01 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 8 Apr 2025 14:53:39 +0100
Subject: [PATCH 304/593] Update to transformers==4.51.1 (#16257)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 requirements/common.txt | 2 +-
 requirements/test.in    | 2 +-
 requirements/test.txt   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 7e361c27edbcb..472945d7b26a1 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -6,7 +6,7 @@ requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
-transformers >= 4.51.0
+transformers >= 4.51.1
 huggingface-hub[hf_xet] >= 0.30.0  # Required for Xet downloads.
 tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
diff --git a/requirements/test.in b/requirements/test.in
index 73bd3e4a4f571..668ad3d5b216d 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -31,7 +31,7 @@ mistral_common[opencv] >= 1.5.4 # required for pixtral test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.8 # required for model evaluation test
-transformers==4.51.0
+transformers==4.51.1
 huggingface-hub[hf_xet]>=0.30.0  # Required for Xet downloads.
 # quantization
 bitsandbytes>=0.45.3
diff --git a/requirements/test.txt b/requirements/test.txt
index 88b5ab51d26e7..5a58465cba598 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -648,7 +648,7 @@ tqdm==4.66.6
     #   transformers
 tqdm-multiprocess==0.0.11
     # via lm-eval
-transformers==4.51.0
+transformers==4.51.1
     # via
     #   -r requirements/test.in
     #   genai-perf

From 1f5d13ab9f1232751a09cdb6b1cdbb4687393833 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 8 Apr 2025 23:39:12 +0800
Subject: [PATCH 305/593] [New Model]: jinaai/jina-embeddings-v3 (#16120)

---
 .../embed_jina_embeddings_v3.py               |  50 +++++
 tests/conftest.py                             |   5 +-
 ...{test_jina_reranker_v2.py => test_jina.py} |  64 +++++-
 vllm/config.py                                |   5 +
 vllm/model_executor/models/bert.py            |  66 ++++--
 vllm/model_executor/models/roberta.py         | 193 ++++++++++++------
 6 files changed, 297 insertions(+), 86 deletions(-)
 create mode 100644 examples/offline_inference/embed_jina_embeddings_v3.py
 rename tests/models/embedding/language/{test_jina_reranker_v2.py => test_jina.py} (59%)

diff --git a/examples/offline_inference/embed_jina_embeddings_v3.py b/examples/offline_inference/embed_jina_embeddings_v3.py
new file mode 100644
index 0000000000000..f7d9e47e7953e
--- /dev/null
+++ b/examples/offline_inference/embed_jina_embeddings_v3.py
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from argparse import Namespace
+
+from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def main(args: Namespace):
+    # Sample prompts.
+    prompts = [
+        "Follow the white rabbit.",  # English
+        "Sigue al conejo blanco.",  # Spanish
+        "Suis le lapin blanc.",  # French
+        "跟着白兔走。",  # Chinese
+        "اتبع الأرنب الأبيض.",  # Arabic
+        "Folge dem weißen Kaninchen.",  # German
+    ]
+
+    # Create an LLM.
+    # You should pass task="embed" for embedding models
+    model = LLM(**vars(args))
+
+    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
+    # Only text matching task is supported for now. See #16120
+    outputs = model.embed(prompts)
+
+    # Print the outputs.
+    print("\nGenerated Outputs:")
+    print("Only text matching task is supported for now. See #16120")
+    print("-" * 60)
+    for prompt, output in zip(prompts, outputs):
+        embeds = output.outputs.embedding
+        embeds_trimmed = ((str(embeds[:16])[:-1] +
+                           ", ...]") if len(embeds) > 16 else embeds)
+        print(f"Prompt: {prompt!r} \n"
+              f"Embeddings for text matching: {embeds_trimmed} "
+              f"(size={len(embeds)})")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(model="jinaai/jina-embeddings-v3",
+                        task="embed",
+                        trust_remote_code=True)
+    args = parser.parse_args()
+    main(args)
diff --git a/tests/conftest.py b/tests/conftest.py
index b833cff4db7c0..c5d393907ec8c 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -671,8 +671,9 @@ class HfRunner:
         return [(output_ids, output_str, output_logprobs)
                 for output_ids, output_str, output_logprobs in outputs]
 
-    def encode(self, prompts: list[str]) -> list[list[torch.Tensor]]:
-        return self.model.encode(prompts)
+    def encode(self, prompts: list[str], *args,
+               **kwargs) -> list[list[torch.Tensor]]:
+        return self.model.encode(prompts, *args, **kwargs)
 
     def predict(self, prompts: list[list[str]]) -> torch.Tensor:
         return self.model.predict(prompts, convert_to_tensor=True)
diff --git a/tests/models/embedding/language/test_jina_reranker_v2.py b/tests/models/embedding/language/test_jina.py
similarity index 59%
rename from tests/models/embedding/language/test_jina_reranker_v2.py
rename to tests/models/embedding/language/test_jina.py
index ab88fa9ba636c..2a3eab02ddd9e 100644
--- a/tests/models/embedding/language/test_jina_reranker_v2.py
+++ b/tests/models/embedding/language/test_jina.py
@@ -2,13 +2,15 @@
 # ruff: noqa: E501
 """Compare the scoring outputs of HF and vLLM models.
 
-Run `pytest tests/models/embedding/language/test_jina_reranker_v2.py`.
+Run `pytest tests/models/embedding/language/test_jina.py`.
 """
 import math
 
 import pytest
 
-MODELS = [
+from tests.models.embedding.utils import check_embeddings_close
+
+SCORING_MODELS = [
     "jinaai/jina-reranker-v2-base-multilingual",  # Roberta
 ]
 
@@ -27,8 +29,21 @@ TEXTS_2 = [
     "新しいメイクのトレンドは鮮やかな色と革新的な技術に焦点を当てています",
 ]
 
+EMBEDDING_MODELS = [
+    "jinaai/jina-embeddings-v3",
+]
 
-@pytest.fixture(scope="module", params=MODELS)
+EMBEDDING_PROMPTS = [
+    "Follow the white rabbit.",  # English
+    "Sigue al conejo blanco.",  # Spanish
+    "Suis le lapin blanc.",  # French
+    "跟着白兔走。",  # Chinese
+    "اتبع الأرنب الأبيض.",  # Arabic
+    "Folge dem weißen Kaninchen.",  # German
+]
+
+
+@pytest.fixture(scope="module", params=SCORING_MODELS)
 def model_name(request):
     yield request.param
 
@@ -68,3 +83,46 @@ def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
 
     assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
     assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
+
+
+@pytest.fixture(scope="module", params=EMBEDDING_MODELS)
+def emb_model_name(request):
+    yield request.param
+
+
+def test_is_matryoshka(vllm_runner, emb_model_name):
+    with vllm_runner(emb_model_name, task="embed",
+                     max_model_len=None) as vllm_model:
+        assert vllm_model.model.llm_engine.model_config.is_matryoshka
+
+
+@pytest.mark.parametrize("model", EMBEDDING_MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_embeddings(
+    hf_runner,
+    vllm_runner,
+    model,
+    dtype: str,
+    monkeypatch,
+) -> None:
+
+    example_prompts = EMBEDDING_PROMPTS
+
+    with hf_runner(
+            model,
+            dtype=dtype,
+            is_sentence_transformer=True,
+    ) as hf_model:
+        hf_outputs = hf_model.encode(example_prompts, task="text-matching")
+
+    with vllm_runner(model, task="embed", dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.encode(example_prompts)
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+        tol=1e-2,
+    )
diff --git a/vllm/config.py b/vllm/config.py
index 439e27b154ab3..2662c6a84990c 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1130,6 +1130,11 @@ class ModelConfig:
         architectures = getattr(self.hf_config, "architectures", [])
         return ModelRegistry.is_v1_compatible(architectures)
 
+    @property
+    def is_matryoshka(self) -> bool:
+        return (hasattr(self.hf_config, "matryoshka_dimensions")
+                or getattr(self.hf_config, "is_matryoshka", False))
+
 
 class CacheConfig:
     """Configuration for the KV cache.
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 111b49ab8dd2a..e1d77646f47e8 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -18,6 +18,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
 from vllm.model_executor.layers.pooler import (CrossEncodingPooler, Pooler,
                                                PoolingType)
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -38,19 +39,24 @@ class BertEmbedding(nn.Module):
         self.size = config.hidden_size
         self.word_embeddings = VocabParallelEmbedding(config.vocab_size,
                                                       config.hidden_size)
-        self.position_embeddings = VocabParallelEmbedding(
-            config.max_position_embeddings, config.hidden_size)
+
         self.token_type_embeddings = VocabParallelEmbedding(
             config.type_vocab_size, config.hidden_size)
         self.LayerNorm = nn.LayerNorm(config.hidden_size,
                                       eps=config.layer_norm_eps)
-        self.position_ids = nn.Parameter(
-            torch.empty((1, config.max_position_embeddings)), )
 
         self.position_embedding_type = config.position_embedding_type
-        if self.position_embedding_type != "absolute":
-            raise ValueError("Only 'absolute' position_embedding_type" +
-                             " is supported")
+        if self.position_embedding_type == "absolute":
+            self.position_embeddings = VocabParallelEmbedding(
+                config.max_position_embeddings, config.hidden_size)
+            self.position_ids = nn.Parameter(
+                torch.empty((1, config.max_position_embeddings)), )
+        elif self.position_embedding_type == "rotary":
+            self.position_embeddings = None
+            self.position_ids = None
+        else:
+            raise ValueError("Only 'absolute' and 'rotary' " +
+                             "position_embedding_type is supported")
 
     def forward(
         self,
@@ -64,9 +70,6 @@ class BertEmbedding(nn.Module):
         # Input embeddings.
         inputs_embeds = self.word_embeddings(input_ids)
 
-        # Position embeddings.
-        position_embeddings = self.position_embeddings(position_ids)
-
         if token_type_ids is None:
             token_type_ids = torch.zeros(input_shape,
                                          dtype=torch.long,
@@ -74,7 +77,12 @@ class BertEmbedding(nn.Module):
 
         token_type_embeddings = self.token_type_embeddings(token_type_ids)
 
-        embeddings = inputs_embeds + token_type_embeddings + position_embeddings
+        embeddings = inputs_embeds + token_type_embeddings
+
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+
         embeddings = self.LayerNorm(embeddings)
         return embeddings
 
@@ -98,7 +106,10 @@ class BertPooler(nn.Module):
 @support_torch_compile
 class BertEncoder(nn.Module):
 
-    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 rotary_kwargs: Optional[dict] = None,
+                 prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
@@ -107,16 +118,18 @@ class BertEncoder(nn.Module):
             BertLayer(config=config,
                       cache_config=cache_config,
                       quant_config=quant_config,
+                      rotary_kwargs=rotary_kwargs,
                       prefix=f"{prefix}.layer.{layer_idx}")
             for layer_idx in range(config.num_hidden_layers)
         ])
 
     def forward(
         self,
+        positions: torch.Tensor,
         hidden_states: torch.Tensor,
     ) -> torch.Tensor:
         for layer in self.layer:
-            hidden_states = layer(hidden_states)
+            hidden_states = layer(positions, hidden_states)
         return hidden_states
 
 
@@ -126,6 +139,7 @@ class BertLayer(nn.Module):
                  config: BertConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
+                 rotary_kwargs: Optional[dict] = None,
                  prefix: str = ""):
         super().__init__()
 
@@ -135,6 +149,7 @@ class BertLayer(nn.Module):
             layer_norm_eps=config.layer_norm_eps,
             cache_config=cache_config,
             quant_config=quant_config,
+            rotary_kwargs=rotary_kwargs,
             prefix=f"{prefix}.attention")
 
         self.intermediate = BertIntermediate(
@@ -150,8 +165,8 @@ class BertLayer(nn.Module):
                                  quant_config=quant_config,
                                  prefix=f"{prefix}.output")
 
-    def forward(self, hidden_states: torch.Tensor):
-        attn_output = self.attention(hidden_states)
+    def forward(self, positions: torch.Tensor, hidden_states: torch.Tensor):
+        attn_output = self.attention(positions, hidden_states)
         intermediate_output = self.intermediate(attn_output)
         output = self.output(intermediate_output, attn_output)
         return output
@@ -166,6 +181,7 @@ class BertAttention(nn.Module):
         layer_norm_eps: float,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        rotary_kwargs: Optional[dict] = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -174,6 +190,7 @@ class BertAttention(nn.Module):
                                       num_attention_heads=num_attention_heads,
                                       cache_config=cache_config,
                                       quant_config=quant_config,
+                                      rotary_kwargs=rotary_kwargs,
                                       prefix=f"{prefix}.output")
 
         self.output = BertSelfOutput(hidden_size=hidden_size,
@@ -183,9 +200,10 @@ class BertAttention(nn.Module):
 
     def forward(
         self,
+        positions: torch.Tensor,
         hidden_states: torch.Tensor,
     ) -> torch.Tensor:
-        self_output = self.self(hidden_states)
+        self_output = self.self(positions, hidden_states)
         return self.output(self_output, hidden_states)
 
 
@@ -197,6 +215,7 @@ class BertSelfAttention(nn.Module):
         num_attention_heads: int,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        rotary_kwargs: Optional[dict] = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -225,6 +244,11 @@ class BertSelfAttention(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.qkv_proj")
 
+        if rotary_kwargs:
+            self.rotary_emb = get_rope(**rotary_kwargs)
+        else:
+            self.rotary_emb = None
+
         self.attn = Attention(num_heads=self.num_heads,
                               head_size=self.head_dim,
                               scale=self.scaling,
@@ -236,10 +260,15 @@ class BertSelfAttention(nn.Module):
 
     def forward(
         self,
+        positions: torch.Tensor,
         hidden_states: torch.Tensor,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        if self.rotary_emb:
+            q, k = self.rotary_emb(positions, q, k)
+
         output = self.attn(q, k, v)
         return output
 
@@ -321,11 +350,13 @@ class BertModel(nn.Module, SupportsQuant):
                  vllm_config: VllmConfig,
                  prefix: str = "",
                  embedding_class: type = BertEmbedding,
+                 rotary_kwargs: Optional[dict] = None,
                  add_pooling_layer: bool = False):
         super().__init__()
         config = vllm_config.model_config.hf_config
         self.embeddings = embedding_class(config)
         self.encoder = BertEncoder(vllm_config=vllm_config,
+                                   rotary_kwargs=rotary_kwargs,
                                    prefix=f"{prefix}.encoder")
         self.pooler = BertPooler(config) if add_pooling_layer else None
 
@@ -347,7 +378,7 @@ class BertModel(nn.Module, SupportsQuant):
                 seq_lens=attn_metadata.seq_lens_tensor,
                 position_ids=position_ids,
                 token_type_ids=token_type_ids)
-        return self.encoder(hidden_states)
+        return self.encoder(position_ids, hidden_states)
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
@@ -401,6 +432,7 @@ class BertEmbeddingModel(nn.Module, SupportsV0Only, SupportsQuant):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         pooler_config = vllm_config.model_config.pooler_config
+        self.config = vllm_config.model_config.hf_config
         self.model = self._build_model(vllm_config=vllm_config,
                                        prefix=maybe_prefix(prefix, "model"))
         self._pooler = self._build_pooler(pooler_config)
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index a09741a559755..4c23d72a41952 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -22,30 +22,6 @@ from vllm.transformers_utils.config import (
 from .interfaces import SupportsCrossEncoding, SupportsV0Only
 
 
-def roberta_task_weights_filter(
-    all_weights: Iterable[Tuple[str, torch.Tensor]]
-) -> Tuple[Iterable[Tuple[str, torch.Tensor]], Iterable[Tuple[str,
-                                                              torch.Tensor]]]:
-    """
-    Separate task-specific weights that are applied on top
-    of the encoder-decoder bert base.
-    To do so, return two generators over the original iterator.
-    Also, remove the "roberta." prefix to make it loadable
-    from vanilla BertModel.
-    """
-    # Copy of a lazy iterator without in-memory overhead so both
-    # iterators can be iterated upon independently.
-    all_weights1, all_weights2 = itertools.tee(all_weights)
-
-    def encoder_decoder_weights():
-        for name, weight in all_weights1:
-            if name.startswith("roberta."):
-                yield (name[len("roberta."):], weight)
-
-    return encoder_decoder_weights(), ((n, w) for n, w in all_weights2
-                                       if not n.startswith("roberta."))
-
-
 class RobertaEmbedding(nn.Module):
 
     def __init__(self, config: RobertaConfig):
@@ -119,30 +95,6 @@ class RobertaEmbedding(nn.Module):
         return embeddings
 
 
-# Adapted from transformers
-def create_position_ids_from_input_ids(input_ids,
-                                       padding_idx,
-                                       past_key_values_length=0):
-    """
-    Replace non-padding symbols with their position numbers.
-    Position numbers begin at padding_idx+1. Padding symbols
-    are ignored. This is modified from fairseq's `utils.make_positions`.
-
-    Args:
-        x: torch.Tensor x:
-
-    Returns: torch.Tensor
-    """
-    # The series of casts and type-conversions here are carefully
-    # balanced to both work with ONNX export and XLA.
-    mask = input_ids.ne(padding_idx).int()
-
-    incremental_indices = (torch.cumsum(mask, dim=0).type_as(mask) +
-                           past_key_values_length) * mask
-
-    return incremental_indices.long() + padding_idx
-
-
 # Adapted from transformers
 class RobertaClassificationHead(nn.Module):
     """Head for sentence-level classification tasks."""
@@ -174,15 +126,38 @@ class RobertaEmbeddingModel(BertEmbeddingModel):
     def _build_model(self,
                      vllm_config: VllmConfig,
                      prefix: str = "") -> BertModel:
-        return BertModel(vllm_config=vllm_config,
-                         prefix=prefix,
-                         embedding_class=RobertaEmbedding)
+        if (vllm_config.model_config.hf_config.position_embedding_type ==
+                "rotary"):
+            config = vllm_config.model_config.hf_config
+            head_dim = config.hidden_size // config.num_attention_heads
+
+            rotary_kwargs = {
+                "head_size": head_dim,
+                "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
+                "max_position": config.max_position_embeddings,
+                "base": config.rotary_emb_base,
+                "rope_scaling": getattr(config, "rope_scaling", None)
+            }
+
+            return BertModel(vllm_config=vllm_config,
+                             rotary_kwargs=rotary_kwargs,
+                             prefix=prefix)
+        else:
+            return BertModel(vllm_config=vllm_config,
+                             prefix=prefix,
+                             embedding_class=RobertaEmbedding)
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        if getattr(self.config, "lora_rank", 0) > 0:
+            scaling = self.config.lora_alpha / self.config.lora_rank
+            weights = jina_merge_lora_weights(weights, scaling)
+
         weights = self.hf_to_vllm_mapper.apply(weights)
         # Separate weights in "roberta"-prefixed and all else (not in memory).
         # For use with models like FacebookAI/roberta-base.
         bert_weights, task_weights = roberta_task_weights_filter(weights)
+        bert_weights = jina_to_vllm_mapper.apply(bert_weights)
+
         loaded = self.model.load_weights(bert_weights)
         if not len(loaded):
             # Fix for models like `sentence-transformers/stsb-roberta-base-v2`
@@ -203,18 +178,6 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
        _pooler: An instance of Pooler used for pooling operations.
    """
 
-    jina_to_vllm_mapper = WeightsMapper(
-        orig_to_new_substr={
-            'emb_ln': "embeddings.LayerNorm",
-            'layers': "layer",
-            'mixer.Wqkv': "attention.self.qkv_proj",
-            'mixer.out_proj': "attention.output.dense",
-            'norm1': "attention.output.LayerNorm",
-            'mlp.fc1': "intermediate.dense",
-            'mlp.fc2': "output.dense",
-            'norm2': "output.LayerNorm",
-        })
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -232,7 +195,7 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         bert_weights, task_weights = roberta_task_weights_filter(weights)
-        bert_weights = self.jina_to_vllm_mapper.apply(bert_weights)
+        bert_weights = jina_to_vllm_mapper.apply(bert_weights)
 
         self.roberta.load_weights(bert_weights)
 
@@ -265,3 +228,105 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
                             inputs_embeds=inputs_embeds,
                             intermediate_tensors=intermediate_tensors,
                             token_type_ids=token_type_ids)
+
+
+# Adapted from transformers
+def create_position_ids_from_input_ids(input_ids,
+                                       padding_idx,
+                                       past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers.
+    Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully
+    # balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+
+    incremental_indices = (torch.cumsum(mask, dim=0).type_as(mask) +
+                           past_key_values_length) * mask
+
+    return incremental_indices.long() + padding_idx
+
+
+def roberta_task_weights_filter(
+    all_weights: Iterable[Tuple[str, torch.Tensor]]
+) -> Tuple[Iterable[Tuple[str, torch.Tensor]], Iterable[Tuple[str,
+                                                              torch.Tensor]]]:
+    """
+    Separate task-specific weights that are applied on top
+    of the encoder-decoder bert base.
+    To do so, return two generators over the original iterator.
+    Also, remove the "roberta." prefix to make it loadable
+    from vanilla BertModel.
+    """
+    # Copy of a lazy iterator without in-memory overhead so both
+    # iterators can be iterated upon independently.
+    all_weights1, all_weights2 = itertools.tee(all_weights)
+
+    def encoder_decoder_weights():
+        for name, weight in all_weights1:
+            if name.startswith("roberta."):
+                yield (name[len("roberta."):], weight)
+
+    return encoder_decoder_weights(), ((n, w) for n, w in all_weights2
+                                       if not n.startswith("roberta."))
+
+
+jina_to_vllm_mapper = WeightsMapper(
+    orig_to_new_substr={
+        'emb_ln': "embeddings.LayerNorm",
+        'layers': "layer",
+        'mixer.Wqkv': "attention.self.qkv_proj",
+        'mixer.out_proj': "attention.output.dense",
+        'norm1': "attention.output.LayerNorm",
+        'mlp.fc1': "intermediate.dense",
+        'mlp.fc2': "output.dense",
+        'norm2': "output.LayerNorm",
+    })
+
+
+@torch.inference_mode()
+def jina_merge_lora_weights(weights: Iterable[Tuple[str, torch.Tensor]],
+                            scaling: float = 1.0):
+    # use for jina-embeddings-v3
+    # Merge Lora weights into a single weight tensor.
+    # This is a temporary solution until we have a better way to handle
+
+    weights = {name: weight for name, weight in weights}
+
+    o = ".original"
+    a = ".0.lora_A"
+    b = ".0.lora_B"
+
+    # text-matching
+    i = -1
+
+    for name in list(weights.keys()):
+        if o in name:
+            dtype = weights[name].dtype
+            shape = weights[name].shape
+            weight_name = name[:-len(o)]
+
+            if "embeddings" in weight_name:
+                B = weights[weight_name + a][i].cuda().float()
+                A = weights[weight_name + b][i].cuda().float()
+            else:
+                B = weights[weight_name + b][i].cuda().float()
+                A = weights[weight_name + a][i].cuda().float()
+
+            weight = (weights[weight_name + o].cuda() +
+                      torch.matmul(B, A).view(shape) * scaling)
+            weight = weight.cpu().to(dtype)
+
+            weights[weight_name.replace(".parametrizations", "")] = weight
+
+            del weights[weight_name + o], weights[weight_name +
+                                                  a], weights[weight_name + b]
+
+    return [(name, weight) for name, weight in weights.items()]

From dc96fd54c689991ead97da6e2e8718e96f00faaa Mon Sep 17 00:00:00 2001
From: Kero Liang <kerorek@outlook.com>
Date: Wed, 9 Apr 2025 00:08:09 +0800
Subject: [PATCH 306/593] [Misc] Avoid stripping meaningful whitespace from
 `nvidia-smi topo -m` output in collect_env.py (#16272)

Signed-off-by: imkero <kerorek@outlook.com>
---
 collect_env.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/collect_env.py b/collect_env.py
index 1562fa2a0325d..e11271a13640a 100644
--- a/collect_env.py
+++ b/collect_env.py
@@ -105,8 +105,14 @@ def run(command):
     else:
         enc = locale.getpreferredencoding()
     output = raw_output.decode(enc)
+    if command == 'nvidia-smi topo -m':
+        # don't remove the leading whitespace of `nvidia-smi topo -m`
+        #   because they are meaningful
+        output = output.rstrip()
+    else:
+        output = output.strip()
     err = raw_err.decode(enc)
-    return rc, output.strip(), err.strip()
+    return rc, output, err.strip()
 
 
 def run_and_read_all(run_lambda, command):

From 4ebc0b96401ab908e72f894138de154efbfdffd6 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 9 Apr 2025 00:45:21 +0800
Subject: [PATCH 307/593] [Bugfix] Proper input validation for multi-modal
 encoder-decoder models (#16156)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../encoder_decoder_multimodal.py             |  2 +-
 examples/offline_inference/vision_language.py |  2 +-
 .../vision_language_multi_image.py            |  4 +-
 tests/engine/test_short_mm_context.py         |  3 +-
 .../entrypoints/llm/test_prompt_validation.py |  2 +-
 .../openai/test_prompt_validation.py          |  2 +-
 .../vision_language/test_mllama.py            |  8 +-
 vllm/engine/llm_engine.py                     | 63 ++++++++++-----
 vllm/multimodal/profiling.py                  | 13 ++--
 vllm/v1/engine/processor.py                   | 76 ++++++++++++-------
 10 files changed, 113 insertions(+), 62 deletions(-)

diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py
index 6d0c3ac1ee09a..b2f2386d83b08 100644
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@@ -56,7 +56,7 @@ def run_florence2():
 def run_mllama():
     engine_args = EngineArgs(
         model="meta-llama/Llama-3.2-11B-Vision-Instruct",
-        max_model_len=4096,
+        max_model_len=8192,
         max_num_seqs=2,
         limit_mm_per_prompt={"image": 1},
         dtype="half",
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index a944260c26925..1f3c5757dbacf 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -556,7 +556,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
     # The configuration below has been confirmed to launch on a single L40 GPU.
     engine_args = EngineArgs(
         model=model_name,
-        max_model_len=4096,
+        max_model_len=8192,
         max_num_seqs=2,
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 39465c9b0ce45..23994a615dd98 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -318,8 +318,8 @@ def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
     # The configuration below has been confirmed to launch on a single L40 GPU.
     engine_args = EngineArgs(
         model=model_name,
-        max_model_len=4096,
-        max_num_seqs=16,
+        max_model_len=8192,
+        max_num_seqs=2,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
diff --git a/tests/engine/test_short_mm_context.py b/tests/engine/test_short_mm_context.py
index d5111e3fda8fd..b29d6362f571b 100644
--- a/tests/engine/test_short_mm_context.py
+++ b/tests/engine/test_short_mm_context.py
@@ -18,7 +18,8 @@ models = ["llava-hf/llava-1.5-7b-hf"]
 def test_context_length_too_short(vllm_runner, image_assets, model):
     images = [asset.pil_image for asset in image_assets]
 
-    with pytest.raises(ValueError, match="too long to fit into the model"):
+    with pytest.raises(ValueError,
+                       match="longer than the maximum model length"):
         vllm_model = vllm_runner(
             model,
             max_model_len=128,  # LLaVA has a feature size of 576
diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py
index 61bd1d462a50f..665c6ea1e6994 100644
--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
@@ -15,7 +15,7 @@ def v1(run_with_both_engines):
 
 def test_empty_prompt():
     llm = LLM(model="openai-community/gpt2", enforce_eager=True)
-    with pytest.raises(ValueError, match='Prompt cannot be empty'):
+    with pytest.raises(ValueError, match='decoder prompt cannot be empty'):
         llm.generate([""])
 
 
diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py
index 64a1eb6a63eef..f889189a99681 100644
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@@ -17,7 +17,7 @@ async def test_empty_prompt():
         client = remote_server.get_async_client()
 
         with pytest.raises(openai.BadRequestError,
-                           match=re.compile('.+Prompt cannot be empty.+')):
+                           match="decoder prompt cannot be empty"):
             await client.completions.create(model=model_name,
                                             prompt="",
                                             max_tokens=5,
diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py
index c688655887e27..a9f0de7689c4d 100644
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -211,7 +211,7 @@ def _run_test(
     # max_model_len should be greater than image_feature_size
     with vllm_runner(model,
                      dtype=dtype,
-                     max_model_len=4096,
+                     max_model_len=8192,
                      max_num_seqs=3,
                      tensor_parallel_size=tensor_parallel_size,
                      distributed_executor_backend=distributed_executor_backend,
@@ -422,7 +422,7 @@ def test_bnb_regression(
     llm = LLM(
         model=model,
         dtype=dtype,
-        max_model_len=4096,
+        max_model_len=8192,
         max_num_seqs=2,
         quantization="bitsandbytes",
     )
@@ -475,7 +475,7 @@ def test_explicit_implicit_prompt(
     llm = LLM(
         model=model,
         dtype=dtype,
-        max_model_len=4096,
+        max_model_len=8192,
         max_num_seqs=2,
         tensor_parallel_size=1,
     )
@@ -506,7 +506,7 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
     with global_force_attn_backend_context_manager(attn_backend), vllm_runner(
             model,
             dtype=dtype,
-            max_model_len=4096,
+            max_model_len=8192,
             max_num_seqs=2,
             tensor_parallel_size=1,
             limit_mm_per_prompt={"image":
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index f842581bf5517..3ac39887f52ea 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -8,7 +8,7 @@ from contextlib import contextmanager
 from dataclasses import dataclass
 from functools import partial
 from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
-                    Iterable, List, Mapping, NamedTuple, Optional)
+                    Iterable, List, Literal, Mapping, NamedTuple, Optional)
 from typing import Sequence as GenericSequence
 from typing import Set, Type, Union, cast, overload
 
@@ -30,7 +30,7 @@ from vllm.entrypoints.openai.logits_processors import (
     get_logits_processors as get_openai_logits_processors)
 from vllm.executor.executor_base import ExecutorBase
 from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
-                         PromptType)
+                         PromptType, SingletonInputs)
 from vllm.inputs.parse import is_token_prompt, split_enc_dec_inputs
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
@@ -40,6 +40,7 @@ from vllm.model_executor.guided_decoding import (
     get_local_guided_decoding_logits_processor)
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.multimodal.processing import EncDecMultiModalProcessor
 from vllm.outputs import (PoolingRequestOutput, RequestOutput,
                           RequestOutputFactory)
 from vllm.pooling_params import PoolingParams
@@ -2029,29 +2030,57 @@ class LLMEngine:
                                lora_request: Optional[LoRARequest]):
         encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)
 
-        # For encoder-decoder multimodal models, the max_prompt_len
-        # restricts the decoder prompt length
-        if self.model_config.is_multimodal_model:
-            prompt_inputs = decoder_inputs
-        else:
-            prompt_inputs = encoder_inputs or decoder_inputs
+        if encoder_inputs is not None:
+            self._validate_model_input(encoder_inputs,
+                                       lora_request,
+                                       prompt_type="encoder")
+
+        self._validate_model_input(decoder_inputs,
+                                   lora_request,
+                                   prompt_type="decoder")
+
+    def _validate_model_input(
+        self,
+        prompt_inputs: SingletonInputs,
+        lora_request: Optional[LoRARequest],
+        *,
+        prompt_type: Literal["encoder", "decoder"],
+    ):
+        if prompt_type == "encoder" and self.tokenizer is not None:
+            tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
+            model_config = self.model_config
+
+            if model_config.is_multimodal_model:
+                mm_registry = self.input_preprocessor.mm_registry
+                mm_processor = mm_registry.create_processor(
+                    model_config, tokenizer=tokenizer)
+                assert isinstance(mm_processor, EncDecMultiModalProcessor)
+
+                if mm_processor.pad_dummy_encoder_prompt:
+                    return  # Skip encoder length check for Whisper
 
         prompt_ids = prompt_inputs["prompt_token_ids"]
 
-        if prompt_ids is None or len(prompt_ids) == 0:
-            raise ValueError("Prompt cannot be empty")
+        if not prompt_ids:
+            raise ValueError(f"The {prompt_type} prompt cannot be empty")
 
-        if self.model_config.is_multimodal_model:
-            max_prompt_len = self.model_config.max_model_len
-
-            if len(prompt_ids) > max_prompt_len:
-                raise ValueError(
-                    f"The prompt (total length {len(prompt_ids)}) is too long "
-                    f"to fit into the model (context length {max_prompt_len}). "
+        max_prompt_len = self.model_config.max_model_len
+        if len(prompt_ids) >= max_prompt_len:
+            if self.model_config.is_multimodal_model:
+                suggestion = (
                     "Make sure that `max_model_len` is no smaller than the "
                     "number of text tokens plus multimodal tokens. For image "
                     "inputs, the number of image tokens depends on the number "
                     "of images, and possibly their aspect ratios as well.")
+            else:
+                suggestion = (
+                    "Make sure that `max_model_len` is no smaller than the "
+                    "number of text tokens.")
+
+            raise ValueError(
+                f"The {prompt_type} prompt (length {len(prompt_ids)}) is "
+                f"longer than the maximum model length of {max_prompt_len}. "
+                f"{suggestion}")
 
             # TODO: Find out how many placeholder tokens are there so we can
             # check that chunked prefill does not truncate them
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 485a90a204f84..9a733d3bb44e8 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -213,8 +213,12 @@ class MultiModalProfiler(Generic[_I]):
 
         total_len = len(encoder_prompt_token_ids)
 
-        # Encoder-decoder multimodal models only support v0
-        if total_len > seq_len:
+        processor = cast(EncDecMultiModalProcessor, self.processor)
+        if processor.pad_dummy_encoder_prompt:
+            num_tokens_to_pad = max(total_len, seq_len) - total_len
+            encoder_prompt_token_ids.extend([0] * num_tokens_to_pad)
+        # NOTE: Whisper allows total_len > seq_len.
+        elif total_len > seq_len and not envs.VLLM_USE_V1:
             # `max_num_batched_tokens` is defined by `SchedulerConfig`
             logger.warning_once(
                 "The encoder sequence length used for profiling ("
@@ -229,11 +233,6 @@ class MultiModalProfiler(Generic[_I]):
                 "increase `max_model_len`, reduce `max_num_seqs`, "
                 "and/or reduce `mm_counts`.")
 
-        processor = cast(EncDecMultiModalProcessor, self.processor)
-        if processor.pad_dummy_encoder_prompt:
-            num_tokens_to_pad = max(total_len, seq_len) - total_len
-            encoder_prompt_token_ids.extend([0] * num_tokens_to_pad)
-
         return DummyEncoderData(encoder_prompt_token_ids)
 
     def get_decoder_dummy_data(
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 403edddfcbee6..bc5c53b8d72c6 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -2,16 +2,17 @@
 
 import time
 from collections.abc import Mapping
-from typing import Optional, Union
+from typing import Literal, Optional, Union
 
 from vllm.config import VllmConfig
-from vllm.inputs import ProcessorInputs, PromptType
+from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
 from vllm.inputs.parse import split_enc_dec_inputs
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
                              MultiModalRegistry)
 from vllm.multimodal.inputs import PlaceholderRange
+from vllm.multimodal.processing import EncDecMultiModalProcessor
 from vllm.multimodal.utils import merge_and_sort_multimodal_metadata
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
@@ -287,41 +288,62 @@ class Processor:
                                lora_request: Optional[LoRARequest] = None):
         encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)
 
-        # For encoder-decoder multimodal models, the max_prompt_len
-        # restricts the decoder prompt length
-        if self.model_config.is_multimodal_model:
-            prompt_inputs = decoder_inputs
-        else:
-            prompt_inputs = encoder_inputs or decoder_inputs
+        if encoder_inputs is not None:
+            self._validate_model_input(encoder_inputs,
+                                       lora_request,
+                                       prompt_type="encoder")
+
+        self._validate_model_input(decoder_inputs,
+                                   lora_request,
+                                   prompt_type="decoder")
+
+    def _validate_model_input(
+        self,
+        prompt_inputs: SingletonInputs,
+        lora_request: Optional[LoRARequest],
+        *,
+        prompt_type: Literal["encoder", "decoder"],
+    ):
+        tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
+
+        if prompt_type == "encoder":
+            model_config = self.model_config
+
+            if model_config.is_multimodal_model:
+                mm_registry = self.input_preprocessor.mm_registry
+                mm_processor = mm_registry.create_processor(
+                    model_config, tokenizer=tokenizer)
+                assert isinstance(mm_processor, EncDecMultiModalProcessor)
+
+                if mm_processor.pad_dummy_encoder_prompt:
+                    return  # Skip encoder length check for Whisper
 
         prompt_ids = prompt_inputs["prompt_token_ids"]
 
-        if prompt_ids is None or len(prompt_ids) == 0:
-            raise ValueError("Prompt cannot be empty")
+        if not prompt_ids:
+            raise ValueError(f"The {prompt_type} prompt cannot be empty")
 
         max_input_id = max(prompt_ids)
-        max_allowed = self.tokenizer.get_lora_tokenizer(
-            lora_request).max_token_id
-        if max_input_id > max_allowed:
-            raise ValueError(
-                "Token id {} is out of vocabulary".format(max_input_id))
+        if max_input_id > tokenizer.max_token_id:
+            raise ValueError(f"Token id {max_input_id} is out of vocabulary")
 
-        if len(prompt_ids) >= self.model_config.max_model_len:
-            raise ValueError(
-                f"Prompt length of {len(prompt_ids)} is longer than the "
-                f"maximum model length of {self.model_config.max_model_len}.")
-
-        if self.model_config.is_multimodal_model:
-            max_prompt_len = self.model_config.max_model_len
-
-            if len(prompt_ids) > max_prompt_len:
-                raise ValueError(
-                    f"The prompt (total length {len(prompt_ids)}) is too long "
-                    f"to fit into the model (context length {max_prompt_len}). "
+        max_prompt_len = self.model_config.max_model_len
+        if len(prompt_ids) >= max_prompt_len:
+            if self.model_config.is_multimodal_model:
+                suggestion = (
                     "Make sure that `max_model_len` is no smaller than the "
                     "number of text tokens plus multimodal tokens. For image "
                     "inputs, the number of image tokens depends on the number "
                     "of images, and possibly their aspect ratios as well.")
+            else:
+                suggestion = (
+                    "Make sure that `max_model_len` is no smaller than the "
+                    "number of text tokens.")
+
+            raise ValueError(
+                f"The {prompt_type} prompt (length {len(prompt_ids)}) is "
+                f"longer than the maximum model length of {max_prompt_len}. "
+                f"{suggestion}")
 
             # TODO: Find out how many placeholder tokens are there so we can
             # check that chunked prefill does not truncate them

From 40b4284fe3fb70722aeb5d187a03a6b7a3a98565 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 9 Apr 2025 01:02:23 +0800
Subject: [PATCH 308/593] [Bugfix] Handle `process_weights_after_loading` for
 `QKVCrossParallelLinear` (#15328)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/layers/linear.py          | 27 ++++++++++++++-----
 .../model_executor/layers/quantization/fp8.py |  3 +++
 vllm/model_executor/model_loader/loader.py    |  9 +++++++
 3 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 1ae574072b8ff..21035a9e5dbe9 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -1353,6 +1353,7 @@ class QKVCrossParallelLinear(LinearBase):
             prefix=f"{prefix}.kv_proj_encoder")
 
         # `kv_proj_encoder.num_kv_heads` accounts for sharding with tp>1.
+        self.q_size = self.q_proj_decoder.output_size_per_partition
         self.kv_size = self.kv_proj_encoder.num_kv_heads * head_size
 
         if bias:
@@ -1364,20 +1365,31 @@ class QKVCrossParallelLinear(LinearBase):
         else:
             self.bias = None
 
+    def process_weights_after_loading(self):
+        for layer in self.proj.values():
+            if self.quant_method is not None:
+                self.quant_method.process_weights_after_loading(layer)
+
     @property
     def q_proj_decoder(self) -> ColumnParallelLinear:
         layer = self.proj["q_proj_decoder"]
         for name, param in self.named_parameters():
-            target_param = getattr(layer, name)
-            self.sync_weight_attrs(param, target_param, mode="q_proj_decoder")
+            target_param = getattr(layer, name, None)
+            if target_param is not None:
+                self.sync_weight_attrs(param,
+                                       target_param,
+                                       mode="q_proj_decoder")
         return layer
 
     @property
     def kv_proj_encoder(self) -> QKVParallelLinear:
         layer = self.proj["kv_proj_encoder"]
         for name, param in self.named_parameters():
-            target_param = getattr(layer, name)
-            self.sync_weight_attrs(param, target_param, mode="kv_proj_encoder")
+            target_param = getattr(layer, name, None)
+            if target_param is not None:
+                self.sync_weight_attrs(param,
+                                       target_param,
+                                       mode="kv_proj_encoder")
         return layer
 
     def sync_weight_attrs(
@@ -1466,11 +1478,14 @@ class QKVCrossParallelLinear(LinearBase):
                  if loaded_shard_id == "q" else self.kv_proj_encoder)
         target_param = self.select_proj_params(layer, param)
         shard_id_args = (loaded_shard_id, ) if loaded_shard_id != "q" else ()
-        layer.weight_loader(target_param, loaded_weight, *shard_id_args)
+        if self.quant_method.__class__.__name__ in WEIGHT_LOADER_V2_SUPPORTED:
+            layer.weight_loader_v2(target_param, loaded_weight, *shard_id_args)
+        else:
+            layer.weight_loader(target_param, loaded_weight, *shard_id_args)
 
     def extra_repr(self) -> str:
         s = f"in_features={self.input_size}"
-        s += f", q_size={self.q_proj_decoder.output_size_per_partition}"
+        s += f", q_size={self.q_size}"
         s += f", kv_size={self.kv_size}"
         s += f", bias={self.bias is not None}"
         s += f", tp_size={get_tensor_model_parallel_world_size()}"
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 512d64496bd49..b7327f47733b3 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -254,6 +254,7 @@ class Fp8LinearMethod(LinearMethodBase):
                     weight_loader=weight_loader,
                 )
                 scale[:] = torch.finfo(torch.float32).min
+                set_weight_attrs(scale, {"scale_type": "weight_scale"})
                 layer.register_parameter("weight_scale", scale)
             else:
                 assert self.quant_config.activation_scheme == "dynamic"
@@ -268,6 +269,7 @@ class Fp8LinearMethod(LinearMethodBase):
                     weight_loader=weight_loader,
                 )
                 scale[:] = torch.finfo(torch.float32).min
+                set_weight_attrs(scale, {"scale_type": "weight_scale"})
                 # The weight_scale_inv name is intentional for deepseekv3
                 layer.register_parameter("weight_scale_inv", scale)
 
@@ -278,6 +280,7 @@ class Fp8LinearMethod(LinearMethodBase):
                                                 weight_loader=weight_loader)
 
                 scale[:] = torch.finfo(torch.float32).min
+                set_weight_attrs(scale, {"scale_type": "input_scale"})
                 layer.register_parameter("input_scale", scale)
             else:
                 layer.register_parameter("input_scale", None)
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 7e43438851d1f..03934ba074877 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -33,11 +33,15 @@ from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
+# yapf conflicts with isort for this block
+# yapf: disable
 from vllm.model_executor.layers.linear import (LinearBase,
                                                MergedColumnParallelLinear,
+                                               QKVCrossParallelLinear,
                                                QKVParallelLinear,
                                                ReplicatedLinear,
                                                RowParallelLinear)
+# yapf: enable
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizeMethodBase)
 from vllm.model_executor.model_loader.tensorizer import (
@@ -160,6 +164,11 @@ def _initialize_model(
 def _process_weights_after_loading(model: nn.Module, model_config: ModelConfig,
                                    target_device: torch.device) -> None:
     for _, module in model.named_modules():
+        if isinstance(module, QKVCrossParallelLinear):
+            # NOTE(Isotr0py): special case for cross QKV layer because
+            # q and kv proj aren't registered as submodules intentionally
+            module.process_weights_after_loading()
+            continue
         quant_method = getattr(module, "quant_method", None)
         if isinstance(quant_method, QuantizeMethodBase):
             # When quant methods need to process weights after loading

From 0115ccd5c07bee90fa22e88e369e157837d0900b Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 8 Apr 2025 19:18:40 +0100
Subject: [PATCH 309/593] Add warning that content below line in template will
 be removed (#16276)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .github/PULL_REQUEST_TEMPLATE.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index a20c5baf895c1..7042e81a84daa 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -3,4 +3,4 @@ FILL IN THE PR DESCRIPTION HERE
 FIX #xxxx (*link existing issues this PR will resolve*)
 
 <!--- pyml disable-next-line no-emphasis-as-heading -->
-**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>**
+**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>** (anything written below this line will be removed by GitHub Actions)

From e1a2c699dda82199e88e433c144eae66f3b31878 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Tue, 8 Apr 2025 14:56:51 -0400
Subject: [PATCH 310/593] [BugFix] Fix Llama4 - Index Error When Single Request
 Near Max Context (#16209)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
---
 vllm/v1/attention/backends/flash_attn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 1a8d2420db7a7..e1858149dde9e 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -264,7 +264,7 @@ def make_local_attention_virtual_batches(
         np.arange(pages_per_local_batch, dtype=np.int32),
         (virtual_batches, pages_per_local_batch)) \
             + np.expand_dims(block_starts, axis=1)
-    block_indices = block_indices.flatten()
+    block_indices = block_indices.flatten().clip(max=block_table.shape[1] - 1)
     batch_indices = np.repeat(np.arange(actual_batch_size, dtype=np.int32),
                               local_blocks * pages_per_local_batch)
     block_table_local = block_table[batch_indices, block_indices]\

From db104221845e44e0b85336a54e44eb8850b8a5bc Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <linjinzhen@hotmail.com>
Date: Wed, 9 Apr 2025 04:56:09 +0800
Subject: [PATCH 311/593] [Bugfix] fix deepseek fp16 scale bug (#14809)

Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
---
 vllm/model_executor/models/deepseek_v2.py | 34 ++++++++++++++++-------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index fcab533ed2dc5..23b450aeddac9 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -160,14 +160,16 @@ class DeepseekV2MoE(nn.Module):
                 hidden_states=hidden_states,
                 router_logits=router_logits) * self.routed_scaling_factor
         else:
-            # This is a special case to avoid FP16 overflow
+            # Fix FP16 overflow
+            # See DeepseekV2DecoderLayer for more details.
             final_hidden_states = self.experts(hidden_states=hidden_states,
                                                router_logits=router_logits)
         if shared_output is not None:
             if hidden_states.dtype != torch.float16:
                 final_hidden_states = final_hidden_states + shared_output
             else:
-                # This is a special case to avoid FP16 overflow
+                # Fix FP16 overflow
+                # See DeepseekV2DecoderLayer for more details.
                 final_hidden_states = final_hidden_states + shared_output \
                     * (1. / self.routed_scaling_factor)
         if self.tp_size > 1:
@@ -499,6 +501,7 @@ class DeepseekV2DecoderLayer(nn.Module):
         # DecoderLayers are created with `make_layers` which passes the prefix
         # with the layer's index.
         layer_idx = int(prefix.split(sep='.')[-1])
+        self.layer_idx = layer_idx
         if model_config.use_mla:
             attn_cls = DeepseekV2MLAAttention
         else:
@@ -561,19 +564,30 @@ class DeepseekV2DecoderLayer(nn.Module):
             hidden_states=hidden_states,
         )
 
-        # Fully Connected
-        if isinstance(self.mlp, DeepseekV2MoE) and \
-            hidden_states.dtype == torch.float16:
-            # This is a special case to avoid FP16 overflow
+        if hidden_states.dtype == torch.float16:
+            # Fix FP16 overflow
+            # We scale both hidden_states and residual before
+            # rmsnorm, and rmsnorm result would not affect by scale.
             hidden_states *= 1. / self.routed_scaling_factor
+            if self.layer_idx == 0:
+                # The residual is shared by all layers, we only scale it on
+                # first layer.
+                residual *= 1. / self.routed_scaling_factor
+
+        # Fully Connected
         hidden_states, residual = self.post_attention_layernorm(
             hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
-        if isinstance(self.mlp, DeepseekV2MLP) and \
-            hidden_states.dtype == torch.float16:
-            # This is a special case to avoid FP16 overflow
+
+        if isinstance(self.mlp,
+                      DeepseekV2MLP) and hidden_states.dtype == torch.float16:
+            # Fix FP16 overflow
+            # Scaling the DeepseekV2MLP output, it is the input of
+            # input_layernorm of next decoder layer.
+            # The scaling of DeepseekV2MOE output would be done in the forward
+            # of DeepseekV2MOE
             hidden_states *= 1. / self.routed_scaling_factor
-            residual *= 1. / self.routed_scaling_factor
+
         return hidden_states, residual
 
 
From 2755c34a8fef8d0c1b0292a7ddae45d8f92f070c Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Tue, 8 Apr 2025 18:34:09 -0400
Subject: [PATCH 312/593] [V1] Update structured output offline inference
 example (#15721)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .../offline_inference/structured_outputs.py   | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/examples/offline_inference/structured_outputs.py b/examples/offline_inference/structured_outputs.py
index 38ffd7fb9903d..5ec4dbe6e55ae 100644
--- a/examples/offline_inference/structured_outputs.py
+++ b/examples/offline_inference/structured_outputs.py
@@ -19,7 +19,7 @@ outputs = llm.generate(
 print(outputs[0].outputs[0].text)
 
 # Guided decoding by Regex
-guided_decoding_params = GuidedDecodingParams(regex="\w+@\w+\.com\n")
+guided_decoding_params = GuidedDecodingParams(regex=r"\w+@\w+\.com\n")
 sampling_params = SamplingParams(guided_decoding=guided_decoding_params,
                                  stop=["\n"])
 prompt = ("Generate an email address for Alan Turing, who works in Enigma."
@@ -57,17 +57,12 @@ print(outputs[0].outputs[0].text)
 
 # Guided decoding by Grammar
 simplified_sql_grammar = """
-    ?start: select_statement
-
-    ?select_statement: "SELECT " column_list " FROM " table_name
-
-    ?column_list: column_name ("," column_name)*
-
-    ?table_name: identifier
-
-    ?column_name: identifier
-
-    ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+root ::= select_statement
+select_statement ::= "SELECT " column " from " table " where " condition
+column ::= "col_1 " | "col_2 "
+table ::= "table_1 " | "table_2 "
+condition ::= column "= " number
+number ::= "1 " | "2 "
 """
 guided_decoding_params = GuidedDecodingParams(grammar=simplified_sql_grammar)
 sampling_params = SamplingParams(guided_decoding=guided_decoding_params)

From 86c3369eb888fc57c8edcbbfd0bc6f22a606f79c Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 9 Apr 2025 09:13:56 +0800
Subject: [PATCH 313/593] [CI/Build] Fix CI LoRA failure (#16270)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/conftest.py               | 12 ++++++++++++
 tests/lora/test_baichuan.py          |  1 -
 tests/lora/test_chatglm3_tp.py       |  1 -
 tests/lora/test_layers.py            |  2 +-
 tests/lora/test_llama_tp.py          |  1 -
 tests/lora/test_punica_ops.py        |  5 +++++
 tests/lora/test_quant_model.py       |  9 +--------
 tests/lora/test_transfomers_model.py |  1 -
 8 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 91733fde13078..dc433f9dad260 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -256,3 +256,15 @@ def run_with_both_engines_lora(request, monkeypatch):
         monkeypatch.setenv('VLLM_USE_V1', '0')
 
     yield
+
+
+@pytest.fixture
+def reset_default_device():
+    """
+    Some tests, such as `test_punica_ops.py`, explicitly set the 
+    default device, which can affect subsequent tests. Adding this fixture 
+    helps avoid this problem.
+    """
+    original_device = torch.get_default_device()
+    yield
+    torch.set_default_device(original_device)
diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
index 4dacbe26f3d9a..007be7aa582ea 100644
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -73,7 +73,6 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
                        max_num_seqs=16,
                        max_loras=4,
                        max_lora_rank=64,
-                       tensor_parallel_size=1,
                        trust_remote_code=True,
                        fully_sharded_loras=fully_sharded)
     output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1)
diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
index 28a6f163d115a..2c18a115be487 100644
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -61,7 +61,6 @@ def test_chatglm3_lora(chatglm3_lora_files):
                    enable_lora=True,
                    max_loras=4,
                    max_lora_rank=64,
-                   tensor_parallel_size=1,
                    trust_remote_code=True,
                    enable_chunked_prefill=True)
 
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index f85725fe42302..0a8b38fa748a6 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -65,7 +65,7 @@ VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
 
 
 @pytest.fixture(autouse=True)
-def clean_cache():
+def clean_cache_reset_device(reset_default_device):
     # Release any memory we might be holding on to. CI runs OOMs otherwise.
     from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
                                                 _LORA_B_PTR_DICT)
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index 31abac87d19d6..cdb8c893b8bc7 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -88,7 +88,6 @@ def test_llama_lora(sql_lora_files):
         # also test odd max_num_seqs
         max_num_seqs=13,
         max_loras=4,
-        tensor_parallel_size=1,
         enable_chunked_prefill=True)
     generate_and_test(llm, sql_lora_files)
 
diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py
index 726d0c5f2f0d1..add313c945446 100644
--- a/tests/lora/test_punica_ops.py
+++ b/tests/lora/test_punica_ops.py
@@ -13,6 +13,11 @@ from vllm.platforms import current_platform
 from .utils import PunicaTensors, assert_close, generate_data_for_nslices
 
 
+@pytest.fixture(autouse=True)
+def reset_device(reset_default_device):
+    pass
+
+
 # Utility shrink and expand operations used as reference implementations.
 def sgmv_shrink_for_nslices(
         nslices: int, inputs_tensor: torch.Tensor,
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index a4a47a9c2acdf..caf71976a2608 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -78,12 +78,7 @@ def do_sample(llm: vllm.LLM,
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("tp_size", [1])
-def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
-                          tp_size):
-    if num_gpus_available < tp_size and \
-        tp_size > 1 and current_platform.is_cuda_alike():
-        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
+def test_quant_model_lora(tinyllama_lora_files, model):
 
     llm = vllm.LLM(
         model=model.model_path,
@@ -91,7 +86,6 @@ def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
         max_num_seqs=16,
         max_loras=4,
         max_model_len=400,
-        tensor_parallel_size=tp_size,
         gpu_memory_utilization=0.2,  #avoid OOM
         quantization=model.quantization,
         trust_remote_code=True,
@@ -185,7 +179,6 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
         enable_lora=True,
         max_num_seqs=16,
         max_loras=4,
-        tensor_parallel_size=1,
         gpu_memory_utilization=0.2,  #avoid OOM
         quantization=model.quantization,
         trust_remote_code=True,
diff --git a/tests/lora/test_transfomers_model.py b/tests/lora/test_transfomers_model.py
index b50e210ed0822..63907f2c1d02c 100644
--- a/tests/lora/test_transfomers_model.py
+++ b/tests/lora/test_transfomers_model.py
@@ -53,7 +53,6 @@ def test_ilama_lora(ilama_lora_files):
                    enable_lora=True,
                    max_loras=4,
                    max_lora_rank=16,
-                   tensor_parallel_size=1,
                    trust_remote_code=True,
                    enable_chunked_prefill=True)
 

From 1f4b09b525bc53d464219d72710c8f229fe2d5ca Mon Sep 17 00:00:00 2001
From: yueshen2016 <39203804+yueshen2016@users.noreply.github.com>
Date: Tue, 8 Apr 2025 18:53:31 -0700
Subject: [PATCH 314/593] Add support to modelopt quantization of Mixtral model
 (#15961)

Signed-off-by: Yue <yueshen@nvidia.com>
---
 vllm/model_executor/models/mixtral_quant.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 5be91f40bb258..96eb925cf894d 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -45,7 +45,8 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
@@ -420,6 +421,11 @@ class MixtralForCausalLM(nn.Module, SupportsPP):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
+            if name.endswith("scale"):
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue

From 102bf967f09d16df654aab68bc86969fc2f58fcc Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Wed, 9 Apr 2025 10:12:17 +0800
Subject: [PATCH 315/593] [Model] Add smolvlm support (#16017)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 docs/source/models/supported_models.md        |  7 ++
 examples/offline_inference/vision_language.py | 29 +++++++++
 .../vision_language_multi_image.py            | 28 ++++++++
 requirements/test.in                          |  1 +
 requirements/test.txt                         |  4 ++
 .../vision_language/test_models.py            | 10 +++
 .../vision_language/vlm_utils/model_utils.py  |  6 ++
 .../multimodal/processing/test_common.py      |  1 +
 .../multimodal/processing/test_smolvlm.py     | 65 +++++++++++++++++++
 tests/models/registry.py                      |  1 +
 vllm/entrypoints/chat_utils.py                |  2 +-
 vllm/model_executor/models/idefics3.py        | 19 ++++--
 vllm/model_executor/models/registry.py        |  1 +
 vllm/model_executor/models/smolvlm.py         | 51 +++++++++++++++
 14 files changed, 219 insertions(+), 6 deletions(-)
 create mode 100644 tests/models/multimodal/processing/test_smolvlm.py
 create mode 100644 vllm/model_executor/models/smolvlm.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 9e54b2cf54c7d..94a9b039a61d4 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -990,6 +990,13 @@ See [this page](#generative-models) for more information on how to use generativ
   *
   * ✅︎
   * ✅︎
+- * `SmolVLMForConditionalGeneration`
+  * SmolVLM2
+  * T + I
+  * `SmolVLM2-2.2B-Instruct`
+  *
+  * ✅︎
+  * ✅︎
 - * `UltravoxModel`
   * Ultravox
   * T + A<sup>E+</sup>
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 1f3c5757dbacf..7eb2e852f266d 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -298,6 +298,34 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# SmolVLM2-2.2B-Instruct
+def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        enforce_eager=True,
+        mm_processor_kwargs={
+            "max_image_size": {
+                "longest_edge": 384
+            },
+        },
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+    prompts = [
+        (f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # InternVL
 def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -955,6 +983,7 @@ model_example_map = {
     "qwen2_vl": run_qwen2_vl,
     "qwen2_5_vl": run_qwen2_5_vl,
     "skywork_chat": run_skyworkr1v,
+    "smolvlm": run_smolvlm,
 }
 
 
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 23994a615dd98..da9a616e5b857 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -217,6 +217,33 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
+
+    # The configuration below has been confirmed to launch on a single L40 GPU.
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=16,
+        enforce_eager=True,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        mm_processor_kwargs={
+            "max_image_size": {
+                "longest_edge": 384
+            },
+        },
+    )
+
+    placeholders = "\n".join(f"Image-{i}: <image>\n"
+                             for i, _ in enumerate(image_urls, start=1))
+    prompt = f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "OpenGVLab/InternVL2-2B"
 
@@ -614,6 +641,7 @@ model_example_map = {
     "qwen_vl_chat": load_qwen_vl_chat,
     "qwen2_vl": load_qwen2_vl,
     "qwen2_5_vl": load_qwen2_5_vl,
+    "smolvlm": load_smolvlm,
 }
 
 
diff --git a/requirements/test.in b/requirements/test.in
index 668ad3d5b216d..9de492b8bbcad 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -28,6 +28,7 @@ torchvision==0.21.0
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
 mistral_common[opencv] >= 1.5.4 # required for pixtral test
+num2words # required for smolvlm test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.8 # required for model evaluation test
diff --git a/requirements/test.txt b/requirements/test.txt
index 5a58465cba598..6fc5c29261706 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -101,6 +101,8 @@ dill==0.3.8
     #   multiprocess
 dnspython==2.7.0
     # via email-validator
+docopt==0.6.2
+    # via num2words
 docutils==0.16
     # via awscli
 einops==0.8.0
@@ -263,6 +265,8 @@ networkx==3.2.1
     # via torch
 nltk==3.9.1
     # via rouge-score
+num2words==0.5.14
+    # via -r requirements/test.in
 numba==0.61.0
     # via
     #   -r requirements/test.in
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index c0cac8d87492d..64c841ced8f47 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -493,6 +493,16 @@ VLM_TEST_SETTINGS = {
         patch_hf_runner=model_utils.skyworkr1v_patch_hf_runner,
         marks=[large_gpu_mark(min_gb=80)],
     ),
+    "smolvlm": VLMTestInfo(
+        models=["HuggingFaceTB/SmolVLM2-2.2B-Instruct"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt:f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "<image>",
+        max_model_len=8192,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        hf_output_post_proc=model_utils.smolvlm_trunc_hf_output,
+    ),
     ### Tensor parallel / multi-gpu broadcast tests
     "chameleon-broadcast": VLMTestInfo(
         models=["facebook/chameleon-7b"],
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index 2e9190fc6893c..3520345c9679c 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -204,6 +204,12 @@ def idefics3_trunc_hf_output(hf_output: RunnerOutput,
     return output_ids, output_str, out_logprobs
 
 
+def smolvlm_trunc_hf_output(hf_output: RunnerOutput,
+                            model: str) -> RunnerOutput:
+    # Based on Idefics3
+    return idefics3_trunc_hf_output(hf_output, model)
+
+
 def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
                              model: str) -> RunnerOutput:
     output_ids, output_str, out_logprobs = hf_output
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 35334ef13b7aa..0f25c189457bc 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -257,6 +257,7 @@ def _test_processing_correctness_mistral(
     "h2oai/h2ovl-mississippi-800m",
     "OpenGVLab/InternVL2-1B",
     "HuggingFaceM4/Idefics3-8B-Llama3",
+    "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
     "meta-llama/Llama-4-Scout-17B-16E-Instruct",
     "llava-hf/llava-1.5-7b-hf",
     "llava-hf/llava-v1.6-mistral-7b-hf",
diff --git a/tests/models/multimodal/processing/test_smolvlm.py b/tests/models/multimodal/processing/test_smolvlm.py
new file mode 100644
index 0000000000000..56edc58a71baa
--- /dev/null
+++ b/tests/models/multimodal/processing/test_smolvlm.py
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for smolvlm's multimodal preprocessing kwargs."""
+import pytest
+from transformers import SmolVLMConfig
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from ....conftest import _ImageAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["HuggingFaceTB/SmolVLM2-2.2B-Instruct"])
+# yapf: disable
+@pytest.mark.parametrize(
+    ("mm_processor_kwargs", "expected_toks_per_img"),
+    [
+        ({"max_image_size": {"longest_edge": 384}}, 1377),
+        ({"max_image_size": {"longest_edge": 768}}, 405),
+    ])
+# yapf: enable
+@pytest.mark.parametrize("num_imgs", [1, 2])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
+def test_processor_override(
+    image_assets: _ImageAssets,
+    model_id: str,
+    mm_processor_kwargs: dict[str, object],
+    expected_toks_per_img: int,
+    num_imgs: int,
+    kwargs_on_init: bool,
+):
+    """Ensure Idefics3MultiModalProcessor handles num_crops properly."""
+    # Same as the previous test - don't initialize mm_processor_kwargs
+    # in this test and assume that the kwargs will be correctly expanded by
+    # the partial when calling the custom input processor.
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
+
+    # Build the image str / prompt based on the number of images we pass
+    placeholders = "<image>" if num_imgs == 1 else "\n".join(
+        f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
+    prompt = f"<|im_start|>User:{placeholders}\n<end_of_utterance>\nAssistant:"  # noqa: E501
+
+    # Build mm_data
+    image_size = ctx.get_hf_config(SmolVLMConfig).vision_config.image_size
+    dummy_image_size = (image_size * 4, image_size * 4)
+    dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
+    mm_data = {"image": [dummy_image] * num_imgs}
+
+    processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+
+    # Ensure the placeholders format are correct
+    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
+    hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
+    assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
+        "input_ids"][0]
+
+    # Ensure we have the right number of placeholders per num_crops size
+    image_token_id = ctx.get_hf_config().image_token_id
+    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
+    assert img_tok_count == expected_toks_per_img * num_imgs
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 10b93460c56b0..73b7c0fa97451 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -344,6 +344,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct",  # noqa: E501
                                                           min_transformers_version="4.49"),  # noqa: E501
     "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),
+    "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"),  # noqa: E501
     "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501
                                      trust_remote_code=True,
                                      max_transformers_version="4.50"),
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 11c759a6174e6..0e216f53c13b7 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -498,7 +498,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
                                               hf_config.image_token_index)
             if model_type in ("aya_vision", "chameleon", "deepseek_vl_v2",
                               "internvl_chat", "skywork_chat", "NVLM_D",
-                              "h2ovl_chat", "idefics3"):
+                              "h2ovl_chat", "idefics3", "smolvlm"):
                 return "<image>"
             if model_type in ("mllama", "llama4"):
                 return "<|image|>"
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 347106bc4dcf8..4b0513aa271a7 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -206,6 +206,16 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
 
         return grid_w * grid_h + 1
 
+    def _get_image_token(
+            self,
+            processor: Optional[Idefics3Processor]) -> tuple[str, str, str]:
+        if processor is None:
+            processor = self.get_hf_processor()
+        image_token = processor.image_token.content
+        fake_image_token = processor.fake_image_token.content
+        global_image_token = processor.global_image_tag
+        return image_token, fake_image_token, global_image_token
+
     def get_image_repl(
         self,
         *,
@@ -216,9 +226,8 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
         if processor is None:
             processor = self.get_hf_processor()
 
-        image_token = processor.image_token.content
-        fake_image_token = processor.fake_image_token.content
-        global_img_token = processor.global_image_tag
+        image_token, fake_image_token, global_img_token = self._get_image_token(
+            processor)
         image_seq_len = processor.image_seq_len
         grid_placeholder = "<row_{n_h}_col_{n_w}>"
 
@@ -300,7 +309,7 @@ class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo]
         hf_processor = self.info.get_hf_processor()
         image_processor: Idefics3ImageProcessor = hf_processor.image_processor
         longest_edge = image_processor.max_image_size['longest_edge']
-        image_token = hf_processor.image_token.content
+        image_token, _, _ = self.info._get_image_token(hf_processor)
 
         mm_data = {
             "image":
@@ -382,7 +391,7 @@ class Idefics3MultiModalProcessor(
         out_mm_kwargs: MultiModalKwargs,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
-        image_token = hf_processor.image_token.content
+        image_token, _, _ = self.info._get_image_token(hf_processor)
 
         def get_replacement_idefics3(item_idx: int) -> PromptUpdateDetails:
             images = mm_items.get_items("image", ImageProcessorItems)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 43ff892349e24..0de24b578c17d 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -175,6 +175,7 @@ _MULTIMODAL_MODELS = {
     "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
     "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
+    "SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"),  # noqa: E501
     "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
     "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),  # noqa: E501
diff --git a/vllm/model_executor/models/smolvlm.py b/vllm/model_executor/models/smolvlm.py
new file mode 100644
index 0000000000000..17217dc9a2470
--- /dev/null
+++ b/vllm/model_executor/models/smolvlm.py
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, Optional
+
+from transformers import SmolVLMProcessor
+
+from vllm.config import VllmConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+# yapf: disable
+from .idefics3 import Idefics3DummyInputsBuilder as SmolVLMDummyInputsBuilder
+from .idefics3 import Idefics3ForConditionalGeneration
+from .idefics3 import Idefics3MultiModalProcessor as SmolVLMMultiModalProcessor
+from .idefics3 import Idefics3ProcessingInfo
+
+# yapf: enable
+
+
+class SmolVLMProcessingInfo(Idefics3ProcessingInfo):
+
+    def get_hf_processor(
+        self,
+        *,
+        max_image_size: Optional[Dict[str, int]] = None,
+        **kwargs: object,
+    ) -> SmolVLMProcessor:
+        if max_image_size is not None:
+            kwargs["max_image_size"] = max_image_size
+
+        return self.ctx.get_hf_processor(SmolVLMProcessor, **kwargs)
+
+    def _get_image_token(
+            self, processor: Optional[SmolVLMProcessor]) -> tuple[str, str]:
+        if processor is None:
+            processor = self.get_hf_processor()
+        image_token = processor.image_token
+        fake_image_token = processor.fake_image_token
+        global_image_token = processor.global_image_token
+        return image_token, fake_image_token, global_image_token
+
+
+@MULTIMODAL_REGISTRY.register_processor(SmolVLMMultiModalProcessor,
+                                        info=SmolVLMProcessingInfo,
+                                        dummy_inputs=SmolVLMDummyInputsBuilder)
+class SmolVLMForConditionalGeneration(Idefics3ForConditionalGeneration):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(
+            vllm_config=vllm_config,
+            prefix=prefix,
+        )

From 2976dc27e9dc2a799db8337cf9825b63a26eeac5 Mon Sep 17 00:00:00 2001
From: TJian <tunjian.tan@embeddedllm.com>
Date: Wed, 9 Apr 2025 10:12:34 +0800
Subject: [PATCH 316/593] [Bug] [ROCm] Fix Llama 4 Enablement Bug on ROCm: V0
 ROCmFlashAttentionImpl and Triton Fused MoE bugs (#16198)

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: kliuae <kuanfu.liu@embeddedllm.com>
Co-authored-by: Hongxia Yang <hongxia.yang@amd.com>
Co-authored-by: kliuae <kuanfu.liu@embeddedllm.com>
---
 vllm/attention/backends/rocm_flash_attn.py      |  5 ++++-
 .../layers/fused_moe/fused_moe.py               |  2 ++
 vllm/utils.py                                   | 17 +++++++++--------
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index ebeefdf4fd327..7376f9303788d 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -471,7 +471,10 @@ class ROCmFlashAttentionImpl(AttentionImpl):
         if blocksparse_params is not None:
             raise ValueError(
                 "ROCmFlashAttention does not support blocksparse attention.")
-
+        if use_irope:
+            logger.warning(
+                "Using irope in V0 is not supported yet, it will fall back "
+                "to global attention for long context.")
         if logits_soft_cap is None:
             # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
             self.logits_soft_cap = 0.0
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 4ab99acb742f6..a6a00040fb504 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1002,6 +1002,7 @@ direct_register_custom_op(
     op_func=inplace_fused_experts,
     mutates_args=["hidden_states"],
     fake_impl=inplace_fused_experts_fake,
+    tags=(torch.Tag.needs_fixed_stride_order, ),
 )
 
 
@@ -1060,6 +1061,7 @@ direct_register_custom_op(
     op_func=outplace_fused_experts,
     mutates_args=[],
     fake_impl=outplace_fused_experts_fake,
+    tags=(torch.Tag.needs_fixed_stride_order, ),
 )
 
 
diff --git a/vllm/utils.py b/vllm/utils.py
index 46f01638d0eb5..1645565a8e845 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -40,7 +40,7 @@ from dataclasses import dataclass, field
 from functools import cache, lru_cache, partial, wraps
 from types import MappingProxyType
 from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple,
-                    Optional, Type, TypeVar, Union, cast, overload)
+                    Optional, Tuple, Type, TypeVar, Union, cast, overload)
 from uuid import uuid4
 
 import cachetools
@@ -1935,12 +1935,13 @@ vllm_lib = Library("vllm", "FRAGMENT")  # noqa
 
 
 def direct_register_custom_op(
-    op_name: str,
-    op_func: Callable,
-    mutates_args: list[str],
-    fake_impl: Optional[Callable] = None,
-    target_lib: Optional[Library] = None,
-    dispatch_key: str = "CUDA",
+        op_name: str,
+        op_func: Callable,
+        mutates_args: list[str],
+        fake_impl: Optional[Callable] = None,
+        target_lib: Optional[Library] = None,
+        dispatch_key: str = "CUDA",
+        tags: Tuple[torch.Tag, ...] = (),
 ):
     """
     `torch.library.custom_op` can have significant overhead because it
@@ -1979,7 +1980,7 @@ def direct_register_custom_op(
         import torch._custom_op.impl
         schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args)
     my_lib = target_lib or vllm_lib
-    my_lib.define(op_name + schema_str)
+    my_lib.define(op_name + schema_str, tags=tags)
     my_lib.impl(op_name, op_func, dispatch_key=dispatch_key)
     if fake_impl is not None:
         my_lib._register_fake(op_name, fake_impl)

From 4e9cf8c1dd4ac0845239c820a0afa45ac390985e Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Wed, 9 Apr 2025 10:12:44 +0800
Subject: [PATCH 317/593] [Bugfix] fix gettid method is not define (#16084)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 csrc/cpu/utils.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp
index 79771ecd9c081..c17a8961629a6 100644
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -4,6 +4,11 @@
   #include <string>
   #include <sched.h>
 #endif
+#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 30
+  #include <unistd.h>
+  #include <sys/syscall.h>
+  #define gettid() syscall(SYS_gettid)
+#endif
 
 #include "cpu_types.hpp"
 

From 4716377fbc1887f27732b3816bd010a6809e41bc Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Wed, 9 Apr 2025 10:12:51 +0800
Subject: [PATCH 318/593] [Feature] Estimate max-model-len use available KV
 cache memory (#16168)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 tests/v1/core/test_kv_cache_utils.py | 46 +++++++++++++++++++-
 vllm/v1/core/kv_cache_utils.py       | 65 ++++++++++++++++++++++++++--
 2 files changed, 106 insertions(+), 5 deletions(-)

diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 51836644b3251..d2b04c15820c6 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -3,14 +3,16 @@
 import pytest
 import torch
 
+from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
-from vllm.utils import sha256
+from vllm.utils import GiB_bytes, sha256
 # disable yapf here as it formats differently than isort such that both fail
 # yapf: disable
 from vllm.v1.core.kv_cache_utils import (NONE_HASH, BlockHashType,
                                          FreeKVCacheBlockQueue, KVCacheBlock,
                                          PrefixCachingMetrics,
+                                         estimate_max_model_len,
                                          generate_block_hash_extra_keys,
                                          hash_block_tokens,
                                          hash_request_tokens,
@@ -426,3 +428,45 @@ def test_unify_kv_cache_configs():
     ]
     with pytest.raises(AssertionError):
         unify_kv_cache_configs(diff_kv_cache_config)
+
+
+@pytest.mark.parametrize(
+    ("model_id", "max_model_len", "want_estimated_max_len"), [
+        ("Qwen/Qwen1.5-7B", 16385, 16384),
+        ("Qwen/Qwen1.5-7B", 16383, 16383),
+    ])
+def test_estimate_max_model_len(model_id, max_model_len,
+                                want_estimated_max_len):
+    # Create a VllmConfig
+    model_config = ModelConfig(
+        model_id,
+        task="generate",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        max_model_len=max_model_len,
+    )
+    scheduler_config = SchedulerConfig(max_num_batched_tokens=32768)
+
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        scheduler_config=scheduler_config,
+    )
+
+    # Create KV cache specs
+    kv_cache_spec = {}
+    for i in range(32):
+        layer_name = f"layer_{i}"
+        kv_cache_spec[layer_name] = FullAttentionSpec(
+            block_size=16,
+            num_kv_heads=32,
+            head_size=128,
+            dtype=torch.float16,
+            use_mla=False,
+        )
+    # Estimate the maximum model length, 16384 model_len need 8GB
+    estimated_max_len = estimate_max_model_len(vllm_config, kv_cache_spec,
+                                               8 * GiB_bytes)
+    assert estimated_max_len == want_estimated_max_len
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index afcf7e344a0f0..bd0e01d045d17 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -8,7 +8,7 @@ from typing import Any, Callable, NamedTuple, Optional
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.utils import sha256
+from vllm.utils import GiB_bytes, sha256
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, KVCacheSpec,
                                         KVCacheTensor, SlidingWindowSpec)
@@ -459,6 +459,54 @@ def hash_request_tokens(hash_function: Any, block_size: int,
     return ret
 
 
+def estimate_max_model_len(vllm_config: VllmConfig,
+                           kv_cache_spec: dict[str, KVCacheSpec],
+                           available_memory: int) -> int:
+    """
+    Estimates the maximum model length that can fit in the available memory
+    using binary search.
+
+    Args:
+        vllm_config: The global VllmConfig
+        kv_cache_spec: The kv cache spec of each attention layer in the model
+        available_memory: Memory available for KV cache in bytes.
+
+    Returns:
+        The estimated maximum model length that can fit in the available memory.
+    """
+
+    # Define a function to check if a given model length fits in memory
+    def fits_in_memory(model_len: int) -> bool:
+        # Modify the max_model_len for this calculation
+        vllm_config.model_config.max_model_len = model_len
+        # Calculate memory needed for the given model length
+        memory_needed = sum(
+            (layer_spec.max_memory_usage_bytes(vllm_config)
+             for layer_spec in kv_cache_spec.values()),
+            start=0,
+        )
+        return memory_needed <= available_memory
+
+    # Binary search for the maximum model length
+    current_max = vllm_config.model_config.max_model_len
+    left, right = 1, current_max
+
+    # If even the smallest model length doesn't fit, return 0
+    if not fits_in_memory(left):
+        return 0
+
+    # Binary search for the maximum model length that fits
+    result = 1
+    while left <= right:
+        mid = (left + right) // 2
+        if fits_in_memory(mid):
+            result = mid
+            left = mid + 1
+        else:
+            right = mid - 1
+    return result
+
+
 def check_enough_kv_cache_memory(vllm_config: VllmConfig,
                                  kv_cache_spec: dict[str, KVCacheSpec],
                                  available_memory: int):
@@ -486,12 +534,21 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
         needed_memory += layer_spec.max_memory_usage_bytes(vllm_config)
 
     if needed_memory > available_memory:
+        # Estimate the maximum model length that can fit in the available memory
+        estimated_max_len = estimate_max_model_len(vllm_config, kv_cache_spec,
+                                                   available_memory)
+        estimated_msg = ""
+        if estimated_max_len > 0:
+            estimated_msg = " Based on the available memory,"
+            f" the estimated maximum model length is {estimated_max_len}."
+
         raise ValueError(
             f"To serve at least one request with the models's max seq len "
-            f"({max_model_len}), ({needed_memory/1024/1024/1024:.2f} GiB KV "
+            f"({max_model_len}), ({needed_memory/GiB_bytes:.2f} GiB KV "
             f"cache is needed, which is larger than the available KV cache "
-            f"memory ({available_memory/1024/1024/1024:.2f} GiB). Try "
-            f"increasing `gpu_memory_utilization` or decreasing "
+            f"memory ({available_memory/GiB_bytes:.2f} GiB)."
+            f"{estimated_msg} "
+            f" Try increasing `gpu_memory_utilization` or decreasing "
             f"`max_model_len` when initializing the engine.")
 
 
From cb84e45ac75b42ba6795145923e8eb323bb825ad Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Tue, 8 Apr 2025 22:13:22 -0400
Subject: [PATCH 319/593] [Core] Upgrade to xgrammar 0.1.18, add cache size
 limit (#16283)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 requirements/common.txt                                  | 2 +-
 vllm/envs.py                                             | 7 +++++++
 vllm/model_executor/guided_decoding/xgrammar_decoding.py | 8 +++++++-
 vllm/v1/structured_output/backend_xgrammar.py            | 8 +++++++-
 4 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 472945d7b26a1..bb1bb2dd994e9 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -22,7 +22,7 @@ lm-format-enforcer >= 0.10.11, < 0.11
 llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
 outlines == 0.1.11
 lark == 1.2.2
-xgrammar == 0.1.17; platform_machine == "x86_64" or platform_machine == "aarch64"
+xgrammar == 0.1.18; platform_machine == "x86_64" or platform_machine == "aarch64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
diff --git a/vllm/envs.py b/vllm/envs.py
index a561b52aa0abe..f80bf878f79cf 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -106,6 +106,7 @@ if TYPE_CHECKING:
     VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION: bool = False
     VLLM_TPU_BUCKET_PADDING_GAP: int = 0
     VLLM_USE_DEEP_GEMM: bool = False
+    VLLM_XGRAMMAR_CACHE_MB: int = 0
 
 
 def get_default_cache_root():
@@ -697,6 +698,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # Allow use of DeepGemm kernels for fused moe ops.
     "VLLM_USE_DEEP_GEMM":
     lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))),
+
+    # Control the cache sized used by the xgrammar compiler. The default
+    # of 512 MB should be enough for roughly 1000 JSON schemas.
+    # It can be changed with this variable if needed for some reason.
+    "VLLM_XGRAMMAR_CACHE_MB":
+    lambda: int(os.getenv("VLLM_XGRAMMAR_CACHE_MB", "512")),
 }
 
 # end-env-vars-definition
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index b44301f1a4c9b..d7e600e9beb12 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -10,6 +10,7 @@ from typing import TYPE_CHECKING, Any, List
 
 import torch
 
+import vllm.envs
 from vllm.logger import init_logger
 
 try:
@@ -131,8 +132,13 @@ class GrammarCompilerCache:
                 encoded_vocab=config_data.encoded_vocab,
                 metadata=config_data.metadata,
             )
+            cache_size = vllm.envs.VLLM_XGRAMMAR_CACHE_MB * 1024 * 1024
             cls._cache[cache_key] = xgr.GrammarCompiler(
-                tokenizer_info, max_threads=config.max_threads)
+                tokenizer_info,
+                max_threads=config.max_threads,
+                cache_enabled=True,
+                cache_limit_bytes=cache_size,
+            )
 
         return cls._cache[cache_key]
 
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index 783a33481243c..83f2c6436ed2c 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -5,6 +5,7 @@ from typing import TYPE_CHECKING
 
 import torch
 
+import vllm.envs
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
@@ -76,7 +77,12 @@ class XgrammarBackend(StructuredOutputBackend):
                 tokenizer,
                 vocab_size=self.vocab_size,
             )
-        self.compiler = xgr.GrammarCompiler(tokenizer_info, max_threads=8)
+        self.compiler = xgr.GrammarCompiler(
+            tokenizer_info,
+            max_threads=8,
+            cache_enabled=True,
+            cache_limit_bytes=vllm.envs.VLLM_XGRAMMAR_CACHE_MB * 1024 * 1024,
+        )
 
     def compile_grammar(self, request_type: StructuredOutputOptions,
                         grammar_spec: str) -> StructuredOutputGrammar:

From 87b4ac56c2f9c61bd1d2e0cdcfd2db43c8c52d91 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 8 Apr 2025 22:14:46 -0600
Subject: [PATCH 320/593] [CI][Bugfix] Fix bad tolerance for
 test_batch_base64_embedding (#16221)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 tests/entrypoints/openai/test_embedding.py | 28 +++++++++++++---------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index 0d1c936da7597..2cdeb684f75de 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -11,6 +11,7 @@ import requests
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
+from ...models.embedding.utils import check_embeddings_close
 from ...utils import RemoteOpenAIServer
 
 MODEL_NAME = "intfloat/multilingual-e5-small"
@@ -190,30 +191,35 @@ async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
     responses_float = await client.embeddings.create(input=input_texts,
                                                      model=model_name,
                                                      encoding_format="float")
+    float_data = [d.embedding for d in responses_float.data]
 
     responses_base64 = await client.embeddings.create(input=input_texts,
                                                       model=model_name,
                                                       encoding_format="base64")
-
-    decoded_responses_base64_data = []
+    base64_data = []
     for data in responses_base64.data:
-        decoded_responses_base64_data.append(
+        base64_data.append(
             np.frombuffer(base64.b64decode(data.embedding),
                           dtype="float32").tolist())
 
-    assert responses_float.data[0].embedding == decoded_responses_base64_data[
-        0]
-    assert responses_float.data[1].embedding == decoded_responses_base64_data[
-        1]
+    check_embeddings_close(
+        embeddings_0_lst=float_data,
+        embeddings_1_lst=base64_data,
+        name_0="float",
+        name_1="base64",
+    )
 
     # Default response is float32 decoded from base64 by OpenAI Client
     responses_default = await client.embeddings.create(input=input_texts,
                                                        model=model_name)
+    default_data = [d.embedding for d in responses_default.data]
 
-    assert responses_float.data[0].embedding == responses_default.data[
-        0].embedding
-    assert responses_float.data[1].embedding == responses_default.data[
-        1].embedding
+    check_embeddings_close(
+        embeddings_0_lst=float_data,
+        embeddings_1_lst=default_data,
+        name_0="float",
+        name_1="default",
+    )
 
 
 @pytest.mark.asyncio

From b1eb4ca152773d00c3e8fcadabafb0324c00725e Mon Sep 17 00:00:00 2001
From: Chengji Yao <chengjiyao@google.com>
Date: Tue, 8 Apr 2025 23:46:32 -0700
Subject: [PATCH 321/593] [TPU] Update PyTorch/XLA (#16288)

Signed-off-by: Chengji Yao <chengjiyao@google.com>
---
 requirements/tpu.txt        | 12 ++++++------
 tests/v1/tpu/test_pallas.py | 12 ++++--------
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index 105c45db70df9..75ebbc4ed9403 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -17,10 +17,10 @@ ray[data]
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250406-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250406-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250406-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250406-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250406-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250406-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250408-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250408-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250408-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
 
diff --git a/tests/v1/tpu/test_pallas.py b/tests/v1/tpu/test_pallas.py
index 54eab145efb47..8faa5270b5930 100644
--- a/tests/v1/tpu/test_pallas.py
+++ b/tests/v1/tpu/test_pallas.py
@@ -4,9 +4,7 @@ from unittest.mock import ANY, patch
 import torch
 
 from vllm.attention.backends.abstract import AttentionType
-from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK,
-                                               NUM_QUERIES_PER_BLOCK,
-                                               PallasAttentionBackendImpl,
+from vllm.v1.attention.backends.pallas import (PallasAttentionBackendImpl,
                                                PallasMetadata)
 
 
@@ -32,8 +30,6 @@ def test_ragged_paged_attention():
         logits_soft_cap=logits_soft_cap,
         attn_type=AttentionType.DECODER,
     )
-    mock_vmem_limit_bytes = 1024
-    attn_impl.vmem_limit_bytes = mock_vmem_limit_bytes
 
     class FakeAttentionLayer:
         _k_scale_float: float
@@ -88,9 +84,9 @@ def test_ragged_paged_attention():
             ANY,  # block_tables
             ANY,  # query_start_loc
             ANY,  # num_seqs
-            num_kv_pages_per_block=NUM_KV_PAGES_PER_BLOCK,
-            num_queries_per_block=NUM_QUERIES_PER_BLOCK,
-            vmem_limit_bytes=mock_vmem_limit_bytes,
+            num_kv_pages_per_block=None,
+            num_queries_per_block=None,
+            vmem_limit_bytes=None,
             use_kernel=True,
             sm_scale=scale,
             sliding_window=sliding_window,

From 9cdde47289f682495c7cf1778a5ce9f092ada34b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luka=20Govedi=C4=8D?=
 <ProExpertProg@users.noreply.github.com>
Date: Wed, 9 Apr 2025 02:46:45 -0400
Subject: [PATCH 322/593] [BugFix] Fix fusion test and add them to CI (#16287)

Signed-off-by: luka <luka@neuralmagic.com>
---
 .buildkite/test-pipeline.yaml    |   9 ++-
 tests/compile/test_full_graph.py | 105 +++++++++++++++++--------------
 tests/compile/test_fusion.py     |  11 +++-
 3 files changed, 75 insertions(+), 50 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 55530d0da8d72..9ef498f86eca1 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -292,6 +292,14 @@ steps:
   command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
   parallelism: 4
 
+- label: PyTorch Compilation Unit Tests
+  source_file_dependencies:
+    - vllm/
+    - tests/compile
+  commands:
+    - pytest -v -s compile/test_pass_manager.py
+    - pytest -v -s compile/test_fusion.py
+
 - label: PyTorch Fullgraph Smoke Test # 9min
   source_file_dependencies:
   - vllm/
@@ -301,7 +309,6 @@ steps:
   # these tests need to be separated, cannot combine
   - pytest -v -s compile/piecewise/test_simple.py
   - pytest -v -s compile/piecewise/test_toy_llama.py
-  - pytest -v -s compile/test_pass_manager.py
 
 - label: PyTorch Fullgraph Test # 18min
   source_file_dependencies:
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 5311a4ce21054..579133ec0c3f6 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from typing import Any, Union
+from typing import Any, Optional, Union
 
 import pytest
 import torch
@@ -15,7 +15,7 @@ from vllm.platforms import current_platform
 from ..utils import create_new_process_for_each_test
 
 
-def models_list(all: bool):
+def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
     TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
         ("facebook/opt-125m", {}),
         ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
@@ -32,47 +32,50 @@ def models_list(all: bool):
         ("meta-llama/Llama-3.2-1B-Instruct", {}),
     ]
 
-    if not all:
-        return TEST_MODELS
-
-    if is_quant_method_supported("aqlm"):
-        TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
-            "quantization": "aqlm"
-        }))
-
-    # TODO: figure out why this fails.
-    if False and is_quant_method_supported("gguf"):  # noqa: SIM223
-        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
-            "quantization": "gguf"
-        }))
-
-    if is_quant_method_supported("gptq"):
-        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
-            "quantization": "gptq"
-        }))
-
-    if is_quant_method_supported("gptq_marlin"):
-        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
-            "quantization": "gptq_marlin"
-        }))
-
-    if is_quant_method_supported("gptq_marlin_24"):
-        TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
-            "quantization": "gptq_marlin_24"
-        }))
-
-    if is_quant_method_supported("marlin"):
-        TEST_MODELS.append(
-            ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
-                "quantization": "marlin"
+    if all:
+        if is_quant_method_supported("aqlm"):
+            TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
+                "quantization": "aqlm"
             }))
 
-    if not current_platform.is_rocm() and is_quant_method_supported("awq"):
-        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
-            "quantization": "AWQ"
-        }))
+        # TODO: figure out why this fails.
+        if False and is_quant_method_supported("gguf"):  # noqa: SIM223
+            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
+                "quantization": "gguf"
+            }))
 
-    return TEST_MODELS
+        if is_quant_method_supported("gptq"):
+            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
+                "quantization": "gptq"
+            }))
+
+        if is_quant_method_supported("gptq_marlin"):
+            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
+                "quantization": "gptq_marlin"
+            }))
+
+        if is_quant_method_supported("gptq_marlin_24"):
+            TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
+                "quantization": "gptq_marlin_24"
+            }))
+
+        if is_quant_method_supported("marlin"):
+            TEST_MODELS.append(
+                ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
+                    "quantization": "marlin"
+                }))
+
+        if not current_platform.is_rocm() and is_quant_method_supported("awq"):
+            TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
+                "quantization": "AWQ"
+            }))
+
+    if keywords is None:
+        return TEST_MODELS
+
+    # filter by keywords
+    pred = lambda model: any(keyword in model[0] for keyword in keywords)
+    return list(filter(pred, TEST_MODELS))
 
 
 @pytest.mark.parametrize(
@@ -96,20 +99,30 @@ def test_full_graph(
         run_model(optimization_level, model, model_kwargs)
 
 
+PassConfig = CompilationConfig.PassConfig
+
+
 # TODO(luka) add other supported compilation config scenarios here
 @pytest.mark.parametrize(
-    "compilation_config",
-    # additional compile sizes
+    "compilation_config, model_info",
     [
-        CompilationConfig(level=CompilationLevel.PIECEWISE,
-                          compile_sizes=[1, 2])
+        # additional compile sizes, only some of the models
+        (CompilationConfig(level=CompilationLevel.PIECEWISE,
+                           compile_sizes=[1, 2]), model)
+        for model in models_list(all=False)
+    ] + [
+        # RMSNorm + quant fusion, only 8-bit quant models
+        (CompilationConfig(level=CompilationLevel.PIECEWISE,
+                           custom_ops=["+rms_norm"],
+                           pass_config=PassConfig(enable_fusion=True,
+                                                  enable_noop=True)), model)
+        for model in models_list(keywords=["FP8-dynamic", "quantized.w8a8"])
     ])
 # only test some of the models
-@pytest.mark.parametrize("model_info", models_list(all=False))
 @create_new_process_for_each_test()
 def test_custom_compile_config(
-    model_info: tuple[str, dict[str, Any]],
     compilation_config: CompilationConfig,
+    model_info: tuple[str, dict[str, Any]],
 ):
     model, model_kwargs = model_info
     print(f"MODEL={model}")
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index a1adf7083ef54..efebf05b6b047 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -44,12 +44,17 @@ class TestModel(torch.nn.Module):
         resid = torch.sqrt(x)
         y = self.norm[0](x)
 
-        x2 = self.fp8_linear.apply(y, self.w[0], self.wscale[0], self.scale[0])
+        x2 = self.fp8_linear.apply(y,
+                                   self.w[0],
+                                   self.wscale[0],
+                                   input_scale=self.scale[0])
         # make sure resid is used for replacement to work
         y2, resid = self.norm[1](x2, resid)
 
-        x3 = self.fp8_linear.apply(y2, self.w[1], self.wscale[1],
-                                   self.scale[1])
+        x3 = self.fp8_linear.apply(y2,
+                                   self.w[1],
+                                   self.wscale[1],
+                                   input_scale=self.scale[1])
         y3, resid = self.norm[2](x3, resid)  # use resid here
         return y3
 

From 24f6b9a71397539a3d02c801963220b0e9a2aef9 Mon Sep 17 00:00:00 2001
From: Accelerator1996 <lvfei.lv@alibaba-inc.com>
Date: Wed, 9 Apr 2025 14:47:30 +0800
Subject: [PATCH 323/593] [Misc] Fix test_sharded_state_loader.py(#16004)
 (#16005)

Signed-off-by: lvfei.lv <lvfei.lv@alibaba-inc.com>
---
 tests/test_sharded_state_loader.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py
index 8406f305215b2..94b0156e104b2 100644
--- a/tests/test_sharded_state_loader.py
+++ b/tests/test_sharded_state_loader.py
@@ -47,12 +47,10 @@ def test_filter_subtensors():
 
 @pytest.fixture(scope="module")
 def llama_3p2_1b_files():
-    with TemporaryDirectory() as cache_dir:
-        input_dir = snapshot_download("meta-llama/Llama-3.2-1B-Instruct",
-                                      cache_dir=cache_dir,
-                                      ignore_patterns=["*.bin*", "original/*"])
+    input_dir = snapshot_download("meta-llama/Llama-3.2-1B-Instruct",
+                                  ignore_patterns=["*.bin*", "original/*"])
 
-        yield input_dir
+    yield input_dir
 
 
 def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
@@ -64,9 +62,9 @@ def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
 
     # Copy metadata files to output directory
     for file in os.listdir(input_dir):
-        if not any(
-                file.endswith(ext) and not os.path.isdir(file)
-                for ext in weights_patterns):
+        if os.path.isdir(os.path.join(input_dir, file)):
+            continue
+        if not any(file.endswith(ext) for ext in weights_patterns):
             shutil.copy(f"{input_dir}/{file}", output_dir)
 
 
@@ -81,7 +79,8 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs):
 @pytest.mark.parametrize("enable_lora", [False, True])
 @pytest.mark.parametrize("tp_size", [1, 2])
 def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
-                              llama_3p2_1b_files):
+                              llama_3p2_1b_files,
+                              monkeypatch: pytest.MonkeyPatch):
     if num_gpus_available < tp_size:
         pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
 
@@ -89,6 +88,8 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
     gpu_memory_utilization = 0.8
     input_dir = llama_3p2_1b_files
     ctx = mp.get_context("spawn")
+    # The interface in v1 engine has changed, run in v1 engine will hang.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
 
     # Run in separate processes for memory & CUDA isolation
     with TemporaryDirectory() as output_dir:

From e484e028575e670137f8267a56247a1eb04fb884 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 9 Apr 2025 15:51:27 +0800
Subject: [PATCH 324/593] [Bugfix] Avoid transferring cached multi-modal items
 from P0 to P1 (#16273)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/v1/engine/__init__.py       |  3 ++-
 vllm/v1/engine/core.py           |  6 ++---
 vllm/v1/engine/mm_input_cache.py | 41 ++++++++++++++++++++++++++------
 vllm/v1/engine/processor.py      | 23 +++++++++++++-----
 vllm/v1/request.py               | 14 +++++++----
 5 files changed, 65 insertions(+), 22 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 0557d0c6c19d0..1264e43c79d9e 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -2,6 +2,7 @@
 
 import enum
 import time
+from collections.abc import Sequence
 from typing import Any, Optional, Union
 
 import msgspec
@@ -52,7 +53,7 @@ class EngineCoreRequest(
     # Detokenizer, but set to None when it is added to EngineCoreClient.
     prompt: Optional[str]
     prompt_token_ids: list[int]
-    mm_inputs: Optional[list[MultiModalKwargs]]
+    mm_inputs: Optional[Sequence[Optional[MultiModalKwargs]]]
     mm_hashes: Optional[list[str]]
     mm_placeholders: Optional[list[PlaceholderRange]]
     sampling_params: SamplingParams
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index f58c77e4f1658..077d499889623 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -31,7 +31,7 @@ from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType, UtilityOutput)
-from vllm.v1.engine.mm_input_cache import MMInputCacheServer
+from vllm.v1.engine.mm_input_cache import MirroredProcessingCache
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.outputs import ModelRunnerOutput
@@ -105,7 +105,7 @@ class EngineCore:
         )
 
         # Setup MM Input Mapper.
-        self.mm_input_cache_server = MMInputCacheServer(
+        self.mm_input_cache_server = MirroredProcessingCache(
             vllm_config.model_config)
 
         # Setup batch queue for pipeline parallelism.
@@ -173,7 +173,7 @@ class EngineCore:
             # anything that has a hash must have a HIT cache entry here
             # as well.
             assert request.mm_inputs is not None
-            request.mm_inputs = self.mm_input_cache_server.get_and_update(
+            request.mm_inputs = self.mm_input_cache_server.get_and_update_p1(
                 request.mm_inputs, request.mm_hashes)
 
         req = Request.from_engine_core_request(request)
diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
index 61a55d2499bd1..ef5a2e5acb152 100644
--- a/vllm/v1/engine/mm_input_cache.py
+++ b/vllm/v1/engine/mm_input_cache.py
@@ -1,8 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
+from collections.abc import Sequence
+from typing import Optional
 
 from vllm.envs import VLLM_MM_INPUT_CACHE_GIB
 from vllm.multimodal import MultiModalKwargs
 from vllm.multimodal.processing import ProcessingCache
+from vllm.utils import is_list_of
 
 # The idea of multimodal preprocessing caching is based on having a client and
 # a server, where the client executes in the frontend process (=P0) and the
@@ -11,9 +14,11 @@ from vllm.multimodal.processing import ProcessingCache
 # -- Client:
 #  - BaseMultiModalProcessor to process MultiModalData into MultiModalKwargs
 #    with built-in caching functionality, with mm_hash as its identifier.
+#  - MirroredProcessingCache to keep track of the cached entries and
+#    determine whether to send the MultiModalKwargs to P1.
 #
 # -- Server:
-#  - MMInputCacheServer to perform caching of the received MultiModalKwargs.
+#  - MirroredProcessingCache to store the MultiModalKwargs from P0.
 #
 # The caching for both client and server is mirrored, and this allows us
 # to avoid the serialization of "mm_inputs" (like pixel values) between
@@ -25,26 +30,48 @@ from vllm.multimodal.processing import ProcessingCache
 # variable VLLM_MM_INPUT_CACHE_GIB.
 
 
-class MMInputCacheServer:
+class MirroredProcessingCache:
 
     def __init__(self, model_config):
         self.use_cache = not model_config.disable_mm_preprocessor_cache
         self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB,
                                                       MultiModalKwargs)
 
-    def get_and_update(
+    def get_and_update_p0(
         self,
-        mm_inputs: list[MultiModalKwargs],
+        mm_inputs: Sequence[MultiModalKwargs],
         mm_hashes: list[str],
-    ) -> list[MultiModalKwargs]:
+    ) -> Sequence[Optional[MultiModalKwargs]]:
         assert len(mm_inputs) == len(mm_hashes)
 
         if not self.use_cache:
+            assert is_list_of(mm_inputs, MultiModalKwargs)
             return mm_inputs
 
-        full_mm_inputs = []
+        full_mm_inputs = list[Optional[MultiModalKwargs]]()
+        for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
+            if mm_hash in self.mm_cache:
+                mm_input = None
+            else:
+                self.mm_cache[mm_hash] = mm_input
+
+            full_mm_inputs.append(mm_input)
+
+        return full_mm_inputs
+
+    def get_and_update_p1(
+        self,
+        mm_inputs: Sequence[Optional[MultiModalKwargs]],
+        mm_hashes: list[str],
+    ) -> Sequence[MultiModalKwargs]:
+        assert len(mm_inputs) == len(mm_hashes)
+
+        if not self.use_cache:
+            assert is_list_of(mm_inputs, MultiModalKwargs)
+            return mm_inputs
+
+        full_mm_inputs = list[MultiModalKwargs]()
         for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
-            assert mm_hash is not None
             if mm_input is None:
                 mm_input = self.mm_cache[mm_hash]
             else:
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index bc5c53b8d72c6..5f9c8ea4835f8 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import time
-from collections.abc import Mapping
+from collections.abc import Mapping, Sequence
 from typing import Literal, Optional, Union
 
 from vllm.config import VllmConfig
@@ -19,6 +19,7 @@ from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.mm_input_cache import MirroredProcessingCache
 from vllm.v1.structured_output.backend_guidance import (
     validate_guidance_grammar)
 from vllm.v1.structured_output.utils import (
@@ -47,6 +48,8 @@ class Processor:
                                                     self.tokenizer,
                                                     mm_registry)
 
+        self.mm_input_cache_client = MirroredProcessingCache(self.model_config)
+
         # Multi-modal hasher (for images)
         self.use_hash = (
             not self.model_config.disable_mm_preprocessor_cache) or \
@@ -231,7 +234,7 @@ class Processor:
             self.tokenizer.get_lora_tokenizer(lora_request))
 
         # Multimodal related.
-        sorted_mm_inputs: Optional[list[MultiModalKwargs]] = None
+        sorted_mm_inputs: Optional[Sequence[Optional[MultiModalKwargs]]] = None
         sorted_mm_positions: Optional[list[PlaceholderRange]] = None
         sorted_mm_hashes: Optional[list[str]] = None
         if decoder_inputs["type"] == "multimodal":
@@ -256,20 +259,28 @@ class Processor:
             # are multiple modalities.
             unique_modalities = set(sorted_item_modalities)
             if len(unique_modalities) > 1:
-                sorted_mm_inputs = []
+                orig_sorted_mm_inputs = []
                 used_indices = {modality: 0 for modality in unique_modalities}
+
                 for modality in sorted_item_modalities:
                     items = decoder_mm_inputs.get_items(modality)
                     item = items[used_indices[modality]]
-                    sorted_mm_inputs.append(MultiModalKwargs.from_items([item
-                                                                         ]))
+
+                    orig_sorted_mm_inputs.append(
+                        MultiModalKwargs.from_items([item]))
                     used_indices[modality] += 1
             else:
-                sorted_mm_inputs = [
+                orig_sorted_mm_inputs = [
                     MultiModalKwargs.from_items([item]) for item in
                     decoder_mm_inputs.get_items(sorted_item_modalities[0])
                 ]
 
+            if sorted_mm_hashes is not None:
+                sorted_mm_inputs = self.mm_input_cache_client.get_and_update_p0(
+                    orig_sorted_mm_inputs, sorted_mm_hashes)
+            else:
+                sorted_mm_inputs = orig_sorted_mm_inputs
+
         return EngineCoreRequest(
             request_id=request_id,
             prompt=decoder_inputs.get("prompt"),
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index daf59fd76e9a9..6be72431dde52 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -3,17 +3,16 @@
 import enum
 from typing import TYPE_CHECKING, Optional, Union
 
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
+from vllm.utils import is_list_of
 from vllm.v1.engine import (EngineCoreEvent, EngineCoreEventType,
                             EngineCoreRequest, FinishReason)
 from vllm.v1.structured_output.request import StructuredOutputRequest
 from vllm.v1.utils import ConstantList
 
 if TYPE_CHECKING:
-
     from vllm.lora.request import LoRARequest
-    from vllm.multimodal import MultiModalKwargs
-    from vllm.multimodal.inputs import PlaceholderRange
 
 
 class Request:
@@ -23,9 +22,9 @@ class Request:
         request_id: str,
         prompt: Optional[str],
         prompt_token_ids: list[int],
-        multi_modal_inputs: Optional[list["MultiModalKwargs"]],
+        multi_modal_inputs: Optional[list[MultiModalKwargs]],
         multi_modal_hashes: Optional[list[str]],
-        multi_modal_placeholders: Optional[list["PlaceholderRange"]],
+        multi_modal_placeholders: Optional[list[PlaceholderRange]],
         sampling_params: SamplingParams,
         eos_token_id: Optional[int],
         arrival_time: float,
@@ -75,6 +74,11 @@ class Request:
 
     @classmethod
     def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
+        if request.mm_inputs is not None:
+            assert isinstance(request.mm_inputs, list)
+            assert is_list_of(request.mm_inputs, MultiModalKwargs), (
+                "mm_inputs was not updated in EngineCore.add_request")
+
         return cls(
             request_id=request.request_id,
             prompt=request.prompt,

From 477d2a8aa23ee785466ec06fdae6228da4af15f6 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 9 Apr 2025 01:56:25 -0600
Subject: [PATCH 325/593] Update label-tpu mergify and remove removal bot
 (#16298)

---
 .github/mergify.yml | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index 3097b994659ab..2a1e039ca2995 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -90,10 +90,9 @@ pull_request_rules:
 
 - name: label-tpu
   description: Automatically apply tpu label
-  # Keep this list in sync with `label-tpu-remove` conditions
   conditions:
     - or:
-      - files~=tpu.py
+      - files~=tpu.
       - files~=_tpu
       - files~=tpu_
       - files~=/tpu/
@@ -103,21 +102,6 @@ pull_request_rules:
       add:
         - tpu
 
-- name: label-tpu-remove
-  description: Automatically remove tpu label
-  # Keep this list in sync with `label-tpu` conditions
-  conditions:
-    - and:
-      - -files~=tpu.py
-      - -files~=_tpu
-      - -files~=tpu_
-      - -files~=/tpu/
-      - -files~=pallas
-  actions:
-    label:
-      remove:
-        - tpu
-
 - name: ping author on conflicts and add 'needs-rebase' label
   conditions:
       - conflict

From 819d548e8a4e56f0e68ea8b4f9bf41f759548191 Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Wed, 9 Apr 2025 15:59:02 +0800
Subject: [PATCH 326/593] [BugFix] logger is not callable (#16312)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
---
 vllm/attention/backends/hpu_attn.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index 55b03bbf32e4b..cede9915efcf3 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -149,8 +149,8 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
                 self.fused_scaled_dot_product_attention = ModuleFusedSDPA(
                     FusedSDPA)
             except ImportError:
-                logger().warning("Could not import HPU FusedSDPA kernel. "
-                                 "vLLM will use native implementation.")
+                logger.warning("Could not import HPU FusedSDPA kernel. "
+                               "vLLM will use native implementation.")
 
         suppored_head_sizes = HPUPagedAttention.get_supported_head_sizes()
         if head_size not in suppored_head_sizes:

From ec7da6fcf32fc05efe5d7ba30d01d3d940f12a3c Mon Sep 17 00:00:00 2001
From: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Date: Wed, 9 Apr 2025 00:59:14 -0700
Subject: [PATCH 327/593] [BugFix] llama4 qknorm should be not shared across
 head (#16311)

Signed-off-by: Lu Fang <fanglu@fb.com>
---
 vllm/model_executor/models/llama4.py | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index 029f6044598cc..3dbf352ab8b0f 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -155,14 +155,8 @@ class Llama4Attention(nn.Module):
         self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
         self.n_rep = self.num_heads // self.num_kv_heads
-        self.q_norm = RMSNorm(
-            hidden_size=self.q_size,
-            eps=config.rms_norm_eps,
-            has_weight=False,
-            dtype=torch.float32,
-        ) if self.use_qk_norm else None
-        self.k_norm = RMSNorm(
-            hidden_size=self.kv_size,
+        self.qk_norm = RMSNorm(
+            hidden_size=self.head_dim,
             eps=config.rms_norm_eps,
             has_weight=False,
             dtype=torch.float32,
@@ -226,10 +220,11 @@ class Llama4Attention(nn.Module):
 
         if self.rotary_emb is not None:
             q, k = self.rotary_emb(positions, q, k)
-        if self.q_norm is not None:
-            q = self.q_norm(q.float()).to(q.dtype)
-        if self.k_norm is not None:
-            k = self.k_norm(k.float()).to(k.dtype)
+        if self.qk_norm is not None:
+            q = q.reshape(-1, self.num_heads, self.head_dim)
+            q = self.qk_norm(q.float()).reshape(-1, self.q_size).to(q.dtype)
+            k = k.reshape(-1, self.num_kv_heads, self.head_dim)
+            k = self.qk_norm(k.float()).reshape(-1, self.kv_size).to(k.dtype)
 
         # We are applying temperature tuning (https://arxiv.org/abs/2501.19399)
         # to NoPE layers, where the inference-time temperature tuning function

From 24834f48943d4d106cca0764ffe3d68c0eb57500 Mon Sep 17 00:00:00 2001
From: ajayvohra2005 <ajayvohr@amazon.com>
Date: Wed, 9 Apr 2025 06:43:22 -0400
Subject: [PATCH 328/593] update neuron config (#16289)

Signed-off-by: Ajay Vohra <ajayvohr@amazon.com>
---
 vllm/model_executor/model_loader/neuron.py | 33 +++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py
index d900fb3a7d397..67aaad10fcfe9 100644
--- a/vllm/model_executor/model_loader/neuron.py
+++ b/vllm/model_executor/model_loader/neuron.py
@@ -174,8 +174,39 @@ def _is_neuron_on_device_sampling_disabled(model_config: ModelConfig) -> bool:
 
 def _get_neuron_config_after_override(default_neuron_config,
                                       overridden_neuron_config):
-    from transformers_neuronx.config import NeuronConfig
+    from transformers_neuronx.config import (ContinuousBatchingConfig,
+                                             GenerationConfig,
+                                             KVCacheQuantizationConfig,
+                                             NeuronConfig, QuantizationConfig,
+                                             SparseAttnConfig)
+
     overridden_neuron_config = overridden_neuron_config or {}
+    sparse_attn = overridden_neuron_config.pop("sparse_attn", {})
+    if sparse_attn:
+        overridden_neuron_config["sparse_attn"] = SparseAttnConfig(
+            **sparse_attn)
+
+    kv_cache_quant = overridden_neuron_config.pop("kv_cache_quant", {})
+    if kv_cache_quant:
+        overridden_neuron_config["kv_cache_quant"] = KVCacheQuantizationConfig(
+            **kv_cache_quant)
+
+    continuous_batching = overridden_neuron_config.pop("continuous_batching",
+                                                       {})
+    if continuous_batching:
+        overridden_neuron_config[
+            "continuous_batching"] = ContinuousBatchingConfig(
+                **continuous_batching)
+
+    quant = overridden_neuron_config.pop("quant", {})
+    if quant:
+        overridden_neuron_config["quant"] = QuantizationConfig(**quant)
+
+    on_device_generation = overridden_neuron_config.pop(
+        "on_device_generation", {})
+    if on_device_generation:
+        overridden_neuron_config["on_device_generation"] = GenerationConfig(
+            **on_device_generation)
     default_neuron_config.update(overridden_neuron_config)
     return NeuronConfig(**default_neuron_config)
 

From 04149cce2775a5d7ba4ec5dec693003f81e58cba Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Wed, 9 Apr 2025 18:43:59 +0800
Subject: [PATCH 329/593] [BugFix] fix some typos found by typos. (#16314)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
---
 benchmarks/benchmark_serving.py                             | 4 ++--
 benchmarks/benchmark_serving_structured_output.py           | 4 ++--
 csrc/mamba/causal_conv1d/causal_conv1d.cu                   | 2 +-
 vllm/attention/backends/flash_attn.py                       | 2 +-
 vllm/attention/backends/hpu_attn.py                         | 6 +++---
 vllm/attention/backends/mla/common.py                       | 6 +++---
 vllm/attention/backends/xformers.py                         | 6 +++---
 vllm/attention/ops/nki_flash_attn.py                        | 2 +-
 vllm/benchmarks/serve.py                                    | 4 ++--
 vllm/engine/output_processor/multi_step.py                  | 2 +-
 vllm/entrypoints/openai/tool_parsers/utils.py               | 2 +-
 .../layers/quantization/kernels/scaled_mm/__init__.py       | 2 +-
 vllm/platforms/cpu.py                                       | 6 +++---
 vllm/platforms/interface.py                                 | 2 +-
 vllm/reasoning/granite_reasoning_parser.py                  | 2 +-
 vllm/sampling_params.py                                     | 2 +-
 vllm/third_party/pynvml.py                                  | 2 +-
 vllm/v1/attention/backends/mla/common.py                    | 4 ++--
 vllm/v1/executor/multiproc_executor.py                      | 2 +-
 vllm/v1/worker/gpu_model_runner.py                          | 2 +-
 vllm/v1/worker/tpu_model_runner.py                          | 2 +-
 21 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index c50125b708b85..431adb8e997ee 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -921,7 +921,7 @@ if __name__ == "__main__":
         "--percentile-metrics",
         type=str,
         default="ttft,tpot,itl",
-        help="Comma-seperated list of selected metrics to report percentils. "
+        help="Comma-separated list of selected metrics to report percentils. "
         "This argument specifies the metrics to report percentiles. "
         "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
         "Default value is \"ttft,tpot,itl\".")
@@ -929,7 +929,7 @@ if __name__ == "__main__":
         "--metric-percentiles",
         type=str,
         default="99",
-        help="Comma-seperated list of percentiles for selected metrics. "
+        help="Comma-separated list of percentiles for selected metrics. "
         "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
         "Default value is \"99\". "
         "Use \"--percentile-metrics\" to select metrics.",
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index 71cb420a52c46..6d3ba6c025ae4 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -963,7 +963,7 @@ if __name__ == "__main__":
         "--percentile-metrics",
         type=str,
         default="ttft,tpot,itl",
-        help="Comma-seperated list of selected metrics to report percentils. "
+        help="Comma-separated list of selected metrics to report percentils. "
         "This argument specifies the metrics to report percentiles. "
         "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
         "Default value is \"ttft,tpot,itl\".")
@@ -971,7 +971,7 @@ if __name__ == "__main__":
         "--metric-percentiles",
         type=str,
         default="99",
-        help="Comma-seperated list of percentiles for selected metrics. "
+        help="Comma-separated list of percentiles for selected metrics. "
         "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
         "Default value is \"99\". "
         "Use \"--percentile-metrics\" to select metrics.",
diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu
index f0e5533bcae60..98daf1a1b8e6c 100644
--- a/csrc/mamba/causal_conv1d/causal_conv1d.cu
+++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu
@@ -422,7 +422,7 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
         int final_state_position =  ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize);
         // in case the final state is separated between the last "smem_exchange" and 
         // and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2), 
-        // (which occurs when `final_state_position` is a non-positivie index)
+        // (which occurs when `final_state_position` is a non-positive index)
         // we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it
         if (conv_states != nullptr && final_state_position < 0 && seqlen > kWidth){
             input_t vals_load[kNElts] = {0};
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index c0a572b4aaea3..f9c5ad4df54ea 100755
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -326,7 +326,7 @@ class FlashAttentionMetadata(AttentionMetadata):
             assert self.use_cuda_graph
 
         if turn_prefills_into_decodes:
-            # When Mutli-Step is enabled with Chunked-Prefill, prefills and
+            # When Multi-Step is enabled with Chunked-Prefill, prefills and
             # decodes are scheduled together. In the first step, all the
             # prefills turn into decodes. This update reflects that
             # conversion.
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index cede9915efcf3..15625612e08e4 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -152,11 +152,11 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
                 logger.warning("Could not import HPU FusedSDPA kernel. "
                                "vLLM will use native implementation.")
 
-        suppored_head_sizes = HPUPagedAttention.get_supported_head_sizes()
-        if head_size not in suppored_head_sizes:
+        supported_head_sizes = HPUPagedAttention.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
             raise ValueError(
                 f"Head size {head_size} is not supported by PagedAttention. "
-                f"Supported head sizes are: {suppored_head_sizes}.")
+                f"Supported head sizes are: {supported_head_sizes}.")
 
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index 8d70afe282d68..5a47c0f630813 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -83,8 +83,8 @@ spda_o = scaled_dot_product_attention(
 return spda_o @ W_O
 
 NOTE: in the actual code, 
-    `kv_b_proj` is [W_UK; W_UV] concatnated per head
-    `q_b_proj` is [W_UQ; W_QR] concatnated per head
+    `kv_b_proj` is [W_UK; W_UV] concatenated per head
+    `q_b_proj` is [W_UQ; W_QR] concatenated per head
     `out_proj` is W_O
 
 
@@ -667,7 +667,7 @@ class MLACommonMetadata(AttentionMetadata):
             assert num_seqs > num_queries
 
         if turn_prefills_into_decodes:
-            # When Mutli-Step is enabled with Chunked-Prefill, prefills and
+            # When Multi-Step is enabled with Chunked-Prefill, prefills and
             # decodes are scheduled together. In the first step, all the
             # prefills turn into decodes. This update reflects that
             # conversion.
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index cd152e57d7496..a9d4a70b55a8c 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -414,11 +414,11 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        suppored_head_sizes = PagedAttention.get_supported_head_sizes()
-        if head_size not in suppored_head_sizes:
+        supported_head_sizes = PagedAttention.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
             raise ValueError(
                 f"Head size {head_size} is not supported by PagedAttention. "
-                f"Supported head sizes are: {suppored_head_sizes}.")
+                f"Supported head sizes are: {supported_head_sizes}.")
 
         self.attn_type = attn_type
 
diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py
index 6bce5879c81df..8c9145bb99e8c 100644
--- a/vllm/attention/ops/nki_flash_attn.py
+++ b/vllm/attention/ops/nki_flash_attn.py
@@ -446,7 +446,7 @@ def flash_paged_attention(
     IO tensor dtypes:
       - This kernel assumes all IO tensors have the same dtype except for
         block_tables (int32) and mask (int32)
-      - If mixed_percision is True, then all Tensor Engine operation will be
+      - If mixed_precision is True, then all Tensor Engine operation will be
         performed in bfloat16 and accumulation will be performed in float32.
         Otherwise the intermediates will be in the same type as the inputs.
 
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 813556f90f534..dc0ec32194866 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -724,14 +724,14 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "--percentile-metrics",
         type=str,
         default="ttft,tpot,itl",
-        help="Comma-seperated list of selected metrics to report percentils. "
+        help="Comma-separated list of selected metrics to report percentils. "
         "This argument specifies the metrics to report percentiles. "
         "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". ")
     parser.add_argument(
         "--metric-percentiles",
         type=str,
         default="99",
-        help="Comma-seperated list of percentiles for selected metrics. "
+        help="Comma-separated list of percentiles for selected metrics. "
         "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
         "Use \"--percentile-metrics\" to select metrics.",
     )
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index 4c5d78a43df6c..5f126c7571dc8 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -93,7 +93,7 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
             externally (before the next schedule() call)
         """
         # Sequences can be in RUNNING or FINISHED_ABORTED state
-        # once scheduled, as a sequence is moved to FINSIHED_ABORTED
+        # once scheduled, as a sequence is moved to FINISHED_ABORTED
         # if a client disconnects from the api server.
         seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING)
         if seqs is None:
diff --git a/vllm/entrypoints/openai/tool_parsers/utils.py b/vllm/entrypoints/openai/tool_parsers/utils.py
index 7997629d461a4..acbff3258e465 100644
--- a/vllm/entrypoints/openai/tool_parsers/utils.py
+++ b/vllm/entrypoints/openai/tool_parsers/utils.py
@@ -98,7 +98,7 @@ def find_all_indices(string: str, substring: str) -> list[int]:
 
 
 # partial_json_parser doesn't support extra data and
-# JSONDecorder.raw_decode doesn't support partial JSON
+# JSONDecoder.raw_decode doesn't support partial JSON
 def partial_json_loads(input_str: str, flags: Allow) -> tuple[Any, int]:
     try:
         return (partial_json_parser.loads(input_str, flags), len(input_str))
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
index bedda4c2ab21b..014108e695065 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
@@ -29,7 +29,7 @@ def choose_scaled_mm_linear_kernel(
         compute_capability: Optional[int] = None
 ) -> Type[ScaledMMLinearKernel]:
     """
-    Choose an ScalledMMLinearKernel that can implement the given config for the 
+    Choose an ScaledMMLinearKernel that can implement the given config for the 
     given compute capability. Attempts to choose the best kernel in terms of 
     performance.
 
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index cfd7bc2a40571..3c8aecc099459 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -69,12 +69,12 @@ class CpuPlatform(Platform):
 
         cache_config = vllm_config.cache_config
 
-        ipex_avaliable = find_spec("intel_extension_for_pytorch") is not None
+        ipex_available = find_spec("intel_extension_for_pytorch") is not None
 
         if cache_config and cache_config.block_size is None:
-            cache_config.block_size = 128 if ipex_avaliable else 16
+            cache_config.block_size = 128 if ipex_available else 16
 
-        if not ipex_avaliable and cache_config.block_size != 16:
+        if not ipex_available and cache_config.block_size != 16:
             raise RuntimeError(
                 f"--block-size={cache_config.block_size} requires"
                 " intel_extension_for_pytorch")
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 2bb543bd73f70..f788d90bfb4ad 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -231,7 +231,7 @@ class Platform:
                                 parser: Optional[FlexibleArgumentParser] = None
                                 ) -> None:
         """
-        Do some pre-registeration or update action for the current platform.
+        Do some pre-registration or update action for the current platform.
 
         This function is called before global VllmConfig is initialized or cli
         arguments are parsed. It's used for out-of-tree platforms to register or
diff --git a/vllm/reasoning/granite_reasoning_parser.py b/vllm/reasoning/granite_reasoning_parser.py
index 249ace1f167fa..0dae02d33fec7 100644
--- a/vllm/reasoning/granite_reasoning_parser.py
+++ b/vllm/reasoning/granite_reasoning_parser.py
@@ -60,7 +60,7 @@ class GraniteReasoningParser(ReasoningParser):
 
         Args:
             model_output (str): Output of the model to be parsed.
-            request (ChatCompletionReqest): Request being processed.
+            request (ChatCompletionRequest): Request being processed.
 
         Returns:
             tuple[Optional[str], Optional[str]]: Tuple pair containing the
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 584320e76cbc5..75cf09e0a2282 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -101,7 +101,7 @@ class RequestOutputKind(Enum):
     CUMULATIVE = 0
     # Return only deltas in each RequestOutput
     DELTA = 1
-    # Do not return intermediate RequestOuputs
+    # Do not return intermediate RequestOutput
     FINAL_ONLY = 2
 
 
diff --git a/vllm/third_party/pynvml.py b/vllm/third_party/pynvml.py
index 0a4be23a09362..7ed9ced0e2620 100644
--- a/vllm/third_party/pynvml.py
+++ b/vllm/third_party/pynvml.py
@@ -1119,7 +1119,7 @@ class _PrintableStructure(Structure):
     e.g. class that has _field_ 'hex_value', c_uint could be formatted with
       _fmt_ = {"hex_value" : "%08X"}
     to produce nicer output.
-    Default fomratting string for all fields can be set with key "<default>" like:
+    Default formatting string for all fields can be set with key "<default>" like:
       _fmt_ = {"<default>" : "%d MHz"} # e.g all values are numbers in MHz.
     If not set it's assumed to be just "%s"
 
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 1437db7e9d485..e6c4ebc729bb9 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -83,8 +83,8 @@ spda_o = scaled_dot_product_attention(
 return spda_o @ W_O
 
 NOTE: in the actual code,
-    `kv_b_proj` is [W_UK; W_UV] concatnated per head
-    `q_b_proj` is [W_UQ; W_QR] concatnated per head
+    `kv_b_proj` is [W_UK; W_UV] concatenated per head
+    `q_b_proj` is [W_UQ; W_QR] concatenated per head
     `out_proj` is W_O
 
 
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index d79bce194b713..e854c2a44ff94 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -326,7 +326,7 @@ class WorkerProc:
             logger.debug("Worker interrupted.")
 
         except Exception:
-            # worker_busy_loop sends exceptions exceptons to Executor
+            # worker_busy_loop sends exceptions to Executor
             # for shutdown, but if there is an error in startup or an
             # error with IPC itself, we need to alert the parent.
             psutil.Process().parent().send_signal(signal.SIGUSR1)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a83409a72a88e..debb7072cff87 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -998,7 +998,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
     ) -> Union[ModelRunnerOutput, torch.Tensor]:
         self._update_states(scheduler_output)
         if not scheduler_output.total_num_scheduled_tokens:
-            # Return empty ModelRunnerOuptut if there's no work to do.
+            # Return empty ModelRunnerOutput if there's no work to do.
             return EMPTY_MODEL_RUNNER_OUTPUT
 
         if self.is_multimodal_model:
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 7360c8760f21a..c99c6cb722447 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -652,7 +652,7 @@ class TPUModelRunner:
         # Update cached state
         self._update_states(scheduler_output)
         if not scheduler_output.total_num_scheduled_tokens:
-            # Return empty ModelRunnerOuptut if there's no work to do.
+            # Return empty ModelRunnerOutput if there's no work to do.
             return EMPTY_MODEL_RUNNER_OUTPUT
 
         if self.is_multimodal_model:

From d55244df31969e7df435603b5d7014939e60881b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Wed, 9 Apr 2025 13:12:54 +0200
Subject: [PATCH 330/593] [Model] Add `SupportsMultiModal.get_language_model`
 interface (#16007)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 docs/source/contributing/model/multimodal.md   | 11 +++++++++++
 vllm/model_executor/models/aria.py             |  3 +++
 vllm/model_executor/models/aya_vision.py       |  3 +++
 vllm/model_executor/models/blip2.py            |  3 +++
 vllm/model_executor/models/chameleon.py        |  3 +++
 vllm/model_executor/models/deepseek_vl2.py     |  3 +++
 vllm/model_executor/models/florence2.py        |  3 +++
 vllm/model_executor/models/fuyu.py             |  3 +++
 vllm/model_executor/models/gemma3_mm.py        |  3 +++
 vllm/model_executor/models/glm4v.py            |  3 +++
 vllm/model_executor/models/idefics3.py         |  3 +++
 vllm/model_executor/models/interfaces.py       | 12 ++++++++++++
 vllm/model_executor/models/internvl.py         |  3 +++
 vllm/model_executor/models/llava.py            |  3 +++
 vllm/model_executor/models/llava_next.py       |  3 +++
 vllm/model_executor/models/llava_next_video.py |  3 +++
 vllm/model_executor/models/llava_onevision.py  |  3 +++
 vllm/model_executor/models/minicpmv.py         |  3 +++
 vllm/model_executor/models/mistral3.py         |  3 +++
 vllm/model_executor/models/mllama.py           |  3 +++
 vllm/model_executor/models/mllama4.py          |  3 +++
 vllm/model_executor/models/molmo.py            |  3 +++
 vllm/model_executor/models/paligemma.py        |  3 +++
 vllm/model_executor/models/phi3v.py            |  3 +++
 vllm/model_executor/models/phi4mm.py           |  3 +++
 vllm/model_executor/models/pixtral.py          |  3 +++
 vllm/model_executor/models/qwen2_5_vl.py       |  3 +++
 vllm/model_executor/models/qwen2_audio.py      |  3 +++
 vllm/model_executor/models/qwen2_vl.py         |  3 +++
 vllm/model_executor/models/qwen_vl.py          |  3 +++
 vllm/model_executor/models/skyworkr1v.py       |  3 +++
 vllm/model_executor/models/ultravox.py         |  3 +++
 vllm/model_executor/models/whisper.py          |  3 +++
 33 files changed, 116 insertions(+)

diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index c4894d39edc97..0c7496334fb79 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -79,6 +79,17 @@ Further update the model as follows:
             return inputs_embeds
     ```
 
+- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model` getter to provide stable access to the underlying language model.
+
+    ```python
+    class YourModelForImage2Seq(nn.Module):
+        ...
+
+        def get_language_model(self) -> torch.nn.Module:
+            # Change `language_model` according to your implementation.
+            return self.language_model
+    ```
+
 - Once the above steps are done, update the model class with the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
 
   ```diff
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 8cd3be90ca8da..af340feffcf96 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -605,6 +605,9 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
 
         return self.multi_modal_projector(image_outputs, image_attn_mask)
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index 6b68885d375a2..929c8f2a82a2b 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -424,6 +424,9 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
             num_patches=num_patches,
         )
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index db9d42f5b86aa..a1f20ea4e6143 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -627,6 +627,9 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
 
         return self.language_projection(query_output)
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 3d527cb6f529d..d46ae5327dcb3 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -988,6 +988,9 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
             data=self._validate_pixel_values(pixel_values),
         )
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 4554a997755f6..03d5be2927bb3 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -604,6 +604,9 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         return self._pixel_values_to_embedding(
             pixel_values=pixel_values, images_spatial_crop=images_spatial_crop)
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 70b8d51b713c4..62fd09398fac1 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -1050,6 +1050,9 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal,
         pixel_values = image_input["data"]
         return self._encode_image(pixel_values)
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 189b91db4a862..c0a0f572ff3c4 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -341,6 +341,9 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
         return vision_embeddings_flat.split(patches_per_image, dim=0)
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 9552ee1f0b3a7..93d0aa301f54e 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -591,6 +591,9 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
             e.flatten(0, 1) for e in image_embeds.split(num_patches.tolist())
         ]
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index c190a45855919..6d7b760d0dd76 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -596,6 +596,9 @@ class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
 
         return self.transformer.vision(pixel_values)
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.transformer
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 4b0513aa271a7..ec02d1c8862a1 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -710,6 +710,9 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
             e.flatten(0, 1) for e in image_features.split(num_patches.tolist())
         ]
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index c61254ac99990..0cda199af471f 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -56,6 +56,18 @@ class SupportsMultiModal(Protocol):
         """
         ...
 
+    def get_language_model(self) -> torch.nn.Module:
+        """
+        Returns the underlying language model used for text generation.
+
+        This is typically the `torch.nn.Module` instance responsible for 
+        processing the merged multimodal embeddings and producing hidden states
+
+        Returns:
+            torch.nn.Module: The core language model component.
+        """
+        ...
+
     # Only for models that support v0 chunked prefill
     # TODO(ywang96): Remove this overload once v0 is deprecated
     @overload
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index cf5608e3de7ba..7fd628fa6c380 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -884,6 +884,9 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
         else:
             self.visual_token_mask = None
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index b34ac38f68071..9516550005d50 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -674,6 +674,9 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
         image_embeds = torch.split(image_embeds, feature_sizes)
         return image_embeds
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 4de13e5407354..9c4d0e1fc275e 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -480,6 +480,9 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
             for i, patch_features_batch in enumerate(patch_embeddings)
         ]
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 780af72d57201..6fc4c187efa78 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -421,6 +421,9 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         return [e.flatten(0, 1) for e in embeds]
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         video_input = self._parse_and_validate_video_input(**kwargs)
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index c7e13bb352f42..5fbd27b9b0b3e 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -852,6 +852,9 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
         image_feature = image_feature.view(batch_frames, -1, dim)
         return image_feature
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index eb20a963ae2ab..12b5364cbaf83 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -892,6 +892,9 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
 
         return multimodal_embeddings
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.llm
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index b6fbc6b1fa3d0..67c0e2ec233bc 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -514,6 +514,9 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsMultiModal,
             image_embeds = (image_embeds, )
         return image_embeds
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 6a2e20840fcf5..a67339ca5221f 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1325,6 +1325,9 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
         cross_attention_states = cross_attention_states_flat
         return cross_attention_states
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_cross_attention_states(
         self,
         image_inputs: MllamaImagePixelInputs,
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index d76d63774b4e4..0499fe09eb949 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -742,6 +742,9 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
             for img in vision_embeddings_flat.split(patches_per_image, dim=0)
         ]
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(self,
                                   **kwargs) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 6857bfa810e3e..a7551e613dfc3 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1488,6 +1488,9 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
             )
         ]
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 845f77ac39ce7..274163ac9c428 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -323,6 +323,9 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         return self.multi_modal_projector(image_features)
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index d3b0688f21c38..344f348cd3d93 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -674,6 +674,9 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
 
         return image_embeds
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index cb75ee1ea2ccd..ec19797f88754 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -1802,3 +1802,6 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal,
             connector=["audio_projection_for_vision", "audio_projection"],
             tower_model=["vision_encoder", "embed_tokens_extend"],
         )
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.model
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index e07c6516aef2e..328d52711b5e5 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -396,6 +396,9 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
         image_embeds = torch.split(image_embeds, feature_sizes)
         return image_embeds
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 1e6ff1fec6d5c..84b7e59c8a0af 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -967,6 +967,9 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                     **kwargs)
         return modalities
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
 
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 54220037d253f..9f2593fc94f40 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -355,6 +355,9 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
         return torch.split(masked_audio_features,
                            audio_output_lengths.flatten().tolist())
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index a7800d4153667..f93654d0fcb33 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1276,6 +1276,9 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         return modalities
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
 
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index a2ec9a9a4d177..2e941f3b7a318 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -740,6 +740,9 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
 
         return self.transformer.visual(image_input["data"])
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.transformer
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index e3deae828a33c..a8460a2e10434 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -889,6 +889,9 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
         else:
             self.visual_token_mask = None
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 6e73a2ae656c2..6e9d15261b790 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -563,6 +563,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
         ]
         return flattened_embeddings.split(embed_lens)
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index e83abbe8b2527..7751f96da6aea 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -692,6 +692,9 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
         )
         return decoder_outputs
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.model.decoder
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         # TODO: This method does not obey the interface for SupportsMultiModal.

From 98d01d3ce2a4d06e85348e375e726c40bee0bdf0 Mon Sep 17 00:00:00 2001
From: Guillaume Calmettes <gcalmettes@scaleway.com>
Date: Wed, 9 Apr 2025 14:11:10 +0200
Subject: [PATCH 331/593] [Bugfix][Frontend] respect provided default guided
 decoding backend (#15476)

Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
---
 tests/test_sampling_params.py       | 81 +++++++++++++++++++++++++++--
 vllm/entrypoints/openai/protocol.py |  2 -
 2 files changed, 78 insertions(+), 5 deletions(-)

diff --git a/tests/test_sampling_params.py b/tests/test_sampling_params.py
index 40e26ed5199c1..9af810c4c1bca 100644
--- a/tests/test_sampling_params.py
+++ b/tests/test_sampling_params.py
@@ -1,7 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 """Tests for the SamplingParams class.
 """
+
+import pytest
+
 from vllm import SamplingParams
+from vllm.config import ModelConfig
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest
+
+MODEL_NAME = "Qwen/Qwen1.5-7B"
 
 
 def test_max_tokens_none():
@@ -9,6 +16,74 @@ def test_max_tokens_none():
     SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None)
 
 
-if __name__ == "__main__":
-    import pytest
-    pytest.main([__file__])
+@pytest.fixture(scope="module")
+def model_config():
+    return ModelConfig(
+        MODEL_NAME,
+        task="auto",
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        revision=None,
+    )
+
+
+@pytest.fixture(scope="module")
+def default_max_tokens():
+    return 4096
+
+
+def test_sampling_params_from_request_with_no_guided_decoding_backend(
+        model_config, default_max_tokens):
+    # guided_decoding_backend is not present at request level
+    request = ChatCompletionRequest.model_validate({
+        'messages': [{
+            'role': 'user',
+            'content': 'Hello'
+        }],
+        'model':
+        MODEL_NAME,
+        'response_format': {
+            'type': 'json_object',
+        },
+    })
+
+    sampling_params = request.to_sampling_params(
+        default_max_tokens,
+        model_config.logits_processor_pattern,
+    )
+    # we do not expect any backend to be present and the default
+    # guided_decoding_backend at engine level will be used.
+    assert sampling_params.guided_decoding.backend is None
+
+
+@pytest.mark.parametrize("request_level_guided_decoding_backend,expected",
+                         [("xgrammar", "xgrammar"),
+                          ("lm-format-enforcer", "lm-format-enforcer"),
+                          ("outlines", "outlines")])
+def test_sampling_params_from_request_with_guided_decoding_backend(
+        request_level_guided_decoding_backend: str, expected: str,
+        model_config, default_max_tokens):
+
+    request = ChatCompletionRequest.model_validate({
+        'messages': [{
+            'role': 'user',
+            'content': 'Hello'
+        }],
+        'model':
+        MODEL_NAME,
+        'response_format': {
+            'type': 'json_object',
+        },
+        'guided_decoding_backend':
+        request_level_guided_decoding_backend,
+    })
+
+    sampling_params = request.to_sampling_params(
+        default_max_tokens,
+        model_config.logits_processor_pattern,
+    )
+    # backend correctly identified in resulting sampling_params
+    assert sampling_params.guided_decoding.backend == expected
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 7cbd9d7ce2a64..cbd5f6e566b30 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -476,8 +476,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
                 json_schema = self.response_format.json_schema
                 assert json_schema is not None
                 self.guided_json = json_schema.json_schema
-                if self.guided_decoding_backend is None:
-                    self.guided_decoding_backend = "xgrammar"
 
         guided_decoding = GuidedDecodingParams.from_optional(
             json=self._get_guided_json_from_tool() or self.guided_json,

From 098900d7c2b53324687977eece400f634755cf51 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 9 Apr 2025 08:59:36 -0600
Subject: [PATCH 332/593] Revert "Update label-tpu mergify and remove removal
 bot" (#16350)

---
 .github/mergify.yml | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index 2a1e039ca2995..3097b994659ab 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -90,9 +90,10 @@ pull_request_rules:
 
 - name: label-tpu
   description: Automatically apply tpu label
+  # Keep this list in sync with `label-tpu-remove` conditions
   conditions:
     - or:
-      - files~=tpu.
+      - files~=tpu.py
       - files~=_tpu
       - files~=tpu_
       - files~=/tpu/
@@ -102,6 +103,21 @@ pull_request_rules:
       add:
         - tpu
 
+- name: label-tpu-remove
+  description: Automatically remove tpu label
+  # Keep this list in sync with `label-tpu` conditions
+  conditions:
+    - and:
+      - -files~=tpu.py
+      - -files~=_tpu
+      - -files~=tpu_
+      - -files~=/tpu/
+      - -files~=pallas
+  actions:
+    label:
+      remove:
+        - tpu
+
 - name: ping author on conflicts and add 'needs-rebase' label
   conditions:
       - conflict

From a25866ac8d8f6eb50e18db65e856b8330854c878 Mon Sep 17 00:00:00 2001
From: zh Wang <rekind133@outlook.com>
Date: Thu, 10 Apr 2025 01:03:34 +0800
Subject: [PATCH 333/593] [Bugfix] Fix profiling.py (#16202)

Signed-off-by: zh Wang <rekind133@outlook.com>
---
 examples/offline_inference/profiling.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/offline_inference/profiling.py b/examples/offline_inference/profiling.py
index ffa76b4e4f2ce..6e1d4722440a5 100644
--- a/examples/offline_inference/profiling.py
+++ b/examples/offline_inference/profiling.py
@@ -234,9 +234,8 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
             sampling_params.max_tokens = next(output_len_generator)
             assert isinstance(sampling_params.max_tokens, int)
 
-            prompt_token_ids = torch.randint(
-                llm.llm_engine.model_config.get_vocab_size(),
-                size=(prompt_len, )).tolist()
+            prompt_token_ids = torch.randint(llm.get_tokenizer().vocab_size,
+                                             size=(prompt_len, )).tolist()
 
             llm.llm_engine.add_request(
                 request_id=f"seq{i}",

From c3b51891371e659c92936697c622df1162fb6eff Mon Sep 17 00:00:00 2001
From: Guillaume Calmettes <gcalmettes@scaleway.com>
Date: Wed, 9 Apr 2025 19:33:24 +0200
Subject: [PATCH 334/593] [Bugfix] catch AssertionError in MistralTokenizer as
 ValueError (#16344)

Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
---
 vllm/entrypoints/chat_utils.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 0e216f53c13b7..23c2c3cfd5811 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1193,8 +1193,15 @@ def apply_mistral_chat_template(
         **kwargs,
     )
 
-    return tokenizer.apply_chat_template(
-        messages=messages,
-        tools=tools,
-        **kwargs,
-    )
+    try:
+        return tokenizer.apply_chat_template(
+            messages=messages,
+            tools=tools,
+            **kwargs,
+        )
+    # mistral-common uses assert statements to stop processing of input
+    # if input does not comply with the expected format.
+    # We convert those assertion errors to ValueErrors so they can be
+    # are properly caught in the preprocessing_input step
+    except AssertionError as e:
+        raise ValueError from e

From 566f10a9293425483ac47892e2a6cd4960177a59 Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Wed, 9 Apr 2025 12:52:26 -0500
Subject: [PATCH 335/593] [CI]Fix hpu docker and numpy version for CI (#16355)

Signed-off-by: Chendi Xue <chendi.xue@intel.com>
---
 docker/Dockerfile.hpu | 2 +-
 requirements/hpu.txt  | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/docker/Dockerfile.hpu b/docker/Dockerfile.hpu
index 48211c88f872b..224f142b5ff44 100644
--- a/docker/Dockerfile.hpu
+++ b/docker/Dockerfile.hpu
@@ -1,4 +1,4 @@
-FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
+FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
 
 COPY ./ /workspace/vllm
 
diff --git a/requirements/hpu.txt b/requirements/hpu.txt
index a61d72d04f409..830f6ef3f50cb 100644
--- a/requirements/hpu.txt
+++ b/requirements/hpu.txt
@@ -5,6 +5,7 @@
 ray
 triton==3.1.0
 pandas
+numpy==1.26.4
 tabulate
 setuptools>=61
 setuptools-scm>=8

From b2ce859bd28b6a845962cba575ab08b74011617b Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 9 Apr 2025 13:09:28 -0600
Subject: [PATCH 336/593] Fix `benchmark_throughput.py --backend=hf` (#16352)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 benchmarks/benchmark_throughput.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index d0d7dfa1d7957..33a31e62f9165 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -213,14 +213,17 @@ def run_hf(
     max_prompt_len = 0
     max_output_len = 0
     for i in range(len(requests)):
-        prompt, prompt_len, output_len = requests[i]
+        prompt = requests[i].prompt
+        prompt_len = requests[i].prompt_len
+        output_len = requests[i].expected_output_len
         # Add the prompt to the batch.
         batch.append(prompt)
         max_prompt_len = max(max_prompt_len, prompt_len)
         max_output_len = max(max_output_len, output_len)
         if len(batch) < max_batch_size and i != len(requests) - 1:
             # Check if we can add more requests to the batch.
-            _, next_prompt_len, next_output_len = requests[i + 1]
+            next_prompt_len = requests[i + 1].prompt_len
+            next_output_len = requests[i + 1].expected_output_len
             if (max(max_prompt_len, next_prompt_len) +
                     max(max_output_len, next_output_len)) <= 2048:
                 # We can add more requests to the batch.

From fee5b8d37f3b2c6e63ff87e98105e5365ec2eb45 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Wed, 9 Apr 2025 15:14:06 -0400
Subject: [PATCH 337/593] [Build/CI] Add tracing deps to vllm container image
 (#15224)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .buildkite/test-pipeline.yaml | 5 -----
 requirements/common.txt       | 4 ++++
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 9ef498f86eca1..f091d191c6e92 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -163,11 +163,6 @@ steps:
   - tests/tracing
   commands:
   - pytest -v -s metrics
-  - "pip install \
-      'opentelemetry-sdk>=1.26.0,<1.27.0' \
-      'opentelemetry-api>=1.26.0,<1.27.0' \
-      'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
-      'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
   - pytest -v -s tracing
 
 ##### fast check tests  #####
diff --git a/requirements/common.txt b/requirements/common.txt
index bb1bb2dd994e9..32745a101f657 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -43,3 +43,7 @@ watchfiles # required for http server to monitor the updates of TLS files
 python-json-logger # Used by logging as per examples/other/logging_configuration.md
 scipy # Required for phi-4-multimodal-instruct
 ninja # Required for xgrammar, rocm, tpu, xpu
+opentelemetry-sdk>=1.26.0,<1.27.0  # vllm.tracing
+opentelemetry-api>=1.26.0,<1.27.0  # vllm.tracing
+opentelemetry-exporter-otlp>=1.26.0,<1.27.0  # vllm.tracing
+opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0  # vllm.tracing

From cb391d85dc91645108fc22807b42f218df16d13c Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Wed, 9 Apr 2025 21:50:01 +0200
Subject: [PATCH 338/593] [Hardware] add platform-specific request validation
 api (#16291)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
---
 vllm/platforms/cpu.py       |  4 ----
 vllm/platforms/cuda.py      |  4 ----
 vllm/platforms/hpu.py       |  4 ----
 vllm/platforms/interface.py | 23 +++++++++++++++--------
 vllm/platforms/neuron.py    |  4 ----
 vllm/platforms/rocm.py      |  4 ----
 vllm/platforms/tpu.py       | 22 ++++++++++++++++++----
 vllm/platforms/xpu.py       |  4 ----
 vllm/v1/engine/processor.py | 10 +++++-----
 9 files changed, 38 insertions(+), 41 deletions(-)

diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 3c8aecc099459..70553354a0602 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -180,7 +180,3 @@ class CpuPlatform(Platform):
         Get device specific communicator class for distributed communication.
         """
         return "vllm.distributed.device_communicators.cpu_communicator.CpuCommunicator"  # noqa
-
-    @classmethod
-    def supports_structured_output(cls) -> bool:
-        return True
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 053cf74ebceb0..0576022be448b 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -308,10 +308,6 @@ class CudaPlatformBase(Platform):
     def supports_v1(cls, model_config: ModelConfig) -> bool:
         return True
 
-    @classmethod
-    def supports_structured_output(cls) -> bool:
-        return True
-
     @classmethod
     def use_custom_allreduce(cls) -> bool:
         return True
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index f011f14029a39..4c842b5251105 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -92,7 +92,3 @@ class HpuPlatform(Platform):
     @classmethod
     def get_device_communicator_cls(cls) -> str:
         return "vllm.distributed.device_communicators.hpu_communicator.HpuCommunicator"  # noqa
-
-    @classmethod
-    def supports_structured_output(cls) -> bool:
-        return True
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index f788d90bfb4ad..31a7ffbd910d1 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-
 import enum
 import platform
 import random
@@ -9,14 +8,21 @@ from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Union
 import numpy as np
 import torch
 
+from vllm.inputs import PromptType
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig, VllmConfig
+    from vllm.lora.request import LoRARequest
+    from vllm.pooling_params import PoolingParams
+    from vllm.sampling_params import SamplingParams
     from vllm.utils import FlexibleArgumentParser
 else:
     ModelConfig = None
     VllmConfig = None
+    LoRARequest = None
+    PoolingParams = None
+    SamplingParams = None
     FlexibleArgumentParser = None
 
 logger = init_logger(__name__)
@@ -379,13 +385,6 @@ class Platform:
         """
         return False
 
-    @classmethod
-    def supports_structured_output(cls) -> bool:
-        """
-        Returns whether the current platform can support structured output.
-        """
-        return False
-
     @classmethod
     def use_custom_allreduce(cls) -> bool:
         """
@@ -393,6 +392,14 @@ class Platform:
         """
         return False
 
+    @classmethod
+    def validate_request(
+        cls,
+        prompt: PromptType,
+        params: Union[SamplingParams, PoolingParams],
+    ) -> None:
+        """Raises if this request is unsupported on this platform"""
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index 93657881cbdd8..c1f426e5b8801 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -67,7 +67,3 @@ class NeuronPlatform(Platform):
     @classmethod
     def use_all_gather(cls) -> bool:
         return True
-
-    @classmethod
-    def supports_structured_output(cls) -> bool:
-        return True
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index a2fbf416ecf20..d18b7c26f7ec5 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -303,10 +303,6 @@ class RocmPlatform(Platform):
         # V1 support on AMD gpus is experimental
         return True
 
-    @classmethod
-    def supports_structured_output(cls) -> bool:
-        return True
-
     @classmethod
     def use_custom_allreduce(cls) -> bool:
         # We only enable custom allreduce for MI300 series
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index eeadb4a71e5e7..d5848424b332b 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -1,19 +1,26 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, Optional, Union
 
 import torch
 
 import vllm.envs as envs
+from vllm.inputs import PromptType
 from vllm.logger import init_logger
 
 from .interface import Platform, PlatformEnum, _Backend
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig, VllmConfig
+    from vllm.lora.request import LoRARequest
+    from vllm.pooling_params import PoolingParams
+    from vllm.sampling_params import SamplingParams
 else:
     ModelConfig = None
     VllmConfig = None
+    LoRARequest = None
+    PoolingParams = None
+    SamplingParams = None
 
 logger = init_logger(__name__)
 
@@ -135,6 +142,13 @@ class TpuPlatform(Platform):
         return True
 
     @classmethod
-    def supports_structured_output(cls) -> bool:
-        # Structured output is not supported on TPU.
-        return False
+    def validate_request(
+        cls,
+        prompt: PromptType,
+        params: Union[SamplingParams, PoolingParams],
+    ) -> None:
+        """Raises if this request is unsupported on this platform"""
+        if isinstance(params,
+                      SamplingParams) and params.guided_decoding is not None:
+            raise ValueError("Structured output is not supported on "
+                             f"{cls.device_name}.")
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index c4bd639384a40..225e756cd7ce8 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -140,7 +140,3 @@ class XPUPlatform(Platform):
     @classmethod
     def get_device_communicator_cls(cls) -> str:
         return "vllm.distributed.device_communicators.xpu_communicator.XpuCommunicator"  # noqa
-
-    @classmethod
-    def supports_structured_output(cls) -> bool:
-        return True
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 5f9c8ea4835f8..2525b10a15bb2 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -141,11 +141,6 @@ class Processor:
         else:
             params.guided_decoding.backend = engine_level_backend
 
-        from vllm.platforms import current_platform
-        if not current_platform.supports_structured_output():
-            raise ValueError("Structured output is not supported on "
-                             f"{current_platform.device_name}.")
-
         # Request content validation
         if engine_level_backend.startswith("xgrammar"):
             # xgrammar with no fallback
@@ -187,6 +182,11 @@ class Processor:
         # TODO(woosuk): Support pooling models.
         # TODO(woosuk): Support encoder-decoder models.
 
+        from vllm.platforms import current_platform
+        current_platform.validate_request(
+            prompt=prompt,
+            params=params,
+        )
         self._validate_lora(lora_request)
         self._validate_params(params)
         if priority != 0:

From 1bff42c4b74658122b6e54f04bb122ee09b53330 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Thu, 10 Apr 2025 07:32:42 +0800
Subject: [PATCH 339/593] [Misc] refactor Structured Outputs example (#16322)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 .../offline_inference/structured_outputs.py   | 96 ++++++++++++-------
 1 file changed, 60 insertions(+), 36 deletions(-)

diff --git a/examples/offline_inference/structured_outputs.py b/examples/offline_inference/structured_outputs.py
index 5ec4dbe6e55ae..363b500e0adf8 100644
--- a/examples/offline_inference/structured_outputs.py
+++ b/examples/offline_inference/structured_outputs.py
@@ -1,4 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
+"""
+This file demonstrates the example usage of guided decoding 
+to generate structured outputs using vLLM. It shows how to apply 
+different guided decoding techniques such as Choice, Regex, JSON schema, 
+and Grammar to produce structured and formatted results 
+based on specific prompts.
+"""
 
 from enum import Enum
 
@@ -7,26 +14,21 @@ from pydantic import BaseModel
 from vllm import LLM, SamplingParams
 from vllm.sampling_params import GuidedDecodingParams
 
-llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)
-
 # Guided decoding by Choice (list of possible options)
-guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
-sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
-outputs = llm.generate(
-    prompts="Classify this sentiment: vLLM is wonderful!",
-    sampling_params=sampling_params,
-)
-print(outputs[0].outputs[0].text)
+guided_decoding_params_choice = GuidedDecodingParams(
+    choice=["Positive", "Negative"])
+sampling_params_choice = SamplingParams(
+    guided_decoding=guided_decoding_params_choice)
+prompt_choice = "Classify this sentiment: vLLM is wonderful!"
 
 # Guided decoding by Regex
-guided_decoding_params = GuidedDecodingParams(regex=r"\w+@\w+\.com\n")
-sampling_params = SamplingParams(guided_decoding=guided_decoding_params,
-                                 stop=["\n"])
-prompt = ("Generate an email address for Alan Turing, who works in Enigma."
-          "End in .com and new line. Example result:"
-          "alan.turing@enigma.com\n")
-outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
-print(outputs[0].outputs[0].text)
+guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n")
+sampling_params_regex = SamplingParams(
+    guided_decoding=guided_decoding_params_regex, stop=["\n"])
+prompt_regex = (
+    "Generate an email address for Alan Turing, who works in Enigma."
+    "End in .com and new line. Example result:"
+    "alan.turing@enigma.com\n")
 
 
 # Guided decoding by JSON using Pydantic schema
@@ -44,16 +46,11 @@ class CarDescription(BaseModel):
 
 
 json_schema = CarDescription.model_json_schema()
-
-guided_decoding_params = GuidedDecodingParams(json=json_schema)
-sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
-prompt = ("Generate a JSON with the brand, model and car_type of"
-          "the most iconic car from the 90's")
-outputs = llm.generate(
-    prompts=prompt,
-    sampling_params=sampling_params,
-)
-print(outputs[0].outputs[0].text)
+guided_decoding_params_json = GuidedDecodingParams(json=json_schema)
+sampling_params_json = SamplingParams(
+    guided_decoding=guided_decoding_params_json)
+prompt_json = ("Generate a JSON with the brand, model and car_type of"
+               "the most iconic car from the 90's")
 
 # Guided decoding by Grammar
 simplified_sql_grammar = """
@@ -64,12 +61,39 @@ table ::= "table_1 " | "table_2 "
 condition ::= column "= " number
 number ::= "1 " | "2 "
 """
-guided_decoding_params = GuidedDecodingParams(grammar=simplified_sql_grammar)
-sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
-prompt = ("Generate an SQL query to show the 'username' and 'email'"
-          "from the 'users' table.")
-outputs = llm.generate(
-    prompts=prompt,
-    sampling_params=sampling_params,
-)
-print(outputs[0].outputs[0].text)
+guided_decoding_params_grammar = GuidedDecodingParams(
+    grammar=simplified_sql_grammar)
+sampling_params_grammar = SamplingParams(
+    guided_decoding=guided_decoding_params_grammar)
+prompt_grammar = ("Generate an SQL query to show the 'username' and 'email'"
+                  "from the 'users' table.")
+
+
+def format_output(title: str, output: str):
+    print(f"{'-' * 50}\n{title}: {output}\n{'-' * 50}")
+
+
+def generate_output(prompt: str, sampling_params: SamplingParams, llm: LLM):
+    outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
+    return outputs[0].outputs[0].text
+
+
+def main():
+    llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)
+
+    choice_output = generate_output(prompt_choice, sampling_params_choice, llm)
+    format_output("Guided decoding by Choice", choice_output)
+
+    regex_output = generate_output(prompt_regex, sampling_params_regex, llm)
+    format_output("Guided decoding by Regex", regex_output)
+
+    json_output = generate_output(prompt_json, sampling_params_json, llm)
+    format_output("Guided decoding by JSON", json_output)
+
+    grammar_output = generate_output(prompt_grammar, sampling_params_grammar,
+                                     llm)
+    format_output("Guided decoding by Grammar", grammar_output)
+
+
+if __name__ == "__main__":
+    main()

From a454748544f67a7677a2bf71ab329da0600d34a6 Mon Sep 17 00:00:00 2001
From: Chengji Yao <chengjiyao@google.com>
Date: Wed, 9 Apr 2025 17:51:51 -0700
Subject: [PATCH 340/593] [TPU][V1] Refine tpu_model_runner to mitigate future
 recompilation issues (#16275)

Signed-off-by: Chengji Yao <chengjiyao@google.com>
---
 tests/tpu/test_compilation.py                |  14 +-
 tests/v1/tpu/worker/test_tpu_model_runner.py |  25 ++-
 vllm/v1/sample/tpu/metadata.py               |  77 ++++----
 vllm/v1/worker/tpu_model_runner.py           | 175 +++++++++++--------
 4 files changed, 166 insertions(+), 125 deletions(-)

diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py
index 2a71f460f78ef..06e00187caf46 100644
--- a/tests/tpu/test_compilation.py
+++ b/tests/tpu/test_compilation.py
@@ -44,7 +44,7 @@ def test_tpu_compilation():
             assert generated_text.startswith(answer)
 
     compiled_codes = sorted(
-        glob.glob(os.path.join(temp_dir, "__transformed_code*.py")))
+        glob.glob(os.path.join(temp_dir, "__transformed_code*for_forward.py")))
 
     for i, compiled_code in enumerate(compiled_codes):
         print("{} file: {}".format(i + 1, compiled_code))
@@ -52,15 +52,21 @@ def test_tpu_compilation():
     # We should only trigger Dynamo compilation 2 times:
     # 1. Forward pass without kv_caches
     # 2. Forward pass with kv_caches
-    # Check we have 4 compiled codes
+    # Check we have 2 compiled codes
     assert len(compiled_codes) == 2
 
     kv_cache_prefix = "kv_cache"
     attn_prefix = "ragged_paged_attention"
 
+    def extract_compiled_index(s):
+        parts = s.replace(".", "_").split("_")
+        numbers = [int(part) for part in parts if part.isdigit()]
+        return numbers[0]
+
     # Check all the compilations are as expected
-    compiled_fns = sorted(
-        glob.glob(os.path.join(temp_dir, "__compiled_fn*Captured*.py")))
+    compiled_fns = sorted(glob.glob(
+        os.path.join(temp_dir, "__compiled_fn*Captured*.py")),
+                          key=lambda s: extract_compiled_index(s))
 
     for i, compiled_fn in enumerate(compiled_fns):
         print("{} file: {}".format(i + 1, compiled_fn))
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 6b6a91b857f0e..8ea8c890613a3 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -7,9 +7,9 @@ from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
                                        SchedulerOutput)
-from vllm.v1.worker.tpu_model_runner import (TPUModelRunner,
-                                             _get_padded_token_len,
-                                             _get_paddings)
+from vllm.v1.worker.tpu_model_runner import (
+    TPUModelRunner, _get_padded_num_reqs_with_upper_limit,
+    _get_padded_token_len, _get_req_paddings, _get_token_paddings)
 
 # Mock torch_xla module since it may not be available in the test environments
 torch_xla_patcher = mock.patch.dict(
@@ -296,16 +296,29 @@ def test_update_states_request_unscheduled(model_runner):
 def test_get_paddings():
     min_token_size, max_token_size, padding_gap = 16, 512, 64
     expected_paddings = [16, 32, 64, 128, 192, 256, 320, 384, 448, 512]
-    actual_paddings = _get_paddings(min_token_size, max_token_size,
-                                    padding_gap)
+    actual_paddings = _get_token_paddings(min_token_size, max_token_size,
+                                          padding_gap)
     assert actual_paddings == expected_paddings
 
 
 def test_get_padded_token_len():
     min_token_size, max_token_size, padding_gap = 16, 512, 64
-    paddings = _get_paddings(min_token_size, max_token_size, padding_gap)
+    paddings = _get_token_paddings(min_token_size, max_token_size, padding_gap)
     assert _get_padded_token_len(paddings, 1) == 16
     assert _get_padded_token_len(paddings, 16) == 16
     assert _get_padded_token_len(paddings, 20) == 32
     assert _get_padded_token_len(paddings, 300) == 320
     assert _get_padded_token_len(paddings, 512) == 512
+
+
+def test_get_padded_num_reqs_with_upper_limit():
+    assert _get_padded_num_reqs_with_upper_limit(3, 32) == 8
+    assert _get_padded_num_reqs_with_upper_limit(9, 32) == 16
+    assert _get_padded_num_reqs_with_upper_limit(19, 32) == 32
+    assert _get_padded_num_reqs_with_upper_limit(17, 28) == 28
+
+
+def test_get_req_paddings():
+    assert _get_req_paddings(1, 32) == [8, 16, 32]
+    assert _get_req_paddings(8, 32) == [8, 16, 32]
+    assert _get_req_paddings(8, 36) == [8, 16, 32, 36]
diff --git a/vllm/v1/sample/tpu/metadata.py b/vllm/v1/sample/tpu/metadata.py
index 89d3ddf51d748..10995d6787a55 100644
--- a/vllm/v1/sample/tpu/metadata.py
+++ b/vllm/v1/sample/tpu/metadata.py
@@ -3,7 +3,6 @@ from dataclasses import dataclass, field
 from typing import Optional
 
 import torch
-import torch_xla.core.xla_model as xm
 
 from vllm.v1.worker.gpu_input_batch import InputBatch
 
@@ -24,15 +23,15 @@ class TPUSupportedSamplingMetadata:
     # This class exposes a more xla-friendly interface than SamplingMetadata
     # on TPU, in particular all arguments should be traceable and no optionals
     # are allowed, to avoid graph recompilation on Nones.
-    temperature: torch.Tensor
+    temperature: torch.Tensor = None
 
-    min_p: torch.Tensor
+    min_p: torch.Tensor = None
     # Still too slow on forward_native!
     top_k: torch.Tensor = None
     top_p: torch.Tensor = None
 
     # Greedy sampling flag for compiling single xla graph.
-    all_greedy: torch.Tensor = None
+    all_greedy: bool = True
 
     # Generator not supported by xla
     generators: dict[int,
@@ -57,64 +56,58 @@ class TPUSupportedSamplingMetadata:
 
     allowed_token_ids_mask = None
     bad_words_token_ids = None
-    indices_do_sample: torch.Tensor = None
 
     @classmethod
     def from_input_batch(
-            cls, input_batch: InputBatch,
-            indices_do_sample: torch.Tensor) -> "TPUSupportedSamplingMetadata":
+        cls,
+        input_batch: InputBatch,
+        padded_num_reqs: int,
+        xla_device: torch.device,
+        generate_params_if_all_greedy: bool = False
+    ) -> "TPUSupportedSamplingMetadata":
         """
         Copy sampling tensors slices from `input_batch` to on device tensors.
 
         `InputBatch._make_sampling_metadata` causes recompilation on XLA as it 
         slices dynamic shapes on device tensors. This impl moves the dynamic 
-        ops to CPU and produces tensors of fixed `padded_num_reqs` size. It 
-        also reuses the on-device persistent tensors managed in `input_batch`
-        to reduce waste. 
+        ops to CPU and produces tensors of fixed `padded_num_reqs` size.
 
-        `indices_do_sample` contains the indices to be fed to the  Sampler, 
-        normally one per request, here padded to the closest pre-compiled shape
-        We expect sampling params tensors to be padded to the same fixed shape.
-
-        Eg. 3 requests, tensors padded to 4 
-            temperature: [0.7, 0.2, 0.9]=>[0.7, 0.2, 0.9, 0.0]
-            sample indices: [4, 10, 11]=>indices_do_sample: [4, 10, 11, 0]
+        Args:
+            input_batch: The input batch containing sampling parameters.
+            padded_num_reqs: The padded number of requests.
+            xla_device: The XLA device.
+            generate_params_if_all_greedy: If True, generate sampling parameters
+                even if all requests are greedy. this is useful for cases where
+                we want to pre-compile a graph with sampling parameters, even if
+                they are not strictly needed for greedy decoding.
         """
-        num_reqs = input_batch.num_reqs
-        padded_num_reqs = len(indices_do_sample)
+        # Early return to avoid unnecessary cpu to tpu copy
+        if (input_batch.all_greedy is True
+                and generate_params_if_all_greedy is False):
+            return cls(all_greedy=True)
 
-        def copy_slice(cpu_tensor: torch.Tensor, tpu_tensor: torch.Tensor,
-                       fill_val) -> torch.Tensor:
-            # Copy slice from CPU to corresponding TPU pre-allocated tensor.
+        num_reqs = input_batch.num_reqs
+
+        def fill_slice(cpu_tensor: torch.Tensor, fill_val) -> torch.Tensor:
             # Pad value is the default one.
             cpu_tensor[num_reqs:padded_num_reqs] = fill_val
-            # Subtle compilation: len(tpu_tensor) must be >= `padded_num_reqs`
-            tpu_tensor[:padded_num_reqs] = cpu_tensor[:padded_num_reqs]
 
-        # NOTE NickLucche The sync CPU-TPU graph we produce here must be
-        # consistent. We can't have flags to skip copies or we'll end up
-        # recompiling.
-        copy_slice(input_batch.temperature_cpu_tensor, input_batch.temperature,
+        fill_slice(input_batch.temperature_cpu_tensor,
                    DEFAULT_SAMPLING_PARAMS["temperature"])
         # TODO Temporarily disabled until sampling options are enabled
-        # copy_slice(input_batch.top_p_cpu_tensor, input_batch.top_p)
-        # copy_slice(input_batch.top_k_cpu_tensor, input_batch.top_k)
-        copy_slice(input_batch.min_p_cpu_tensor, input_batch.min_p,
+        # fill_slice(input_batch.top_p_cpu_tensor)
+        # fill_slice(input_batch.top_k_cpu_tensor)
+        fill_slice(input_batch.min_p_cpu_tensor,
                    DEFAULT_SAMPLING_PARAMS["min_p"])
 
-        xm.mark_step()
-        xm.wait_device_ops()
-
         # Slice persistent device tensors to a fixed pre-compiled padded shape.
         return cls(
-            temperature=input_batch.temperature[:padded_num_reqs],
-            # Scalar tensor for xla-friendly tracing.
-            all_greedy=torch.tensor(input_batch.all_greedy,
-                                    dtype=torch.bool,
-                                    device=input_batch.device),
+            temperature=input_batch.temperature_cpu_tensor[:padded_num_reqs].
+            to(xla_device),
+            all_greedy=input_batch.all_greedy,
             # TODO enable more and avoid returning None values
             top_p=None,  # input_batch.top_p[:padded_num_reqs],
             top_k=None,  # input_batch.top_k[:padded_num_reqs],
-            min_p=input_batch.min_p[:padded_num_reqs],
-            generators=input_batch.generators,
-            indices_do_sample=indices_do_sample)
+            min_p=input_batch.min_p_cpu_tensor[:padded_num_reqs].to(
+                xla_device),
+            generators=input_batch.generators)
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index c99c6cb722447..773c426474fc9 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -32,7 +32,7 @@ from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheSpec, SlidingWindowSpec)
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
-                             ModelRunnerOutput, SamplerOutput)
+                             ModelRunnerOutput)
 from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
 from vllm.v1.sample.tpu.sampler import Sampler as TPUSampler
 from vllm.v1.utils import bind_kv_cache
@@ -177,10 +177,12 @@ class TPUModelRunner:
         # Range tensor with values [0 .. self.max_num_tokens - 1].
         # Used to initialize positions / context_lens / seq_lens
         self.arange_np = np.arange(self.max_num_tokens, dtype=np.int32)
-        self.num_tokens_paddings = _get_paddings(
+        self.num_tokens_paddings = _get_token_paddings(
             min_token_size=16,
             max_token_size=self.max_num_tokens,
             padding_gap=envs.VLLM_TPU_BUCKET_PADDING_GAP)
+        self.num_reqs_paddings = _get_req_paddings(
+            min_req_size=MIN_NUM_SEQS, max_req_size=self.max_num_reqs)
 
     def _update_num_xla_graphs(self, case_str):
         check_comp = self.check_recompilation and not self.enforce_eager
@@ -508,7 +510,7 @@ class TPUModelRunner:
         # Padded to avoid recompiling when `num_reqs` varies.
         logits_indices = self.query_start_loc_cpu[1:padded_num_reqs + 1] - 1
         logits_indices = logits_indices.to(self.device)
-        return attn_metadata, logits_indices
+        return attn_metadata, logits_indices, padded_num_reqs
 
     def _scatter_placeholders(
         self,
@@ -663,7 +665,8 @@ class TPUModelRunner:
             mm_embeds = []
 
         # Prepare inputs
-        attn_metadata, logits_indices = self._prepare_inputs(scheduler_output)
+        attn_metadata, logits_indices, padded_num_reqs = self._prepare_inputs(
+            scheduler_output)
         if self.is_multimodal_model:
             # NOTE(woosuk): To unify token ids and soft tokens (vision
             # embeddings), we always use embeddings (rather than token ids)
@@ -682,11 +685,6 @@ class TPUModelRunner:
             input_ids = self.input_ids
             inputs_embeds = None
         num_reqs = self.input_batch.num_reqs
-        # NOTE (NickLucche) here we sync with TPU: sampling params tensors
-        # are copied to device in chunks of pre-compiled padded shape to
-        # avoid recompilations.
-        tpu_sampling_metadata = TPUSupportedSamplingMetadata.\
-            from_input_batch(self.input_batch, logits_indices)
         # Run the decoder
         with set_forward_context(attn_metadata, self.vllm_config):
             hidden_states = self.model(
@@ -694,6 +692,10 @@ class TPUModelRunner:
                 positions=self.position_ids,
                 inputs_embeds=inputs_embeds,
             )
+        hidden_states = self.select_hidden_states(hidden_states,
+                                                  logits_indices)
+        tpu_sampling_metadata = TPUSupportedSamplingMetadata.\
+            from_input_batch(self.input_batch, padded_num_reqs, self.device)
         selected_token_ids = self.sample_from_hidden(hidden_states,
                                                      tpu_sampling_metadata)
         # Remove padding on cpu and keep dynamic op outside of xla graph.
@@ -857,60 +859,78 @@ class TPUModelRunner:
                              inputs_embeds=inputs_embeds)
         self._hidden_states_dtype = out.dtype
 
-    def capture_model(self) -> None:
-        """Compile the model."""
-
+    def _precompile_backbone(self) -> None:
         logger.info("Compiling the model with different input shapes.")
 
         start = time.perf_counter()
         for num_tokens in self.num_tokens_paddings:
             logger.info("  -- num_tokens: %d", num_tokens)
             self._dummy_run(num_tokens)
-            xm.mark_step()
         xm.wait_device_ops()
         end = time.perf_counter()
-
         logger.info("Compilation finished in in %.2f [secs].", end - start)
-        self._update_num_xla_graphs("model")
+        self._update_num_xla_graphs("model backbone")
 
+    def _precompile_select_hidden_states(self) -> None:
+        # Compile hidden state selection function for bucketed
+        # n_tokens x max_num_reqs. Graph is really small so this is fine.
+        logger.info(
+            "Compiling select_hidden_states with different input shapes.")
+        start = time.perf_counter()
+        hsize = self.model_config.get_hidden_size()
+        for num_tokens in self.num_tokens_paddings:
+            dummy_hidden = torch.zeros((num_tokens, hsize),
+                                       device=self.device,
+                                       dtype=self._hidden_states_dtype)
+            torch._dynamo.mark_dynamic(dummy_hidden, 0)
+            for num_reqs in self.num_reqs_paddings:
+                indices = torch.zeros(num_reqs,
+                                      dtype=torch.int32,
+                                      device=self.device)
+                torch._dynamo.mark_dynamic(indices, 0)
+                self.select_hidden_states(dummy_hidden, indices)
+            logger.info("  -- num_tokens: %d", num_tokens)
+        xm.wait_device_ops()
+        end = time.perf_counter()
+        logger.info("Compilation finished in in %.2f [secs].", end - start)
+        self._update_num_xla_graphs("select_hidden_states")
+
+    def _precompile_sample_from_hidden(self) -> None:
         logger.info("Compiling sampling with different input shapes.")
         start = time.perf_counter()
         hsize = self.model_config.get_hidden_size()
-        device = self.device
-        # Compile sampling step for different model+sampler outputs in bucketed
-        # n_tokens x max_num_reqs. Graph is really small so this is fine.
-        for num_tokens in self.num_tokens_paddings:
-            num_reqs_to_sample = MIN_NUM_SEQS
-            dummy_hidden = torch.randn((num_tokens, hsize),
-                                       device=device,
+        for num_reqs in self.num_reqs_paddings:
+            dummy_hidden = torch.zeros((num_reqs, hsize),
+                                       device=self.device,
                                        dtype=self._hidden_states_dtype)
-            # Compile for [8, 16, .., 128,.., `self.max_num_reqs`]
-            while True:
-                indices = torch.zeros(
-                    num_reqs_to_sample,
-                    dtype=torch.int32,
-                    device=device,
-                )
-                xm.mark_step()
-                sampling_meta = TPUSupportedSamplingMetadata.\
-                    from_input_batch(self.input_batch, indices)
-                logger.info("  -- num_tokens: %d, num_seqs: %d", num_tokens,
-                            num_reqs_to_sample)
-                out = self.sample_from_hidden(dummy_hidden, sampling_meta)
-                out = out.cpu()
-                # Requests can't be more than tokens. But do compile for the
-                # next bigger value in case num_tokens uses bucketed padding.
-                if num_reqs_to_sample >= min(num_tokens, self.max_num_reqs):
-                    break
-                # Make sure to compile the `max_num_reqs` upper-limit case
-                num_reqs_to_sample = _get_padded_num_reqs_with_upper_limit(
-                    num_reqs_to_sample + 1, self.max_num_reqs)
+            # The first dimension of dummy_hidden cannot be mark_dynamic because
+            # some operations in the sampler require it to be static.
+            for all_greedy in [False, True]:
+                generate_params_if_all_greedy = not all_greedy
+                sampling_metadata = (
+                    TPUSupportedSamplingMetadata.from_input_batch(
+                        self.input_batch,
+                        num_reqs,
+                        self.device,
+                        generate_params_if_all_greedy,
+                    ))
+                sampling_metadata.all_greedy = all_greedy
+                self.sample_from_hidden(dummy_hidden, sampling_metadata)
+            logger.info("  -- num_seqs: %d", num_reqs)
         xm.wait_device_ops()
         end = time.perf_counter()
-
         logger.info("Compilation finished in in %.2f [secs].", end - start)
         self._update_num_xla_graphs("sampling")
 
+    def capture_model(self) -> None:
+        """
+        Precompile all the subgraphs with possible input shapes.
+        """
+        # TODO: precompile encoder
+        self._precompile_backbone()
+        self._precompile_select_hidden_states()
+        self._precompile_sample_from_hidden()
+
     def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         """
         Initialize KV cache based on `kv_cache_config`.
@@ -962,48 +982,55 @@ class TPUModelRunner:
                 compiled_model.original_code_object)
             compiled_model.compiled_codes.clear()
 
+    @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
+    def select_hidden_states(self, hidden_states, indices_do_sample):
+        return hidden_states[indices_do_sample]
+
+    @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
     def sample_from_hidden(
         self,
-        hidden_states: torch.Tensor,
+        sample_hidden_states: torch.Tensor,
         sampling_metadata: TPUSupportedSamplingMetadata,
     ) -> torch.Tensor:
         """
-            Sample with xla-friendly function. This function is to be traced 
-            separately for lighter compilation overhead.
-            """
-        # Tensor `sample_hidden_states` is of fixed pre-compiled size.
-        sample_hidden_states = \
-            hidden_states[sampling_metadata.indices_do_sample]
-        # SamplingMetadata here for pruning output in LogitsProcessor, disabled.
+        Sample with xla-friendly function. This function is to be traced 
+        separately from `forward` for lighter compilation overhead.
+        """
         logits = self.model.compute_logits(sample_hidden_states, None)
-
-        def sample(
-                logits: torch.Tensor,
-                sampling_metadata: TPUSupportedSamplingMetadata
-        ) -> SamplerOutput:
-            sampler_out = self.sampler(logits, sampling_metadata)
-            return sampler_out
-
-        # Optimized greedy sampling branch, tracing both paths in a single pass
-        # NOTE all_greedy is a scalar, this is just an optimized if/else.
-        out_tokens = torch.where(
-            sampling_metadata.all_greedy,
-            torch.argmax(logits, dim=-1, keepdim=True),
-            sample(logits, sampling_metadata).sampled_token_ids)
+        if sampling_metadata.all_greedy:
+            out_tokens = torch.argmax(logits, dim=-1, keepdim=True)
+        else:
+            out_tokens = self.sampler(logits,
+                                      sampling_metadata).sampled_token_ids
         return out_tokens
 
+    def get_multimodal_embeddings(self, *args, **kwargs):
+        return self.model.get_multimodal_embeddings(*args, **kwargs)
 
-def _get_padded_number(n: int, multiple: int) -> int:
-    return ((n + multiple - 1) // multiple) * multiple
+    def get_input_embeddings(self, *args, **kwargs):
+        return self.model.get_input_embeddings(*args, **kwargs)
 
 
-def _get_padded_num_reqs_with_upper_limit(x, upper_limit) -> int:
+def _get_req_paddings(min_req_size: int, max_req_size: int) -> list[int]:
+    logger.info("Preparing request paddings:")
+    # assert min_req_size is power of 2
+    assert (min_req_size & (min_req_size - 1) == 0) and min_req_size > 0
+    paddings: list = []
+    num = max(MIN_NUM_SEQS, min_req_size)
+    while num <= max_req_size and (len(paddings) == 0 or paddings[-1] != num):
+        paddings.append(num)
+        logger.info("    %d", num)
+        num = _get_padded_num_reqs_with_upper_limit(num + 1, max_req_size)
+    return paddings
+
+
+def _get_padded_num_reqs_with_upper_limit(x: int, upper_limit: int) -> int:
     res = MIN_NUM_SEQS if x <= MIN_NUM_SEQS else 1 << (x - 1).bit_length()
     return min(res, upper_limit)
 
 
-def _get_paddings(min_token_size: int, max_token_size: int,
-                  padding_gap: int) -> list[int]:
+def _get_token_paddings(min_token_size: int, max_token_size: int,
+                        padding_gap: int) -> list[int]:
     """Generate a list of padding size, starting from min_token_size, 
     ending with a number that can cover max_token_size
     
@@ -1013,18 +1040,20 @@ def _get_paddings(min_token_size: int, max_token_size: int,
         first increase the size to twice, 
         then increase the padding size by padding_gap.
     """
+    # assert min_token_size is power of 2
+    assert (min_token_size & (min_token_size - 1) == 0) and min_token_size > 0
     paddings = []
     num = min_token_size
 
     if padding_gap == 0:
-        logger.info("Using exponential paddings:")
+        logger.info("Using exponential token paddings:")
         while num <= max_token_size:
             logger.info("    %d", num)
             paddings.append(num)
             num *= 2
 
     else:
-        logger.info("Using incremental paddings:")
+        logger.info("Using incremental token paddings:")
         while num <= padding_gap:
             logger.info("    %d", num)
             paddings.append(num)

From 1e44ffc3ff5be0f7bd4c4e7efa888a80d2681743 Mon Sep 17 00:00:00 2001
From: Yuxuan Zhang <2448370773@qq.com>
Date: Thu, 10 Apr 2025 09:19:42 +0800
Subject: [PATCH 341/593] Add GLM-4-0414 support (#16338)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: lvfei.lv <lvfei.lv@alibaba-inc.com>
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
Signed-off-by: Lu Fang <fanglu@fb.com>
Signed-off-by: Ajay Vohra <ajayvohr@amazon.com>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
Co-authored-by: Accelerator1996 <lvfei.lv@alibaba-inc.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: yihong <zouzou0208@gmail.com>
Co-authored-by: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Co-authored-by: ajayvohra2005 <ajayvohr@amazon.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: Guillaume Calmettes <gcalmettes@scaleway.com>
---
 docs/source/models/supported_models.md |   5 +
 tests/models/registry.py               |   5 +
 vllm/model_executor/models/glm4.py     | 313 +++++++++++++++++++++++++
 vllm/model_executor/models/registry.py |   1 +
 4 files changed, 324 insertions(+)
 create mode 100644 vllm/model_executor/models/glm4.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 94a9b039a61d4..2ebec2ea968ab 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -303,6 +303,11 @@ See [this page](#generative-models) for more information on how to use generativ
   * `THUDM/glm-4-9b-chat-hf`, etc.
   * ✅︎
   * ✅︎
+- * `Glm4ForCausalLM`
+  * GLM-4-0414
+  * `THUDM/GLM-4-32B-Chat-0414`, etc.
+  * ✅︎
+  * ✅︎
 - * `GPT2LMHeadModel`
   * GPT-2
   * `gpt2`, `gpt2-xl`, etc.
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 73b7c0fa97451..40479fb8a5b07 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -146,6 +146,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it",
                                          min_transformers_version="4.50"),
     "GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"),
+    "Glm4ForCausalLM": _HfExamplesInfo(
+        "THUDM/GLM-4-32B-Chat-0414",
+        is_available_online=False,
+        min_transformers_version="4.52.dev0"
+    ),
     "GPT2LMHeadModel": _HfExamplesInfo("gpt2"),
     "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder"),
     "GPTJForCausalLM": _HfExamplesInfo("EleutherAI/gpt-j-6b"),
diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
new file mode 100644
index 0000000000000..cba093cbfef78
--- /dev/null
+++ b/vllm/model_executor/models/glm4.py
@@ -0,0 +1,313 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright 2025 The Zhipu AI team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only GLM-4-0414 model compatible with HuggingFace weights."""
+from typing import Iterable, Optional, Set, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import Glm4Config
+
+from vllm.attention import Attention, AttentionType
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .llama import LlamaMLP as Glm4MLP
+from .llama import LlamaModel
+from .utils import AutoWeightsLoader, PPMissingLayer, maybe_prefix
+
+
+class Glm4Attention(nn.Module):
+
+    def __init__(self,
+                 config: Glm4Config,
+                 hidden_size: int,
+                 num_heads: int,
+                 num_kv_heads: int,
+                 max_position: int = 4096 * 32,
+                 head_dim: Optional[int] = None,
+                 qkv_bias: bool = False,
+                 rope_theta: float = 10000,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 rope_scaling: Optional[Tuple] = None,
+                 prefix: str = "",
+                 attn_type: str = AttentionType.DECODER) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5)
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim or hidden_size // self.total_num_heads
+        self.rotary_dim = int(partial_rotary_factor * self.head_dim)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.rotary_dim,
+            max_position=max_position,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+            partial_rotary_factor=partial_rotary_factor,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn",
+                              attn_type=attn_type)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Glm4DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Glm4Config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 1000000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+
+        self.self_attn = Glm4Attention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            qkv_bias=getattr(config, 'attention_bias', False),
+            head_dim=getattr(config, 'head_dim', None),
+            cache_config=cache_config,
+            quant_config=quant_config,
+            rope_scaling=rope_scaling,
+            prefix=f"{prefix}.self_attn",
+            attn_type=AttentionType.DECODER,
+        )
+        self.mlp = Glm4MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+        self.post_self_attn_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+        self.post_mlp_layernorm = RMSNorm(config.hidden_size,
+                                          eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        hidden_states = self.post_self_attn_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        hidden_states = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_mlp_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states, residual
+
+
+ALL_DECODER_LAYER_TYPES = {
+    "attention": Glm4DecoderLayer,
+}
+
+
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    })
+class Glm4Model(LlamaModel):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         layer_type=Glm4DecoderLayer)
+
+
+class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.lora_config = lora_config
+
+        self.quant_config = quant_config
+        self.model = Glm4Model(vllm_config=vllm_config,
+                               prefix=maybe_prefix(prefix, "model"))
+
+        if get_pp_group().is_last_rank:
+            if config.tie_word_embeddings:
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.lm_head = ParallelLMHead(config.vocab_size,
+                                              config.hidden_size,
+                                              quant_config=quant_config,
+                                              prefix=maybe_prefix(
+                                                  prefix, "lm_head"))
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = get_sampler()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 0de24b578c17d..6a70f6bb72360 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -58,6 +58,7 @@ _TEXT_GENERATION_MODELS = {
     "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"),
     "Gemma3ForCausalLM": ("gemma3", "Gemma3ForCausalLM"),
     "GlmForCausalLM": ("glm", "GlmForCausalLM"),
+    "Glm4ForCausalLM": ("glm4", "Glm4ForCausalLM"),
     "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
     "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
     "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"),

From 1da6a0927418aceaa677d3903c93ca4684ee51d8 Mon Sep 17 00:00:00 2001
From: Guillaume Calmettes <gcalmettes@scaleway.com>
Date: Thu, 10 Apr 2025 04:43:09 +0200
Subject: [PATCH 342/593] [Bugfix]: do not shutdown server if
 `skip_special_use=False` for MistralTokenizer (#14094)

Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
---
 vllm/entrypoints/openai/serving_chat.py        | 4 +++-
 vllm/transformers_utils/tokenizers/__init__.py | 5 +++--
 vllm/transformers_utils/tokenizers/mistral.py  | 7 +++++++
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index d4d0cfa400094..dd0b67df4f15a 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -39,7 +39,8 @@ from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.sequence import Logprob
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.transformers_utils.tokenizers import (maybe_serialize_tool_calls,
-                                                truncate_tool_call_ids)
+                                                truncate_tool_call_ids,
+                                                validate_request_params)
 
 logger = init_logger(__name__)
 
@@ -159,6 +160,7 @@ class OpenAIServingChat(OpenAIServing):
                 # for more info: see comment in `maybe_serialize_tool_calls`
                 maybe_serialize_tool_calls(request)
                 truncate_tool_call_ids(request)
+                validate_request_params(request)
 
             if (request.tool_choice == "auto" and
                     not (self.enable_auto_tools and tool_parser is not None)
diff --git a/vllm/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py
index c12388d9b20bc..7aac29a6bf967 100644
--- a/vllm/transformers_utils/tokenizers/__init__.py
+++ b/vllm/transformers_utils/tokenizers/__init__.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from .mistral import (MistralTokenizer, maybe_serialize_tool_calls,
-                      truncate_tool_call_ids)
+                      truncate_tool_call_ids, validate_request_params)
 
 __all__ = [
-    "MistralTokenizer", "maybe_serialize_tool_calls", "truncate_tool_call_ids"
+    "MistralTokenizer", "maybe_serialize_tool_calls", "truncate_tool_call_ids",
+    "validate_request_params"
 ]
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index d893431f4871b..58a114fa3a32f 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -98,6 +98,13 @@ def truncate_tool_call_ids(request: "ChatCompletionRequest"):
                 request.messages[i]["tool_call_id"] = tool_call_id
 
 
+def validate_request_params(request: "ChatCompletionRequest"):
+    if (request.skip_special_tokens is not None
+            and not request.skip_special_tokens):
+        raise ValueError("skip_special_tokens=False is not supported "
+                         "for Mistral tokenizers.")
+
+
 def list_local_repo_files(repo_id: str, revision: Optional[str]) -> List[str]:
     repo_cache = os.path.join(
         huggingface_hub.constants.HF_HUB_CACHE,

From a564797151a0cdf2698802997f9dba675fb87dd5 Mon Sep 17 00:00:00 2001
From: Aaron Ang <67321817+aaron-ang@users.noreply.github.com>
Date: Wed, 9 Apr 2025 23:07:40 -0400
Subject: [PATCH 343/593] [Model] use AutoWeightsLoader for granite,
 granitemoe, granitemoeshared, grok1, mixtral (#16325)

Signed-off-by: Aaron Ang <aaron.angyd@gmail.com>
---
 vllm/model_executor/models/granite.py         | 143 ++++++------
 vllm/model_executor/models/granitemoe.py      |  79 ++++---
 .../model_executor/models/granitemoeshared.py |  79 ++++---
 vllm/model_executor/models/grok1.py           | 211 +++++++++---------
 vllm/model_executor/models/mixtral.py         | 190 ++++++++--------
 5 files changed, 366 insertions(+), 336 deletions(-)

diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index eba8207d2cd4a..3bd6332c11ca0 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -50,8 +50,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (PPMissingLayer, is_pp_missing_parameter, make_layers,
-                    maybe_prefix)
+from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
+                    make_layers, maybe_prefix)
 
 
 class GraniteMLP(nn.Module):
@@ -260,6 +260,7 @@ class GraniteModel(nn.Module):
         lora_config = vllm_config.lora_config
 
         self.config = config
+        self.quant_config = quant_config
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
         self.vocab_size = config.vocab_size + lora_vocab
@@ -321,6 +322,65 @@ class GraniteModel(nn.Module):
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
@@ -428,71 +488,18 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            (".qkv_proj", ".q_proj", "q"),
-            (".qkv_proj", ".k_proj", "k"),
-            (".qkv_proj", ".v_proj", "v"),
-            (".gate_up_proj", ".gate_proj", 0),
-            (".gate_up_proj", ".up_proj", 1),
+        skip_prefixes = [
+            "rotary_emb.inv_freq",
+            # Models trained using ColossalAI may include these tensors in
+            # the checkpoint. Skip them.
+            "rotary_emb.cos_cached",
+            "rotary_emb.sin_cached",
         ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                continue
-            # With tie_word_embeddings, we can skip lm_head.weight
-            # The weight might appear unnecessarily in the files if the model is
-            # processed with quantization, LoRA, fine-tuning, etc.
-            if self.config.tie_word_embeddings and "lm_head.weight" in name:
-                continue
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache quantization scales
-                param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
-                weight_loader(param, loaded_weight)
-                loaded_params.add(scale_name)
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
+        # With tie_word_embeddings, we can skip lm_head.weight
+        # The weight might appear unnecessarily in the files if the model is
+        # processed with quantization, LoRA, fine-tuning, etc.
+        if self.config.tie_word_embeddings:
+            skip_prefixes.append("lm_head.weight")
 
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Remapping the name of FP8 kv-scale.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 5152539c68f68..367722126e569 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -49,7 +49,7 @@ from vllm.sequence import IntermediateTensors
 
 from . import mixtral
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import make_layers, maybe_prefix
+from .utils import AutoWeightsLoader, make_layers, maybe_prefix
 
 
 class GraniteMoeMoE(nn.Module):
@@ -252,6 +252,8 @@ class GraniteMoeModel(nn.Module):
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
 
+        self.config = config
+        self.quant_config = quant_config  # Required by MixtralModel
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
         self.vocab_size = config.vocab_size + lora_vocab
@@ -304,6 +306,40 @@ class GraniteMoeModel(nn.Module):
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        new_weights = {}
+        for n, p in weights:
+            if n.endswith('.block_sparse_moe.input_linear.weight'):
+                for e in range(p.size(0)):
+                    w1_name = n.replace(
+                        '.block_sparse_moe.input_linear.weight',
+                        f".block_sparse_moe.experts.{e}.w1.weight")
+                    w3_name = n.replace(
+                        '.block_sparse_moe.input_linear.weight',
+                        f".block_sparse_moe.experts.{e}.w3.weight")
+                    w1_param, w3_param = p[e].chunk(2, dim=0)
+                    assert w1_name not in new_weights
+                    assert w3_name not in new_weights
+                    new_weights[w1_name] = w1_param
+                    new_weights[w3_name] = w3_param
+            elif n.endswith('.block_sparse_moe.output_linear.weight'):
+                for e in range(p.size(0)):
+                    w2_name = n.replace(
+                        '.block_sparse_moe.output_linear.weight',
+                        f".block_sparse_moe.experts.{e}.w2.weight")
+                    w2_param = p[e]
+                    assert w2_name not in new_weights
+                    new_weights[w2_name] = w2_param
+            elif n.endswith('.block_sparse_moe.router.layer.weight'):
+                gate_name = n.replace('.block_sparse_moe.router.layer.weight',
+                                      ".block_sparse_moe.gate.weight")
+                assert gate_name not in new_weights
+                new_weights[gate_name] = p
+            else:
+                new_weights[n] = p
+        return mixtral.MixtralModel.load_weights(self, new_weights.items())
+
 
 class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     fall_back_to_pt_during_load = False
@@ -331,7 +367,6 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
         self.config = config
         self.lora_config = lora_config
-        self.quant_config = quant_config  # Required by MixtralForCausalLM
 
         self.model = GraniteMoeModel(vllm_config=vllm_config,
                                      prefix=maybe_prefix(prefix, "model"))
@@ -403,37 +438,9 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        new_weights = {}
-        for n, p in weights:
-            if n.endswith('.block_sparse_moe.input_linear.weight'):
-                for e in range(p.size(0)):
-                    w1_name = n.replace(
-                        '.block_sparse_moe.input_linear.weight',
-                        f".block_sparse_moe.experts.{e}.w1.weight")
-                    w3_name = n.replace(
-                        '.block_sparse_moe.input_linear.weight',
-                        f".block_sparse_moe.experts.{e}.w3.weight")
-                    w1_param, w3_param = p[e].chunk(2, dim=0)
-                    assert w1_name not in new_weights
-                    assert w3_name not in new_weights
-                    new_weights[w1_name] = w1_param
-                    new_weights[w3_name] = w3_param
-            elif n.endswith('.block_sparse_moe.output_linear.weight'):
-                for e in range(p.size(0)):
-                    w2_name = n.replace(
-                        '.block_sparse_moe.output_linear.weight',
-                        f".block_sparse_moe.experts.{e}.w2.weight")
-                    w2_param = p[e]
-                    assert w2_name not in new_weights
-                    new_weights[w2_name] = w2_param
-            elif n.endswith('.block_sparse_moe.router.layer.weight'):
-                gate_name = n.replace('.block_sparse_moe.router.layer.weight',
-                                      ".block_sparse_moe.gate.weight")
-                assert gate_name not in new_weights
-                new_weights[gate_name] = p
-            elif n == 'lm_head.weight' and self.config.tie_word_embeddings:
-                pass
-            else:
-                new_weights[n] = p
-        return mixtral.MixtralForCausalLM.load_weights(self,
-                                                       new_weights.items())
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py
index 7e2e4cdcbfa36..cf8c969e118fe 100644
--- a/vllm/model_executor/models/granitemoeshared.py
+++ b/vllm/model_executor/models/granitemoeshared.py
@@ -29,7 +29,7 @@ from vllm.sequence import IntermediateTensors
 from . import mixtral
 from .granitemoe import GraniteMoeAttention, GraniteMoeMoE
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import make_layers, maybe_prefix
+from .utils import AutoWeightsLoader, make_layers, maybe_prefix
 
 
 class GraniteMoeSharedMLP(nn.Module):
@@ -152,6 +152,8 @@ class GraniteMoeSharedModel(nn.Module):
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
 
+        self.config = config
+        self.quant_config = quant_config  # Required by MixtralModel
         self.padding_idx = config.pad_token_id
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
@@ -207,6 +209,40 @@ class GraniteMoeSharedModel(nn.Module):
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        new_weights = {}
+        for n, p in weights:
+            if n.endswith('.block_sparse_moe.input_linear.weight'):
+                for e in range(p.size(0)):
+                    w1_name = n.replace(
+                        '.block_sparse_moe.input_linear.weight',
+                        f".block_sparse_moe.experts.{e}.w1.weight")
+                    w3_name = n.replace(
+                        '.block_sparse_moe.input_linear.weight',
+                        f".block_sparse_moe.experts.{e}.w3.weight")
+                    w1_param, w3_param = p[e].chunk(2, dim=0)
+                    assert w1_name not in new_weights
+                    assert w3_name not in new_weights
+                    new_weights[w1_name] = w1_param
+                    new_weights[w3_name] = w3_param
+            elif n.endswith('.block_sparse_moe.output_linear.weight'):
+                for e in range(p.size(0)):
+                    w2_name = n.replace(
+                        '.block_sparse_moe.output_linear.weight',
+                        f".block_sparse_moe.experts.{e}.w2.weight")
+                    w2_param = p[e]
+                    assert w2_name not in new_weights
+                    new_weights[w2_name] = w2_param
+            elif n.endswith('.block_sparse_moe.router.layer.weight'):
+                gate_name = n.replace('.block_sparse_moe.router.layer.weight',
+                                      ".block_sparse_moe.gate.weight")
+                assert gate_name not in new_weights
+                new_weights[gate_name] = p
+            else:
+                new_weights[n] = p
+        return mixtral.MixtralModel.load_weights(self, new_weights.items())
+
 
 class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     fall_back_to_pt_during_load = False
@@ -234,7 +270,6 @@ class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
         self.config = config
         self.lora_config = lora_config
-        self.quant_config = quant_config
 
         self.model = GraniteMoeSharedModel(vllm_config=vllm_config,
                                            prefix=maybe_prefix(
@@ -307,37 +342,9 @@ class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        new_weights = {}
-        for n, p in weights:
-            if n.endswith('.block_sparse_moe.input_linear.weight'):
-                for e in range(p.size(0)):
-                    w1_name = n.replace(
-                        '.block_sparse_moe.input_linear.weight',
-                        f".block_sparse_moe.experts.{e}.w1.weight")
-                    w3_name = n.replace(
-                        '.block_sparse_moe.input_linear.weight',
-                        f".block_sparse_moe.experts.{e}.w3.weight")
-                    w1_param, w3_param = p[e].chunk(2, dim=0)
-                    assert w1_name not in new_weights
-                    assert w3_name not in new_weights
-                    new_weights[w1_name] = w1_param
-                    new_weights[w3_name] = w3_param
-            elif n.endswith('.block_sparse_moe.output_linear.weight'):
-                for e in range(p.size(0)):
-                    w2_name = n.replace(
-                        '.block_sparse_moe.output_linear.weight',
-                        f".block_sparse_moe.experts.{e}.w2.weight")
-                    w2_param = p[e]
-                    assert w2_name not in new_weights
-                    new_weights[w2_name] = w2_param
-            elif n.endswith('.block_sparse_moe.router.layer.weight'):
-                gate_name = n.replace('.block_sparse_moe.router.layer.weight',
-                                      ".block_sparse_moe.gate.weight")
-                assert gate_name not in new_weights
-                new_weights[gate_name] = p
-            elif n == 'lm_head.weight' and self.config.tie_word_embeddings:
-                pass
-            else:
-                new_weights[n] = p
-        return mixtral.MixtralForCausalLM.load_weights(self,
-                                                       new_weights.items())
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py
index f2e82017f6530..ef96257ba4bbc 100644
--- a/vllm/model_executor/models/grok1.py
+++ b/vllm/model_executor/models/grok1.py
@@ -48,7 +48,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -302,6 +302,8 @@ class Grok1Model(nn.Module):
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
 
+        self.config = config
+        self.quant_config = quant_config
         self.padding_idx = config.pad_token_id
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
@@ -370,6 +372,105 @@ class Grok1Model(nn.Module):
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        # Map Grok1's unique expert parameter names to standard names
+        # Grok1 uses "num_experts" in its config
+        num_experts = getattr(self.config, "num_experts", 8)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="linear",  # Grok1 specific
+            ckpt_down_proj_name="linear_1",  # Grok1 specific
+            ckpt_up_proj_name="linear_v",  # Grok1 specific
+            num_experts=num_experts)
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+
+        for name, loaded_weight in weights:
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                if name.endswith("scale"):
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    # Handle Grok1-specific norm.scale naming
+                    if "norm.scale" in name:
+                        name = name.replace("scale", "weight")
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     fall_back_to_pt_during_load = False
@@ -460,106 +561,10 @@ class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-        ]
+        skip_prefixes = ["rotary_emb.inv_freq"]
+        # Skip lm_head when tie_word_embeddings is True
+        if self.config.tie_word_embeddings:
+            skip_prefixes.append("lm_head")
 
-        # Map Grok1's unique expert parameter names to standard names
-        # Grok1 uses "num_experts" in its config
-        num_experts = getattr(self.config, "num_experts", 8)
-        expert_params_mapping = FusedMoE.make_expert_params_mapping(
-            ckpt_gate_proj_name="linear",  # Grok1 specific
-            ckpt_down_proj_name="linear_1",  # Grok1 specific
-            ckpt_up_proj_name="linear_v",  # Grok1 specific
-            num_experts=num_experts)
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache quantization scales
-                param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
-                weight_loader(param, loaded_weight)
-                loaded_params.add(scale_name)
-                continue
-
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if ((name.endswith(".bias") or name.endswith("_bias"))
-                        and name not in params_dict):
-                    continue
-                # Skip layers on other devices.
-                if is_pp_missing_parameter(name, self):
-                    continue
-                if name.endswith("scale"):
-                    # Remapping the name of FP8 kv-scale.
-                    name = maybe_remap_kv_scale_name(name, params_dict)
-                    if name is None:
-                        continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                for mapping in expert_params_mapping:
-                    param_name, weight_name, expert_id, shard_id = mapping
-                    if weight_name not in name:
-                        continue
-                    name = name.replace(weight_name, param_name)
-                    # Skip layers on other devices.
-                    if is_pp_missing_parameter(name, self):
-                        continue
-                    if ((name.endswith(".bias") or name.endswith("_bias"))
-                            and name not in params_dict):
-                        continue
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  name,
-                                  shard_id=shard_id,
-                                  expert_id=expert_id)
-                    break
-                else:
-                    # Skip loading extra bias for GPTQ models.
-                    if ((name.endswith(".bias") or name.endswith("_bias"))
-                            and name not in params_dict):
-                        continue
-                    # Skip layers on other devices.
-                    if is_pp_missing_parameter(name, self):
-                        continue
-
-                    # Remapping the name of FP8 kv-scale.
-                    name = maybe_remap_kv_scale_name(name, params_dict)
-                    if name is None:
-                        continue
-
-                    # Handle Grok1-specific norm.scale naming
-                    if "norm.scale" in name:
-                        name = name.replace("scale", "weight")
-
-                    # Skip lm_head when tie_word_embeddings is True
-                    if "lm_head" in name and self.config.tie_word_embeddings:
-                        continue
-
-                    param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
-                    weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 6bdb623593a71..b0ac99f21ead1 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -49,7 +49,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -260,6 +260,8 @@ class MixtralModel(nn.Module):
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
 
+        self.config = config
+        self.quant_config = quant_config
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
         self.vocab_size = config.vocab_size + lora_vocab
@@ -313,6 +315,98 @@ class MixtralModel(nn.Module):
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="w1",
+            ckpt_down_proj_name="w2",
+            ckpt_up_proj_name="w3",
+            num_experts=self.config.num_local_experts)
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                if name.endswith("scale"):
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     fall_back_to_pt_during_load = False
@@ -397,95 +491,5 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-        ]
-
-        # Params for weights, fp8 weight scales, fp8 activation scales
-        # (param_name, weight_name, expert_id, shard_id)
-        expert_params_mapping = FusedMoE.make_expert_params_mapping(
-            ckpt_gate_proj_name="w1",
-            ckpt_down_proj_name="w2",
-            ckpt_up_proj_name="w3",
-            num_experts=self.config.num_local_experts)
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache quantization scales
-                param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
-                weight_loader(param, loaded_weight)
-                loaded_params.add(scale_name)
-                continue
-
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if ((name.endswith(".bias") or name.endswith("_bias"))
-                        and name not in params_dict):
-                    continue
-                # Skip layers on other devices.
-                if is_pp_missing_parameter(name, self):
-                    continue
-                if name.endswith("scale"):
-                    # Remapping the name of FP8 kv-scale.
-                    name = maybe_remap_kv_scale_name(name, params_dict)
-                    if name is None:
-                        continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                for mapping in expert_params_mapping:
-                    param_name, weight_name, expert_id, shard_id = mapping
-                    if weight_name not in name:
-                        continue
-                    name = name.replace(weight_name, param_name)
-                    # Skip layers on other devices.
-                    if is_pp_missing_parameter(name, self):
-                        continue
-                    if ((name.endswith(".bias") or name.endswith("_bias"))
-                            and name not in params_dict):
-                        continue
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  name,
-                                  shard_id=shard_id,
-                                  expert_id=expert_id)
-                    break
-                else:
-                    # Skip loading extra bias for GPTQ models.
-                    if ((name.endswith(".bias") or name.endswith("_bias"))
-                            and name not in params_dict):
-                        continue
-                    # Skip layers on other devices.
-                    if is_pp_missing_parameter(name, self):
-                        continue
-                    # Remapping the name of FP8 kv-scale.
-                    name = maybe_remap_kv_scale_name(name, params_dict)
-                    if name is None:
-                        continue
-
-                    param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
-                    weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self, skip_prefixes=["rotary_emb.inv_freq"])
+        return loader.load_weights(weights)

From 1621b252888887cc6601c5df2dfa8841ae305d8e Mon Sep 17 00:00:00 2001
From: Chengji Yao <chengjiyao@google.com>
Date: Wed, 9 Apr 2025 21:06:16 -0700
Subject: [PATCH 344/593] [TPU] Fix dummy loading OOM (#16372)

Signed-off-by: Chengji Yao <chengjiyao@google.com>
---
 .../model_executor/model_loader/weight_utils.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index a7475941c1278..1bb592f492ef2 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -658,8 +658,21 @@ def initialize_dummy_weights(
     for param in model.state_dict().values():
         if torch.is_floating_point(param):
             if current_platform.is_tpu():
-                # XLA device does not support torch.Generator()
-                param.uniform_(low, high)
+                generator = torch.Generator(device="cpu")
+                generator.manual_seed(seed)
+                # Note: The param.uniform_ function cannot be used in this
+                # context because it demands more TPU HBM than directly copying
+                # from a CPU tensor.
+                # Note: We avoid using torch.rank_like as it doesn't currently
+                # support the generator argument.
+                param.copy_((high - low) *
+                            torch.rand(*param.shape,
+                                       generator=generator,
+                                       dtype=param.dtype,
+                                       layout=param.layout,
+                                       requires_grad=param.requires_grad,
+                                       device="cpu") + low)
+                torch._sync(param)
                 continue
 
             generator = torch.Generator(device=param.data.device)

From 4aed0ca6a2de8c56838fe850f8d8f8559074744a Mon Sep 17 00:00:00 2001
From: Jintao <huangjintao@mail.ustc.edu.cn>
Date: Thu, 10 Apr 2025 12:30:05 +0800
Subject: [PATCH 345/593] [bugfix] Avoid the time consumption caused by
 creating dummy videos. (#16371)

---
 vllm/multimodal/profiling.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 9a733d3bb44e8..ec3625f2f4265 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -78,6 +78,8 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
         length: int,
         num_audios: int,
     ) -> list[npt.NDArray]:
+        if num_audios == 0:
+            return []
         audio = np.zeros((length, ))
         return [audio] * num_audios
 
@@ -88,6 +90,8 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
         height: int,
         num_images: int,
     ) -> list[Image.Image]:
+        if num_images == 0:
+            return []
         image = Image.new("RGB", (width, height), color=255)
         return [image] * num_images
 
@@ -99,6 +103,8 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
         num_frames: int,
         num_videos: int,
     ) -> list[npt.NDArray]:
+        if num_videos == 0:
+            return []
         video = np.full((num_frames, width, height, 3), 255)
         return [video] * num_videos
 

From 0d4d06fe2fe248dbd5c4a360e8c4ffa3b386279c Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Wed, 9 Apr 2025 21:35:00 -0700
Subject: [PATCH 346/593] [CI][Bugfix] Pin triton version for CPU (#16384)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 requirements/cpu.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index fc09083781e6f..1205c098e6f06 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -15,3 +15,6 @@ torchaudio==2.6.0; platform_machine == "ppc64le"
 torchvision; platform_machine != "ppc64le"  and platform_machine != "s390x"
 torchvision==0.21.0; platform_machine == "ppc64le"
 datasets # for benchmark scripts
+
+# cpu cannot use triton 3.3.0
+triton==3.2.0

From 82eb61dd4c4e306bda4f20edab063693396c4e1a Mon Sep 17 00:00:00 2001
From: Benjamin Kitor <bkitor@gigaio.com>
Date: Wed, 9 Apr 2025 21:54:54 -0700
Subject: [PATCH 347/593] [misc] use tqdm.auto where appropriate (#16290)

Signed-off-by: Benjamin Kitor <bkitor@gigaio.com>
---
 vllm/entrypoints/llm.py     | 2 +-
 vllm/worker/model_runner.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index c79ab16a1949f..70bb73f482c86 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -8,7 +8,7 @@ from typing import Any, Callable, ClassVar, Optional, Union, cast, overload
 
 import cloudpickle
 import torch.nn as nn
-from tqdm import tqdm
+from tqdm.auto import tqdm
 from typing_extensions import TypeVar, deprecated
 
 from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 86e6d97520131..9524a69f6b3aa 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -15,7 +15,7 @@ import numpy as np
 import torch
 import torch.distributed
 import torch.nn as nn
-from tqdm import tqdm
+from tqdm.auto import tqdm
 
 import vllm.envs as envs
 from vllm.attention import AttentionMetadata, get_attn_backend

From baada0e737430ac093cb984c63502ef509a0b7d7 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 9 Apr 2025 22:55:12 -0600
Subject: [PATCH 348/593] [Bugfix][TPU] Fix TPU validate_request (#16369)

Signed-off-by: Michael Goin <mgoin64@gmail.com>
---
 vllm/platforms/tpu.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index d5848424b332b..61e84a6d6f95d 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -7,20 +7,17 @@ import torch
 import vllm.envs as envs
 from vllm.inputs import PromptType
 from vllm.logger import init_logger
+from vllm.sampling_params import SamplingParams
 
 from .interface import Platform, PlatformEnum, _Backend
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig, VllmConfig
-    from vllm.lora.request import LoRARequest
     from vllm.pooling_params import PoolingParams
-    from vllm.sampling_params import SamplingParams
 else:
     ModelConfig = None
     VllmConfig = None
-    LoRARequest = None
     PoolingParams = None
-    SamplingParams = None
 
 logger = init_logger(__name__)
 

From 417bcefbae024a10ddedb18e2dab58e7a4c535b3 Mon Sep 17 00:00:00 2001
From: Chenyaaang <42742451+Chenyaaang@users.noreply.github.com>
Date: Wed, 9 Apr 2025 22:35:07 -0700
Subject: [PATCH 349/593] fix sonnet dataset sample when prefix len is very
 small (#16379)

Signed-off-by: Chenyaaang <chenyangli@google.com>
---
 benchmarks/benchmark_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index 1d61485e70b5b..c6630800cfbb9 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -472,7 +472,7 @@ class SonnetDataset(BenchmarkDataset):
 
         # Determine how many poem lines to use.
         num_input_lines = round((input_len - base_offset) / avg_len)
-        num_prefix_lines = round((prefix_len - base_offset) / avg_len)
+        num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0)
         prefix_lines = self.data[:num_prefix_lines]
 
         samples = []

From a9bd832fc5bd62e7de739ff9e715b44a129634dc Mon Sep 17 00:00:00 2001
From: Aaron Ang <67321817+aaron-ang@users.noreply.github.com>
Date: Thu, 10 Apr 2025 02:01:00 -0400
Subject: [PATCH 350/593] [Model] use AutoWeightsLoader for deepseek_v2,
 internlm2 (#16383)

Signed-off-by: Aaron Ang <aaron.angyd@gmail.com>
---
 vllm/model_executor/models/deepseek_v2.py | 174 +++++++++++-----------
 vllm/model_executor/models/internlm2.py   |  75 +++++-----
 2 files changed, 127 insertions(+), 122 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 23b450aeddac9..62714f883bc1f 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -53,7 +53,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (PPMissingLayer, is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -668,6 +668,91 @@ class DeepseekV2Model(nn.Module):
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts)
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
+            if spec_layer is not None:
+                continue  # skip spec decode layers for main model
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if (("mlp.experts." in name) and name not in params_dict):
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
 
@@ -737,91 +822,8 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-
-        # Params for weights, fp8 weight scales, fp8 activation scales
-        # (param_name, weight_name, expert_id, shard_id)
-        expert_params_mapping = FusedMoE.make_expert_params_mapping(
-            ckpt_gate_proj_name="gate_proj",
-            ckpt_down_proj_name="down_proj",
-            ckpt_up_proj_name="up_proj",
-            num_experts=self.config.n_routed_experts)
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
-            if spec_layer is not None:
-                continue  # skip spec decode layers for main model
-
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                # Skip non-stacked layers and experts (experts handled below).
-                if weight_name not in name:
-                    continue
-                # We have mlp.experts[0].gate_proj in the checkpoint.
-                # Since we handle the experts below in expert_params_mapping,
-                # we need to skip here BEFORE we update the name, otherwise
-                # name will be updated to mlp.experts[0].gate_up_proj, which
-                # will then be updated below in expert_params_mapping
-                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
-                if (("mlp.experts." in name) and name not in params_dict):
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                for mapping in expert_params_mapping:
-                    param_name, weight_name, expert_id, shard_id = mapping
-                    if weight_name not in name:
-                        continue
-                    name = name.replace(weight_name, param_name)
-
-                    if is_pp_missing_parameter(name, self):
-                        continue
-
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  name,
-                                  shard_id=shard_id,
-                                  expert_id=expert_id)
-                    break
-                else:
-                    # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
-                        continue
-
-                    # Remapping the name of FP8 kv-scale.
-                    name = maybe_remap_kv_scale_name(name, params_dict)
-                    if name is None:
-                        continue
-
-                    if is_pp_missing_parameter(name, self):
-                        continue
-
-                    param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
-                    weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self, skip_prefixes=["rotary_emb.inv_freq"])
+        return loader.load_weights(weights)
 
 
 class DeepseekV3ForCausalLM(DeepseekV2ForCausalLM):
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 520b85c0cdfbc..bf544ed3a0af0 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -32,7 +32,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors, PoolerOutput
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -306,6 +306,42 @@ class InternLM2Model(nn.Module):
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "w1", 0),
+            ("gate_up_proj", "w3", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
     packed_modules_mapping = {
@@ -373,41 +409,8 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("gate_up_proj", "w1", 0),
-            ("gate_up_proj", "w3", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self, skip_prefixes=["rotary_emb.inv_freq"])
+        return loader.load_weights(weights)
 
 
 class InternLM2ForRewardModel(InternLM2ForCausalLM):

From 3d4c87758eba24dbed14b08992b6b577a8caa238 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 10 Apr 2025 14:03:33 +0800
Subject: [PATCH 351/593] [Misc] Update transformers version limits of
 multi-modal tests (#16381)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-pipeline.yaml                 |   7 +-
 .../vision_language/test_models.py            |  31 +--
 .../vision_language/test_phi3v.py             | 245 ------------------
 tests/models/registry.py                      |   7 +-
 4 files changed, 20 insertions(+), 270 deletions(-)
 delete mode 100644 tests/models/decoder_only/vision_language/test_phi3v.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f091d191c6e92..cdc487c1e78d4 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -429,7 +429,7 @@ steps:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal
     - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
-    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
+    - pytest -v -s models/decoder_only/vision_language -m 'core_model or quant_model'
     - pytest -v -s models/embedding/vision_language -m core_model
     - pytest -v -s models/encoder_decoder/audio_language -m core_model
     - pytest -v -s models/encoder_decoder/language -m core_model
@@ -448,10 +448,7 @@ steps:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
     - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
-    # HACK - run phi3v tests separately to sidestep this transformers bug
-    # https://github.com/huggingface/transformers/issues/34307
-    - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
-    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
     - pytest -v -s models/embedding/vision_language -m 'not core_model'
     - pytest -v -s models/encoder_decoder/language -m 'not core_model'
     - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 64c841ced8f47..e19cc241054bc 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -425,23 +425,20 @@ VLM_TEST_SETTINGS = {
         max_num_seqs=2,
         patch_hf_runner=model_utils.molmo_patch_hf_runner,
     ),
-    # Tests for phi3v currently live in another file because of a bug in
-    # transformers. Once this issue is fixed, we can enable them here instead.
-    # https://github.com/huggingface/transformers/issues/34307
-    # "phi3v": VLMTestInfo(
-    #     models=["microsoft/Phi-3.5-vision-instruct"],
-    #     test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-    #     prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
-    #     img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
-    #     max_model_len=4096,
-    #     max_num_seqs=2,
-    #     task="generate",
-    #     # use eager mode for hf runner since phi3v didn't work with flash_attn
-    #     hf_model_kwargs={"_attn_implementation": "eager"},
-    #     use_tokenizer_eos=True,
-    #     vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
-    #     num_logprobs=10,
-    # ),
+    "phi3v": VLMTestInfo(
+        models=["microsoft/Phi-3.5-vision-instruct"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
+        max_model_len=4096,
+        max_num_seqs=2,
+        task="generate",
+        # use eager mode for hf runner since phi3v didn't work with flash_attn
+        hf_model_kwargs={"_attn_implementation": "eager"},
+        use_tokenizer_eos=True,
+        vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
+        num_logprobs=10,
+    ),
     "pixtral_hf": VLMTestInfo(
         models=["nm-testing/pixtral-12b-FP8-dynamic"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py
deleted file mode 100644
index 237d499d8f6ad..0000000000000
--- a/tests/models/decoder_only/vision_language/test_phi3v.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-import re
-from typing import Optional
-
-import pytest
-from packaging.version import Version
-from transformers import AutoTokenizer
-from transformers import __version__ as TRANSFORMERS_VERSION
-
-from vllm.multimodal.image import rescale_image_size
-from vllm.platforms import current_platform
-from vllm.sequence import SampleLogprobs
-
-from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
-from ...utils import check_logprobs_close
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
-    "cherry_blossom":
-    "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n",
-})
-HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
-
-models = ["microsoft/Phi-3.5-vision-instruct"]
-
-
-def vllm_to_hf_output(vllm_output: tuple[list[int], str,
-                                         Optional[SampleLogprobs]],
-                      model: str):
-    """Sanitize vllm output to be comparable with hf output."""
-    _, output_str, out_logprobs = vllm_output
-
-    output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
-    assert output_str_without_image[0] == " "
-    output_str_without_image = output_str_without_image[1:]
-
-    hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
-
-    tokenizer = AutoTokenizer.from_pretrained(model)
-    hf_output_ids = tokenizer.encode(output_str_without_image)
-    assert hf_output_ids[0] == 1
-    hf_output_ids = hf_output_ids[1:]
-
-    return hf_output_ids, hf_output_str, out_logprobs
-
-
-target_dtype = "half"
-
-# ROCm Triton FA can run into shared memory issues with these models,
-# use other backends in the meantime
-# FIXME (mattwong, gshtrasb, hongxiayan)
-if current_platform.is_rocm():
-    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
-
-
-def run_test(
-    hf_runner: type[HfRunner],
-    vllm_runner: type[VllmRunner],
-    inputs: list[tuple[list[str], PromptImageInput]],
-    model: str,
-    *,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    mm_limit: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    """Inference result should be the same between hf and vllm.
-
-    All the image fixtures for the test are from IMAGE_ASSETS.
-    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects
-    and corresponding MultiModalConfig as input.
-    Note, the text input is also adjusted to abide by vllm contract.
-    The text output is sanitized to be able to compare with hf.
-    """
-    # HACK - this is an attempted workaround for the following bug
-    # https://github.com/huggingface/transformers/issues/34307
-    from transformers import AutoImageProcessor  # noqa: F401
-    from transformers import AutoProcessor  # noqa: F401
-
-    # Once the model repo is updated to 4.49, we should be able to run the
-    # test in `test_models.py` without the above workaround
-    if Version(TRANSFORMERS_VERSION) >= Version("4.49"):
-        pytest.skip(f"`transformers=={TRANSFORMERS_VERSION}` installed, "
-                    "but `transformers<=4.49` is required to run this model. "
-                    "Reason: Cannot run HF implementation")
-
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-    # max_model_len should be greater than image_feature_size
-    with vllm_runner(model,
-                     task="generate",
-                     max_model_len=4096,
-                     max_num_seqs=2,
-                     dtype=dtype,
-                     limit_mm_per_prompt={"image": mm_limit},
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
-        vllm_outputs_per_case = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images)
-            for prompts, images in inputs
-        ]
-
-    # use eager mode for hf runner, since phi3_v didn't work with flash_attn
-    hf_model_kwargs = {"_attn_implementation": "eager"}
-    with hf_runner(model, dtype=dtype,
-                   model_kwargs=hf_model_kwargs) as hf_model:
-        eos_token_id = hf_model.processor.tokenizer.eos_token_id
-        hf_outputs_per_case = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    images=images,
-                                                    eos_token_id=eos_token_id)
-            for prompts, images in inputs
-        ]
-
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
-                                        vllm_outputs_per_case):
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=[
-                vllm_to_hf_output(vllm_output, model)
-                for vllm_output in vllm_outputs
-            ],
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-# Since we use _attn_implementation="eager" for hf_runner, there is more
-# significant numerical difference. The basic `logprobs=5` fails to pass.
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [10])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype: str, max_tokens: int, num_logprobs: int) -> None:
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_per_image,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=1,
-        tensor_parallel_size=1,
-    )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", [target_dtype])
-def test_regression_7840(hf_runner, vllm_runner, image_assets, model,
-                         dtype) -> None:
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_regresion_7840 = [
-        ([prompt], [image]) for image, prompt in zip(images, HF_IMAGE_PROMPTS)
-    ]
-
-    # Regression test for #7840.
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_regresion_7840,
-        model,
-        dtype=dtype,
-        max_tokens=128,
-        num_logprobs=10,
-        mm_limit=1,
-        tensor_parallel_size=1,
-    )
-
-
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [10])
-def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
-                             size_factors, dtype: str, max_tokens: int,
-                             num_logprobs: int) -> None:
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_case = [
-        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
-         [[rescale_image_size(image, factor) for image in images]
-          for factor in size_factors])
-    ]
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        inputs_per_case,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        mm_limit=2,
-        tensor_parallel_size=1,
-    )
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 40479fb8a5b07..b43bdb9c788da 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -326,7 +326,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                         extras={"fp8": "nm-testing/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic"}),  # noqa: E501
     "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
                                         max_transformers_version="4.48",
-                                        transformers_version_reason="Use of private method which no longer exists.",  # noqa: E501
+                                        transformers_version_reason="Incorrectly-detected `tensorflow` import.",  # noqa: E501
                                         extras={"olmo": "allenai/Molmo-7B-O-0924"},  # noqa: E501
                                         trust_remote_code=True),
     "NVLM_D": _HfExamplesInfo("nvidia/NVLM-D-72B",
@@ -335,6 +335,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                          extras={"v2": "google/paligemma2-3b-ft-docci-448"}),  # noqa: E501
     "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
                                         trust_remote_code=True,
+                                        max_transformers_version="4.48",
+                                        transformers_version_reason="Use of deprecated imports which have been removed.",  # noqa: E501
                                         extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"}),  # noqa: E501
     "Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
                                         trust_remote_code=True),
@@ -351,8 +353,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),
     "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"),  # noqa: E501
     "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501
-                                     trust_remote_code=True,
-                                     max_transformers_version="4.50"),
+                                     trust_remote_code=True),
     # [Encoder-decoder]
     # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
     # Therefore, we borrow the BartTokenizer from the original Bart model

From a5d11a54dc455fd7a3ace5177f5767f7e2366075 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 10 Apr 2025 14:19:42 +0800
Subject: [PATCH 352/593] [Bugfix] Fix validation error for text-only Mllama
 3.2 (#16377)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/engine/llm_engine.py            | 30 +++++++++++++-----------
 vllm/model_executor/models/mllama.py |  5 +++-
 vllm/v1/engine/processor.py          | 34 +++++++++++++++-------------
 3 files changed, 39 insertions(+), 30 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 3ac39887f52ea..54f7b8fb69b58 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -2046,27 +2046,31 @@ class LLMEngine:
         *,
         prompt_type: Literal["encoder", "decoder"],
     ):
-        if prompt_type == "encoder" and self.tokenizer is not None:
-            tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
-            model_config = self.model_config
+        model_config = self.model_config
+        tokenizer = (None if self.tokenizer is None else
+                     self.tokenizer.get_lora_tokenizer(lora_request))
 
-            if model_config.is_multimodal_model:
+        prompt_ids = prompt_inputs["prompt_token_ids"]
+        if not prompt_ids:
+            if prompt_type == "encoder" and model_config.is_multimodal_model:
+                pass  # Mllama may have empty encoder inputs for text-only data
+            else:
+                raise ValueError(f"The {prompt_type} prompt cannot be empty")
+
+        max_prompt_len = self.model_config.max_model_len
+        if len(prompt_ids) >= max_prompt_len:
+            if prompt_type == "encoder" and model_config.is_multimodal_model:
                 mm_registry = self.input_preprocessor.mm_registry
                 mm_processor = mm_registry.create_processor(
-                    model_config, tokenizer=tokenizer)
+                    model_config,
+                    tokenizer=tokenizer or object(),  # Dummy if no tokenizer
+                )
                 assert isinstance(mm_processor, EncDecMultiModalProcessor)
 
                 if mm_processor.pad_dummy_encoder_prompt:
                     return  # Skip encoder length check for Whisper
 
-        prompt_ids = prompt_inputs["prompt_token_ids"]
-
-        if not prompt_ids:
-            raise ValueError(f"The {prompt_type} prompt cannot be empty")
-
-        max_prompt_len = self.model_config.max_model_len
-        if len(prompt_ids) >= max_prompt_len:
-            if self.model_config.is_multimodal_model:
+            if model_config.is_multimodal_model:
                 suggestion = (
                     "Make sure that `max_model_len` is no smaller than the "
                     "number of text tokens plus multimodal tokens. For image "
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index a67339ca5221f..d332b17f910e5 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -211,6 +211,9 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
         # }
 
         if mm_data:
+            hf_processor = self.info.get_hf_processor()
+            image_token: str = hf_processor.image_token
+
             # Since only the last group of consecutive images
             # are attended by the decoded tokens, we only need to
             # get the number of tokens for those images.
@@ -227,7 +230,7 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
             num_tokens = decode_tiles * token_per_chunk
             mm_inputs["encoder_prompt_token_ids"] = [image_token_id
                                                      ] * num_tokens
-            mm_inputs["encoder_prompt"] = "<|image|>" * num_tokens
+            mm_inputs["encoder_prompt"] = image_token * num_tokens
 
         return mm_inputs
 
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 2525b10a15bb2..7d1913ecebed2 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -315,32 +315,34 @@ class Processor:
         *,
         prompt_type: Literal["encoder", "decoder"],
     ):
+        model_config = self.model_config
         tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
 
-        if prompt_type == "encoder":
-            model_config = self.model_config
-
-            if model_config.is_multimodal_model:
-                mm_registry = self.input_preprocessor.mm_registry
-                mm_processor = mm_registry.create_processor(
-                    model_config, tokenizer=tokenizer)
-                assert isinstance(mm_processor, EncDecMultiModalProcessor)
-
-                if mm_processor.pad_dummy_encoder_prompt:
-                    return  # Skip encoder length check for Whisper
-
         prompt_ids = prompt_inputs["prompt_token_ids"]
-
         if not prompt_ids:
-            raise ValueError(f"The {prompt_type} prompt cannot be empty")
+            if prompt_type == "encoder" and model_config.is_multimodal_model:
+                pass  # Mllama may have empty encoder inputs for text-only data
+            else:
+                raise ValueError(f"The {prompt_type} prompt cannot be empty")
 
-        max_input_id = max(prompt_ids)
+        max_input_id = max(prompt_ids, default=0)
         if max_input_id > tokenizer.max_token_id:
             raise ValueError(f"Token id {max_input_id} is out of vocabulary")
 
         max_prompt_len = self.model_config.max_model_len
         if len(prompt_ids) >= max_prompt_len:
-            if self.model_config.is_multimodal_model:
+            if prompt_type == "encoder" and model_config.is_multimodal_model:
+                mm_registry = self.input_preprocessor.mm_registry
+                mm_processor = mm_registry.create_processor(
+                    model_config,
+                    tokenizer=tokenizer,
+                )
+                assert isinstance(mm_processor, EncDecMultiModalProcessor)
+
+                if mm_processor.pad_dummy_encoder_prompt:
+                    return  # Skip encoder length check for Whisper
+
+            if model_config.is_multimodal_model:
                 suggestion = (
                     "Make sure that `max_model_len` is no smaller than the "
                     "number of text tokens plus multimodal tokens. For image "

From c70cf0fe061dc92a5608a67adbd12f82c52f8d9c Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 10 Apr 2025 01:08:47 -0600
Subject: [PATCH 353/593] [Kernel] Use moe_wna16 kernel for compressed tensors
 wna16 moe models (#16038)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .../Qwen1.5-MoE-W4A16-compressed-tensors.yaml |  11 +
 .../lm-eval-harness/configs/models-small.txt  |   2 +-
 vllm/model_executor/layers/fused_moe/layer.py |  11 +-
 .../compressed_tensors/compressed_tensors.py  |   3 +-
 .../compressed_tensors_moe.py                 | 242 +++++++++++++++++-
 5 files changed, 254 insertions(+), 15 deletions(-)
 create mode 100644 .buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml

diff --git a/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
new file mode 100644
index 0000000000000..166af81a3f0ee
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
+model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.31
+  - name: "exact_match,flexible-extract"
+    value: 0.47
+limit: 1319
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt
index 6057229ac50f3..254d01edf8449 100644
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@@ -4,7 +4,7 @@ Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
-Minitron-4B-Base-FP8.yaml
+Qwen1.5-MoE-W4A16-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-FP8W8.yaml
 Meta-Llama-3-8B-QQQ.yaml
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 80ac5f42dfb89..89a7548da2e87 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -512,7 +512,9 @@ class FusedMoE(torch.nn.Module):
         }
         # need full intermediate size pre-sharding for WNA16 act order
         if (self.quant_method.__class__.__name__
-                in ("GPTQMarlinMoEMethod", "CompressedTensorsWNA16MoEMethod")):
+                in ("GPTQMarlinMoEMethod",
+                    "CompressedTensorsWNA16MarlinMoEMethod",
+                    "CompressedTensorsWNA16MoEMethod")):
             moe_quant_params["intermediate_size_full"] = intermediate_size
 
         self.quant_method.create_weights(layer=self, **moe_quant_params)
@@ -648,9 +650,10 @@ class FusedMoE(torch.nn.Module):
         # compressed-tensors checkpoints with packed weights are stored flipped
         # TODO (mgoin): check self.quant_method.quant_config.quant_format
         # against known CompressionFormat enum values that have this quality
-        loaded_weight = loaded_weight.t().contiguous() if (
-            self.quant_method.__class__.__name__
-            == "CompressedTensorsWNA16MoEMethod") else loaded_weight
+        if self.quant_method.__class__.__name__ in (
+                "CompressedTensorsWNA16MarlinMoEMethod",
+                "CompressedTensorsWNA16MoEMethod"):
+            loaded_weight = loaded_weight.t().contiguous()
 
         if shard_id not in ("w1", "w2", "w3"):
             raise ValueError(f"shard_id must be ['w1','w2','w3'] but "
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 4b2d7ca2badee..b714d95b60258 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -96,8 +96,7 @@ class CompressedTensorsConfig(QuantizationConfig):
         if isinstance(layer, Attention):
             return CompressedTensorsKVCacheMethod(self)
         if isinstance(layer, FusedMoE):
-            return CompressedTensorsMoEMethod.get_moe_method(
-                self, layer.activation, layer.expert_map)
+            return CompressedTensorsMoEMethod.get_moe_method(self, layer)
         return None
 
     @classmethod
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index f573c8ae5131b..d22999659380e 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -6,7 +6,8 @@ from typing import Callable, List, Optional
 
 import torch
 from compressed_tensors import CompressionFormat
-from compressed_tensors.quantization import QuantizationStrategy
+from compressed_tensors.quantization import (ActivationOrdering,
+                                             QuantizationStrategy)
 
 import vllm.model_executor.layers.fused_moe  # noqa
 from vllm import _custom_ops as ops
@@ -30,9 +31,11 @@ class GPTQMarlinState(Enum):
 
 
 __all__ = [
-    "CompressedTensorsMoEMethod", "CompressedTensorsW8A8Fp8MoEMethod",
+    "CompressedTensorsMoEMethod",
+    "CompressedTensorsW8A8Fp8MoEMethod",
     "CompressedTensorsW8A8Fp8MoECutlassMethod",
-    "CompressedTensorsWNA16MoEMethod"
+    "CompressedTensorsWNA16MarlinMoEMethod",
+    "CompressedTensorsWNA16MoEMethod",
 ]
 
 
@@ -41,8 +44,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
     @staticmethod
     def get_moe_method(
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
-        activation: str,
-        expert_map: Optional[torch.Tensor],
+        layer: torch.nn.Module,
     ) -> "CompressedTensorsMoEMethod":
         # TODO: @dsikka: refactor this to use schemes as other kernels
         # are supported + check if the layer is being ignored.
@@ -51,9 +53,21 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
             "input_activations")
 
         if quant_config._is_wNa16_group_channel(weight_quant, input_quant):
-            return CompressedTensorsWNA16MoEMethod(quant_config)
+            # Prefer to use the non-marlin kernel when:
+            # 1. Many experts (MarlinMoE gives poor performance when >= 16)
+            # 2. Non-FP16 dtype (MarlinMoE only supports FP16)
+            # 3. Actorder is not group/dynamic (g_idx is unsupported)
+            # 4. Scaled are grouped (channelwise is unsupported)
+            if ((layer.local_num_experts >= 16
+                 or layer.params_dtype != torch.float16) and
+                    weight_quant.actorder not in (ActivationOrdering.GROUP,
+                                                  ActivationOrdering.DYNAMIC)
+                    and weight_quant.strategy in QuantizationStrategy.GROUP):
+                return CompressedTensorsWNA16MoEMethod(quant_config)
+            else:
+                return CompressedTensorsWNA16MarlinMoEMethod(quant_config)
         elif (quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
-              and activation == "silu" and expert_map is None):
+              and layer.activation == "silu" and layer.expert_map is None):
             return CompressedTensorsW8A8Fp8MoECutlassMethod(quant_config)
         elif quant_config._is_fp8_w8a8(weight_quant, input_quant):
             return CompressedTensorsW8A8Fp8MoEMethod(quant_config)
@@ -482,7 +496,7 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
         )
 
 
-class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
+class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
 
     def __init__(
             self,
@@ -823,3 +837,215 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
             sort_indices2=layer.w2_g_idx_sort_indices,
             num_bits=self.num_bits,
             is_k_full=self.is_k_full)
+
+
+class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
+
+    def __init__(
+            self,
+            quant_config: "CompressedTensorsConfig"  # type: ignore # noqa E501
+    ):
+        self.quant_config = quant_config
+        # TODO: @dsikka: refactor this to use schemes as other kernels
+        # are supported + check if the layer is being ignored.
+        config = self.quant_config.target_scheme_map["Linear"].get("weights")
+        self.num_bits = config.num_bits
+        self.packed_factor = 32 // config.num_bits
+        self.strategy = config.strategy
+        # channelwise is not supported by this kernel
+        assert config.strategy == "group"
+        self.group_size = config.group_size
+        # grouped actorder isn't supported by this kernel
+        assert config.actorder != "group"
+        assert config.symmetric, (
+            "Only symmetric quantization is supported for MoE")
+
+        if not (self.quant_config.quant_format
+                == CompressionFormat.pack_quantized.value
+                and self.num_bits in WNA16_SUPPORTED_BITS):
+            raise ValueError("For Fused MoE layers, only ",
+                             f"{CompressionFormat.pack_quantized.value} ",
+                             "is supported for the following bits: ",
+                             f"{WNA16_SUPPORTED_BITS}")
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        # Will transpose the loaded weight along the
+        # intermediate and hidden dim sizes. Will
+        # shard for TP along the transposed dims
+        extra_weight_attrs.update({
+            "is_transposed": True,
+            "quant_method": self.strategy
+        })
+        w13_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size // self.packed_factor,
+            2 * intermediate_size_per_partition,
+            dtype=torch.int32),
+                                        requires_grad=False)
+        layer.register_parameter("w13_weight_packed", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            intermediate_size_per_partition // self.packed_factor,
+            hidden_size,
+            dtype=torch.int32),
+                                       requires_grad=False)
+        layer.register_parameter("w2_weight_packed", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        w2_scales_size = intermediate_size_per_partition
+
+        if self.strategy == "channel":
+            num_groups_w2 = num_groups_w13 = 1
+            self.group_size = -1
+        else:
+            num_groups_w2 = w2_scales_size // self.group_size
+            num_groups_w13 = hidden_size // self.group_size
+
+        w13_scale = torch.nn.Parameter(torch.ones(
+            num_experts,
+            num_groups_w13,
+            2 * intermediate_size_per_partition,
+            dtype=params_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w13_weight_scale", w13_scale)
+        set_weight_attrs(w13_scale, extra_weight_attrs)
+
+        w2_scale = torch.nn.Parameter(torch.ones(num_experts,
+                                                 num_groups_w2,
+                                                 hidden_size,
+                                                 dtype=params_dtype),
+                                      requires_grad=False)
+        layer.register_parameter("w2_weight_scale", w2_scale)
+        set_weight_attrs(w2_scale, extra_weight_attrs)
+        set_weight_attrs(w2_scale, {"load_full_w2": False})
+
+        w2_weight_shape = torch.nn.Parameter(torch.empty(num_experts, 2),
+                                             requires_grad=False)
+        layer.register_parameter("w2_weight_shape", w2_weight_shape)
+        set_weight_attrs(w2_weight_shape, extra_weight_attrs)
+        w13_weight_shape = torch.nn.Parameter(torch.empty(num_experts, 2),
+                                              requires_grad=False)
+
+        layer.register_parameter("w13_weight_shape", w13_weight_shape)
+        set_weight_attrs(w13_weight_shape, extra_weight_attrs)
+
+        w13_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_g_idx", w13_g_idx)
+        set_weight_attrs(w13_g_idx, extra_weight_attrs)
+
+        w2_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_g_idx", w2_g_idx)
+        set_weight_attrs(w2_g_idx, extra_weight_attrs)
+
+        w13_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_g_idx_sort_indices",
+                                 w13_g_idx_sort_indices)
+        set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)
+
+        w2_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_g_idx_sort_indices",
+                                 w2_g_idx_sort_indices)
+        set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
+
+        layer.a13_scale = None
+        layer.a2_scale = None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # Reconfigure packed weights and scales to match moe_wna16 format
+        layer.w13_weight_packed = torch.nn.Parameter(
+            layer.w13_weight_packed.transpose(1, 2).contiguous().view(
+                torch.uint8),
+            requires_grad=False)
+        layer.w2_weight_packed = torch.nn.Parameter(
+            layer.w2_weight_packed.transpose(1,
+                                             2).contiguous().view(torch.uint8),
+            requires_grad=False)
+        layer.w13_weight_scale = torch.nn.Parameter(
+            layer.w13_weight_scale.transpose(1, 2).contiguous(),
+            requires_grad=False)
+        layer.w2_weight_scale = torch.nn.Parameter(
+            layer.w2_weight_scale.transpose(1, 2).contiguous(),
+            requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+    ) -> torch.Tensor:
+        from vllm.model_executor.layers.fused_moe import fused_experts
+        assert activation == "silu", "Only SiLU activation is supported."
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
+
+        return fused_experts(
+            x,
+            layer.w13_weight_packed,
+            layer.w2_weight_packed,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            use_int4_w4a16=self.num_bits == 4,
+            use_int8_w8a16=self.num_bits == 8,
+            global_num_experts=global_num_experts,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            expert_map=expert_map,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            w1_zp=None,
+            w2_zp=None,
+            block_shape=[0, self.group_size])

From 65e09094c4ef91a56c6ad8a622b7192c96c90455 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Thu, 10 Apr 2025 15:45:26 +0800
Subject: [PATCH 354/593] [doc] add download model tips (#16389)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 docs/source/models/supported_models.md | 29 ++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 2ebec2ea968ab..d8e81281a75ef 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -160,6 +160,35 @@ If vLLM successfully returns text (for generative models) or hidden states (for
 Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
 Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
 
+#### Using a proxy
+
+Here are some tips for loading/downloading models from Hugging Face using a proxy:
+
+- Set the proxy globally for your session (or set it in the profile file):
+
+```shell
+export http_proxy=http://your.proxy.server:port
+export https_proxy=http://your.proxy.server:port
+```
+
+- Set the proxy for just the current command:
+
+```shell
+https_proxy=http://your.proxy.server:port huggingface-cli download <model_name>
+
+# or use vllm cmd directly
+https_proxy=http://your.proxy.server:port  vllm serve <model_name> --disable-log-requests
+```
+
+- Set the proxy in Python interpreter:
+
+```python
+import os
+
+os.environ['http_proxy'] = 'http://your.proxy.server:port'
+os.environ['https_proxy'] = 'http://your.proxy.server:port'
+```
+
 ### ModelScope
 
 To use models from [ModelScope](https://www.modelscope.cn) instead of Hugging Face Hub, set an environment variable:

From ec1f9c8c9160d041c2ad17d60197d1db8adb8b22 Mon Sep 17 00:00:00 2001
From: cyyever <cyyever@outlook.com>
Date: Thu, 10 Apr 2025 15:59:37 +0800
Subject: [PATCH 355/593] Update Numba to 0.61.2 (#16376)

Signed-off-by: cyy <cyyever@outlook.com>
---
 requirements/cuda.txt | 2 +-
 requirements/rocm.txt | 2 +-
 requirements/test.in  | 2 +-
 requirements/test.txt | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index 9be7a868f56e0..cdc6ee75afbcd 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -2,7 +2,7 @@
 -r common.txt
 
 numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
-numba == 0.61; python_version > '3.9'
+numba == 0.61.2; python_version > '3.9'
 
 # Dependencies for NVIDIA GPUs
 ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1.
diff --git a/requirements/rocm.txt b/requirements/rocm.txt
index 5d5fea2d0e57e..4df92aab3749e 100644
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -2,7 +2,7 @@
 -r common.txt
 
 numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
-numba == 0.61; python_version > '3.9'
+numba == 0.61.2; python_version > '3.9'
 
 # Dependencies for AMD GPUs
 awscli
diff --git a/requirements/test.in b/requirements/test.in
index 9de492b8bbcad..95c94dcdbe999 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -42,7 +42,7 @@ genai_perf==0.0.8
 tritonclient==2.51.0
 
 numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
-numba == 0.61; python_version > '3.9'
+numba == 0.61.2; python_version > '3.9'
 numpy
 runai-model-streamer==0.11.0
 runai-model-streamer-s3==0.11.0
diff --git a/requirements/test.txt b/requirements/test.txt
index 6fc5c29261706..476b4a2cc0ec2 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -267,7 +267,7 @@ nltk==3.9.1
     # via rouge-score
 num2words==0.5.14
     # via -r requirements/test.in
-numba==0.61.0
+numba==0.61.2
     # via
     #   -r requirements/test.in
     #   librosa

From 61de3ef74b9cd5594e1d79ba573258adf481382c Mon Sep 17 00:00:00 2001
From: "Ye (Charlotte) Qi" <yeq@meta.com>
Date: Thu, 10 Apr 2025 02:36:27 -0700
Subject: [PATCH 356/593] [Model] Remove image mm limit for LLaMa4  (#16365)

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
---
 .../vision_language_multi_image.py            | 29 +++++++++++++++----
 vllm/model_executor/models/mllama4.py         |  4 ++-
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index da9a616e5b857..d9f84d2feae86 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -22,6 +22,16 @@ QUESTION = "What is the content of each image?"
 IMAGE_URLS = [
     "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
     "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg",
 ]
 
 
@@ -285,8 +295,7 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
 
     engine_args = EngineArgs(
         model=model_name,
-        max_model_len=8192,
-        max_num_seqs=4,
+        max_model_len=131072,
         tensor_parallel_size=8,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
@@ -660,7 +669,7 @@ def run_generate(model, question: str, image_urls: list[str],
             llm.llm_engine.add_lora(lora_request=lora_request)
 
     sampling_params = SamplingParams(temperature=0.0,
-                                     max_tokens=128,
+                                     max_tokens=256,
                                      stop_token_ids=req_data.stop_token_ids)
 
     outputs = llm.generate(
@@ -694,7 +703,7 @@ def run_chat(model: str, question: str, image_urls: list[str],
             llm.llm_engine.add_lora(lora_request=lora_request)
 
     sampling_params = SamplingParams(temperature=0.0,
-                                     max_tokens=128,
+                                     max_tokens=256,
                                      stop_token_ids=req_data.stop_token_ids)
     outputs = llm.chat(
         [{
@@ -729,10 +738,12 @@ def main(args: Namespace):
     method = args.method
     seed = args.seed
 
+    image_urls = IMAGE_URLS[:args.num_images]
+
     if method == "generate":
-        run_generate(model, QUESTION, IMAGE_URLS, seed)
+        run_generate(model, QUESTION, image_urls, seed)
     elif method == "chat":
-        run_chat(model, QUESTION, IMAGE_URLS, seed)
+        run_chat(model, QUESTION, image_urls, seed)
     else:
         raise ValueError(f"Invalid method: {method}")
 
@@ -757,6 +768,12 @@ if __name__ == "__main__":
                         type=int,
                         default=None,
                         help="Set the seed when initializing `vllm.LLM`.")
+    parser.add_argument(
+        "--num-images",
+        "-n",
+        choices=list(range(1, 13)),  # 12 is the max number of images
+        default=2,
+        help="Number of images to use for the demo.")
 
     args = parser.parse_args()
     main(args)
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 0499fe09eb949..17171f823cb0b 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -477,7 +477,9 @@ class Mllama4ProcessingInfo(BaseProcessingInfo):
                                          **kwargs)
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": 10}
+        # Although vLLM can support more images from an infra capability
+        # perspective, we do not recommend using >10 images in practice.
+        return {"image": None}
 
     @staticmethod
     def get_patch_per_chunk(vision_config: Llama4VisionConfig) -> int:

From ce8d6b75fc0586045df75ee1568a5b5f9957251b Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Thu, 10 Apr 2025 21:02:37 +0800
Subject: [PATCH 357/593] [doc] update the wrong link (#16401)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 .github/ISSUE_TEMPLATE/600-new-model.yml    | 2 +-
 docs/source/models/extensions/tensorizer.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/600-new-model.yml b/.github/ISSUE_TEMPLATE/600-new-model.yml
index 713e76c1a5cec..5f0125ef98096 100644
--- a/.github/ISSUE_TEMPLATE/600-new-model.yml
+++ b/.github/ISSUE_TEMPLATE/600-new-model.yml
@@ -9,7 +9,7 @@ body:
     value: >
       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
 
-      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
+      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model.
 - type: textarea
   attributes:
     label: The model to consider.
diff --git a/docs/source/models/extensions/tensorizer.md b/docs/source/models/extensions/tensorizer.md
index 830c579d91bae..cd94c81e620a2 100644
--- a/docs/source/models/extensions/tensorizer.md
+++ b/docs/source/models/extensions/tensorizer.md
@@ -9,7 +9,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor
 
 For more information on CoreWeave's Tensorizer, please refer to
 [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
-the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference/tensorize_vllm_model.html).
+the [vLLM example script](https://docs.vllm.ai/en/latest/getting_started/examples/tensorize_vllm_model.html).
 
 :::{note}
 Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.

From 8661c0241d52401b30dbcdfcacb8e263739146c3 Mon Sep 17 00:00:00 2001
From: wineandchord <guoqizhou123123@qq.com>
Date: Thu, 10 Apr 2025 21:43:05 +0800
Subject: [PATCH 358/593] [CI] Add auto update workflow for Dockerfile graph
 (#11879)

Signed-off-by: wineandchord <guoqizhou19@gmail.com>
---
 .pre-commit-config.yaml          |  6 +++
 tools/update-dockerfile-graph.sh | 78 ++++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+)
 create mode 100755 tools/update-dockerfile-graph.sh

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f81410ab40690..e921f69925b66 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -122,6 +122,12 @@ repos:
     language: system
     always_run: true
     pass_filenames: false
+  - id: update-dockerfile-graph
+    name: Update Dockerfile dependency graph
+    entry: tools/update-dockerfile-graph.sh
+    language: script
+    files: ^docker/Dockerfile$
+    pass_filenames: false
   # Keep `suggestion` last
   - id: suggestion
     name: Suggestion
diff --git a/tools/update-dockerfile-graph.sh b/tools/update-dockerfile-graph.sh
new file mode 100755
index 0000000000000..98cff47d17a03
--- /dev/null
+++ b/tools/update-dockerfile-graph.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+# Update Dockerfile dependency graph when docker/Dockerfile changes.
+# This script is designed to be used as a pre-commit hook.
+
+set -euo pipefail
+
+# Check if docker/Dockerfile is staged for commit
+if git diff --cached --name-only | grep -q "^docker/Dockerfile$"; then
+  echo "docker/Dockerfile has changed, attempting to update dependency graph..."
+
+  # Check if Docker is installed and running
+  if ! command -v docker &> /dev/null; then
+    echo "Warning: Docker command not found. Skipping Dockerfile graph update."
+    echo "Please install Docker to automatically update the graph: https://docs.docker.com/get-docker/"
+    exit 0
+  fi
+  if ! docker info &> /dev/null; then
+    echo "Warning: Docker daemon is not running. Skipping Dockerfile graph update."
+    echo "Please start Docker to automatically update the graph."
+    exit 0
+  fi
+
+  # Define the target file path
+  TARGET_GRAPH_FILE="docs/source/assets/contributing/dockerfile-stages-dependency.png"
+
+  # Ensure target directory exists
+  mkdir -p "$(dirname "$TARGET_GRAPH_FILE")"
+
+  # Store old image hash in a variable if the file exists
+  OLD_HASH=""
+  if [ -f "$TARGET_GRAPH_FILE" ]; then
+    OLD_HASH=$(sha256sum "$TARGET_GRAPH_FILE")
+  fi
+  
+  # Generate Dockerfile graph
+  echo "Running dockerfilegraph tool..."
+  docker run \
+    --rm \
+    --user "$(id -u):$(id -g)" \
+    --workdir /workspace \
+    --volume "$(pwd)":/workspace \
+    ghcr.io/patrickhoefler/dockerfilegraph:alpine \
+    --output png \
+    --dpi 200 \
+    --max-label-length 50 \
+    --filename docker/Dockerfile \
+    --legend
+  
+  echo "Finding generated PNG file..."
+  # Check for Dockerfile.png in the root directory (most likely location)
+  if [ -f "./Dockerfile.png" ]; then
+    echo "Found generated file at: ./Dockerfile.png"
+    mv "./Dockerfile.png" "$TARGET_GRAPH_FILE"
+  else
+    # Try to find it elsewhere
+    DOCKERFILE_PNG=$(find . -name "Dockerfile.png" -type f | head -1)
+    
+    if [ -n "$DOCKERFILE_PNG" ]; then
+      echo "Found generated file at: $DOCKERFILE_PNG"
+      mv "$DOCKERFILE_PNG" "$TARGET_GRAPH_FILE"
+    else
+      echo "Error: Could not find the generated PNG file"
+      find . -name "*.png" -type f -mmin -5
+      exit 1
+    fi
+  fi
+  
+  # Check if the graph has changed
+  NEW_HASH=$(sha256sum "$TARGET_GRAPH_FILE")
+  if [ "$NEW_HASH" != "$OLD_HASH" ]; then
+    echo "Graph has changed. Please stage the updated file: $TARGET_GRAPH_FILE"
+    exit 1
+  else
+    echo "No changes in graph detected."
+  fi
+fi
+
+exit 0 

From 7678fcd5b6d64084aeabeb17251d388e251ba4c9 Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Thu, 10 Apr 2025 07:37:47 -0700
Subject: [PATCH 359/593] Fix the torch version parsing logic (#15857)

---
 vllm/compilation/compiler_interface.py |  5 ++---
 vllm/compilation/inductor_pass.py      |  6 +++---
 vllm/config.py                         |  8 +++-----
 vllm/utils.py                          | 18 ++++++++++++++++++
 4 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index 5a22cf70aadab..6c8875916efc3 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -2,7 +2,6 @@
 import contextlib
 import copy
 import hashlib
-import importlib.metadata
 import os
 from contextlib import ExitStack
 from typing import Any, Callable, Dict, List, Optional, Tuple
@@ -11,9 +10,9 @@ from unittest.mock import patch
 import torch
 import torch._inductor.compile_fx
 import torch.fx as fx
-from packaging.version import Version
 
 from vllm.config import VllmConfig
+from vllm.utils import is_torch_equal_or_newer
 
 
 class CompilerInterface:
@@ -379,7 +378,7 @@ class InductorAdaptor(CompilerInterface):
         manually setting up internal contexts. But we also rely on non-public
         APIs which might not provide these guarantees.
         """
-        if Version(importlib.metadata.version('torch')) >= Version("2.6"):
+        if is_torch_equal_or_newer("2.6"):
             import torch._dynamo.utils
             return torch._dynamo.utils.get_metrics_context()
         else:
diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py
index 08dd8c8e1ea26..00a2e89f21aeb 100644
--- a/vllm/compilation/inductor_pass.py
+++ b/vllm/compilation/inductor_pass.py
@@ -1,17 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import hashlib
-import importlib.metadata
 import inspect
 import json
 import types
 from typing import Any, Callable, Dict, Optional, Union
 
 import torch
-from packaging.version import Version
 from torch import fx
 
-if Version(importlib.metadata.version('torch')) >= Version("2.6"):
+from vllm.utils import is_torch_equal_or_newer
+
+if is_torch_equal_or_newer("2.6"):
     from torch._inductor.custom_graph_pass import CustomGraphPass
 else:
     # CustomGraphPass is not present in 2.5 or lower, import our version
diff --git a/vllm/config.py b/vllm/config.py
index 2662c6a84990c..5fcc5f464a8be 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -4,7 +4,6 @@ import ast
 import copy
 import enum
 import hashlib
-import importlib.metadata
 import json
 import sys
 import warnings
@@ -18,7 +17,6 @@ from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Final, Literal,
                     Optional, Protocol, Union)
 
 import torch
-from packaging.version import Version
 from pydantic import BaseModel, Field, PrivateAttr
 from torch.distributed import ProcessGroup, ReduceOp
 from transformers import PretrainedConfig
@@ -40,8 +38,8 @@ from vllm.transformers_utils.config import (
 from vllm.transformers_utils.s3_utils import S3Model
 from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
 from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
-                        get_cpu_memory, get_open_port, random_uuid,
-                        resolve_obj_by_qualname)
+                        get_cpu_memory, get_open_port, is_torch_equal_or_newer,
+                        random_uuid, resolve_obj_by_qualname)
 
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
@@ -3285,7 +3283,7 @@ class CompilationConfig(BaseModel):
         #    and it is not yet a priority. RFC here:
         #    https://github.com/vllm-project/vllm/issues/14703
 
-        if Version(importlib.metadata.version('torch')) >= Version("2.6"):
+        if is_torch_equal_or_newer("2.6"):
             KEY = 'enable_auto_functionalized_v2'
             if KEY not in self.inductor_compile_config:
                 self.inductor_compile_config[KEY] = False
diff --git a/vllm/utils.py b/vllm/utils.py
index 1645565a8e845..551f1a4c9d26a 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -53,6 +53,7 @@ import torch.types
 import yaml
 import zmq
 import zmq.asyncio
+from packaging import version
 from packaging.version import Version
 from torch.library import Library
 from typing_extensions import Never, ParamSpec, TypeIs, assert_never
@@ -2580,3 +2581,20 @@ def sha256(input) -> int:
     input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
     return int.from_bytes(hashlib.sha256(input_bytes).digest(),
                           byteorder="big")
+
+
+def is_torch_equal_or_newer(target: str) -> bool:
+    """Check if the installed torch version is >= the target version.
+
+    Args:
+        target: a version string, like "2.6.0".
+
+    Returns:
+        Whether the condition meets.
+    """
+    try:
+        torch_version = version.parse(str(torch.__version__))
+        return torch_version >= version.parse(target)
+    except Exception:
+        # Fallback to PKG-INFO to load the package info, needed by the doc gen.
+        return Version(importlib.metadata.version('torch')) >= Version(target)

From 83b824c8b4ee55824b30f0509fd312b0cddb35e5 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 11 Apr 2025 00:06:58 +0800
Subject: [PATCH 360/593] [VLM] Remove
 `BaseProcessingInfo.get_mm_max_tokens_per_item` (#16408)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/contributing/model/multimodal.md  | 240 +++---------------
 .../multimodal/processing/test_llama4.py      |   5 -
 vllm/model_executor/models/aria.py            |   7 -
 vllm/model_executor/models/aya_vision.py      |  25 --
 vllm/model_executor/models/blip2.py           |   7 -
 vllm/model_executor/models/chameleon.py       |   7 -
 vllm/model_executor/models/clip.py            |   3 -
 vllm/model_executor/models/deepseek_vl2.py    |  14 -
 vllm/model_executor/models/florence2.py       |  11 +-
 vllm/model_executor/models/fuyu.py            |  15 --
 vllm/model_executor/models/gemma3_mm.py       |  16 --
 vllm/model_executor/models/glm4v.py           |   7 -
 vllm/model_executor/models/h2ovl.py           |  23 --
 vllm/model_executor/models/idefics3.py        |  16 --
 vllm/model_executor/models/internvl.py        |  16 --
 vllm/model_executor/models/llava.py           |   7 -
 .../model_executor/models/llava_next_video.py |  16 --
 vllm/model_executor/models/llava_onevision.py |  10 -
 vllm/model_executor/models/minicpmo.py        |  11 -
 vllm/model_executor/models/minicpmv.py        |  12 -
 vllm/model_executor/models/mistral3.py        |  15 --
 vllm/model_executor/models/mllama.py          |  10 -
 vllm/model_executor/models/mllama4.py         |  19 --
 vllm/model_executor/models/molmo.py           |  16 --
 vllm/model_executor/models/paligemma.py       |  48 ++--
 vllm/model_executor/models/phi3v.py           |  15 --
 vllm/model_executor/models/pixtral.py         |  23 --
 .../models/prithvi_geospatial_mae.py          |   3 -
 vllm/model_executor/models/qwen2_audio.py     |  11 -
 vllm/model_executor/models/qwen2_vl.py        |  10 -
 vllm/model_executor/models/qwen_vl.py         |   7 -
 vllm/model_executor/models/siglip.py          |   3 -
 vllm/model_executor/models/skyworkr1v.py      |  16 --
 vllm/model_executor/models/ultravox.py        |  12 -
 vllm/model_executor/models/vision.py          |   4 -
 vllm/model_executor/models/whisper.py         |  11 +-
 vllm/multimodal/processing.py                 |  15 --
 vllm/multimodal/profiling.py                  |  65 ++---
 vllm/multimodal/registry.py                   |  10 +-
 39 files changed, 104 insertions(+), 677 deletions(-)

diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index 0c7496334fb79..03d830fe90f11 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -121,17 +121,21 @@ def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
     return {"image": None, "video": 1}
 ```
 
-### Maximum number of placeholder feature tokens
+## 3. Specify dummy inputs
 
-Also, override the abstract method {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max_tokens_per_item`
-to return the maximum number of placeholder feature tokens per input item for each modality.
+Then, inherit {class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` to construct dummy inputs for
+HF processing as well as memory profiling.
 
-When calling the model, the output embeddings from the visual encoder are assigned to the input positions
-containing placeholder feature tokens. Therefore, the number of placeholder feature tokens should be equal
-to the size of the output embeddings.
+### For memory profiling
 
-:::::{tab-set}
-::::{tab-item} Basic example: LLaVA
+Override the abstract method {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`
+to construct dummy inputs for memory profiling. This dummy input should result in the worst-case memory usage of
+the model so that vLLM can reserve the correct amount of memory for it.
+
+Assuming that the memory usage increases with the number of tokens, the dummy input can be constructed to maximize the number of output embeddings, which is the same number as placeholder feature tokens.
+
+::::{tab-set}
+:::{tab-item} Basic example: LLaVA
 :sync: llava
 
 Looking at the code of HF's `LlavaForConditionalGeneration`:
@@ -240,7 +244,7 @@ def get_num_image_tokens(
 ```
 
 Notice that the number of image tokens doesn't depend on the image width and height.
-So, we can calculate the maximum number of image tokens using any image size:
+We can simply use a dummy `image_size`:
 
 ```python
 def get_image_size_with_most_features(self) -> ImageSize:
@@ -248,33 +252,35 @@ def get_image_size_with_most_features(self) -> ImageSize:
     width = height = hf_config.image_size
     return ImageSize(width=width, height=height)
 
-def get_max_image_tokens(self) -> int:
-    target_width, target_height = self.get_image_size_with_most_features()
-
-    return self.get_num_image_tokens(
-        image_width=target_width,
-        image_height=target_height,
-    )
-```
-
-And thus, we can override the method as:
-
-```python
-def get_mm_max_tokens_per_item(
+def get_dummy_processor_inputs(
     self,
     seq_len: int,
     mm_counts: Mapping[str, int],
-) -> Mapping[str, int]:
-    return {"image": self.get_max_image_tokens()}
+) -> ProcessorInputs:
+    num_images = mm_counts.get("image", 0)
+
+    processor = self.info.get_hf_processor()
+    image_token = processor.image_token
+  
+    hf_config = self.get_hf_config()
+    target_width, target_height = self.info.get_image_size_with_most_features()
+
+    mm_data = {
+        "image":
+        self._get_dummy_images(width=target_width,
+                               height=target_height,
+                               num_images=num_images)
+    }
+
+    return ProcessorInputs(
+        prompt_text=image_token * num_images,
+        mm_data=mm_data,
+    )
 ```
 
-:::{note}
-Our [actual code](gh-file:vllm/model_executor/models/llava.py) is more abstracted to support vision encoders other than CLIP.
 :::
 
-::::
-
-::::{tab-item} Non-consecutive feature tokens: Fuyu
+:::{tab-item} No input placeholders: Fuyu
 :sync: fuyu
 
 Looking at the code of HF's `FuyuForCausalLM`:
@@ -394,188 +400,16 @@ num_patches_per_dim_w = image_width // patch_width
 num_patches = num_patches_per_dim_h * num_patches_per_dim_w
 ```
 
-We can calculate this in vLLM using this code:
-
-```python
-def get_num_image_patches(
-    self,
-    *,
-    image_width: int,
-    image_height: int,
-) -> int:
-    image_processor = self.get_image_processor()
-    target_width = image_processor.size["width"]
-    target_height = image_processor.size["height"]
-    patch_width = image_processor.patch_size["width"]
-    patch_height = image_processor.patch_size["height"]
-
-    if not (image_width <= target_width and image_height <= target_height):
-        height_scale_factor = target_height / image_height
-        width_scale_factor = target_width / image_width
-        optimal_scale_factor = min(height_scale_factor, width_scale_factor)
-
-        image_height = int(image_height * optimal_scale_factor)
-        image_width = int(image_width * optimal_scale_factor)
-
-    ncols = math.ceil(image_width / patch_width)
-    nrows = math.ceil(image_height / patch_height)
-    return ncols * nrows
-```
-
-These image patches correspond to placeholder tokens (`|SPEAKER|`). However, the processor also
-inserts newline tokens (`|NEWLINE|`) as shown here:
-
-```python
-# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L654-L670
-tensor_of_image_ids = torch.full(
-    [num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
-)
-patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
-assert num_patches == patches.shape[0]
-
-if variable_sized:
-    # Now terminate each line with |NEWLINE|.
-    tensor_of_image_ids = tensor_of_image_ids.reshape(-1, image_width // patch_width)
-    newline_ids = torch.full(
-        [tensor_of_image_ids.shape[0], 1],
-        image_newline_id,
-        dtype=torch.int32,
-        device=image_input.device,
-    )
-    tensor_of_image_ids = torch.cat([tensor_of_image_ids, newline_ids], dim=1)
-    tensor_of_image_ids = tensor_of_image_ids.reshape(-1)
-```
-
-So, the layout of tokens for an image is:
-
-```
-|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
-|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
-...
-|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
-```
-
-This makes the placeholder tokens non-consecutive in the prompt.
-Since vLLM requires the feature tokens to be consecutive, **we also treat the newline tokens as feature tokens**.
-
-So overall, the total number of feature tokens is
-
-```python
-def get_num_image_tokens(
-    self,
-    *,
-    image_width: int,
-    image_height: int,
-) -> int:
-    image_processor = self.get_image_processor()
-    target_width = image_processor.size["width"]
-    target_height = image_processor.size["height"]
-    patch_width = image_processor.patch_size["width"]
-    patch_height = image_processor.patch_size["height"]
-
-    if not (image_width <= target_width and image_height <= target_height):
-        height_scale_factor = target_height / image_height
-        width_scale_factor = target_width / image_width
-        optimal_scale_factor = min(height_scale_factor, width_scale_factor)
-
-        image_height = int(image_height * optimal_scale_factor)
-        image_width = int(image_width * optimal_scale_factor)
-
-    ncols = math.ceil(image_width / patch_width)
-    nrows = math.ceil(image_height / patch_height)
-    return (ncols + 1) * nrows
-```
-
-To calculate the maximum number of image tokens, recall that input images are first resized
-to fit within `image_processor.size`. The maximum possible dimensions of the image before
-being converted into patches is therefore equal to `image_processor.size`.
+These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized
+to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`.
 
 ```python
 def get_image_size_with_most_features(self) -> ImageSize:
     image_processor = self.get_image_processor()
     return ImageSize(width=image_processor.size["width"],
                         height=image_processor.size["height"])
-
-def get_max_image_tokens(self) -> int:
-    target_width, target_height = self.get_image_size_with_most_features()
-
-    return self.get_num_image_tokens(
-        image_width=target_width,
-        image_height=target_height,
-    )
 ```
 
-And thus, we can override the method as:
-
-```python
-def get_mm_max_tokens_per_item(
-    self,
-    seq_len: int,
-    mm_counts: Mapping[str, int],
-) -> Mapping[str, int]:
-    return {"image": self.get_max_image_tokens()}
-```
-
-:::{note}
-Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) returns `ncols` and `nrows` directly instead of the total token count.
-This is because `ncols` and `nrows` are used to specify the layout of the feature tokens (as shown in Step 4 of this guide).
-:::
-
-::::
-:::::
-
-## 3. Specify dummy inputs
-
-Then, inherit {class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` to construct dummy inputs for
-HF processing as well as memory profiling.
-
-### For memory profiling
-
-Override the abstract method {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`
-to construct dummy inputs for memory profiling. This dummy input should result in the worst-case memory usage of
-the model so that vLLM can reserve the correct amount of memory for it.
-
-Assuming that the memory usage increases with the number of tokens, the dummy input can be constructed based
-on the code for {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max_tokens_per_item`.
-
-::::{tab-set}
-:::{tab-item} Basic example: LLaVA
-:sync: llava
-
-Making use of the `get_image_size_with_most_features` method implemented in Step 2:
-
-```python
-def get_dummy_processor_inputs(
-    self,
-    seq_len: int,
-    mm_counts: Mapping[str, int],
-) -> ProcessorInputs:
-    num_images = mm_counts.get("image", 0)
-
-    processor = self.info.get_hf_processor()
-    image_token = processor.image_token
-  
-    hf_config = self.get_hf_config()
-    target_width, target_height = self.info.get_image_size_with_most_features()
-
-    mm_data = {
-        "image":
-        self._get_dummy_images(width=target_width,
-                               height=target_height,
-                               num_images=num_images)
-    }
-
-    return ProcessorInputs(
-        prompt_text=image_token * num_images,
-        mm_data=mm_data,
-    )
-```
-
-:::
-
-:::{tab-item} No input placeholders: Fuyu
-:sync: fuyu
-
 Fuyu does not expect image placeholders in the inputs to HF processor, so
 the dummy prompt text is empty regardless of the number of images.
 Otherwise, the logic of this method is very similar to LLaVA:
diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py
index 578dcd4a44452..2bfc2785feb6f 100644
--- a/tests/models/multimodal/processing/test_llama4.py
+++ b/tests/models/multimodal/processing/test_llama4.py
@@ -76,11 +76,6 @@ def test_processor_override(
         if v == config.boi_token_index]
 
     # patch sizes and masks
-    patch_token_id = vocab[hf_processor.img_patch_token]
-    num_patches = processed_inputs["prompt_token_ids"].count(patch_token_id)
-    mm_counts = {"image": num_imgs}
-    assert num_patches / num_imgs <= \
-        processor.info.get_mm_max_tokens_per_item(32768, mm_counts)["image"]
     num_patches_per_chunk = processor.info.get_patch_per_chunk(
         config.vision_config)
     assert prompt_token_ids.count(config.image_token_index) \
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index af340feffcf96..23b8ef89268d7 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -408,13 +408,6 @@ class AriaProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_num_image_tokens()}
-
     def get_num_image_tokens(self) -> int:
         hf_config = self.get_hf_config()
         return max(hf_config.projector_patch_to_query_dict.values())
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index 929c8f2a82a2b..cdec31602503d 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -117,31 +117,6 @@ class AyaVisionProcessingInfo(BaseProcessingInfo):
     def get_image_processor(self) -> GotOcr2ImageProcessor:
         return self.get_hf_processor().image_processor
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-
-    def get_max_image_tokens(self) -> int:
-        hf_processor = self.get_hf_processor()
-        image_processor = hf_processor.image_processor
-
-        image_size = self.get_image_size_with_most_features()
-        num_patches = self.get_num_patches(
-            image_width=image_size.width,
-            image_height=image_size.height,
-            size=image_processor.size,
-            min_patches=image_processor.min_patches,
-            max_patches=image_processor.max_patches,
-        )
-
-        img_patches_per_tile = (hf_processor.img_size //
-                                hf_processor.patch_size)**2
-
-        return num_patches * img_patches_per_tile
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index a1f20ea4e6143..dde78ee52a3de 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -406,13 +406,6 @@ class Blip2ProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_num_image_tokens()}
-
     def get_num_image_tokens(self) -> int:
         hf_config = self.get_hf_config()
         return hf_config.num_query_tokens
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index d46ae5327dcb3..fb2f4b677c5af 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -64,13 +64,6 @@ class ChameleonProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_num_image_tokens()}
-
     def get_num_image_tokens(self) -> int:
         processor = self.get_hf_processor()
         return processor.image_seq_length
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index dc3aa9cbe86b7..153054e5c028b 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -30,9 +30,6 @@ class CLIPEncoderInfo(VisionEncoderInfo[CLIPVisionConfig]):
     ) -> int:
         return self.get_patch_grid_length()**2 + 1
 
-    def get_max_image_tokens(self) -> int:
-        return self.get_patch_grid_length()**2 + 1
-
     def get_image_size(self) -> int:
         return self.vision_config.image_size
 
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 03d5be2927bb3..951185bc9bd01 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -168,20 +168,6 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo):
                                 image_width=x[1], image_height=x[0]))
         return ImageSize(width=width, height=height)
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        num_images = mm_counts.get("image", 0)
-        max_image_size = self.get_image_size_with_most_features()
-        max_image_tokens = self.get_num_image_tokens(
-            image_height=max_image_size.height,
-            image_width=max_image_size.width,
-            cropping=num_images <= 2)
-
-        return {"image": max_image_tokens}
-
 
 class DeepseekVL2DummyInputsBuilder(
         BaseDummyInputsBuilder[DeepseekVL2ProcessingInfo]):
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 62fd09398fac1..56572bd59a35c 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -764,17 +764,10 @@ class Florence2ProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
 
-    def get_max_image_tokens(self) -> int:
+    def get_num_image_tokens(self) -> int:
         processor_config = self.ctx.get_hf_image_processor_config()
         return processor_config["image_seq_length"]
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-
 
 class Florence2DummyInputsBuilder(
         BaseDummyInputsBuilder[Florence2ProcessingInfo]):
@@ -871,7 +864,7 @@ class Florence2MultiModalProcessor(
     ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
         pad_token_id = hf_config.pad_token_id
-        num_image_tokens = self.info.get_max_image_tokens()
+        num_image_tokens = self.info.get_num_image_tokens()
         image_tokens = [pad_token_id] * num_image_tokens
 
         return [
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index c0a0f572ff3c4..5fc6bb846388f 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -80,13 +80,6 @@ class FuyuProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-
     def get_image_feature_grid_size(
         self,
         *,
@@ -129,14 +122,6 @@ class FuyuProcessingInfo(BaseProcessingInfo):
         return ImageSize(width=image_processor.size["width"],
                          height=image_processor.size["height"])
 
-    def get_max_image_tokens(self) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-        )
-
 
 class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]):
 
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 93d0aa301f54e..34d856f4b2037 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -68,13 +68,6 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-
     def _resolve_image_kwargs(
         self,
         processor: Gemma3Processor,
@@ -228,15 +221,6 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
         # Result in the max possible feature size (h:w = max_num_crops:1)
         return ImageSize(height=50 * max_num_crops, width=50)
 
-    def get_max_image_tokens(self) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-            processor=None,
-        )
-
 
 class Gemma3DummyInputsBuilder(BaseDummyInputsBuilder[Gemma3ProcessingInfo]):
 
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index 6d7b760d0dd76..02954eecc42cd 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -431,13 +431,6 @@ class GLM4VProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_num_image_feature_tokens()}
-
     def get_num_image_tokens(self) -> int:
         hf_config = self.get_hf_config()
         vision_config = hf_config.vision_config
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index f975a19a364ed..15e126b0f4cea 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -412,19 +412,6 @@ class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
             **kwargs,
         )
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        max_tokens_one_image = self.get_max_image_tokens(use_msac=None)
-        if mm_counts.get("image", 0) <= 1:
-            max_tokens_per_image = max_tokens_one_image
-        else:
-            max_tokens_per_image = self.get_max_image_tokens(use_msac=False)
-
-        return {"image": max_tokens_per_image}
-
     def get_num_image_tokens(
         self,
         *,
@@ -442,16 +429,6 @@ class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
             use_msac=use_msac,
         )
 
-    def get_max_image_tokens(self, use_msac: Optional[bool] = None) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-            processor=None,
-            use_msac=use_msac,
-        )
-
 
 class H2OVLMultiModalProcessor(InternVLMultiModalProcessor[H2OVLProcessingInfo]
                                ):
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index ec02d1c8862a1..655db1c856346 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -97,13 +97,6 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-
     def _resize_output_size(self,
                             *,
                             height: int,
@@ -287,15 +280,6 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
             height=image_processor.size["longest_edge"],
         )
 
-    def get_max_image_tokens(self) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-            processor=None,
-        )
-
 
 class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo]
                                  ):
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 7fd628fa6c380..08741b3a3c11e 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -458,13 +458,6 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-
     def get_num_image_tokens(
         self,
         *,
@@ -480,15 +473,6 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
             image_height=image_height,
         )
 
-    def get_max_image_tokens(self) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-            processor=None,
-        )
-
     def get_image_size_with_most_features(self) -> ImageSize:
         processor = self.get_hf_processor()
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 9516550005d50..5804cb4419b6c 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -137,13 +137,6 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-
     def _apply_feature_select_strategy(
         self,
         strategy: str,
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 6fc4c187efa78..281c9c0e8ebe3 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -61,22 +61,6 @@ class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"video": 1}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        max_video_tokens = self.get_num_video_tokens(
-            image_width=target_width,
-            image_height=target_height,
-            num_frames=self.get_num_frames_with_most_features(
-                seq_len, mm_counts),
-        )
-
-        return {"video": max_video_tokens}
-
     def get_image_size_with_most_features(self) -> ImageSize:
         vision_encoder_info = self.get_vision_encoder_info()
         width = height = vision_encoder_info.get_image_size()
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 5fbd27b9b0b3e..f6256771d9828 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -101,16 +101,6 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {
-            "image": self.get_max_image_tokens(),
-            "video": self.get_max_video_tokens(seq_len, mm_counts),
-        }
-
     # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
     # with additional logic afterwards taken from LlavaOnevisionProcessor
     def _get_num_unpadded_features(
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index a4fb0cb1741e9..8bb41a108b5a9 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -142,17 +142,6 @@ class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {**super().get_supported_mm_limits(), "audio": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {
-            **super().get_mm_max_tokens_per_item(seq_len, mm_counts),
-            "audio":
-            self.get_max_audio_tokens(),
-        }
-
     def get_audio_placeholder(
         self,
         audio_lens: int,
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 12b5364cbaf83..87c6902195831 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -346,18 +346,6 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
 
         return mm_limits
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        mm_max_tokens = {"image": self.get_max_image_tokens()}
-        if self.get_model_version() == (2, 6):
-            mm_max_tokens["video"] = self.get_max_video_tokens(
-                seq_len, mm_counts)
-
-        return mm_max_tokens
-
     def get_slice_image_placeholder(
         self,
         image_size: ImageSize,
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index 67c0e2ec233bc..d2c600feb4b29 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -162,13 +162,6 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-
     def get_num_image_tokens(
         self,
         *,
@@ -186,14 +179,6 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo):
         width = height = vision_encoder_info.get_image_size()
         return ImageSize(width=width, height=height)
 
-    def get_max_image_tokens(self) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-        )
-
 
 _I = TypeVar("_I", bound=BaseLlavaProcessingInfo)
 
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index d332b17f910e5..b61e42f31d88b 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -106,16 +106,6 @@ class MllamaProcessingInfo(BaseProcessingInfo):
         image_size = self.get_hf_config().vision_config.image_size
         return calc_token_per_chunk(image_size)
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        vision_config = self.get_hf_config().vision_config
-        token_per_chunk = self.get_token_per_chunk_from_config()
-        mm_max_tokens = vision_config.max_num_tiles * token_per_chunk
-        return {"image": mm_max_tokens}
-
     def get_num_tiles_per_image(self, image_height: int,
                                 image_width: int) -> int:
         vision_config = self.get_hf_config().vision_config
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 17171f823cb0b..4f709751ae629 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -498,17 +498,6 @@ class Mllama4ProcessingInfo(BaseProcessingInfo):
         image_processor = self.get_hf_processor().image_processor
         return image_processor.max_patches
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        vision_config = self.get_hf_config().vision_config
-        patch_per_chunk = self.get_patch_per_chunk(vision_config)
-        num_patches = self.get_max_num_tiles() + 1
-
-        return {"image": patch_per_chunk * num_patches}
-
     def get_image_size_with_most_features(self) -> ImageSize:
         vision_config = self.get_hf_config().vision_config
         image_size = vision_config.image_size
@@ -516,14 +505,6 @@ class Mllama4ProcessingInfo(BaseProcessingInfo):
         return ImageSize(height=self.get_max_num_tiles() * image_size,
                          width=image_size)
 
-    def get_max_image_tokens(self) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-        )
-
 
 class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
                                  ):
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index a7551e613dfc3..d896431b166b2 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1164,13 +1164,6 @@ class MolmoProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-
     def get_num_image_tokens(
         self,
         *,
@@ -1195,15 +1188,6 @@ class MolmoProcessingInfo(BaseProcessingInfo):
 
         return extra + joint
 
-    def get_max_image_tokens(self) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-            processor=None,
-        )
-
     def get_image_size_with_most_features(self) -> ImageSize:
         processor = self.get_hf_processor()
 
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 274163ac9c428..ae8eee4515e04 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -13,7 +13,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputs, MultiModalKwargs)
-from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptIndexTargets,
                                         PromptInsertion, PromptUpdate,
@@ -72,16 +73,18 @@ class PaliGemmaProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
 
-    def get_mm_max_tokens_per_item(
+    def get_num_image_tokens(
         self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_num_image_tokens()}
-
-    def get_num_image_tokens(self) -> int:
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
         vision_encoder_info = self.get_vision_encoder_info()
-        return vision_encoder_info.get_max_image_tokens()
+
+        return vision_encoder_info.get_num_image_tokens(
+            image_width=image_width,
+            image_height=image_height,
+        )
 
 
 class PaliGemmaDummyInputsBuilder(
@@ -148,12 +151,30 @@ class PaliGemmaMultiModalProcessor(
         image_token_id = hf_config.image_token_index
 
         tokenizer = self.info.get_tokenizer()
-        num_image_tokens = self.info.get_num_image_tokens()
-        image_tokens = [image_token_id] * num_image_tokens
 
         bos_token_id = tokenizer.bos_token_id
         assert isinstance(bos_token_id, int)
 
+        def get_insertion(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                num_image_tokens = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                num_image_tokens = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                )
+
+            image_tokens = [image_token_id] * num_image_tokens
+
+            return PromptUpdateDetails.select_token_id(
+                image_tokens + [bos_token_id],
+                embed_token_id=image_token_id,
+            )
+
         # Paligemma 1 and 2 have different tokenizer.add_bos_token
         # Insert <image>*n + <bos> after <bos> for Paligemma 1
         # Insert <image>*n + <bos> for Paligemma 2
@@ -162,10 +183,7 @@ class PaliGemmaMultiModalProcessor(
                 modality="image",
                 target=PromptIndexTargets.prefix(
                     [bos_token_id] if tokenizer.add_bos_token else []),
-                insertion=PromptUpdateDetails.select_token_id(
-                    image_tokens + [bos_token_id],
-                    embed_token_id=image_token_id,
-                ),
+                insertion=get_insertion,
             )
         ]
 
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 344f348cd3d93..cce700f02f597 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -321,21 +321,6 @@ class Phi3VProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        max_image_tokens = self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-            processor=None,
-        )
-
-        return {"image": max_image_tokens}
-
     def get_num_image_tokens(
         self,
         *,
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 328d52711b5e5..fdd342ccf6b56 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -167,13 +167,6 @@ class PixtralProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-
     def get_vision_config(
         self,
         processor: Optional[PixtralProcessorAdapter] = None,
@@ -207,14 +200,6 @@ class PixtralProcessingInfo(BaseProcessingInfo):
 
         return ImageSize(width=max_image_size, height=max_image_size)
 
-    def get_max_image_tokens(self) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-        )
-
 
 class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
 
@@ -938,14 +923,6 @@ class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]):
         )
         return ncols * nrows
 
-    def get_max_image_tokens(self) -> int:
-        image_size = self.get_image_size()
-
-        return self.get_num_image_tokens(
-            image_width=image_size,
-            image_height=image_size,
-        )
-
     def get_image_size(self) -> int:
         return self.vision_config.image_size
 
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index a69c0fc54e4c2..e3a93e95530c3 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -45,9 +45,6 @@ class PrithviGeoSpatialMAEProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {"image": 0}
-
 
 class PrithviGeoSpatialMAEInputBuilder(
         BaseDummyInputsBuilder[PrithviGeoSpatialMAEProcessingInfo]):
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 9f2593fc94f40..ba4646f5583f9 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -109,17 +109,6 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        hf_config = self.get_hf_config()
-        max_source_positions = hf_config.audio_config.max_source_positions
-        max_output_lengths = (max_source_positions - 2) // 2 + 1
-
-        return {"audio": max_output_lengths}
-
 
 class Qwen2AudioDummyInputsBuilder(
         BaseDummyInputsBuilder[Qwen2AudioProcessingInfo]):
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index f93654d0fcb33..23f27e7ef9fb0 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -818,16 +818,6 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {
-            "image": self.get_max_image_tokens(),
-            "video": self.get_max_video_tokens(seq_len, mm_counts),
-        }
-
     def _get_vision_info(
         self,
         *,
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index 2e941f3b7a318..403d47a39d175 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -530,13 +530,6 @@ class QwenVLProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_num_image_tokens()}
-
     def get_num_image_tokens(self) -> int:
         hf_config = self.get_hf_config()
         vision_config = hf_config.visual
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index cecad9e8935ee..75fcf540b0b12 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -33,9 +33,6 @@ class SiglipEncoderInfo(VisionEncoderInfo[SiglipVisionConfig]):
     ) -> int:
         return self.get_patch_grid_length()**2
 
-    def get_max_image_tokens(self) -> int:
-        return self.get_patch_grid_length()**2
-
     def get_image_size(self) -> int:
         return self.vision_config.image_size
 
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index a8460a2e10434..09a212a9face0 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -459,13 +459,6 @@ class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-
     def get_num_image_tokens(
         self,
         *,
@@ -481,15 +474,6 @@ class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo):
             image_height=image_height,
         )
 
-    def get_max_image_tokens(self) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-            processor=None,
-        )
-
     def get_image_size_with_most_features(self) -> ImageSize:
         processor = self.get_hf_processor()
 
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 6e9d15261b790..3ff5a0516b65e 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -2,7 +2,6 @@
 
 # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py
 """PyTorch Ultravox model."""
-import math
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
 from typing import Any, Literal, Optional, Set, Tuple, TypedDict, Union
@@ -107,17 +106,6 @@ class UltravoxProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        feature_extractor = self.get_feature_extractor()
-        max_audio_tokens = math.ceil(feature_extractor.chunk_length *
-                                     _AUDIO_TOKENS_PER_SECOND)
-
-        return {"audio": max_audio_tokens * _MAX_ENCODER_BATCH_SIZE}
-
 
 class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo]
                                  ):
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 347f51499b7be..05e3b3f3ccdf3 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -33,10 +33,6 @@ class VisionEncoderInfo(ABC, Generic[_C]):
     ) -> int:
         raise NotImplementedError
 
-    @abstractmethod
-    def get_max_image_tokens(self) -> int:
-        raise NotImplementedError
-
     @abstractmethod
     def get_image_size(self) -> int:
         raise NotImplementedError
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 7751f96da6aea..341e22a4a8bb1 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -538,16 +538,9 @@ class WhisperProcessingInfo(BaseProcessingInfo):
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
 
-    def get_max_audio_tokens(self) -> int:
+    def get_num_audio_tokens(self) -> int:
         return self.get_hf_config().max_source_positions
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"audio": self.get_max_audio_tokens()}
-
 
 class WhisperDummyInputsBuilder(BaseDummyInputsBuilder[WhisperProcessingInfo]):
 
@@ -630,7 +623,7 @@ class WhisperMultiModalProcessor(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> Sequence[PromptUpdate]:
-        num_tokens = self.info.get_max_audio_tokens()
+        num_tokens = self.info.get_num_audio_tokens()
         return [
             PromptReplacement(
                 modality="audio",
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 64f657db94bba..fefeefd21375e 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1034,21 +1034,6 @@ class BaseProcessingInfo:
         """
         raise NotImplementedError
 
-    @abstractmethod
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        """
-        Get the maximum possible number of tokens per data item
-        for each modality.
-
-        The dictionary returned by this method should have the same
-        keys as that returned by :meth:`get_supported_mm_limits`.
-        """
-        raise NotImplementedError
-
 
 _I = TypeVar("_I", bound=BaseProcessingInfo)
 
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index ec3625f2f4265..7efe86448fdd0 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -68,7 +68,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
     ) -> ProcessorInputs:
         """
         Build the input which, after processing, results in
-        :code:`self.info.get_mm_max_tokens_per_item()` placeholder tokens.
+        the maximum possible number of placeholder tokens.
         """
         raise NotImplementedError
 
@@ -152,8 +152,11 @@ class MultiModalProfiler(Generic[_I]):
     def _get_dummy_mm_inputs(
         self,
         seq_len: int,
-        mm_counts: Mapping[str, int],
+        mm_counts: Optional[Mapping[str, int]] = None,
     ) -> MultiModalInputs:
+        if mm_counts is None:
+            mm_counts = self.get_mm_limits()
+
         factory = self.dummy_inputs
         processor_inputs = factory.get_dummy_processor_inputs(
             seq_len, mm_counts)
@@ -164,53 +167,23 @@ class MultiModalProfiler(Generic[_I]):
             hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
         )
 
-    def get_and_validate_mm_inputs(
+    def _get_mm_num_tokens(
         self,
-        seq_len: int,
-        mm_counts: Optional[Mapping[str, int]] = None,
-    ) -> tuple[MultiModalInputs, Mapping[str, int]]:
-        if mm_counts is None:
-            mm_counts = self.get_mm_limits()
-
-        info = self.processing_info
-        mm_max_tokens_per_item = info.get_mm_max_tokens_per_item(
-            seq_len, mm_counts)
-
-        if mm_counts.keys() - mm_max_tokens_per_item.keys():
-            raise AssertionError(
-                "The keys returned by `get_supported_mm_limits` "
-                f"({set(mm_counts.keys())}) should be a subset of those "
-                "returned by `get_mm_max_tokens_per_item` "
-                f"({set(mm_max_tokens_per_item.keys())})")
-
-        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
+        mm_inputs: MultiModalInputs,
+    ) -> Mapping[str, int]:
         placeholders_by_modality = mm_inputs["mm_placeholders"]
 
-        total_placeholders_by_modality = {
+        return {
             modality: sum(item.get_num_embeds() for item in placeholders)
             for modality, placeholders in placeholders_by_modality.items()
         }
-        expected_placeholders_by_modality = {
-            modality: mm_max_tokens_per_item[modality] * mm_counts[modality]
-            for modality in placeholders_by_modality
-        }
-        if total_placeholders_by_modality != expected_placeholders_by_modality:
-            raise AssertionError(
-                f"The processed dummy data has a total of "
-                f"{total_placeholders_by_modality} placeholder tokens, which "
-                f"is not the expected {expected_placeholders_by_modality} "
-                "tokens.")
-        return mm_inputs, total_placeholders_by_modality
 
     def get_encoder_dummy_data(
         self,
         seq_len: int,
         mm_counts: Optional[Mapping[str, int]] = None,
     ) -> DummyEncoderData:
-        (
-            mm_inputs,
-            total_placeholders_by_modality,
-        ) = self.get_and_validate_mm_inputs(seq_len, mm_counts)
+        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
         mm_inputs = cast(MultiModalEncDecInputs, mm_inputs)
 
         # For encoder-decoder models, use encoder prompt token ids instead of
@@ -232,7 +205,7 @@ class MultiModalProfiler(Generic[_I]):
                 " is too short "
                 "to hold the multi-modal embeddings in the worst case "
                 f"({total_len} tokens in total, out of which "
-                f"{total_placeholders_by_modality} are reserved for "
+                f"{self._get_mm_num_tokens(mm_inputs)} are reserved for "
                 "multi-modal embeddings). This may cause certain "
                 "multi-modal inputs to fail during inference, even when "
                 "the input text is short. To avoid this, you should "
@@ -246,10 +219,7 @@ class MultiModalProfiler(Generic[_I]):
         seq_len: int,
         mm_counts: Optional[Mapping[str, int]] = None,
     ) -> DummyDecoderData:
-        (
-            mm_inputs,
-            total_placeholders_by_modality,
-        ) = self.get_and_validate_mm_inputs(seq_len, mm_counts)
+        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
 
         prompt_token_ids = mm_inputs["prompt_token_ids"]
         total_len = len(prompt_token_ids)
@@ -263,7 +233,7 @@ class MultiModalProfiler(Generic[_I]):
                 "is too short "
                 "to hold the multi-modal embeddings in the worst case "
                 f"({total_len} tokens in total, out of which "
-                f"{total_placeholders_by_modality} are reserved for "
+                f"{self._get_mm_num_tokens(mm_inputs)} are reserved for "
                 "multi-modal embeddings). This may cause certain "
                 "multi-modal inputs to fail during inference, even when "
                 "the input text is short. To avoid this, you should "
@@ -278,3 +248,12 @@ class MultiModalProfiler(Generic[_I]):
             multi_modal_data=mm_inputs["mm_kwargs"],
             multi_modal_placeholders=mm_inputs["mm_placeholders"],
         )
+
+    def get_mm_max_tokens(
+        self,
+        seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
+    ) -> Mapping[str, int]:
+        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
+
+        return self._get_mm_num_tokens(mm_inputs)
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 4f41fa083f63b..eafa28d612a6d 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -258,10 +258,16 @@ class MultiModalRegistry:
         """
         if self.has_processor(model_config):
             processor = self.create_processor(model_config, disable_cache=True)
+            profiler = MultiModalProfiler(processor)
+
             seq_len = model_config.max_model_len
             mm_limits = self.get_mm_limits_per_prompt(model_config)
-            return processor.info.get_mm_max_tokens_per_item(
-                seq_len, mm_limits)
+
+            return profiler.get_mm_max_tokens(
+                seq_len,
+                {modality: 1
+                 for modality in mm_limits},
+            )
 
         return {
             key: plugin.get_max_multimodal_tokens(model_config)

From c1b57855ecf257ba620fad022d429332d50f289f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nicolo.lucchesi@gmail.com>
Date: Thu, 10 Apr 2025 19:32:04 +0200
Subject: [PATCH 361/593] [TPU][V1] Use `language_model` interface for getting
 text backbone in MM (#16410)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 vllm/v1/worker/tpu_model_runner.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 773c426474fc9..e6c5a8996de6a 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -972,8 +972,7 @@ class TPUModelRunner:
 
     def reset_dynamo_cache(self):
         if self.is_multimodal_model:
-            assert hasattr(self.model, "language_model")
-            compiled_model = self.model.language_model.model
+            compiled_model = self.model.get_language_model().model
         else:
             compiled_model = self.model.model
         if isinstance(compiled_model, TorchCompileWrapperWithCustomDispatcher):

From 0c54fc72733d3e6f8b1ac236db8380172f884ec0 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 10 Apr 2025 18:34:37 +0100
Subject: [PATCH 362/593] Improve configs - `ParallelConfig` (#16332)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config.py           | 146 +++++++++++++++++++++++++++++++--------
 vllm/engine/arg_utils.py | 121 +++++++++++++++++---------------
 2 files changed, 182 insertions(+), 85 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 5fcc5f464a8be..23541a884d91a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -4,13 +4,16 @@ import ast
 import copy
 import enum
 import hashlib
+import inspect
 import json
 import sys
+import textwrap
 import warnings
 from collections import Counter
 from collections.abc import Mapping
 from contextlib import contextmanager
-from dataclasses import dataclass, field, replace
+from dataclasses import (MISSING, dataclass, field, fields, is_dataclass,
+                         replace)
 from importlib.util import find_spec
 from pathlib import Path
 from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Final, Literal,
@@ -104,6 +107,77 @@ class ModelImpl(str, enum.Enum):
     TRANSFORMERS = "transformers"
 
 
+def get_attr_docs(cls: type[Any]) -> dict[str, str]:
+    """
+    Get any docstrings placed after attribute assignments in a class body.
+
+    https://davidism.com/mit-license/
+    """
+
+    def pairwise(iterable):
+        """
+        Manually implement https://docs.python.org/3/library/itertools.html#itertools.pairwise
+        
+        Can be removed when Python 3.9 support is dropped.
+        """
+        iterator = iter(iterable)
+        a = next(iterator, None)
+
+        for b in iterator:
+            yield a, b
+            a = b
+
+    cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0]
+
+    if not isinstance(cls_node, ast.ClassDef):
+        raise TypeError("Given object was not a class.")
+
+    out = {}
+
+    # Consider each pair of nodes.
+    for a, b in pairwise(cls_node.body):
+        # Must be an assignment then a constant string.
+        if (not isinstance(a, (ast.Assign, ast.AnnAssign))
+                or not isinstance(b, ast.Expr)
+                or not isinstance(b.value, ast.Constant)
+                or not isinstance(b.value.value, str)):
+            continue
+
+        doc = inspect.cleandoc(b.value.value)
+
+        # An assignment can have multiple targets (a = b = v), but an
+        # annotated assignment only has one target.
+        targets = a.targets if isinstance(a, ast.Assign) else [a.target]
+
+        for target in targets:
+            # Must be assigning to a plain name.
+            if not isinstance(target, ast.Name):
+                continue
+
+            out[target.id] = doc
+
+    return out
+
+
+def config(cls: type[Any]) -> type[Any]:
+    """
+    A decorator that ensures all fields in a dataclass have default values
+    and that each field has a docstring.
+    """
+    if not is_dataclass(cls):
+        raise TypeError("The decorated class must be a dataclass.")
+    attr_docs = get_attr_docs(cls)
+    for f in fields(cls):
+        if f.init and f.default is MISSING and f.default_factory is MISSING:
+            raise ValueError(
+                f"Field '{f.name}' in {cls.__name__} must have a default value."
+            )
+        if f.name not in attr_docs:
+            raise ValueError(
+                f"Field '{f.name}' in {cls.__name__} must have a docstring.")
+    return cls
+
+
 class ModelConfig:
     """Configuration for the model.
 
@@ -1432,61 +1506,77 @@ class LoadConfig:
             self.ignore_patterns = ["original/**/*"]
 
 
+@config
 @dataclass
 class ParallelConfig:
     """Configuration for the distributed execution."""
 
-    pipeline_parallel_size: int = 1  # Number of pipeline parallel groups.
-    tensor_parallel_size: int = 1  # Number of tensor parallel groups.
-    data_parallel_size: int = 1  # Number of data parallel groups.
-    data_parallel_rank: int = 0  # Rank of the data parallel group.
-    # Local rank of the data parallel group, defaults to global rank.
+    pipeline_parallel_size: int = 1
+    """Number of pipeline parallel groups."""
+    tensor_parallel_size: int = 1
+    """Number of tensor parallel groups."""
+    data_parallel_size: int = 1
+    """Number of data parallel groups. MoE layers will be sharded according to
+    the product of the tensor parallel size and data parallel size."""
+    data_parallel_rank: int = 0
+    """Rank of the data parallel group."""
     data_parallel_rank_local: Optional[int] = None
-    # IP of the data parallel master.
+    """Local rank of the data parallel group, defaults to global rank."""
     data_parallel_master_ip: str = "127.0.0.1"
-    data_parallel_master_port: int = 29500  # Port of the data parallel master.
-    enable_expert_parallel: bool = False  # Use EP instead of TP for MoE layers.
+    """IP of the data parallel master."""
+    data_parallel_master_port: int = 29500
+    """Port of the data parallel master."""
+    enable_expert_parallel: bool = False
+    """Use expert parallelism instead of tensor parallelism for MoE layers."""
 
-    # Maximum number of multiple batches
-    # when load model sequentially. To avoid RAM OOM when using tensor
-    # parallel and large models.
     max_parallel_loading_workers: Optional[int] = None
+    """Maximum number of parallal loading workers when loading model
+    sequentially in multiple batches. To avoid RAM OOM when using tensor
+    parallel and large models."""
 
-    # Disable the custom all-reduce kernel and fall back to NCCL.
     disable_custom_all_reduce: bool = False
+    """Disable the custom all-reduce kernel and fall back to NCCL."""
 
-    # Config for the tokenizer pool. If None, will use synchronous tokenization.
     tokenizer_pool_config: Optional[TokenizerPoolConfig] = None
+    """Config for the tokenizer pool. If None, will use synchronous
+    tokenization."""
 
-    # Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler.
     ray_workers_use_nsight: bool = False
+    """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
 
-    # ray distributed model workers placement group.
     placement_group: Optional["PlacementGroup"] = None
+    """ray distributed model workers placement group."""
 
-    # Backend to use for distributed model
-    # workers, either "ray" or "mp" (multiprocessing). If the product
-    # of pipeline_parallel_size and tensor_parallel_size is less than
-    # or equal to the number of GPUs available, "mp" will be used to
-    # keep processing on a single host. Otherwise, this will default
-    # to "ray" if Ray is installed and fail otherwise. Note that tpu
-    # and hpu only support Ray for distributed inference.
     distributed_executor_backend: Optional[Union[str,
                                                  type["ExecutorBase"]]] = None
+    """Backend to use for distributed model
+    workers, either "ray" or "mp" (multiprocessing). If the product
+    of pipeline_parallel_size and tensor_parallel_size is less than
+    or equal to the number of GPUs available, "mp" will be used to
+    keep processing on a single host. Otherwise, this will default
+    to "ray" if Ray is installed and fail otherwise. Note that tpu
+    and hpu only support Ray for distributed inference."""
 
-    # the full name of the worker class to use. If "auto", the worker class
-    # will be determined based on the platform.
     worker_cls: str = "auto"
+    """The full name of the worker class to use. If "auto", the worker class
+    will be determined based on the platform."""
     sd_worker_cls: str = "auto"
+    """The full name of the worker class to use for speculative decofing. 
+    If "auto", the worker class will be determined based on the platform."""
     worker_extension_cls: str = ""
+    """The full name of the worker extension class to use. The worker extension
+    class is dynamically inherited by the worker class. This is used to inject
+    new attributes and methods to the worker class for use in collective_rpc
+    calls."""
 
-    # world_size is TPxPP, it affects the number of workers we create.
     world_size: int = field(init=False)
-    # world_size_across_dp is TPxPPxDP, it is the size of the world
-    # including data parallelism.
+    """world_size is TPxPP, it affects the number of workers we create."""
     world_size_across_dp: int = field(init=False)
+    """world_size_across_dp is TPxPPxDP, it is the size of the world
+    including data parallelism."""
 
     rank: int = 0
+    """Global rank in distributed setup."""
 
     def get_next_dp_init_port(self) -> int:
         """
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 0c81e3edbe33e..ba71a8770d17d 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -5,9 +5,9 @@ import dataclasses
 import json
 import re
 import threading
-from dataclasses import dataclass
+from dataclasses import MISSING, dataclass, fields
 from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Mapping, Optional,
-                    Tuple, Type, Union, cast, get_args)
+                    Tuple, Type, Union, cast, get_args, get_origin)
 
 import torch
 
@@ -19,7 +19,7 @@ from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat,
                          ModelConfig, ModelImpl, ObservabilityConfig,
                          ParallelConfig, PoolerConfig, PromptAdapterConfig,
                          SchedulerConfig, SpeculativeConfig, TaskOption,
-                         TokenizerPoolConfig, VllmConfig)
+                         TokenizerPoolConfig, VllmConfig, get_attr_docs)
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
@@ -111,14 +111,15 @@ class EngineArgs:
     # Note: Specifying a custom executor backend by passing a class
     # is intended for expert use only. The API may change without
     # notice.
-    distributed_executor_backend: Optional[Union[str,
-                                                 Type[ExecutorBase]]] = None
+    distributed_executor_backend: Optional[Union[
+        str, Type[ExecutorBase]]] = ParallelConfig.distributed_executor_backend
     # number of P/D disaggregation (or other disaggregation) workers
-    pipeline_parallel_size: int = 1
-    tensor_parallel_size: int = 1
-    data_parallel_size: int = 1
-    enable_expert_parallel: bool = False
-    max_parallel_loading_workers: Optional[int] = None
+    pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size
+    tensor_parallel_size: int = ParallelConfig.tensor_parallel_size
+    data_parallel_size: int = ParallelConfig.data_parallel_size
+    enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
+    max_parallel_loading_workers: Optional[
+        int] = ParallelConfig.max_parallel_loading_workers
     block_size: Optional[int] = None
     enable_prefix_caching: Optional[bool] = None
     prefix_caching_hash_algo: str = "builtin"
@@ -145,7 +146,7 @@ class EngineArgs:
     quantization: Optional[str] = None
     enforce_eager: Optional[bool] = None
     max_seq_len_to_capture: int = 8192
-    disable_custom_all_reduce: bool = False
+    disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
     tokenizer_pool_size: int = 0
     # Note: Specifying a tokenizer pool by passing a class
     # is intended for expert use only. The API may change without
@@ -170,7 +171,7 @@ class EngineArgs:
     device: str = 'auto'
     num_scheduler_steps: int = 1
     multi_step_stream_outputs: bool = True
-    ray_workers_use_nsight: bool = False
+    ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
     num_gpu_blocks_override: Optional[int] = None
     num_lookahead_slots: int = 0
     model_loader_extra_config: Optional[dict] = None
@@ -197,8 +198,8 @@ class EngineArgs:
     override_neuron_config: Optional[Dict[str, Any]] = None
     override_pooler_config: Optional[PoolerConfig] = None
     compilation_config: Optional[CompilationConfig] = None
-    worker_cls: str = "auto"
-    worker_extension_cls: str = ""
+    worker_cls: str = ParallelConfig.worker_cls
+    worker_extension_cls: str = ParallelConfig.worker_extension_cls
 
     kv_transfer_config: Optional[KVTransferConfig] = None
 
@@ -232,6 +233,31 @@ class EngineArgs:
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         """Shared CLI arguments for vLLM engine."""
+
+        def is_optional(cls: type[Any]) -> bool:
+            """Check if the class is an optional type."""
+            return get_origin(cls) is Union and type(None) in get_args(cls)
+
+        def get_kwargs(cls: type[Any]) -> Dict[str, Any]:
+            cls_docs = get_attr_docs(cls)
+            kwargs = {}
+            for field in fields(cls):
+                name = field.name
+                # One of these will always be present
+                default = (field.default_factory
+                           if field.default is MISSING else field.default)
+                kwargs[name] = {"default": default, "help": cls_docs[name]}
+                # When using action="store_true"
+                # add_argument doesn't accept type
+                if field.type is bool:
+                    continue
+                # Handle optional fields
+                if is_optional(field.type):
+                    kwargs[name]["type"] = nullable_str
+                    continue
+                kwargs[name]["type"] = field.type
+            return kwargs
+
         # Model arguments
         parser.add_argument(
             '--model',
@@ -411,52 +437,37 @@ class EngineArgs:
             '* "transformers" will use the Transformers model '
             'implementation.\n')
         # Parallel arguments
-        parser.add_argument(
+        parallel_kwargs = get_kwargs(ParallelConfig)
+        parallel_group = parser.add_argument_group(
+            title="ParallelConfig",
+            description=ParallelConfig.__doc__,
+        )
+        parallel_group.add_argument(
             '--distributed-executor-backend',
             choices=['ray', 'mp', 'uni', 'external_launcher'],
-            default=EngineArgs.distributed_executor_backend,
-            help='Backend to use for distributed model '
-            'workers, either "ray" or "mp" (multiprocessing). If the product '
-            'of pipeline_parallel_size and tensor_parallel_size is less than '
-            'or equal to the number of GPUs available, "mp" will be used to '
-            'keep processing on a single host. Otherwise, this will default '
-            'to "ray" if Ray is installed and fail otherwise. Note that tpu '
-            'only supports Ray for distributed inference.')
-
-        parser.add_argument('--pipeline-parallel-size',
-                            '-pp',
-                            type=int,
-                            default=EngineArgs.pipeline_parallel_size,
-                            help='Number of pipeline stages.')
-        parser.add_argument('--tensor-parallel-size',
-                            '-tp',
-                            type=int,
-                            default=EngineArgs.tensor_parallel_size,
-                            help='Number of tensor parallel replicas.')
-        parser.add_argument('--data-parallel-size',
-                            '-dp',
-                            type=int,
-                            default=EngineArgs.data_parallel_size,
-                            help='Number of data parallel replicas. '
-                            'MoE layers will be sharded according to the '
-                            'product of the tensor-parallel-size and '
-                            'data-parallel-size.')
-        parser.add_argument(
+            **parallel_kwargs["distributed_executor_backend"])
+        parallel_group.add_argument(
+            '--pipeline-parallel-size', '-pp',
+            **parallel_kwargs["pipeline_parallel_size"])
+        parallel_group.add_argument('--tensor-parallel-size', '-tp',
+                                    **parallel_kwargs["tensor_parallel_size"])
+        parallel_group.add_argument('--data-parallel-size', '-dp',
+                                    **parallel_kwargs["data_parallel_size"])
+        parallel_group.add_argument(
             '--enable-expert-parallel',
             action='store_true',
-            help='Use expert parallelism instead of tensor parallelism '
-            'for MoE layers.')
-        parser.add_argument(
+            **parallel_kwargs["enable_expert_parallel"])
+        parallel_group.add_argument(
             '--max-parallel-loading-workers',
-            type=int,
-            default=EngineArgs.max_parallel_loading_workers,
-            help='Load model sequentially in multiple batches, '
-            'to avoid RAM OOM when using tensor '
-            'parallel and large models.')
-        parser.add_argument(
+            **parallel_kwargs["max_parallel_loading_workers"])
+        parallel_group.add_argument(
             '--ray-workers-use-nsight',
             action='store_true',
-            help='If specified, use nsight to profile Ray workers.')
+            **parallel_kwargs["ray_workers_use_nsight"])
+        parallel_group.add_argument(
+            '--disable-custom-all-reduce',
+            action='store_true',
+            **parallel_kwargs["disable_custom_all_reduce"])
         # KV cache arguments
         parser.add_argument('--block-size',
                             type=int,
@@ -639,10 +650,6 @@ class EngineArgs:
                             'Additionally for encoder-decoder models, if the '
                             'sequence length of the encoder input is larger '
                             'than this, we fall back to the eager mode.')
-        parser.add_argument('--disable-custom-all-reduce',
-                            action='store_true',
-                            default=EngineArgs.disable_custom_all_reduce,
-                            help='See ParallelConfig.')
         parser.add_argument('--tokenizer-pool-size',
                             type=int,
                             default=EngineArgs.tokenizer_pool_size,

From 9665313c3986e0ff7d9047d62cf47ddaa1dba485 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Thu, 10 Apr 2025 13:53:26 -0400
Subject: [PATCH 363/593] [V1] Set structured output backend to `auto` by
 default (#15724)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 tests/entrypoints/openai/test_chat.py         | 74 +++----------------
 vllm/config.py                                |  4 +-
 vllm/engine/arg_utils.py                      |  6 +-
 .../guided_decoding/__init__.py               |  6 ++
 4 files changed, 22 insertions(+), 68 deletions(-)

diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index b83c37a9032d3..a10b42ea3a4b5 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -20,8 +20,6 @@ from .test_completion import zephyr_lora_files  # noqa: F401
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 
-GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
-
 
 @pytest.fixture(scope="module")
 def monkeypatch_module():
@@ -487,20 +485,9 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
     assert last_completion_tokens == 10
 
 
-# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
-# (i.e. using the same ordering as in the Completions API tests), the test
-# will fail on the second `guided_decoding_backend` even when I swap their order
-# (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256)
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_choice_chat(client: openai.AsyncOpenAI,
-                                  is_v1_server: bool,
-                                  guided_decoding_backend: str,
                                   sample_guided_choice):
-
-    if is_v1_server and guided_decoding_backend != 'xgrammar':
-        pytest.skip("Only xgrammar backend is supported with V1")
-
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -515,8 +502,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
         messages=messages,
         max_completion_tokens=10,
         temperature=0.7,
-        extra_body=dict(guided_choice=sample_guided_choice,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_choice=sample_guided_choice))
     choice1 = chat_completion.choices[0].message.content
     assert choice1 in sample_guided_choice
 
@@ -530,22 +516,16 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
         messages=messages,
         max_completion_tokens=10,
         temperature=0.7,
-        extra_body=dict(guided_choice=sample_guided_choice,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_choice=sample_guided_choice))
     choice2 = chat_completion.choices[0].message.content
     assert choice2 in sample_guided_choice
     assert choice1 != choice2
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
-                                guided_decoding_backend: str,
+async def test_guided_json_chat(client: openai.AsyncOpenAI,
                                 sample_json_schema):
 
-    if is_v1_server:
-        pytest.skip("sample_json_schema has features unsupported in V1")
-
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -560,8 +540,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=1000,
-        extra_body=dict(guided_json=sample_json_schema,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_json=sample_json_schema))
     message = chat_completion.choices[0].message
     assert message.content is not None
     json1 = json.loads(message.content)
@@ -578,8 +557,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=1000,
-        extra_body=dict(guided_json=sample_json_schema,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_json=sample_json_schema))
     message = chat_completion.choices[0].message
     assert message.content is not None
     json2 = json.loads(message.content)
@@ -589,13 +567,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_guided_regex_chat(client: openai.AsyncOpenAI,
-                                 is_v1_server: bool,
-                                 guided_decoding_backend: str, sample_regex):
-
-    if is_v1_server and guided_decoding_backend != 'xgrammar':
-        pytest.skip("Only xgrammar backend is supported with V1")
+async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex):
 
     messages = [{
         "role": "system",
@@ -610,8 +582,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=20,
-        extra_body=dict(guided_regex=sample_regex,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_regex=sample_regex))
     ip1 = chat_completion.choices[0].message.content
     assert ip1 is not None
     assert re.fullmatch(sample_regex, ip1) is not None
@@ -622,8 +593,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=20,
-        extra_body=dict(guided_regex=sample_regex,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_regex=sample_regex))
     ip2 = chat_completion.choices[0].message.content
     assert ip2 is not None
     assert re.fullmatch(sample_regex, ip2) is not None
@@ -652,15 +622,9 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
-                                           is_v1_server: bool,
-                                           guided_decoding_backend: str,
                                            sample_guided_choice):
 
-    if is_v1_server and guided_decoding_backend != 'xgrammar':
-        pytest.skip("Only xgrammar backend is supported with V1")
-
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -676,8 +640,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
         max_completion_tokens=10,
         logprobs=True,
         top_logprobs=5,
-        extra_body=dict(guided_choice=sample_guided_choice,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_choice=sample_guided_choice))
 
     assert chat_completion.choices[0].logprobs is not None
     assert chat_completion.choices[0].logprobs.content is not None
@@ -689,14 +652,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
-                              guided_decoding_backend: str,
-                              sample_json_schema):
-
-    if is_v1_server:
-        pytest.skip("sample_json_schema has features unsupported on V1")
-
+async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -728,7 +684,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
                 "name": "dummy_function_name"
             }
         },
-        extra_body=dict(guided_decoding_backend=guided_decoding_backend))
+    )
     message = chat_completion.choices[0].message
     assert len(message.content) == 0
     json_string = message.tool_calls[0].function.arguments
@@ -763,7 +719,6 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
                 "name": "dummy_function_name"
             }
         },
-        extra_body=dict(guided_decoding_backend=guided_decoding_backend),
         stream=True)
 
     output = []
@@ -888,7 +843,6 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
         model=model_name,
         tools=tools,
         tool_choice="required",
-        extra_body=dict(guided_decoding_backend="outlines"),
     )
 
     assert chat_completion.choices[0].message.tool_calls is not None
@@ -900,7 +854,6 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
         model=model_name,
         tools=tools,
         tool_choice="required",
-        extra_body=dict(guided_decoding_backend="outlines"),
         stream=True,
     )
 
@@ -914,12 +867,7 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
-                                                  is_v1_server: bool,
                                                   sample_json_schema):
-
-    if is_v1_server:
-        pytest.skip("sample_json_schema has features unsupported on V1")
-
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
diff --git a/vllm/config.py b/vllm/config.py
index 23541a884d91a..ff9579a4bb1e6 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2976,7 +2976,7 @@ class DecodingConfig:
 
     # Which guided decoding algo to use.
     # 'outlines' / 'lm-format-enforcer' / 'xgrammar'
-    guided_decoding_backend: str = 'xgrammar'
+    guided_decoding_backend: str = "auto" if envs.VLLM_USE_V1 else "xgrammar"
 
     reasoning_backend: Optional[str] = None
 
@@ -3001,7 +3001,7 @@ class DecodingConfig:
 
     def __post_init__(self):
         v0_valid_guided_backends = [
-            'outlines', 'lm-format-enforcer', 'xgrammar'
+            'outlines', 'lm-format-enforcer', 'xgrammar', 'auto'
         ]
         v1_valid_guided_backends = ['xgrammar', 'guidance', 'auto']
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ba71a8770d17d..9cc6eca24b5c9 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -182,7 +182,7 @@ class EngineArgs:
     enable_chunked_prefill: Optional[bool] = None
     disable_chunked_mm_input: bool = False
 
-    guided_decoding_backend: str = 'xgrammar'
+    guided_decoding_backend: str = DecodingConfig.guided_decoding_backend
     logits_processor_pattern: Optional[str] = None
 
     speculative_config: Optional[Dict[str, Any]] = None
@@ -407,13 +407,13 @@ class EngineArgs:
         parser.add_argument(
             '--guided-decoding-backend',
             type=str,
-            default='xgrammar',
+            default=DecodingConfig.guided_decoding_backend,
             help='Which engine will be used for guided decoding'
             ' (JSON schema / regex etc) by default. Currently support '
             'https://github.com/mlc-ai/xgrammar and '
             'https://github.com/guidance-ai/llguidance.'
             'Valid backend values are "xgrammar", "guidance", and "auto". '
-            'With "auto", we will make opinionated choices based on request'
+            'With "auto", we will make opinionated choices based on request '
             'contents and what the backend libraries currently support, so '
             'the behavior is subject to change in each release.')
         parser.add_argument(
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index d4fd11fd2e305..6f0eede74b5aa 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -33,6 +33,12 @@ def maybe_backend_fallback(
         logger.warning("%s Falling back to use %s instead.", message, fallback)
         guided_params.backend = fallback
 
+    # `auto` was added for V1 to explicitly declare a mode that has fallbacks
+    # in place. If that is specified with V0, treat it as `xgrammar`, as we have
+    # fallbacks enabled for that and it is the V0 default.
+    if guided_params.backend == "auto":
+        guided_params.backend = "xgrammar"
+
     # lm-format-enforce doesn't support grammar, fallback to xgrammar
     if guided_params.backend_name == "lm-format-enforcer":
         if guided_params.grammar is not None:

From e8224f3dcaeb3c3514667a04d03f627524e07b0b Mon Sep 17 00:00:00 2001
From: Lily Liu <lilyliupku@gmail.com>
Date: Thu, 10 Apr 2025 11:21:48 -0700
Subject: [PATCH 364/593] [V1][Spec Decode] Eagle Model loading (#16035)

Signed-off-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
---
 examples/offline_inference/eagle.py           |   1 +
 tests/models/registry.py                      |   4 +
 ...ram_spec_decode.py => test_spec_decode.py} |  49 ++++++
 vllm/model_executor/model_loader/loader.py    |   4 +-
 vllm/model_executor/models/llama_eagle.py     | 151 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |   1 +
 vllm/transformers_utils/configs/eagle.py      |   5 +-
 vllm/v1/spec_decode/eagle.py                  |  57 ++++---
 vllm/v1/worker/gpu_model_runner.py            |   7 +-
 9 files changed, 251 insertions(+), 28 deletions(-)
 rename tests/v1/e2e/{test_ngram_spec_decode.py => test_spec_decode.py} (65%)
 create mode 100644 vllm/model_executor/models/llama_eagle.py

diff --git a/examples/offline_inference/eagle.py b/examples/offline_inference/eagle.py
index 369417b2c18fc..453ae7b6f56fa 100644
--- a/examples/offline_inference/eagle.py
+++ b/examples/offline_inference/eagle.py
@@ -76,6 +76,7 @@ def main():
         max_num_seqs=args.max_num_seqs,
         gpu_memory_utilization=0.8,
         speculative_config={
+            "method": "eagle",
             "model": eagle_dir,
             "num_speculative_tokens": args.num_spec_tokens,
             "draft_tensor_parallel_size": args.draft_tp,
diff --git a/tests/models/registry.py b/tests/models/registry.py
index b43bdb9c788da..896b6c3bf47b9 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -374,6 +374,10 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
     "DeepSeekMTPModel": _HfExamplesInfo("luccafong/deepseek_mtp_main_random",
                                         speculative_model="luccafong/deepseek_mtp_draft_random",  # noqa: E501
                                         trust_remote_code=True),
+    "EagleLlamaForCausalLM": _HfExamplesInfo("yuhuili/EAGLE-LLaMA3-Instruct-8B",
+                                             trust_remote_code=True,
+                                             speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B",
+                                             tokenizer="meta-llama/Meta-Llama-3-8B-Instruct"),  # noqa: E501
 }
 
 _TRANSFORMERS_MODELS = {
diff --git a/tests/v1/e2e/test_ngram_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
similarity index 65%
rename from tests/v1/e2e/test_ngram_spec_decode.py
rename to tests/v1/e2e/test_spec_decode.py
index 7c7c2f02c0785..673714980592a 100644
--- a/tests/v1/e2e/test_ngram_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -53,6 +53,11 @@ def model_name():
     return "meta-llama/Meta-Llama-3-8B-Instruct"
 
 
+@pytest.fixture
+def eagle_model_name():
+    return "yuhuili/EAGLE-LLaMA3-Instruct-8B"
+
+
 def test_ngram_correctness(
     monkeypatch: pytest.MonkeyPatch,
     test_prompts: list[list[dict[str, Any]]],
@@ -95,3 +100,47 @@ def test_ngram_correctness(
         # Upon failure, inspect the outputs to check for inaccuracy.
         assert matches > int(0.7 * len(ref_outputs))
         del spec_llm
+
+
+def test_eagle_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    test_prompts: list[list[dict[str, Any]]],
+    sampling_config: SamplingParams,
+    model_name: str,
+    eagle_model_name: str,
+):
+    '''
+    Compare the outputs of a original LLM and a speculative LLM
+    should be the same when using eagle speculative decoding.
+    '''
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        ref_llm = LLM(model=model_name, max_model_len=1024)
+        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+        del ref_llm
+
+        spec_llm = LLM(
+            model=model_name,
+            speculative_config={
+                "method": "eagle",
+                "model": eagle_model_name,
+                "num_speculative_tokens": 3,
+            },
+            max_model_len=1024,
+        )
+        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
+        matches = 0
+        misses = 0
+        for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+            if ref_output.outputs[0].text == spec_output.outputs[0].text:
+                matches += 1
+            else:
+                misses += 1
+                print(f"ref_output: {ref_output.outputs[0].text}")
+                print(f"spec_output: {spec_output.outputs[0].text}")
+
+        # Heuristic: expect at least 70% of the prompts to match exactly
+        # Upon failure, inspect the outputs to check for inaccuracy.
+        assert matches > int(0.7 * len(ref_outputs))
+        del spec_llm
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 03934ba074877..b0a0a20aa76f0 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -414,7 +414,7 @@ class DefaultModelLoader(BaseModelLoader):
         return ((source.prefix + name, tensor)
                 for (name, tensor) in weights_iterator)
 
-    def _get_all_weights(
+    def get_all_weights(
         self,
         model_config: ModelConfig,
         model: nn.Module,
@@ -453,7 +453,7 @@ class DefaultModelLoader(BaseModelLoader):
 
             weights_to_load = {name for name, _ in model.named_parameters()}
             loaded_weights = model.load_weights(
-                self._get_all_weights(model_config, model))
+                self.get_all_weights(model_config, model))
             self.counter_after_loading_weights = time.perf_counter()
             logger.info(
                 "Loading weights took %.2f seconds",
diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py
new file mode 100644
index 0000000000000..28ad6128c4f19
--- /dev/null
+++ b/vllm/model_executor/models/llama_eagle.py
@@ -0,0 +1,151 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Iterable, Set, Tuple
+
+import torch
+import torch.nn as nn
+from transformers import LlamaConfig
+
+from vllm.config import ModelConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.llama import (LlamaDecoderLayer,
+                                              LlamaForCausalLM)
+
+from .utils import AutoWeightsLoader, maybe_prefix
+
+logger = init_logger(__name__)
+
+
+class LlamaDecoderLayer(LlamaDecoderLayer):
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        disable_input_layernorm: bool,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, prefix=prefix)
+
+        # Skip the input_layernorm
+        # https://github.com/SafeAILab/EAGLE/blob/35c78f6cdc19a73e05cf5c330b4c358dad970c6a/eagle/model/cnets.py#L427
+        if disable_input_layernorm:
+            del self.input_layernorm
+            self.input_layernorm = nn.Identity()
+
+
+class LlamaModel(nn.Module):
+
+    def __init__(
+        self,
+        *,
+        model_config: ModelConfig,
+        start_layer_id: int = 0,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = model_config.hf_config
+        self.vocab_size = self.config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            prefix=maybe_prefix(prefix, "embed_tokens"),
+        )
+        self.layers = nn.ModuleList([
+            LlamaDecoderLayer(
+                self.config,
+                i == 0,
+                prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"),
+            ) for i in range(self.config.num_hidden_layers)
+        ])
+        self.fc = torch.nn.Linear(self.config.hidden_size * 2,
+                                  self.config.hidden_size,
+                                  bias=False)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        input_embeds = self.embed_tokens(input_ids)
+        hidden_states = self.fc(
+            torch.cat((input_embeds, hidden_states), dim=-1))
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+        return hidden_states + residual
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class EagleLlamaForCausalLM(LlamaForCausalLM):
+
+    def __init__(self, *, model_config: ModelConfig, start_layer_id: int = 0):
+        nn.Module.__init__(self)
+        self.config = model_config.hf_config
+        self.model = LlamaModel(model_config=model_config,
+                                start_layer_id=start_layer_id,
+                                prefix="model")
+
+        logit_scale = getattr(self.config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.config.vocab_size,
+                                                scale=logit_scale)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        return self.model(input_ids, positions, hidden_states)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+
+        model_weights = {}
+        for name, loaded_weight in weights:
+            if "lm_head" not in name:
+                name = "model." + name
+            model_weights[name] = loaded_weight
+
+        loader.load_weights(model_weights.items())
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 6a70f6bb72360..0d13d69926cf4 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -206,6 +206,7 @@ _MULTIMODAL_MODELS = {
 
 _SPECULATIVE_DECODING_MODELS = {
     "EAGLEModel": ("eagle", "EAGLE"),
+    "EagleLlamaForCausalLM": ("llama_eagle", "EagleLlamaForCausalLM"),
     "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"),
     "MedusaModel": ("medusa", "Medusa"),
     "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py
index dd806061ff589..3a9ad3e0ffc81 100644
--- a/vllm/transformers_utils/configs/eagle.py
+++ b/vllm/transformers_utils/configs/eagle.py
@@ -5,6 +5,7 @@ from typing import Optional, Union
 
 from transformers import AutoConfig, PretrainedConfig
 
+import vllm.envs as envs
 from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
 
 
@@ -41,8 +42,10 @@ class EAGLEConfig(PretrainedConfig):
             self.truncated_vocab_size = self.model.vocab_size if \
                 truncated_vocab_size is None else truncated_vocab_size
 
-        if "architectures" not in kwargs:
+        if not envs.VLLM_USE_V1:
             kwargs["architectures"] = ["EAGLEModel"]
+        else:
+            kwargs["architectures"] = ["EagleLlamaForCausalLM"]
 
         super().__init__(**kwargs)
 
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 3aaaf34bc79bf..2322463c0713d 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -4,8 +4,11 @@ import torch.nn as nn
 import triton
 import triton.language as tl
 
-from vllm.config import VllmConfig
+from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.forward_context import set_forward_context
+from vllm.model_executor.model_loader.loader import get_model_loader
+from vllm.model_executor.model_loader.utils import set_default_torch_dtype
+from vllm.model_executor.models.llama_eagle import EagleLlamaForCausalLM
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.v1.sample.metadata import SamplingMetadata
 
@@ -21,8 +24,12 @@ class EagleProposer:
         self.num_speculative_tokens = (
             vllm_config.speculative_config.num_speculative_tokens)
         self.block_size = vllm_config.cache_config.block_size
-        self.arange = torch.arange(vllm_config.scheduler_config.max_num_seqs,
-                                   device=device)
+        # We need +1 here because the arange is used to set query_start_loc,
+        # which has one more element than batch_size.
+        self.arange = torch.arange(vllm_config.scheduler_config.max_num_seqs +
+                                   1,
+                                   device=device,
+                                   dtype=torch.int32)
 
     def propose(
         self,
@@ -54,7 +61,9 @@ class EagleProposer:
         # E.g., [b1, b2, c1, c2, c3, c3] -> [a2, b2, b3, c2, c3, c4]
         input_ids[last_token_indices] = next_token_ids
 
-        seq_lens = target_positions[last_token_indices] + 1
+        # FA requires seq_len to have dtype int32.
+        seq_lens = (target_positions[last_token_indices] + 1).int()
+
         # FIXME(woosuk): The below two ops cause synchronization. Optimize.
         max_seq_len = seq_lens.max().item()
         max_num_tokens = (cu_num_tokens[1:] - cu_num_tokens[:-1]).max().item()
@@ -98,7 +107,7 @@ class EagleProposer:
         hidden_states = sample_hidden_states
         attn_metadata.num_actual_tokens = batch_size
         attn_metadata.max_query_len = 1
-        attn_metadata.query_start_loc = self.arange[:batch_size]
+        attn_metadata.query_start_loc = self.arange[:batch_size + 1]
         for _ in range(self.num_speculative_tokens - 1):
             # Update the inputs.
             input_ids = draft_token_ids_list[-1]
@@ -176,26 +185,28 @@ class EagleProposer:
         return cu_num_tokens, token_indices
 
     def load_model(self, target_model: nn.Module) -> None:
-        self.model = DummyEagleModel()
-        self.model.get_input_embeddings = target_model.get_input_embeddings
-        self.model.compute_logits = target_model.compute_logits
+        loader = get_model_loader(self.vllm_config.load_config)
+        target_layer_num = self.vllm_config.model_config.get_num_layers(
+            self.vllm_config.parallel_config)
 
+        draft_model_config = \
+            self.vllm_config.speculative_config.draft_model_config
+        # FIXME(lily): This does not handle with distributed inference.
+        target_device = self.vllm_config.device_config.device
+        # We need to set the vllm_config here to register attention
+        # layers in the forward context.
+        with set_default_torch_dtype(
+                draft_model_config.dtype), set_current_vllm_config(
+                    self.vllm_config):
+            self.model = EagleLlamaForCausalLM(
+                model_config=draft_model_config,
+                start_layer_id=target_layer_num).to(target_device)
 
-# FIXME(woosuk): This is a dummy model for testing.
-# Remove this once we have a real model.
-class DummyEagleModel(nn.Module):
-
-    def __init__(self):
-        super().__init__()
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        hidden_states: torch.Tensor,
-        positions: torch.Tensor,
-    ) -> torch.Tensor:
-        input_embeddings = self.get_input_embeddings(input_ids)
-        return hidden_states + input_embeddings  # Dummy return.
+        self.model.load_weights(
+            loader.get_all_weights(
+                self.vllm_config.speculative_config.draft_model_config,
+                self.model))
+        self.model.lm_head = target_model.lm_head
 
 
 # FIXME(woosuk): The logic here is duplicated with the main sampling code.
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index debb7072cff87..0e70d77e1b7e7 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1191,9 +1191,12 @@ class GPUModelRunner(LoRAModelRunnerMixin):
 
             if spec_decode_metadata is None:
                 # input_ids can be None for multimodal models.
+                # We need to slice token_ids, positions, and hidden_states
+                # because the eagle head does not use cuda graph and should
+                # not include padding.
                 target_token_ids = self.input_ids[:num_scheduled_tokens]
-                target_positions = positions
-                target_hidden_states = hidden_states
+                target_positions = positions[:num_scheduled_tokens]
+                target_hidden_states = hidden_states[:num_scheduled_tokens]
                 target_slot_mapping = attn_metadata.slot_mapping
                 cu_num_tokens = attn_metadata.query_start_loc
             else:

From 5fbab20e021a4febbeda830eaa15a2deb598c01e Mon Sep 17 00:00:00 2001
From: Chenyaaang <42742451+Chenyaaang@users.noreply.github.com>
Date: Thu, 10 Apr 2025 11:35:41 -0700
Subject: [PATCH 365/593] [Bugfix] Fix bug when dataset is json (#15899)

Signed-off-by: Chenyaaang <chenyangli@google.com>
---
 benchmarks/benchmark_serving_structured_output.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index 6d3ba6c025ae4..7ad0791a98681 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -130,10 +130,11 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
                         "description":
                         "An unique optional field to avoid cached schemas"
                     }
+        else:
+            json_schemas = [schema] * args.num_prompts
 
         def gen_prompt(index: int):
-            schema = json_schemas[index % len(json_schemas)]
-            return f"Generate an example of a user profile given the following schema: {json.dumps(schema)}"  # noqa: E501
+            return f"Generate an example of a user profile given the following schema: {json.dumps(get_schema(index))}"  # noqa: E501
 
         def get_schema(index: int):
             return json_schemas[index % len(json_schemas)]

From daefed052ccc2811993262cfbf6b746b56021b06 Mon Sep 17 00:00:00 2001
From: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com>
Date: Thu, 10 Apr 2025 15:07:07 -0400
Subject: [PATCH 366/593] [Model] Reduce redundant computations in mamba2
 blocks for Bamba-9B (#15423)

Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
Co-authored-by: Yu Chin Fabian Lim <flim@sg.ibm.com>
---
 .../layers/mamba/mamba2_metadata.py           | 109 ++++++++++++++++++
 .../layers/mamba/mamba_mixer2.py              |  52 ++++-----
 .../layers/mamba/ops/ssd_chunk_scan.py        |  51 ++------
 .../layers/mamba/ops/ssd_combined.py          |  10 +-
 .../layers/mamba/ops/ssd_state_passing.py     |   2 -
 vllm/model_executor/models/bamba.py           |  32 +++--
 vllm/model_executor/models/mamba2.py          |  28 ++---
 vllm/model_executor/models/zamba2.py          |  34 +++---
 8 files changed, 186 insertions(+), 132 deletions(-)
 create mode 100644 vllm/model_executor/layers/mamba/mamba2_metadata.py

diff --git a/vllm/model_executor/layers/mamba/mamba2_metadata.py b/vllm/model_executor/layers/mamba/mamba2_metadata.py
new file mode 100644
index 0000000000000..b1c46190403da
--- /dev/null
+++ b/vllm/model_executor/layers/mamba/mamba2_metadata.py
@@ -0,0 +1,109 @@
+# SPDX-License-Identifier: Apache-2.0
+import math
+from dataclasses import dataclass
+
+import torch
+
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.attention.backends.flash_attn import FlashAttentionMetadata
+from vllm.attention.backends.placeholder_attn import (
+    PlaceholderAttentionMetadata)
+from vllm.attention.backends.xformers import XFormersMetadata
+
+
+@dataclass
+class Mamba2Metadata:
+    has_prefill: bool
+
+    has_initial_states: torch.Tensor
+    prep_initial_states: bool
+
+    chunk_size: int
+    seq_idx: torch.Tensor
+    chunk_indices: torch.Tensor
+    chunk_offsets: torch.Tensor
+
+
+def _seq_idx_to_chunk_indices_offsets(seq_idx, chunk_size: int):
+
+    # convert seq_idx to chunk indices and offsets
+    # - derive the cu_seqlens
+    _, cu_seqlens = torch.where(seq_idx.diff())
+    cu_seqlens += 1
+
+    # outputs will have length expansion of chunks that do not divide
+    # chunk_size
+    N = math.ceil(seq_idx.shape[-1] / chunk_size) + (cu_seqlens % chunk_size
+                                                     > 0).sum()
+    chunk_indices = torch.arange(N, dtype=torch.int, device=seq_idx.device)
+    chunk_offsets = torch.zeros((N, ), dtype=torch.int, device=seq_idx.device)
+
+    cu_seqlens = cu_seqlens.tolist() + [seq_idx.shape[-1]]
+    p = 0  # num of insertions
+    for s, e in zip(cu_seqlens[:-1], cu_seqlens[1:]):
+
+        # if does not divide chunk_size, then there is one chunk insertion
+        p += (s % chunk_size > 0)
+
+        # get the dimensions
+        # - the + 1 for _e is to shift the boundary by one chunk
+        # - this shifting is not needed if chunk_size divides e
+        _s, _e = s // chunk_size + p, e // chunk_size + p + (e % chunk_size
+                                                             > 0)
+
+        # adjust inidces and offsets
+        chunk_indices[_s:_e] -= p
+        chunk_offsets[_s] = s % chunk_size
+
+    return chunk_indices, chunk_offsets
+
+
+def prepare_mamba2_metadata(
+    chunk_size: int,
+    input_ids: torch.Tensor,
+    attn_metadata: AttentionMetadata,
+) -> Mamba2Metadata:
+
+    # Need flags to indicate if there are initial states
+    # currently we really only support the FlashAttention backend
+    has_initial_states = None
+    prep_initial_states = False
+    if (isinstance(attn_metadata, (FlashAttentionMetadata, XFormersMetadata,
+                                   PlaceholderAttentionMetadata))
+            and attn_metadata.context_lens_tensor is not None):
+        has_initial_states = attn_metadata.context_lens_tensor > 0
+        # precompute flag to avoid device syncs later in mamba2 forwards
+        prep_initial_states = torch.any(has_initial_states).item()
+
+    has_prefill = attn_metadata.num_prefills > 0
+
+    seq_idx = None
+    chunk_indices, chunk_offsets = None, None
+    if has_prefill:
+        seq_idx = torch.zeros_like(input_ids, dtype=torch.int32)
+        for i, (srt, end) in enumerate(
+                zip(
+                    attn_metadata.query_start_loc,
+                    attn_metadata.query_start_loc[1:],
+                )):
+            seq_idx[srt:end] = i
+        seq_idx.unsqueeze_(0)
+
+        # compute metadata for chunked prefill.
+        # actually this is only needed if there are initial states,
+        # but this is determinable only from attention metadata yet
+        # unavailable from the top-level model forward. Rather than
+        # complicating things to extract said metadata, we simply just
+        # compute them once at the top level model forward and reuse
+        # them in mamba layers. If not needed, they will be ignored
+        # inside mamba kernels.
+        chunk_indices, chunk_offsets = _seq_idx_to_chunk_indices_offsets(
+            seq_idx, chunk_size)
+
+    return Mamba2Metadata(has_prefill=has_prefill,
+                          has_initial_states=has_initial_states,
+                          prep_initial_states=prep_initial_states,
+                          chunk_size=chunk_size,
+                          seq_idx=seq_idx,
+                          chunk_indices=chunk_indices,
+                          chunk_offsets=chunk_offsets)
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index d7a45bc51239a..d459c93a26b24 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -6,10 +6,6 @@ import torch
 from torch import nn
 
 from vllm.attention.backends.abstract import AttentionMetadata
-from vllm.attention.backends.flash_attn import FlashAttentionMetadata
-from vllm.attention.backends.placeholder_attn import (
-    PlaceholderAttentionMetadata)
-from vllm.attention.backends.xformers import XFormersMetadata
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_gather,
@@ -18,6 +14,7 @@ from vllm.forward_context import get_forward_context
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
+from vllm.model_executor.layers.mamba.mamba2_metadata import Mamba2Metadata
 from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
     causal_conv1d_fn, causal_conv1d_update)
 from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
@@ -221,7 +218,6 @@ class MambaMixer2(CustomOp):
                  head_dim: int = 64,
                  rms_norm_eps: float = 1e-5,
                  activation="silu",
-                 chunk_size: int = 256,
                  quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
 
@@ -257,7 +253,6 @@ class MambaMixer2(CustomOp):
         self.ssm_state_size = ssm_state_size
         self.activation = activation
 
-        self.chunk_size = chunk_size
         self.intermediate_size = intermediate_size
         self.head_dim = head_dim
         self.num_heads = num_heads
@@ -388,25 +383,17 @@ class MambaMixer2(CustomOp):
         self,
         hidden_states: torch.Tensor,
         mamba_cache_params: MambaCacheParams,
-        sequence_idx: Optional[torch.Tensor] = None,
+        mamba2_metadata: Mamba2Metadata,
     ):
+        # mamba2_metadata contains metadata necessary for the mamba2 triton
+        # kernels to operate in continuous batching and in chunked prefill
+        # modes; they are computed at top-level model forward since they
+        # are the same and reused for all mamba layers in the same iteration
         attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
 
         seq_len, _ = hidden_states.shape
         groups_time_state_size = self.n_groups * self.ssm_state_size
 
-        # detect if there are prefills
-        has_prefill = attn_metadata.num_prefills > 0
-
-        # - also need flags to indicate if there are initial states
-        # - currently we really only support the FlashAttention backend
-        has_initial_states = None
-        if (isinstance(attn_metadata,
-                       (FlashAttentionMetadata, XFormersMetadata,
-                        PlaceholderAttentionMetadata))
-                and attn_metadata.context_lens_tensor is not None):
-            has_initial_states = attn_metadata.context_lens_tensor > 0
-
         # 1. Gated MLP's linear projection
         projected_states, _ = self.in_proj(hidden_states)
         gate, hidden_states_B_C, dt = torch.split(
@@ -423,7 +410,7 @@ class MambaMixer2(CustomOp):
         conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
                                                self.conv1d.weight.size(2))
 
-        if has_prefill:
+        if mamba2_metadata.has_prefill:
             # |---------- N-1 iteration --------|
             # |---------------- N iteration ---------------------|
             # |- tokenA -|......................|-- newTokens ---|
@@ -439,7 +426,7 @@ class MambaMixer2(CustomOp):
                 self.conv1d.bias,
                 activation=self.activation,
                 conv_states=mamba_cache_params.conv_state,
-                has_initial_state=has_initial_states,
+                has_initial_state=mamba2_metadata.has_initial_states,
                 cache_indices=mamba_cache_params.state_indices_tensor,
                 query_start_loc=attn_metadata.query_start_loc).transpose(
                     0, 1)[:seq_len]
@@ -467,16 +454,15 @@ class MambaMixer2(CustomOp):
         )
 
         # 3. State Space Model sequence transformation
-        if has_prefill:
-
+        if mamba2_metadata.has_prefill:
             initial_states = None
-            if has_initial_states is not None and torch.any(
-                    has_initial_states):
-                zero_init_indices = mamba_cache_params.state_indices_tensor[
-                    ~has_initial_states]
-                mamba_cache_params.ssm_state[zero_init_indices] = 0
-                initial_states = mamba_cache_params.ssm_state[
-                    mamba_cache_params.state_indices_tensor]
+            if (mamba2_metadata.has_initial_states is not None
+                    and mamba2_metadata.prep_initial_states):
+                # making a copy of the states
+                initial_states = torch.where(
+                    mamba2_metadata.has_initial_states[:, None, None, None],
+                    mamba_cache_params.ssm_state[
+                        mamba_cache_params.state_indices_tensor], 0)
 
             scan_output, varlen_state = mamba_chunk_scan_combined(
                 hidden_states.view(1, seq_len, self.num_heads // self.tp_size,
@@ -485,11 +471,13 @@ class MambaMixer2(CustomOp):
                 self.A,
                 B.view(1, seq_len, self.n_groups // self.tp_size, -1),
                 C.view(1, seq_len, self.n_groups // self.tp_size, -1),
-                chunk_size=self.chunk_size,
+                chunk_size=mamba2_metadata.chunk_size,
                 D=self.D,
                 z=None,
                 dt_bias=self.dt_bias,
-                seq_idx=sequence_idx,
+                seq_idx=mamba2_metadata.seq_idx,
+                chunk_indices=mamba2_metadata.chunk_indices,
+                chunk_offsets=mamba2_metadata.chunk_offsets,
                 cu_seqlens=attn_metadata.query_start_loc,
                 initial_states=initial_states,
                 return_varlen_states=True,
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
index 7ef5111227eb4..005917f236382 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
@@ -5,8 +5,6 @@
 
 # ruff: noqa: E501,SIM102
 
-import math
-
 import torch
 import triton
 import triton.language as tl
@@ -442,40 +440,6 @@ def _chunk_scan_fwd_kernel(
              (offs_out_n[None, :] < hdim))
 
 
-def _seq_idx_to_chunk_indices_offsets(seq_idx, chunk_size: int):
-
-    # convert seq_idx to chunk indices and offsets
-    # - derive the cu_seqlens
-    _, cu_seqlens = torch.where(seq_idx.diff())
-    cu_seqlens += 1
-
-    # outputs will have length expansion of chunks that do not divide
-    # chunk_size
-    N = math.ceil(seq_idx.shape[-1] / chunk_size) + (cu_seqlens % chunk_size
-                                                     > 0).sum()
-    chunk_indices = torch.arange(N, dtype=torch.int, device=seq_idx.device)
-    chunk_offsets = torch.zeros((N, ), dtype=torch.int, device=seq_idx.device)
-
-    cu_seqlens = cu_seqlens.tolist() + [seq_idx.shape[-1]]
-    p = 0  # num of insertions
-    for s, e in zip(cu_seqlens[:-1], cu_seqlens[1:]):
-
-        # if does not divide chunk_size, then there is one chunk insertion
-        p += (s % chunk_size > 0)
-
-        # get the dimensions
-        # - the + 1 for _e is to shift the boundary by one chunk
-        # - this shifting is not needed if chunk_size divides e
-        _s, _e = s // chunk_size + p, e // chunk_size + p + (e % chunk_size
-                                                             > 0)
-
-        # adjust inidces and offsets
-        chunk_indices[_s:_e] -= p
-        chunk_offsets[_s] = s % chunk_size
-
-    return chunk_indices, chunk_offsets
-
-
 def _chunk_scan_fwd(
     cb,
     x,
@@ -486,6 +450,8 @@ def _chunk_scan_fwd(
     D=None,
     z=None,
     seq_idx=None,
+    chunk_indices=None,
+    chunk_offsets=None,
     initial_states=None,
 ):
     batch, seqlen, nheads, headdim = x.shape
@@ -502,7 +468,6 @@ def _chunk_scan_fwd(
     assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
     assert states.shape == (batch, nchunks, nheads, headdim, dstate)
 
-    chunk_indices, chunk_offsets = None, None
     if seq_idx is not None:
         assert seq_idx.shape == (batch, seqlen)
 
@@ -510,15 +475,19 @@ def _chunk_scan_fwd(
             # with initial states, we need to take care of how
             # seq_idx crosses the boundaries
             assert batch == 1, "chunk scan only supports initial states with batch 1"
-            assert initial_states.shape == (seq_idx[0].max() + 1, nheads,
-                                            headdim, dstate)
 
             if initial_states.shape[0] == 1:
                 # no in this case no point to use initial states
                 initial_states = None
             else:
-                chunk_indices, chunk_offsets = _seq_idx_to_chunk_indices_offsets(
-                    seq_idx, chunk_size)
+                assert chunk_indices is not None and chunk_offsets is not None, \
+                    (
+                        "chunk_indices and chunk_offsets should have been set"
+                    )
+        else:
+            chunk_indices, chunk_offsets = None, None
+    else:
+        chunk_indices, chunk_offsets = None, None
 
     # Allocates output.
     out = torch.empty(batch,
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_combined.py b/vllm/model_executor/layers/mamba/ops/ssd_combined.py
index 97cdb70b63cc6..3febd4ccb9929 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_combined.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_combined.py
@@ -30,6 +30,8 @@ def _mamba_chunk_scan_combined_fwd(x,
                                    dt_bias=None,
                                    initial_states=None,
                                    seq_idx=None,
+                                   chunk_indices=None,
+                                   chunk_offsets=None,
                                    cu_seqlens=None,
                                    dt_softplus=False,
                                    dt_limit=(0.0, float("inf"))):
@@ -96,7 +98,7 @@ def _mamba_chunk_scan_combined_fwd(x,
     # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries
     # (middle term of factorization of off-diag blocks; A terms)
     # - for handling chunked prefill, this requires i) initial_states
-    #   ii) seq_idx and iii) has_cu_seqlens to be all specified.
+    #   ii) seq_idx and iii) is_cont_batched to be all specified.
     # - When a new seq_idx is detected, we will stop passing the prev_state
     #   and switch accordingly to the init_state corresponding to the new seq_idx.
     # - this will ensure that states will be updated with the rightmost flushed seq_idx
@@ -141,6 +143,8 @@ def _mamba_chunk_scan_combined_fwd(x,
         D=D,
         z=z,
         seq_idx=seq_idx,
+        chunk_indices=chunk_indices,
+        chunk_offsets=chunk_offsets,
         initial_states=initial_states,
     )
     if cu_seqlens is None:
@@ -170,6 +174,8 @@ def mamba_chunk_scan_combined(x,
                               dt_bias=None,
                               initial_states=None,
                               seq_idx=None,
+                              chunk_indices=None,
+                              chunk_offsets=None,
                               cu_seqlens=None,
                               dt_softplus=False,
                               dt_limit=(0.0, float("inf")),
@@ -210,6 +216,8 @@ def mamba_chunk_scan_combined(x,
         dt_bias=dt_bias,
         initial_states=initial_states,
         seq_idx=seq_idx,
+        chunk_indices=chunk_indices,
+        chunk_offsets=chunk_offsets,
         cu_seqlens=cu_seqlens,
         dt_softplus=dt_softplus,
         dt_limit=dt_limit)
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py b/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
index d8f87c113f168..219c5306f4255 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
@@ -150,8 +150,6 @@ def _state_passing_fwd(
             #   are used for continuous batching. In which case we
             #   require seq_idx to be provided
             assert seq_idx is not None, ""
-            assert initial_states.shape == (seq_idx.max().item() + 1, nheads,
-                                            dim)
         else:
             # - this is the regular batching case, where initial
             #   states are used are for each example of the batch.
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index e5896f5fd355a..dfb8f49cc0145 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -18,6 +18,8 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.mamba2_metadata import (
+    Mamba2Metadata, prepare_mamba2_metadata)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import (
     MambaMixer2, extra_groups_for_head_shards)
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -94,7 +96,6 @@ class BambaMixerDecoderLayer(nn.Module):
                                 head_dim=config.mamba_d_head,
                                 rms_norm_eps=config.rms_norm_eps,
                                 activation=config.hidden_act,
-                                chunk_size=config.mamba_chunk_size,
                                 quant_config=quant_config)
 
         self.feed_forward = BambaMLP(config, quant_config=quant_config)
@@ -108,7 +109,7 @@ class BambaMixerDecoderLayer(nn.Module):
         hidden_states: torch.Tensor,
         residual: Optional[torch.Tensor],
         mamba_cache_params: MambaCacheParams,
-        sequence_idx: Optional[torch.Tensor] = None,
+        mamba2_metadata: Mamba2Metadata,
         **kwargs,
     ):
         if residual is None:
@@ -119,7 +120,7 @@ class BambaMixerDecoderLayer(nn.Module):
                 hidden_states, residual)
 
         hidden_states = self.mamba(hidden_states, mamba_cache_params,
-                                   sequence_idx)
+                                   mamba2_metadata)
         # Fully Connected
         hidden_states, residual = self.pre_ff_layernorm(
             hidden_states, residual)
@@ -259,7 +260,7 @@ class BambaModel(nn.Module):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
-        config = vllm_config.model_config.hf_config
+        config: BambaConfig = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
@@ -309,20 +310,13 @@ class BambaModel(nn.Module):
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
 
-        # pass a sequence index tensor, that is required for
-        # proper continuous batching computation including
-        # chunked prefill
-        seq_idx = None
         attn_metadata = get_forward_context().attn_metadata
-        if attn_metadata.num_prefills > 0:
-            seq_idx = torch.zeros_like(input_ids, dtype=torch.int32)
-            for i, (srt, end) in enumerate(
-                    zip(
-                        attn_metadata.query_start_loc,
-                        attn_metadata.query_start_loc[1:],
-                    )):
-                seq_idx[srt:end] = i
-            seq_idx.unsqueeze_(0)
+
+        mamba2_metadata = prepare_mamba2_metadata(
+            chunk_size=self.config.mamba_chunk_size,
+            input_ids=input_ids,
+            attn_metadata=attn_metadata,
+        )
 
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
@@ -352,7 +346,7 @@ class BambaModel(nn.Module):
                 hidden_states=hidden_states,
                 residual=residual,
                 mamba_cache_params=layer_mamba_cache_params,
-                sequence_idx=seq_idx,
+                mamba2_metadata=mamba2_metadata,
             )
 
         if not get_pp_group().is_last_rank:
@@ -555,4 +549,4 @@ class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights)
\ No newline at end of file
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py
index da5cbddbcbc58..526dec46ff29a 100644
--- a/vllm/model_executor/models/mamba2.py
+++ b/vllm/model_executor/models/mamba2.py
@@ -13,6 +13,8 @@ from vllm.distributed.parallel_state import get_pp_group
 from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.mamba2_metadata import (
+    Mamba2Metadata, prepare_mamba2_metadata)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import (
     MambaMixer2, extra_groups_for_head_shards)
 from vllm.model_executor.layers.quantization.base_config import (
@@ -57,7 +59,6 @@ class Mamba2DecoderLayer(nn.Module):
                                  head_dim=config.head_dim,
                                  rms_norm_eps=config.layer_norm_epsilon,
                                  activation=config.hidden_act,
-                                 chunk_size=config.chunk_size,
                                  quant_config=quant_config)
 
         self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
@@ -67,7 +68,7 @@ class Mamba2DecoderLayer(nn.Module):
         hidden_states: torch.Tensor,
         residual: Optional[torch.Tensor],
         mamba_cache_params: MambaCacheParams,
-        sequence_idx: Optional[torch.Tensor],
+        mamba2_metadata: Mamba2Metadata,
         **kwargs,
     ):
         if residual is None:
@@ -77,7 +78,7 @@ class Mamba2DecoderLayer(nn.Module):
             hidden_states, residual = self.norm(hidden_states, residual)
 
         hidden_states = self.mixer(hidden_states, mamba_cache_params,
-                                   sequence_idx)
+                                   mamba2_metadata)
         return hidden_states, residual
 
 
@@ -138,20 +139,13 @@ class Mamba2Model(nn.Module):
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        # pass a sequence index tensor, that is required for
-        # proper continuous batching computation including
-        # chunked prefill
-        seq_idx = None
         attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
-        if attn_metadata.num_prefills > 0:
-            seq_idx = torch.zeros_like(input_ids, dtype=torch.int32)
-            for i, (srt, end) in enumerate(
-                    zip(
-                        attn_metadata.query_start_loc,
-                        attn_metadata.query_start_loc[1:],
-                    )):
-                seq_idx[srt:end] = i
-            seq_idx.unsqueeze_(0)
+
+        mamba2_metadata = prepare_mamba2_metadata(
+            chunk_size=self.config.chunk_size,
+            input_ids=input_ids,
+            attn_metadata=attn_metadata,
+        )
 
         for i in range(len(self.layers)):
             layer = self.layers[i]
@@ -162,7 +156,7 @@ class Mamba2Model(nn.Module):
                 residual=residual,
                 mamba_cache_params=mamba_cache_params.at_layer_idx(
                     i - self.start_layer),
-                sequence_idx=seq_idx)
+                mamba2_metadata=mamba2_metadata)
 
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
index c5330203baca8..ea21fffaede56 100644
--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -25,6 +25,8 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.mamba2_metadata import (
+    Mamba2Metadata, prepare_mamba2_metadata)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import (
     MambaMixer2, extra_groups_for_head_shards)
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -495,7 +497,6 @@ class Zamba2MambaDecoderLayer(nn.Module):
             head_dim=intermediate_size // config.n_mamba_heads,
             rms_norm_eps=config.rms_norm_eps,
             activation="silu",
-            chunk_size=config.chunk_size,
             quant_config=quant_config,
         )
 
@@ -507,7 +508,7 @@ class Zamba2MambaDecoderLayer(nn.Module):
         self,
         hidden_states: torch.Tensor,
         mamba_cache_params: MambaCacheParams,
-        sequence_idx: Optional[torch.Tensor] = None,
+        mamba2_metadata: Mamba2Metadata,
         transformer_hidden_states: Optional[torch.Tensor] = None,
         positions: Optional[torch.Tensor] = None,
         original_hidden_states: Optional[torch.Tensor] = None,
@@ -547,7 +548,7 @@ class Zamba2MambaDecoderLayer(nn.Module):
         hidden_states = self.mamba(
             hidden_states,
             mamba_cache_params=mamba_cache_params,
-            sequence_idx=sequence_idx,
+            mamba2_metadata=mamba2_metadata,
         )
 
         # residual connection after mamba
@@ -594,8 +595,8 @@ class Zamba2HybridLayer(nn.Module):
         hidden_states: torch.Tensor,
         original_hidden_states: torch.Tensor,
         positions: torch.Tensor,
-        mamba_cache_params: Optional[MambaCacheParams] = None,
-        sequence_idx: Optional[torch.Tensor] = None,
+        mamba_cache_params: MambaCacheParams,
+        mamba2_metadata: Mamba2Metadata,
     ) -> torch.Tensor:
         """Forward pass through the hybrid layer.
         
@@ -634,7 +635,7 @@ class Zamba2HybridLayer(nn.Module):
             hidden_states,
             transformer_hidden_states=transformer_hidden_states,
             mamba_cache_params=mamba_cache_params,
-            sequence_idx=sequence_idx,
+            mamba2_metadata=mamba2_metadata,
         )
 
         return layer_outputs
@@ -747,20 +748,13 @@ class Zamba2Model(nn.Module):
             inputs_embeds = self.get_input_embeddings(input_ids)
         hidden_states = inputs_embeds
 
-        # pass a sequence index tensor, that is required for
-        # proper continuous batching computation including
-        # chunked prefill
-        seq_idx = None
         attn_metadata = get_forward_context().attn_metadata
-        if attn_metadata.num_prefills > 0:
-            seq_idx = torch.zeros_like(input_ids, dtype=torch.int32)
-            for i, (srt, end) in enumerate(
-                    zip(
-                        attn_metadata.query_start_loc,
-                        attn_metadata.query_start_loc[1:],
-                    )):
-                seq_idx[srt:end] = i
-            seq_idx.unsqueeze_(0)
+
+        mamba2_metadata = prepare_mamba2_metadata(
+            chunk_size=self.config.chunk_size,
+            input_ids=input_ids,
+            attn_metadata=attn_metadata,
+        )
 
         # Process through layers
         original_hidden_states = torch.clone(hidden_states)
@@ -770,7 +764,7 @@ class Zamba2Model(nn.Module):
                 original_hidden_states=original_hidden_states,
                 positions=positions,
                 mamba_cache_params=mamba_cache_params.at_layer_idx(layer_idx),
-                sequence_idx=seq_idx,
+                mamba2_metadata=mamba2_metadata,
             )
             hidden_states = layer_outputs
 

From dd143ef54137807fdb8f91b836c5ec6617dfb507 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 10 Apr 2025 12:23:14 -0700
Subject: [PATCH 367/593] [V1] Zero-copy tensor/ndarray
 serialization/transmission (#13790)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/test_serial_utils.py |  80 +++++++++++++++++++
 vllm/v1/engine/core.py        |   8 +-
 vllm/v1/engine/core_client.py |  26 +++---
 vllm/v1/serial_utils.py       | 145 ++++++++++++++++++++++++++--------
 4 files changed, 209 insertions(+), 50 deletions(-)
 create mode 100644 tests/v1/test_serial_utils.py

diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py
new file mode 100644
index 0000000000000..0fc3b074533da
--- /dev/null
+++ b/tests/v1/test_serial_utils.py
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: Apache-2.0
+from collections import UserDict
+from dataclasses import dataclass
+
+import numpy as np
+import torch
+
+from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
+
+
+class UnrecognizedType(UserDict):
+
+    def __init__(self, an_int: int):
+        super().__init__()
+        self.an_int = an_int
+
+
+@dataclass
+class MyType:
+    tensor1: torch.Tensor
+    a_string: str
+    list_of_tensors: list[torch.Tensor]
+    numpy_array: np.ndarray
+    unrecognized: UnrecognizedType
+
+
+def test_encode_decode():
+    """Test encode/decode loop with zero-copy tensors."""
+
+    obj = MyType(
+        tensor1=torch.randint(low=0,
+                              high=100,
+                              size=(1024, ),
+                              dtype=torch.int32),
+        a_string="hello",
+        list_of_tensors=[
+            torch.rand((1, 10), dtype=torch.float32),
+            torch.rand((3, 5, 4000), dtype=torch.float64),
+            torch.tensor(1984),  # test scalar too
+        ],
+        numpy_array=np.arange(512),
+        unrecognized=UnrecognizedType(33),
+    )
+
+    encoder = MsgpackEncoder()
+    decoder = MsgpackDecoder(MyType)
+
+    encoded = encoder.encode(obj)
+
+    # There should be the main buffer + 2 large tensor buffers
+    # + 1 large numpy array. "large" is <= 256 bytes.
+    # The two small tensors are encoded inline.
+    assert len(encoded) == 4
+
+    decoded: MyType = decoder.decode(encoded)
+
+    assert_equal(decoded, obj)
+
+    # Test encode_into case
+
+    preallocated = bytearray()
+
+    encoded2 = encoder.encode_into(obj, preallocated)
+
+    assert len(encoded2) == 4
+    assert encoded2[0] is preallocated
+
+    decoded2: MyType = decoder.decode(encoded2)
+
+    assert_equal(decoded2, obj)
+
+
+def assert_equal(obj1: MyType, obj2: MyType):
+    assert torch.equal(obj1.tensor1, obj2.tensor1)
+    assert obj1.a_string == obj2.a_string
+    assert all(
+        torch.equal(a, b)
+        for a, b in zip(obj1.list_of_tensors, obj2.list_of_tensors))
+    assert np.array_equal(obj1.numpy_array, obj2.numpy_array)
+    assert obj1.unrecognized.an_int == obj2.unrecognized.an_int
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 077d499889623..b8c2bebbc5ecb 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -490,14 +490,14 @@ class EngineCoreProc(EngineCore):
 
             while True:
                 # (RequestType, RequestData)
-                type_frame, data_frame = socket.recv_multipart(copy=False)
+                type_frame, *data_frames = socket.recv_multipart(copy=False)
                 request_type = EngineCoreRequestType(bytes(type_frame.buffer))
 
                 # Deserialize the request data.
                 decoder = add_request_decoder if (
                     request_type
                     == EngineCoreRequestType.ADD) else generic_decoder
-                request = decoder.decode(data_frame.buffer)
+                request = decoder.decode(data_frames)
 
                 # Push to input queue for core busy loop.
                 self.input_queue.put_nowait((request_type, request))
@@ -514,8 +514,8 @@ class EngineCoreProc(EngineCore):
             while True:
                 outputs = self.output_queue.get()
                 outputs.engine_index = engine_index
-                encoder.encode_into(outputs, buffer)
-                socket.send(buffer, copy=False)
+                buffers = encoder.encode_into(outputs, buffer)
+                socket.send_multipart(buffers, copy=False)
 
 
 ENGINE_PAUSED_OUTPUTS = EngineCoreOutputs(engine_paused=True)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 2e5f9021f1009..a96ebc7edb538 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -26,7 +26,7 @@ from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType, UtilityOutput)
 from vllm.v1.engine.core import EngineCore, EngineCoreProc
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
+from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder, bytestr
 from vllm.v1.utils import BackgroundProcHandle
 
 logger = init_logger(__name__)
@@ -505,8 +505,8 @@ class SyncMPClient(MPClient):
                         # shutdown signal, exit thread.
                         break
 
-                    frame = out_socket.recv(copy=False)
-                    outputs = decoder.decode(frame.buffer)
+                    frames = out_socket.recv_multipart(copy=False)
+                    outputs = decoder.decode(frames)
                     if outputs.utility_output:
                         _process_utility_output(outputs.utility_output,
                                                 utility_results)
@@ -529,7 +529,7 @@ class SyncMPClient(MPClient):
     def _send_input(self, request_type: EngineCoreRequestType, request: Any):
         # (Identity, RequestType, SerializedRequest)
         msg = (self.core_engine.identity, request_type.value,
-               self.encoder.encode(request))
+               *self.encoder.encode(request))
         self.input_socket.send_multipart(msg, copy=False)
 
     def call_utility(self, method: str, *args) -> Any:
@@ -633,8 +633,8 @@ class AsyncMPClient(MPClient):
 
         async def process_outputs_socket():
             while True:
-                (frame, ) = await output_socket.recv_multipart(copy=False)
-                outputs: EngineCoreOutputs = decoder.decode(frame.buffer)
+                frames = await output_socket.recv_multipart(copy=False)
+                outputs: EngineCoreOutputs = decoder.decode(frames)
                 if outputs.utility_output:
                     _process_utility_output(outputs.utility_output,
                                             utility_results)
@@ -666,12 +666,12 @@ class AsyncMPClient(MPClient):
         if engine is None:
             engine = self.core_engine
 
-        message = (request_type.value, self.encoder.encode(request))
+        message = (request_type.value, *self.encoder.encode(request))
         return self._send_input_message(message, engine)
 
-    def _send_input_message(self, message: tuple[bytes, bytes],
+    def _send_input_message(self, message: tuple[bytestr, ...],
                             engine: CoreEngine) -> Awaitable[None]:
-        message = (engine.identity, ) + message  # type: ignore[assignment]
+        message = (engine.identity, ) + message
         return self.input_socket.send_multipart(message, copy=False)
 
     async def call_utility_async(self, method: str, *args) -> Any:
@@ -684,8 +684,8 @@ class AsyncMPClient(MPClient):
         call_id = uuid.uuid1().int >> 64
         future = asyncio.get_running_loop().create_future()
         self.utility_results[call_id] = future
-        message = (EngineCoreRequestType.UTILITY.value,
-                   self.encoder.encode((call_id, method, args)))
+        message = (EngineCoreRequestType.UTILITY.value, *self.encoder.encode(
+            (call_id, method, args)))
         await self._send_input_message(message, engine)
         self._ensure_output_queue_task()
         return await future
@@ -760,7 +760,7 @@ class DPAsyncMPClient(AsyncMPClient):
 
         # Control message used for triggering dp idle mode loop.
         self.start_dp_msg = (EngineCoreRequestType.START_DP.value,
-                             self.encoder.encode(None))
+                             *self.encoder.encode(None))
 
         self.num_engines_running = 0
         self.reqs_in_flight: dict[str, CoreEngine] = {}
@@ -794,7 +794,7 @@ class DPAsyncMPClient(AsyncMPClient):
         # tokenized.
         request.prompt = None
 
-        msg = (EngineCoreRequestType.ADD.value, self.encoder.encode(request))
+        msg = (EngineCoreRequestType.ADD.value, *self.encoder.encode(request))
 
         chosen_engine = self.get_core_engine_for_request()
         self.reqs_in_flight[request.request_id] = chosen_engine
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 146d7d747f1a4..99b352fdef80a 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -1,61 +1,140 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pickle
+from collections.abc import Sequence
+from inspect import isclass
 from types import FunctionType
-from typing import Any, Optional
+from typing import Any, Optional, Union
 
 import cloudpickle
+import numpy as np
 import torch
+import zmq
 from msgspec import msgpack
 
-CUSTOM_TYPE_TENSOR = 1
-CUSTOM_TYPE_PICKLE = 2
-CUSTOM_TYPE_CLOUDPICKLE = 3
+CUSTOM_TYPE_PICKLE = 1
+CUSTOM_TYPE_CLOUDPICKLE = 2
+
+# TODO calibrate this size
+INLINE_BUF_SIZE_THRESHOLD = 256
+
+bytestr = Union[bytes, bytearray, memoryview, zmq.Frame]
 
 
 class MsgpackEncoder:
-    """Encoder with custom torch tensor serialization."""
+    """Encoder with custom torch tensor and numpy array serialization.
+
+    Note that unlike vanilla `msgspec` Encoders, this interface is generally
+    not thread-safe when encoding tensors / numpy arrays.
+    """
 
     def __init__(self):
-        self.encoder = msgpack.Encoder(enc_hook=custom_enc_hook)
+        self.encoder = msgpack.Encoder(enc_hook=self.enc_hook)
+        # This is used as a local stash of buffers that we can then access from
+        # our custom `msgspec` hook, `enc_hook`. We don't have a way to
+        # pass custom data to the hook otherwise.
+        self.aux_buffers: Optional[list[bytestr]] = None
 
-    def encode(self, obj: Any) -> bytes:
-        return self.encoder.encode(obj)
+    def encode(self, obj: Any) -> Sequence[bytestr]:
+        try:
+            self.aux_buffers = bufs = [b'']
+            bufs[0] = self.encoder.encode(obj)
+            # This `bufs` list allows us to collect direct pointers to backing
+            # buffers of tensors and np arrays, and return them along with the
+            # top-level encoded buffer instead of copying their data into the
+            # new buffer.
+            return bufs
+        finally:
+            self.aux_buffers = None
 
-    def encode_into(self, obj: Any, buf: bytearray) -> None:
-        self.encoder.encode_into(obj, buf)
+    def encode_into(self, obj: Any, buf: bytearray) -> Sequence[bytestr]:
+        try:
+            self.aux_buffers = [buf]
+            bufs = self.aux_buffers
+            self.encoder.encode_into(obj, buf)
+            return bufs
+        finally:
+            self.aux_buffers = None
+
+    def enc_hook(self, obj: Any) -> Any:
+        if isinstance(obj, torch.Tensor):
+            return self._encode_ndarray(obj.numpy())
+
+        # Fall back to pickle for object or void kind ndarrays.
+        if isinstance(obj, np.ndarray) and obj.dtype.kind not in ('O', 'V'):
+            return self._encode_ndarray(obj)
+
+        if isinstance(obj, FunctionType):
+            # `pickle` is generally faster than cloudpickle, but can have
+            # problems serializing methods.
+            return msgpack.Ext(CUSTOM_TYPE_CLOUDPICKLE, cloudpickle.dumps(obj))
+
+        return msgpack.Ext(CUSTOM_TYPE_PICKLE,
+                           pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL))
+
+    def _encode_ndarray(
+        self, obj: np.ndarray
+    ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]:
+        assert self.aux_buffers is not None
+        if not obj.shape or obj.nbytes < INLINE_BUF_SIZE_THRESHOLD:
+            # Encode small arrays and scalars inline.
+            data = obj.data
+        else:
+            # Otherwise encode index of backing buffer.
+            obj = np.ascontiguousarray(obj)
+            data = len(self.aux_buffers)
+            self.aux_buffers.append(obj.data)
+        # We serialize the ndarray as a tuple of native types.
+        # The data is either inlined if small, or an index into a list of
+        # backing buffers that we've stashed in `aux_buffers`.
+        return obj.dtype.str, obj.shape, data
 
 
 class MsgpackDecoder:
-    """Decoder with custom torch tensor serialization."""
+    """Decoder with custom torch tensor and numpy array serialization.
+
+    Note that unlike vanilla `msgspec` Decoders, this interface is generally
+    not thread-safe when encoding tensors / numpy arrays.
+    """
 
     def __init__(self, t: Optional[Any] = None):
         args = () if t is None else (t, )
-        self.decoder = msgpack.Decoder(*args, ext_hook=custom_ext_hook)
+        self.decoder = msgpack.Decoder(*args,
+                                       ext_hook=self.ext_hook,
+                                       dec_hook=self.dec_hook)
+        self.aux_buffers: Sequence[bytestr] = ()
 
-    def decode(self, obj: Any):
-        return self.decoder.decode(obj)
+    def decode(self, bufs: Union[bytestr, Sequence[bytestr]]) -> Any:
+        if isinstance(bufs, (bytes, bytearray, memoryview, zmq.Frame)):
+            # TODO - This check can become `isinstance(bufs, bytestr)`
+            # as of Python 3.10.
+            return self.decoder.decode(bufs)
 
+        self.aux_buffers = bufs
+        try:
+            return self.decoder.decode(bufs[0])
+        finally:
+            self.aux_buffers = ()
 
-def custom_enc_hook(obj: Any) -> Any:
-    if isinstance(obj, torch.Tensor):
-        # NOTE(rob): it is fastest to use numpy + pickle
-        # when serializing torch tensors.
-        # https://gist.github.com/tlrmchlsmth/8067f1b24a82b6e2f90450e7764fa103 # noqa: E501
-        return msgpack.Ext(CUSTOM_TYPE_TENSOR, pickle.dumps(obj.numpy()))
+    def dec_hook(self, t: type, obj: Any) -> Any:
+        # Given native types in `obj`, convert to type `t`.
+        if isclass(t):
+            if issubclass(t, np.ndarray):
+                return self._decode_ndarray(obj)
+            if issubclass(t, torch.Tensor):
+                return torch.from_numpy(self._decode_ndarray(obj))
+        return obj
 
-    if isinstance(obj, FunctionType):
-        return msgpack.Ext(CUSTOM_TYPE_CLOUDPICKLE, cloudpickle.dumps(obj))
+    def _decode_ndarray(self, arr: Any) -> np.ndarray:
+        dtype, shape, data = arr
+        buffer = self.aux_buffers[data] if isinstance(data, int) else data
+        return np.ndarray(buffer=buffer, dtype=np.dtype(dtype), shape=shape)
 
-    return msgpack.Ext(CUSTOM_TYPE_PICKLE, pickle.dumps(obj))
+    def ext_hook(self, code: int, data: memoryview) -> Any:
+        if code == CUSTOM_TYPE_PICKLE:
+            return pickle.loads(data)
+        if code == CUSTOM_TYPE_CLOUDPICKLE:
+            return cloudpickle.loads(data)
 
-
-def custom_ext_hook(code: int, data: memoryview) -> Any:
-    if code == CUSTOM_TYPE_TENSOR:
-        return torch.from_numpy(pickle.loads(data))
-    if code == CUSTOM_TYPE_PICKLE:
-        return pickle.loads(data)
-    if code == CUSTOM_TYPE_CLOUDPICKLE:
-        return cloudpickle.loads(data)
-
-    raise NotImplementedError(f"Extension type code {code} is not supported")
+        raise NotImplementedError(
+            f"Extension type code {code} is not supported")

From 56d4aefa33f3f8ffaf74d02a8d7eef9523651864 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 11 Apr 2025 03:32:14 +0800
Subject: [PATCH 368/593] [VLM] Avoid unnecessary dummy multimodal data during
 processing (#16416)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/aria.py            | 27 +++++++-------
 vllm/model_executor/models/aya_vision.py      | 25 +++++++------
 vllm/model_executor/models/blip2.py           | 19 +++++-----
 vllm/model_executor/models/chameleon.py       | 24 +++++++-----
 vllm/model_executor/models/deepseek_vl2.py    | 27 +++++++-------
 vllm/model_executor/models/florence2.py       | 21 +++++------
 vllm/model_executor/models/fuyu.py            | 19 +++++-----
 vllm/model_executor/models/gemma3_mm.py       | 29 ++++++++-------
 vllm/model_executor/models/glm4v.py           | 32 ++++++++--------
 vllm/model_executor/models/idefics3.py        | 32 ++++++++--------
 vllm/model_executor/models/internvl.py        | 22 +++++------
 vllm/model_executor/models/llava.py           | 25 +++++++------
 .../model_executor/models/llava_next_video.py | 27 +++++++-------
 vllm/model_executor/models/llava_onevision.py | 27 +++++++-------
 vllm/model_executor/models/minicpmo.py        | 34 +++++++++--------
 vllm/model_executor/models/minicpmv.py        | 27 ++++++++------
 vllm/model_executor/models/mistral3.py        | 28 +++++++-------
 vllm/model_executor/models/mllama.py          | 28 +++++++-------
 vllm/model_executor/models/mllama4.py         | 26 +++++++------
 vllm/model_executor/models/molmo.py           | 19 +++++-----
 vllm/model_executor/models/nvlm_d.py          | 23 ++++++------
 vllm/model_executor/models/paligemma.py       | 16 ++++----
 vllm/model_executor/models/phi3v.py           | 27 +++++++-------
 vllm/model_executor/models/pixtral.py         | 19 +++++-----
 .../models/prithvi_geospatial_mae.py          | 25 +++++++------
 vllm/model_executor/models/qwen2_audio.py     | 24 +++++++-----
 vllm/model_executor/models/qwen2_vl.py        | 29 ++++++++-------
 vllm/model_executor/models/qwen_vl.py         | 31 ++++++++--------
 vllm/model_executor/models/skyworkr1v.py      | 22 +++++------
 vllm/model_executor/models/ultravox.py        | 22 +++++------
 vllm/model_executor/models/whisper.py         | 24 ++++++------
 vllm/multimodal/processing.py                 | 13 +------
 vllm/multimodal/profiling.py                  | 37 +++++++++++++++++--
 33 files changed, 436 insertions(+), 394 deletions(-)

diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 23b8ef89268d7..edf67c860e977 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -21,12 +21,13 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 # yapf: disable
@@ -415,31 +416,31 @@ class AriaProcessingInfo(BaseProcessingInfo):
 
 class AriaDummyInputsBuilder(BaseDummyInputsBuilder[AriaProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token: str = processor.tokenizer.image_token  # type: ignore
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         vision_config = self.info.get_vision_config()
 
         max_image_size = vision_config.image_size
         num_images = mm_counts.get("image", 0)
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=max_image_size,
                                    height=max_image_size,
                                    num_images=num_images)
         }
 
-        hf_processor = self.info.get_hf_processor()
-        image_token: str = hf_processor.tokenizer.image_token  # type: ignore
-
-        return ProcessorInputs(
-            prompt_text=image_token * num_images,
-            mm_data=mm_data,
-        )
-
 
 class AriaMultiModalProcessor(BaseMultiModalProcessor[AriaProcessingInfo]):
 
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index cdec31602503d..8700f24d2bd25 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -20,7 +20,7 @@ from vllm.jsontree import json_map_leaves
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -28,7 +28,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         MultiModalFieldConfig,
                                         PromptReplacement, PromptUpdate,
                                         PromptUpdateDetails)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -146,28 +146,29 @@ class AyaVisionProcessingInfo(BaseProcessingInfo):
 class AyaVisionDummyInputsBuilder(
         BaseDummyInputsBuilder[AyaVisionProcessingInfo]):
 
-    def get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
         processor = self.info.get_hf_processor()
         image_token = processor.image_token
 
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         image_size = \
             self.info.get_image_size_with_most_features()
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=image_size.width,
                                    height=image_size.height,
                                    num_images=num_images)
         }
-        return ProcessorInputs(
-            prompt_text=image_token * num_images,
-            mm_data=mm_data,
-        )
 
 
 class AyaVisionMultiModalProcessor(
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index dde78ee52a3de..a6f00f9997730 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -15,12 +15,13 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptIndexTargets,
                                         PromptInsertion, PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 from .blip import BlipVisionModel
@@ -413,29 +414,27 @@ class Blip2ProcessingInfo(BaseProcessingInfo):
 
 class Blip2DummyInputsBuilder(BaseDummyInputsBuilder[Blip2ProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         hf_config = self.info.get_hf_config()
         vision_config = hf_config.vision_config
 
         max_image_size = vision_config.image_size
         num_images = mm_counts.get("image", 0)
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=max_image_size,
                                    height=max_image_size,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text="",
-            mm_data=mm_data,
-        )
-
 
 class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
 
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index fb2f4b677c5af..0ad5e89df2e25 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -30,12 +30,13 @@ from vllm.model_executor.model_loader.weight_utils import (
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate, PromptUpdateDetails)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
@@ -72,28 +73,31 @@ class ChameleonProcessingInfo(BaseProcessingInfo):
 class ChameleonDummyInputsBuilder(
         BaseDummyInputsBuilder[ChameleonProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         config = self.info.get_hf_config()
 
         width = height = config.vq_config.resolution
         num_images = mm_counts.get("image", 0)
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=width,
                                    height=height,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text="<image>" * num_images,
-            mm_data=mm_data,
-        )
-
 
 class ChameleonMultiModalProcessor(
         BaseMultiModalProcessor[ChameleonProcessingInfo]):
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 951185bc9bd01..c3dbadb292769 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -19,14 +19,14 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.utils import set_default_torch_dtype
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,
                                                           MlpProjectorConfig,
@@ -172,29 +172,30 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo):
 class DeepseekVL2DummyInputsBuilder(
         BaseDummyInputsBuilder[DeepseekVL2ProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
-        hf_processor = self.info.get_hf_processor()
-        image_token: str = hf_processor.image_token
 
         max_image_size = self.info.get_image_size_with_most_features()
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=max_image_size.width,
                                    height=max_image_size.height,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text=image_token * num_images,
-            mm_data=mm_data,
-        )
-
 
 class DeepseekVL2MultiModalProcessor(
         BaseMultiModalProcessor[DeepseekVL2ProcessingInfo]):
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 56572bd59a35c..359cc7f377310 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -21,13 +21,14 @@ from vllm.model_executor.models.bart import (BartDecoder, BartEncoder,
                                              BartScaledWordEmbedding)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
-from vllm.multimodal.parse import MultiModalDataDict, MultiModalDataItems
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseProcessingInfo,
                                         EncDecMultiModalProcessor,
                                         PromptIndexTargets, PromptInsertion,
                                         PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
@@ -772,27 +773,25 @@ class Florence2ProcessingInfo(BaseProcessingInfo):
 class Florence2DummyInputsBuilder(
         BaseDummyInputsBuilder[Florence2ProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
         target_width = target_height = self.info.get_hf_config().projection_dim
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text="",
-            mm_data=mm_data,
-        )
-
 
 class Florence2MultiModalProcessor(
         EncDecMultiModalProcessor[Florence2ProcessingInfo]):
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 5fc6bb846388f..27cd8d0986a55 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -31,13 +31,14 @@ from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.models.persimmon import PersimmonForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate, PromptUpdateDetails)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -125,27 +126,25 @@ class FuyuProcessingInfo(BaseProcessingInfo):
 
 class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
         num_images = mm_counts.get("image", 0)
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text="",
-            mm_data=mm_data,
-        )
-
 
 class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
 
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 34d856f4b2037..e5a3d6762fff2 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -15,8 +15,9 @@ from vllm.model_executor.layers.layernorm import GemmaRMSNorm
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.inputs import MultiModalFieldConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 # yapf: disable
@@ -28,7 +29,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         find_mm_placeholders,
                                         replace_token_matches)
 # yapf: enable
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
@@ -224,31 +225,31 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
 
 class Gemma3DummyInputsBuilder(BaseDummyInputsBuilder[Gemma3ProcessingInfo]):
 
-    def get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
         processor = self.info.get_hf_processor()
         image_token = processor.boi_token
 
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text=image_token * num_images,
-            mm_data=mm_data,
-        )
-
 
 class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
 
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index 02954eecc42cd..4e13716719ace 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -12,7 +12,7 @@ from torch import nn
 from torch.nn import LayerNorm
 from torchvision import transforms
 from torchvision.transforms import InterpolationMode
-from transformers import PreTrainedTokenizer, TensorType
+from transformers import BatchFeature, PreTrainedTokenizer, TensorType
 from transformers.image_utils import ImageInput
 from transformers.tokenization_utils_base import TextInput
 
@@ -28,13 +28,13 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, BatchFeature,
-                                        MultiModalFieldConfig,
-                                        PromptReplacement, PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import ChatGLMConfig
 
@@ -447,31 +447,31 @@ class GLM4VProcessingInfo(BaseProcessingInfo):
 
 class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        base_text = "<|begin_of_image|><|endoftext|><|end_of_image|>"
+
+        return base_text * num_images
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         hf_config = self.info.get_hf_config()
         vision_config = hf_config.vision_config
 
         target_width = target_height = vision_config["image_size"]
         num_images = mm_counts.get("image", 0)
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images)
         }
 
-        base_text = "<|begin_of_image|><|endoftext|><|end_of_image|>"
-
-        return ProcessorInputs(
-            prompt_text=base_text * num_images,
-            mm_data=mm_data,
-        )
-
 
 class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
 
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 655db1c856346..c31870461b4c2 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -32,18 +32,18 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import ImageProcessorItems, ImageSize
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo,
-                                        MultiModalDataItems,
-                                        MultiModalFieldConfig,
-                                        PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+                                        MultiModalDataItems, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
 # yapf: enable
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 # yapf: disable
@@ -284,29 +284,31 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
 class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo]
                                  ):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token, _, _ = self.info._get_image_token(processor)
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         hf_processor = self.info.get_hf_processor()
         image_processor: Idefics3ImageProcessor = hf_processor.image_processor
         longest_edge = image_processor.max_image_size['longest_edge']
-        image_token, _, _ = self.info._get_image_token(hf_processor)
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=longest_edge,
                                    height=longest_edge,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text=image_token * num_images,
-            mm_data=mm_data,
-        )
-
 
 class Idefics3MultiModalProcessor(
         BaseMultiModalProcessor[Idefics3ProcessingInfo]):
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 08741b3a3c11e..8f5f454cbf607 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -25,14 +25,14 @@ from vllm.model_executor.models.intern_vit import (InternVisionModel,
                                                    InternVisionPatchModel)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate, PromptUpdateDetails)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
@@ -504,27 +504,27 @@ _I = TypeVar("_I", bound=BaseInternVLProcessingInfo)
 
 class InternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        return "<image>" * num_images
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
         num_images = mm_counts.get("image", 0)
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text="<image>" * num_images,
-            mm_data=mm_data,
-        )
-
 
 class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 5804cb4419b6c..fbd212d170044 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -34,7 +34,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, ProcessingCache,
                                         PromptReplacement, PromptUpdate,
                                         PromptUpdateDetails)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 from .clip import CLIPVisionModel
@@ -186,30 +186,31 @@ _I = TypeVar("_I", bound=BaseLlavaProcessingInfo)
 
 class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
 
-    def get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
 
         processor = self.info.get_hf_processor()
         image_token = processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text=image_token * num_images,
-            mm_data=mm_data,
-        )
-
 
 class LlavaProcessingInfo(BaseLlavaProcessingInfo):
 
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 281c9c0e8ebe3..0221c6b237cbb 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -16,13 +16,14 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
                                    VideoEmbeddingItems, VideoProcessorItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
@@ -130,22 +131,27 @@ class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
 class LlavaNextVideoDummyInputsBuilder(
         BaseDummyInputsBuilder[LlavaNextVideoProcessingInfo]):
 
-    def get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_videos = mm_counts.get("video", 0)
 
         processor = self.info.get_hf_processor()
         video_token = processor.video_token
 
+        return video_token * num_videos
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_videos = mm_counts.get("video", 0)
+
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
         target_num_frames = \
             self.info.get_num_frames_with_most_features(seq_len, mm_counts)
 
-        mm_data = {
+        return {
             "video":
             self._get_dummy_videos(
                 width=target_width,
@@ -155,11 +161,6 @@ class LlavaNextVideoDummyInputsBuilder(
             )
         }
 
-        return ProcessorInputs(
-            prompt_text=video_token * num_videos,
-            mm_data=mm_data,
-        )
-
 
 class LlavaNextVideoMultiModalProcessor(
         BaseMultiModalProcessor[LlavaNextVideoProcessingInfo]):
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index f6256771d9828..60d32c924694c 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -19,11 +19,11 @@ from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
                                    VideoEmbeddingItems, VideoProcessorItems)
 from vllm.multimodal.processing import PromptReplacement, PromptUpdate
-from vllm.multimodal.profiling import ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .clip import CLIPVisionModel
@@ -226,11 +226,7 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
 class LlavaOnevisionDummyInputsBuilder(
         LlavaDummyInputsBuilder[LlavaOnevisionProcessingInfo]):
 
-    def get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
 
@@ -238,13 +234,23 @@ class LlavaOnevisionDummyInputsBuilder(
         image_token = processor.image_token
         video_token = processor.video_token
 
+        return image_token * num_images + video_token * num_videos
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
         target_num_frames = \
             self.info.get_num_frames_with_most_features(seq_len,
                                                         mm_counts)
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
@@ -258,11 +264,6 @@ class LlavaOnevisionDummyInputsBuilder(
             )
         }
 
-        return ProcessorInputs(
-            prompt_text=image_token * num_images + video_token * num_videos,
-            mm_data=mm_data,
-        )
-
 
 class LlavaOnevisionMultiModalProcessor(
         BaseLlavaNextMultiModalProcessor[LlavaOnevisionProcessingInfo]):
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index 8bb41a108b5a9..29c3cc5e769b3 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -35,14 +35,14 @@ from transformers.models.whisper.modeling_whisper import (
 
 from vllm.config import VllmConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    NestedTensors)
 from vllm.multimodal.parse import (AudioItem, AudioProcessorItems,
                                    DictEmbeddingItems, ModalityData,
                                    ModalityDataItems, MultiModalDataItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
                                         PromptUpdateDetails)
-from vllm.multimodal.profiling import ProcessorInputs
 
 from .minicpmv import (_MAX_FRAMES_PER_VIDEO, MiniCPMV2_6,
                        MiniCPMVDummyInputsBuilder,
@@ -206,29 +206,31 @@ class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
 class MiniCPMODummyInputsBuilder(
         MiniCPMVDummyInputsBuilder[MiniCPMOProcessingInfo]):
 
-    def get_dummy_processor_inputs(
-            self, seq_len: int, mm_counts: Mapping[str,
-                                                   int]) -> ProcessorInputs:
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+
+        audio_prompt_texts = self.info.audio_pattern * num_audios
+
+        return super().get_dummy_text(mm_counts) + audio_prompt_texts
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
         num_audios = mm_counts.get("audio", 0)
         audio_len = self.info.get_max_audio_chunks_with_most_features() * \
             self.info.get_default_audio_sampling_rate()
 
-        processor_inputs = super().get_dummy_processor_inputs(
-            seq_len, mm_counts)
-
-        audio_prompt_texts = self.info.audio_pattern * num_audios
         audio_mm_data = {
             "audio":
             self._get_dummy_audios(length=audio_len, num_audios=num_audios)
         }
 
-        return ProcessorInputs(
-            prompt_text=processor_inputs.prompt_text + audio_prompt_texts,
-            mm_data={
-                **processor_inputs.mm_data,
-                **audio_mm_data,
-            },
-        )
+        return {
+            **super().get_dummy_mm_data(seq_len, mm_counts),
+            **audio_mm_data,
+        }
 
 
 class MiniCPMOMultiModalProcessor(
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 87c6902195831..c504737e1b335 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -48,7 +48,8 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    NestedTensors)
 from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem,
                                    ImageProcessorItems, ImageSize,
                                    ModalityData, ModalityDataItems,
@@ -57,7 +58,7 @@ from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem,
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate, PromptUpdateDetails)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.utils import flatten_2d_lists
@@ -471,11 +472,20 @@ _I = TypeVar("_I",
 
 class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        image_prompt_texts = self.info.image_pattern * num_images
+        video_prompt_texts = self.info.video_pattern * num_videos
+
+        return image_prompt_texts + video_prompt_texts
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
 
@@ -486,7 +496,7 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
         num_video_frames = \
             self.info.get_num_frames_with_most_features(seq_len, mm_counts)
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=image_width,
                                    height=image_height,
@@ -498,13 +508,6 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
             ] * num_videos,
         }
 
-        image_prompt_texts = self.info.image_pattern * num_images
-        video_prompt_texts = self.info.video_pattern * num_videos
-
-        return ProcessorInputs(prompt_text=image_prompt_texts +
-                               video_prompt_texts,
-                               mm_data=mm_data)
-
 
 class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index d2c600feb4b29..8b1a1d68fc3fa 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -22,14 +22,15 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, ProcessingCache,
                                         PromptReplacement, PromptUpdate,
                                         PromptUpdateDetails)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -185,30 +186,31 @@ _I = TypeVar("_I", bound=BaseLlavaProcessingInfo)
 
 class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[_I]):
 
-    def get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
 
         processor = self.info.get_hf_processor()
         image_token = processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text=image_token * num_images,
-            mm_data=mm_data,
-        )
-
 
 class Mistral3ProcessingInfo(BaseLlavaProcessingInfo):
 
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index b61e42f31d88b..251d95e41dc3d 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -54,14 +54,14 @@ from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalEncDecInputs,
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
                                     MultiModalFieldConfig, MultiModalKwargs)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
-                                   MultiModalDataDict, MultiModalDataItems)
+                                   MultiModalDataItems)
 from vllm.multimodal.processing import (BaseProcessingInfo,
                                         EncDecMultiModalProcessor,
                                         PromptReplacement, PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 
 from .clip import CLIPMLP
 from .interfaces import SupportsMultiModal, SupportsV0Only
@@ -131,31 +131,31 @@ class MllamaProcessingInfo(BaseProcessingInfo):
 
 class MllamaDummyInputsBuilder(BaseDummyInputsBuilder[MllamaProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images)
         }
 
-        hf_processor = self.info.get_hf_processor()
-        image_token: str = hf_processor.image_token
-
-        return ProcessorInputs(
-            prompt_text=image_token * num_images,
-            mm_data=mm_data,
-        )
-
 
 class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
                                 ):
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 4f709751ae629..0966f546ddf90 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -43,14 +43,14 @@ from vllm.model_executor.model_loader.loader import _initialize_model
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate, PromptUpdateDetails)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -619,29 +619,31 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
 
 class Mllama4DummyInputsBuilder(BaseDummyInputsBuilder[Mllama4ProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.fake_image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
         (target_width,
          target_height) = self.info.get_image_size_with_most_features()
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images)
         }
 
-        image_token = self.info.get_hf_processor().fake_image_token
-        return ProcessorInputs(
-            prompt_text=image_token * num_images,
-            mm_data=mm_data,
-        )
-
 
 @MULTIMODAL_REGISTRY.register_processor(
     Mllama4MultiModalProcessor,
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index d896431b166b2..d75845b45e733 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -41,14 +41,15 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptIndexTargets,
                                         PromptInsertion, PromptUpdate,
                                         PromptUpdateDetails)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
@@ -1216,27 +1217,25 @@ class MolmoProcessingInfo(BaseProcessingInfo):
 
 class MolmoDummyInputsBuilder(BaseDummyInputsBuilder[MolmoProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
         num_images = mm_counts.get("image", 0)
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text="",
-            mm_data=mm_data,
-        )
-
 
 class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
 
diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py
index 314f75c203012..62a7deab6a10c 100644
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -15,12 +15,11 @@ from transformers import PretrainedConfig
 
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
                                         PromptUpdateDetails)
-from vllm.multimodal.profiling import ProcessorInputs
 
 from .intern_vit import InternVisionModel
 from .internvl import (BaseInternVLProcessingInfo, BaseInternVLProcessor,
@@ -87,29 +86,29 @@ class NVLMProcessingInfo(BaseInternVLProcessingInfo):
 
 class NVLMDummyInputsBuilder(InternVLDummyInputsBuilder[NVLMProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        # The newline is necessary to separate ">" of the current item
+        # and "<" of the next item
+        return "<image>\n" * num_images
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
         num_images = mm_counts.get("image", 0)
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            # The newline is necessary to separate ">" of the current item
-            # and "<" of the next item
-            prompt_text="<image>\n" * num_images,
-            mm_data=mm_data,
-        )
-
 
 class NVLMMultiModalProcessor(InternVLMultiModalProcessor[NVLMProcessingInfo]):
 
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index ae8eee4515e04..6c1bd499f6398 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -19,7 +19,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptIndexTargets,
                                         PromptInsertion, PromptUpdate,
                                         PromptUpdateDetails)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -90,29 +90,27 @@ class PaliGemmaProcessingInfo(BaseProcessingInfo):
 class PaliGemmaDummyInputsBuilder(
         BaseDummyInputsBuilder[PaliGemmaProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         hf_config = self.info.get_hf_config()
         vision_config = hf_config.vision_config
         max_image_size = vision_config.image_size
 
         num_images = mm_counts.get("image", 0)
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=max_image_size,
                                    height=max_image_size,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text="",
-            mm_data=mm_data,
-        )
-
 
 class PaliGemmaMultiModalProcessor(
         BaseMultiModalProcessor[PaliGemmaProcessingInfo]):
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index cce700f02f597..7f41ad2359df6 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -32,7 +32,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 # yapf conflicts with isort for this block
@@ -42,7 +43,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PlaceholderFeaturesInfo,
                                         PromptReplacement, PromptUpdate)
 # yapf: enable
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
@@ -343,31 +344,31 @@ class Phi3VProcessingInfo(BaseProcessingInfo):
 
 class Phi3VDummyInputsBuilder(BaseDummyInputsBuilder[Phi3VProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        hf_processor = self.info.get_hf_processor()
+        image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
+
+        return "".join(image_tokens[:num_images])
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images)
         }
 
-        hf_processor = self.info.get_hf_processor()
-        image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
-
-        return ProcessorInputs(
-            prompt_text="".join(image_tokens[:num_images]),
-            mm_data=mm_data,
-        )
-
 
 class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
 
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index fdd342ccf6b56..ee1e7713e90e2 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -32,13 +32,14 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    NestedTensors)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate, PromptUpdateDetails)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import (MistralTokenizer,
                                                cached_tokenizer_from_config)
@@ -203,28 +204,26 @@ class PixtralProcessingInfo(BaseProcessingInfo):
 
 class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text="",
-            mm_data=mm_data,
-        )
-
 
 class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
                                  ):
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index e3a93e95530c3..c10ef45440b11 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -35,7 +35,7 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import (IntermediateTensors, PoolerOutput,
                            PoolingSequenceGroupOutput)
 
@@ -49,20 +49,21 @@ class PrithviGeoSpatialMAEProcessingInfo(BaseProcessingInfo):
 class PrithviGeoSpatialMAEInputBuilder(
         BaseDummyInputsBuilder[PrithviGeoSpatialMAEProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        return ProcessorInputs(
-            prompt_text="",
-            # This model input is fixed and is in the form of a torch Tensor.
-            # The size of pixel_values might change in the cases where we resize
-            # the input but never exceeds the dimensions below.
-            mm_data={
-                "pixel_values": torch.full((1, 6, 512, 512), 1.0),
-                "location_coords": torch.full((1, 2), 1.0)
-            })
+    ) -> MultiModalDataDict:
+        # This model input is fixed and is in the form of a torch Tensor.
+        # The size of pixel_values might change in the cases where we resize
+        # the input but never exceeds the dimensions below.
+        return {
+            "pixel_values": torch.full((1, 6, 512, 512), 1.0),
+            "location_coords": torch.full((1, 2), 1.0),
+        }
 
 
 class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index ba4646f5583f9..280cda0f68f1a 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -37,13 +37,14 @@ from vllm.config import VllmConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate, PromptUpdateDetails)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -113,27 +114,30 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo):
 class Qwen2AudioDummyInputsBuilder(
         BaseDummyInputsBuilder[Qwen2AudioProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+
+        hf_processor = self.info.get_hf_processor()
+        audio_token = hf_processor.audio_token
+
+        return audio_token * num_audios
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         feature_extractor = self.info.get_feature_extractor()
 
         sampling_rate = feature_extractor.sampling_rate
         audio_len = feature_extractor.chunk_length * sampling_rate
         num_audios = mm_counts.get("audio", 0)
 
-        mm_data = {
+        return {
             "audio":
             self._get_dummy_audios(length=audio_len, num_audios=num_audios)
         }
 
-        return ProcessorInputs(
-            prompt_text="<|AUDIO|>" * num_audios,
-            mm_data=mm_data,
-        )
-
 
 class Qwen2AudioMultiModalProcessor(
         BaseMultiModalProcessor[Qwen2AudioProcessingInfo]):
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 23f27e7ef9fb0..11950f78f1d25 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -56,15 +56,15 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (ImageItem, ModalityData,
-                                    MultiModalFieldConfig, MultiModalKwargs,
-                                    VideoItem)
+                                    MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, VideoItem)
 from vllm.multimodal.parse import (DictEmbeddingItems, ImageSize,
                                    ModalityDataItems, MultiModalDataItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
@@ -965,11 +965,7 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
 
 class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):
 
-    def get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
 
@@ -977,12 +973,22 @@ class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):
         image_token: str = hf_processor.image_token
         video_token: str = hf_processor.video_token
 
+        return image_token * num_images + video_token * num_videos
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
         target_num_frames = \
             self.info.get_num_frames_with_most_features(seq_len, mm_counts)
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
@@ -996,11 +1002,6 @@ class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):
             )
         }
 
-        return ProcessorInputs(
-            prompt_text=image_token * num_images + video_token * num_videos,
-            mm_data=mm_data,
-        )
-
 
 class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
                                  ):
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index 403d47a39d175..9f370d7aab4e4 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -32,12 +32,13 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.resampler import Resampler2, get_abs_pos
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate, PromptUpdateDetails)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
@@ -542,34 +543,34 @@ class QwenVLProcessingInfo(BaseProcessingInfo):
 
 class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        hf_processor = self.info.get_hf_processor()
+        img_start = hf_processor.image_start_tag
+        img_end = hf_processor.image_end_tag
+
+        return "".join(f"Picture {i}: {img_start}{img_end}\n"
+                       for i in range(1, num_images + 1))
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         hf_config = self.info.get_hf_config()
         vision_config = hf_config.visual
 
-        processor = self.info.get_hf_processor()
-        img_start = processor.image_start_tag
-        img_end = processor.image_end_tag
-
         target_width = target_height = vision_config["image_size"]
         num_images = mm_counts.get("image", 0)
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text="".join(f"Picture {i}: {img_start}{img_end}\n"
-                                for i in range(1, num_images + 1)),
-            mm_data=mm_data,
-        )
-
 
 class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
 
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index 09a212a9face0..19a23162aa840 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -26,14 +26,14 @@ from vllm.model_executor.models.intern_vit import (InternVisionModel,
                                                    InternVisionPatchModel)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate, PromptUpdateDetails)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
@@ -505,27 +505,27 @@ _I = TypeVar("_I", bound=BaseSkyworkR1VProcessingInfo)
 
 class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        return "<image>" * num_images
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
         num_images = mm_counts.get("image", 0)
 
-        mm_data = {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text="<image>" * num_images,
-            mm_data=mm_data,
-        )
-
 
 class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 3ff5a0516b65e..cb5ff4ed6365b 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -23,13 +23,13 @@ from vllm.model_executor.model_loader.loader import DefaultModelLoader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
 from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
@@ -110,11 +110,16 @@ class UltravoxProcessingInfo(BaseProcessingInfo):
 class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo]
                                  ):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+
+        return "<|audio|>" * num_audios
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         feature_extractor = self.info.get_feature_extractor()
 
         sampling_rate = feature_extractor.sampling_rate
@@ -122,16 +127,11 @@ class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo]
                      _MAX_ENCODER_BATCH_SIZE)
         num_audios = mm_counts.get("audio", 0)
 
-        mm_data = {
+        return {
             "audio":
             self._get_dummy_audios(length=audio_len, num_audios=num_audios)
         }
 
-        return ProcessorInputs(
-            prompt_text="<|audio|>" * num_audios,
-            mm_data=mm_data,
-        )
-
 
 class UltravoxMultiModalProcessor(
         BaseMultiModalProcessor[UltravoxProcessingInfo]):
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 341e22a4a8bb1..63e71f2688057 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -26,13 +26,13 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
-from vllm.multimodal.parse import (MultiModalDataDict, MultiModalDataItems,
-                                   MultiModalDataParser)
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
 from vllm.multimodal.processing import (BaseProcessingInfo,
                                         EncDecMultiModalProcessor,
                                         PromptReplacement, PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 
 from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
                          SupportsTranscription, SupportsV0Only)
@@ -544,27 +544,27 @@ class WhisperProcessingInfo(BaseProcessingInfo):
 
 class WhisperDummyInputsBuilder(BaseDummyInputsBuilder[WhisperProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+
+        return "<|startoftranscript|>" * num_audios
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         feature_extractor = self.info.get_feature_extractor()
 
         sampling_rate = feature_extractor.sampling_rate
         audio_len = feature_extractor.chunk_length * sampling_rate
         num_audios = mm_counts.get("audio", 0)
 
-        mm_data = {
+        return {
             "audio":
             self._get_dummy_audios(length=audio_len, num_audios=num_audios)
         }
 
-        return ProcessorInputs(
-            prompt_text="<|startoftranscript|>" * num_audios,
-            mm_data=mm_data,
-        )
-
 
 class WhisperMultiModalProcessor(
         EncDecMultiModalProcessor[WhisperProcessingInfo]):
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index fefeefd21375e..f531314abedc7 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1051,12 +1051,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                  *,
                  cache: Optional[ProcessingCache] = None,
                  enable_sanity_checks: bool = True) -> None:
-        if get_repls := getattr(self, "_get_prompt_replacements", None):
-            logger.warning_once("`_get_prompt_replacements` has been renamed "
-                                "to `_get_prompt_updates`. The old name will "
-                                "be removed in an upcoming release.")
-            self._get_prompt_updates = get_repls  # type: ignore[method-assign]
-
         super().__init__()
 
         self.info = info
@@ -1274,13 +1268,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         """
         mm_counts = mm_items.get_all_counts()
 
-        dummy_inputs = self.dummy_inputs.get_dummy_processor_inputs(
-            self.info.ctx.model_config.max_model_len,
-            mm_counts,
-        )
-
         _, mm_kwargs, _ = self._apply_hf_processor_text_mm(
-            prompt_text=dummy_inputs.prompt_text,
+            prompt_text=self.dummy_inputs.get_dummy_text(mm_counts),
             mm_items=mm_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
         )
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 7efe86448fdd0..29de9b7cda03c 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from abc import ABC, abstractmethod
+from abc import ABC
 from collections.abc import Mapping
 from dataclasses import dataclass, field
 from typing import Generic, NamedTuple, Optional, TypeVar, cast
@@ -60,7 +60,35 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
 
         self.info = info
 
-    @abstractmethod
+    # TODO: @abstractmethod after transition
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        """
+        Build the text input corresponding to :code:`mm_counts`.
+        """
+        if (type(self).get_dummy_processor_inputs ==
+                BaseDummyInputsBuilder.get_dummy_processor_inputs):
+            raise NotImplementedError
+
+        logger.warning_once("`get_dummy_processor_inputs` has been split up "
+                            "into `get_dummy_text` and `get_dummy_mm_data`. "
+                            "These two methods will be marked as abstract "
+                            "in an upcoming release.")
+
+        seq_len = self.info.ctx.model_config.max_model_len
+        return self.get_dummy_processor_inputs(seq_len, mm_counts).prompt_text
+
+    # TODO: @abstractmethod after transition
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        """
+        Build the multimodal input which, after processing, results in
+        the maximum possible number of placeholder tokens.
+        """
+        raise NotImplementedError
+
     def get_dummy_processor_inputs(
         self,
         seq_len: int,
@@ -70,7 +98,10 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
         Build the input which, after processing, results in
         the maximum possible number of placeholder tokens.
         """
-        raise NotImplementedError
+        dummy_text = self.get_dummy_text(mm_counts)
+        dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts)
+
+        return ProcessorInputs(prompt_text=dummy_text, mm_data=dummy_mm_data)
 
     def _get_dummy_audios(
         self,

From 7cd0bd7212802f845e6cb9e3f1ba0faa422465f4 Mon Sep 17 00:00:00 2001
From: look <eeslook@163.com>
Date: Fri, 11 Apr 2025 04:16:48 +0800
Subject: [PATCH 369/593] [Bugfix] Fix output token length check logic (#16419)

Signed-off-by: look <eeslook@163.com>
---
 benchmarks/benchmark_serving.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 431adb8e997ee..0f061f89b2306 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -156,7 +156,7 @@ def calculate_metrics(
         if outputs[i].success:
             output_len = outputs[i].output_tokens
 
-            if output_len is None:
+            if not output_len:
                 # We use the tokenizer to count the number of output tokens
                 # for some serving backends instead of looking at
                 # len(outputs[i].itl) since multiple output tokens may be

From 3cc9af88ff76c16498cfa85579620e8db63fede9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nicolo.lucchesi@gmail.com>
Date: Thu, 10 Apr 2025 23:05:44 +0200
Subject: [PATCH 370/593] [TPU][V1] Disable per-request seed/Generator (#16172)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 tests/v1/tpu/test_sampler.py       |  5 +++++
 vllm/platforms/tpu.py              | 13 ++++++++-----
 vllm/v1/sample/tpu/metadata.py     | 16 ++++++++++------
 vllm/v1/worker/tpu_model_runner.py |  8 +-------
 4 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/tests/v1/tpu/test_sampler.py b/tests/v1/tpu/test_sampler.py
index f535abedea229..0147da5335171 100644
--- a/tests/v1/tpu/test_sampler.py
+++ b/tests/v1/tpu/test_sampler.py
@@ -34,3 +34,8 @@ def test_sampler_different(model_name: str):
     sampling_params = SamplingParams(temperature=0.1, min_p=0.8, max_tokens=64)
     output2 = llm.generate(prompts, sampling_params)
     assert output[0].outputs[0].text != output2[0].outputs[0].text
+
+    with pytest.raises(ValueError):
+        # Unsupported `seed` param.
+        sampling_params = SamplingParams(temperature=0.3, seed=42)
+        output2 = llm.generate(prompts, sampling_params)
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 61e84a6d6f95d..ada599c27b446 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -7,7 +7,7 @@ import torch
 import vllm.envs as envs
 from vllm.inputs import PromptType
 from vllm.logger import init_logger
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import SamplingParams, SamplingType
 
 from .interface import Platform, PlatformEnum, _Backend
 
@@ -145,7 +145,10 @@ class TpuPlatform(Platform):
         params: Union[SamplingParams, PoolingParams],
     ) -> None:
         """Raises if this request is unsupported on this platform"""
-        if isinstance(params,
-                      SamplingParams) and params.guided_decoding is not None:
-            raise ValueError("Structured output is not supported on "
-                             f"{cls.device_name}.")
+        if isinstance(params, SamplingParams):
+            if params.guided_decoding is not None:
+                raise ValueError("Structured output is not supported on "
+                                 f"{cls.device_name}.")
+            if params.sampling_type == SamplingType.RANDOM_SEED:
+                raise ValueError(
+                    "Torch XLA does not support per-request seed.")
diff --git a/vllm/v1/sample/tpu/metadata.py b/vllm/v1/sample/tpu/metadata.py
index 10995d6787a55..3950fda3e5eae 100644
--- a/vllm/v1/sample/tpu/metadata.py
+++ b/vllm/v1/sample/tpu/metadata.py
@@ -33,10 +33,6 @@ class TPUSupportedSamplingMetadata:
     # Greedy sampling flag for compiling single xla graph.
     all_greedy: bool = True
 
-    # Generator not supported by xla
-    generators: dict[int,
-                     torch.Generator] = field(default_factory=lambda: dict())
-
     # unsupported, you need to return an extra tensor of static size BxV
     max_num_logprobs = None
 
@@ -57,6 +53,15 @@ class TPUSupportedSamplingMetadata:
     allowed_token_ids_mask = None
     bad_words_token_ids = None
 
+    # Generator not supported by xla
+    _generators: dict[int,
+                      torch.Generator] = field(default_factory=lambda: dict())
+
+    @property
+    def generators(self) -> dict[int, torch.Generator]:
+        # Generator not supported by torch/xla. This field must be immutable.
+        return self._generators
+
     @classmethod
     def from_input_batch(
         cls,
@@ -109,5 +114,4 @@ class TPUSupportedSamplingMetadata:
             top_p=None,  # input_batch.top_p[:padded_num_reqs],
             top_k=None,  # input_batch.top_k[:padded_num_reqs],
             min_p=input_batch.min_p_cpu_tensor[:padded_num_reqs].to(
-                xla_device),
-            generators=input_batch.generators)
+                xla_device))
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index e6c5a8996de6a..69251d8bbb31f 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -23,7 +23,6 @@ from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.multimodal.utils import group_mm_inputs_by_modality
-from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
 from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available
 from vllm.v1.attention.backends.pallas import (PallasAttentionBackend,
@@ -267,11 +266,6 @@ class TPUModelRunner:
         for new_req_data in scheduler_output.scheduled_new_reqs:
             req_id = new_req_data.req_id
             sampling_params = new_req_data.sampling_params
-            if sampling_params.sampling_type == SamplingType.RANDOM_SEED:
-                generator = torch.Generator(device=self.device)
-                generator.manual_seed(sampling_params.seed)
-            else:
-                generator = None
 
             self.requests[req_id] = CachedRequestState(
                 req_id=req_id,
@@ -280,7 +274,7 @@ class TPUModelRunner:
                 mm_inputs=new_req_data.mm_inputs,
                 mm_positions=new_req_data.mm_positions,
                 sampling_params=sampling_params,
-                generator=generator,
+                generator=None,
                 block_ids=new_req_data.block_ids,
                 num_computed_tokens=new_req_data.num_computed_tokens,
                 output_token_ids=[],

From 268c3250787729198a0d86378a36e81d4606a2af Mon Sep 17 00:00:00 2001
From: WWW <jadewangcn@outlook.com>
Date: Fri, 11 Apr 2025 06:31:17 +0800
Subject: [PATCH 371/593] Fix range_ratio Bug in RandomDataset (#16126)

Signed-off-by: jadewang21 <jadewangcn@outlook.com>
---
 benchmarks/benchmark_dataset.py    | 23 ++++++++++++++++++-----
 benchmarks/benchmark_serving.py    | 19 ++++++++++++-------
 benchmarks/benchmark_throughput.py | 20 ++++++++++++--------
 3 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index c6630800cfbb9..8bb7cfe0e871c 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -288,7 +288,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
 class RandomDataset(BenchmarkDataset):
     # Default values copied from benchmark_serving.py for the random dataset.
     DEFAULT_PREFIX_LEN = 0
-    DEFAULT_RANGE_RATIO = 1.0
+    DEFAULT_RANGE_RATIO = 0.0
     DEFAULT_INPUT_LEN = 1024
     DEFAULT_OUTPUT_LEN = 128
 
@@ -308,19 +308,32 @@ class RandomDataset(BenchmarkDataset):
         output_len: int = DEFAULT_OUTPUT_LEN,
         **kwargs,
     ) -> list[SampleRequest]:
+        # Enforce range_ratio < 1
+        assert range_ratio < 1.0, (
+            "random_range_ratio must be < 1.0 to ensure a valid sampling range"
+        )
+
         vocab_size = tokenizer.vocab_size
 
         prefix_token_ids = (np.random.randint(
             0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
 
-        input_low = int(input_len * range_ratio)
-        output_low = int(output_len * range_ratio)
+        # New sampling logic: [X * (1 - b), X * (1 + b)]
+        input_low = int(input_len * (1 - range_ratio))
+        input_high = int(input_len * (1 + range_ratio))
+        output_low = int(output_len * (1 - range_ratio))
+        output_high = int(output_len * (1 + range_ratio))
+
+        # Add logging for debugging
+        logger.info("Sampling input_len from [%s, %s]", input_low, input_high)
+        logger.info("Sampling output_len from [%s, %s]", output_low,
+                    output_high)
 
         input_lens = np.random.randint(input_low,
-                                       input_len + 1,
+                                       input_high + 1,
                                        size=num_requests)
         output_lens = np.random.randint(output_low,
-                                        output_len + 1,
+                                        output_high + 1,
                                         size=num_requests)
         offsets = np.random.randint(0, vocab_size, size=num_requests)
 
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 0f061f89b2306..b5bd840d8410d 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -996,18 +996,23 @@ if __name__ == "__main__":
     random_group.add_argument(
         "--random-range-ratio",
         type=float,
-        default=1.0,
-        help="Range of sampled ratio of input/output length, "
-        "used only for random sampling.",
+        default=0.0,
+        help="Range ratio for sampling input/output length, "
+        "used only for random sampling. Must be in the range [0, 1) to define "
+        "a symmetric sampling range"
+        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
     )
     random_group.add_argument(
         "--random-prefix-len",
         type=int,
         default=0,
-        help="Number of fixed prefix tokens before random "
-        " context. The length range of context in a random "
-        " request is [random-prefix-len, "
-        " random-prefix-len + random-prefix-len * random-range-ratio).")
+        help=("Number of fixed prefix tokens before the random context "
+              "in a request. "
+              "The total input length is the sum of `random-prefix-len` and "
+              "a random "
+              "context length sampled from [input_len * (1 - range_ratio), "
+              "input_len * (1 + range_ratio)]."),
+    )
 
     hf_group = parser.add_argument_group("hf dataset options")
     hf_group.add_argument("--hf-subset",
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 33a31e62f9165..39e4a34bc7f5f 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -594,18 +594,22 @@ if __name__ == "__main__":
         default=None,
         help="Path to the lora adapters to use. This can be an absolute path, "
         "a relative path, or a Hugging Face model identifier.")
-    parser.add_argument("--prefix-len",
-                        type=int,
-                        default=None,
-                        help="Number of prefix tokens per request."
-                        "This is for the RandomDataset and SonnetDataset")
+    parser.add_argument(
+        "--prefix-len",
+        type=int,
+        default=0,
+        help="Number of fixed prefix tokens before the random "
+        "context in a request (default: 0).",
+    )
     # random dataset
     parser.add_argument(
         "--random-range-ratio",
         type=float,
-        default=None,
-        help="Range of sampled ratio of input/output length, "
-        "used only for RandomDataSet.",
+        default=0.0,
+        help="Range ratio for sampling input/output length, "
+        "used only for RandomDataset. Must be in the range [0, 1) to define "
+        "a symmetric sampling range "
+        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
     )
 
     # hf dtaset

From 3e397a948431755b668fb95af679ec0a33daea89 Mon Sep 17 00:00:00 2001
From: Alexey Belyakov <alexey.belyakov@intel.com>
Date: Fri, 11 Apr 2025 03:15:06 +0100
Subject: [PATCH 372/593] check input length of sonnet samples (#16423)

Signed-off-by: alexey-belyakov <alexey.belyakov@intel.com>
---
 benchmarks/benchmark_dataset.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index 8bb7cfe0e871c..63f174275d47b 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -489,7 +489,7 @@ class SonnetDataset(BenchmarkDataset):
         prefix_lines = self.data[:num_prefix_lines]
 
         samples = []
-        for _ in range(num_requests):
+        while len(samples) < num_requests:
             extra_lines = random.choices(self.data,
                                          k=num_input_lines - num_prefix_lines)
             prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
@@ -497,13 +497,14 @@ class SonnetDataset(BenchmarkDataset):
             prompt_formatted = tokenizer.apply_chat_template(
                 msg, add_generation_prompt=True, tokenize=False)
             prompt_len = len(tokenizer(prompt_formatted).input_ids)
-            samples.append(
-                SampleRequest(
-                    prompt=prompt_formatted
-                    if return_prompt_formatted else prompt,
-                    prompt_len=prompt_len,
-                    expected_output_len=output_len,
-                ))
+            if prompt_len <= input_len:
+                samples.append(
+                    SampleRequest(
+                        prompt=prompt_formatted
+                        if return_prompt_formatted else prompt,
+                        prompt_len=prompt_len,
+                        expected_output_len=output_len,
+                    ))
         return samples
 
 
From d544d141ec3e2f2beda7a1c15bcb5fdf763e7e91 Mon Sep 17 00:00:00 2001
From: Chenyaaang <42742451+Chenyaaang@users.noreply.github.com>
Date: Thu, 10 Apr 2025 21:25:52 -0700
Subject: [PATCH 373/593] update benchmark_serving_structured_output to include
 auto backend (#16438)

Signed-off-by: Chenyaaang <chenyangli@google.com>
---
 .../benchmark_serving_structured_output.py       | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index 7ad0791a98681..e52f16a8b1299 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -11,7 +11,7 @@ On the client side, run:
         --model <your_model> \
         --dataset json \
         --structured-output-ratio 1.0 \
-        --structured-output-backend xgrammar \
+        --structured-output-backend auto \
         --request-rate 10 \
         --num-prompts 1000
 
@@ -997,12 +997,14 @@ if __name__ == "__main__":
                         type=float,
                         default=1.0,
                         help="Ratio of Structured Outputs requests")
-    parser.add_argument(
-        "--structured-output-backend",
-        type=str,
-        choices=["outlines", "lm-format-enforcer", "xgrammar", "guidance"],
-        default="xgrammar",
-        help="Backend to use for structured outputs")
+    parser.add_argument("--structured-output-backend",
+                        type=str,
+                        choices=[
+                            "outlines", "lm-format-enforcer", "xgrammar",
+                            "guidance", "auto"
+                        ],
+                        default="auto",
+                        help="Backend to use for structured outputs")
 
     args = parser.parse_args()
     main(args)

From 99ef59cf7f93a6ce7d5ed3fd403db0aa962e5348 Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Date: Thu, 10 Apr 2025 21:26:07 -0700
Subject: [PATCH 374/593] [Llama4] Enable attention temperature tuning by
 default for long context (>32k) (#16439)

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
---
 vllm/model_executor/models/llama4.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index 3dbf352ab8b0f..8785e9dcff08a 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -467,11 +467,15 @@ class Llama4ForCausalLM(LlamaForCausalLM):
     }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        # Update temperature tuning config from generation config
+        # update temperature tuning config from generation config
         gen_config = vllm_config.model_config.try_get_generation_config()
         gen_config.update(vllm_config.model_config.override_generation_config)
+        # enable temperature tuning by default when max_model_len > 32K
+        default_attn_temperature_tuning = \
+            vllm_config.model_config.max_model_len > 32768
         vllm_config.model_config.hf_config.attn_temperature_tuning \
-            = gen_config.get("attn_temperature_tuning", False)
+            = gen_config.get(
+                "attn_temperature_tuning", default_attn_temperature_tuning)
 
         super().__init__(vllm_config=vllm_config,
                          prefix=prefix,

From ed37599544007fdbd1c6a33d2a2b50f5df6764ee Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 10 Apr 2025 22:28:07 -0600
Subject: [PATCH 375/593] Update supported_hardware.md for TPU INT8 (#16437)

---
 docs/source/features/quantization/supported_hardware.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/features/quantization/supported_hardware.md b/docs/source/features/quantization/supported_hardware.md
index a5bd8caf77cd0..2cbe8779dd8a1 100644
--- a/docs/source/features/quantization/supported_hardware.md
+++ b/docs/source/features/quantization/supported_hardware.md
@@ -62,7 +62,7 @@ The table below shows the compatibility of various quantization implementations
   * ❌
   * ✅︎
   * ❌
-  * ❌
+  * ✅︎
 - * FP8 (W8A8)
   * ❌
   * ❌

From 93195146eaa48d102c272d24d7c4e0e148759602 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 11 Apr 2025 12:57:16 +0800
Subject: [PATCH 376/593] [Bugfix][VLM] Fix failing Phi-4-MM multi-images tests
 and add vision-speech test (#16424)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 examples/offline_inference/audio_language.py  | 16 +--
 examples/offline_inference/vision_language.py | 34 +++++--
 .../vision_language_multi_image.py            | 12 +--
 .../vision_language/test_models.py            |  4 +-
 .../vision_language/test_phi4mm.py            | 97 +++++++++++++++----
 5 files changed, 118 insertions(+), 45 deletions(-)

diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index f33efbab955ef..9d7585914f5eb 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -199,13 +199,6 @@ def main(args):
     engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
     llm = LLM(**engine_args)
 
-    # To maintain code compatibility in this script, we add LoRA here.
-    # You can also add LoRA using:
-    # llm.generate(prompts, lora_request=lora_request,...)
-    if req_data.lora_requests:
-        for lora_request in req_data.lora_requests:
-            llm.llm_engine.add_lora(lora_request=lora_request)
-
     # We set temperature to 0.2 so that outputs can be different
     # even when all prompts are identical when running batch inference.
     sampling_params = SamplingParams(temperature=0.2,
@@ -226,8 +219,15 @@ def main(args):
     if args.num_prompts > 1:
         # Batch inference
         inputs = [inputs] * args.num_prompts
+    # Add LoRA request if applicable
+    lora_request = (req_data.lora_requests *
+                    args.num_prompts if req_data.lora_requests else None)
 
-    outputs = llm.generate(inputs, sampling_params=sampling_params)
+    outputs = llm.generate(
+        inputs,
+        sampling_params=sampling_params,
+        lora_request=lora_request,
+    )
 
     for o in outputs:
         generated_text = o.outputs[0].text
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 7eb2e852f266d..7b587f29b5a7f 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -8,6 +8,7 @@ on HuggingFace model repository.
 """
 import os
 import random
+from contextlib import contextmanager
 from dataclasses import asdict
 from typing import NamedTuple, Optional
 
@@ -1055,6 +1056,20 @@ def apply_image_repeat(image_repeat_prob, num_prompts, data,
     return inputs
 
 
+@contextmanager
+def time_counter(enable: bool):
+    if enable:
+        import time
+        start_time = time.time()
+        yield
+        elapsed_time = time.time() - start_time
+        print("-" * 50)
+        print("-- generate time = {}".format(elapsed_time))
+        print("-" * 50)
+    else:
+        yield
+
+
 def main(args):
     model = args.model_type
     if model not in model_example_map:
@@ -1113,17 +1128,16 @@ def main(args):
                 },
             } for i in range(args.num_prompts)]
 
-    if args.time_generate:
-        import time
-        start_time = time.time()
-        outputs = llm.generate(inputs, sampling_params=sampling_params)
-        elapsed_time = time.time() - start_time
-        print("-" * 50)
-        print("-- generate time = {}".format(elapsed_time))
-        print("-" * 50)
+    # Add LoRA request if applicable
+    lora_request = (req_data.lora_requests *
+                    args.num_prompts if req_data.lora_requests else None)
 
-    else:
-        outputs = llm.generate(inputs, sampling_params=sampling_params)
+    with time_counter(args.time_generate):
+        outputs = llm.generate(
+            inputs,
+            sampling_params=sampling_params,
+            lora_request=lora_request,
+        )
 
     print("-" * 50)
     for o in outputs:
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index d9f84d2feae86..1ac141d8a583c 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -661,13 +661,6 @@ def run_generate(model, question: str, image_urls: list[str],
     engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
     llm = LLM(**engine_args)
 
-    # To maintain code compatibility in this script, we add LoRA here.
-    # You can also add LoRA using:
-    # llm.generate(prompts, lora_request=lora_request,...)
-    if req_data.lora_requests:
-        for lora_request in req_data.lora_requests:
-            llm.llm_engine.add_lora(lora_request=lora_request)
-
     sampling_params = SamplingParams(temperature=0.0,
                                      max_tokens=256,
                                      stop_token_ids=req_data.stop_token_ids)
@@ -679,7 +672,9 @@ def run_generate(model, question: str, image_urls: list[str],
                 "image": req_data.image_data
             },
         },
-        sampling_params=sampling_params)
+        sampling_params=sampling_params,
+        lora_request=req_data.lora_requests,
+    )
 
     print("-" * 50)
     for o in outputs:
@@ -724,6 +719,7 @@ def run_chat(model: str, question: str, image_urls: list[str],
         }],
         sampling_params=sampling_params,
         chat_template=req_data.chat_template,
+        lora_request=req_data.lora_requests,
     )
 
     print("-" * 50)
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index e19cc241054bc..5bd10544d81b6 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -433,8 +433,8 @@ VLM_TEST_SETTINGS = {
         max_model_len=4096,
         max_num_seqs=2,
         task="generate",
-        # use eager mode for hf runner since phi3v didn't work with flash_attn
-        hf_model_kwargs={"_attn_implementation": "eager"},
+        # use sdpa mode for hf runner since phi3v didn't work with flash_attn
+        hf_model_kwargs={"_attn_implementation": "sdpa"},
         use_tokenizer_eos=True,
         vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
         num_logprobs=10,
diff --git a/tests/models/decoder_only/vision_language/test_phi4mm.py b/tests/models/decoder_only/vision_language/test_phi4mm.py
index c3e88b60978a0..3cd830015076d 100644
--- a/tests/models/decoder_only/vision_language/test_phi4mm.py
+++ b/tests/models/decoder_only/vision_language/test_phi4mm.py
@@ -2,18 +2,22 @@
 
 import os
 import re
+from collections.abc import Sequence
 from typing import Optional
 
+import librosa
 import pytest
 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
 
+from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.image import rescale_image_size
 from vllm.platforms import current_platform
 from vllm.sequence import SampleLogprobs
 
-from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
+                          PromptImageInput, VllmRunner)
 from ....utils import large_gpu_test
 from ...utils import check_logprobs_close
 
@@ -29,6 +33,8 @@ model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
 # Since the vision-lora and speech-lora co-exist with the base model,
 # we have to manually specify the path of the lora weights.
 vision_lora_path = os.path.join(model_path, "vision-lora")
+speech_question = os.path.join(model_path, "examples",
+                               "what_is_shown_in_this_image.wav")
 models = [model_path]
 
 
@@ -64,7 +70,8 @@ if current_platform.is_rocm():
 def run_test(
     hf_runner: type[HfRunner],
     vllm_runner: type[VllmRunner],
-    inputs: list[tuple[list[str], PromptImageInput]],
+    inputs: Sequence[tuple[list[str], PromptImageInput,
+                           Optional[PromptAudioInput]]],
     model: str,
     *,
     max_model_len: int,
@@ -104,28 +111,49 @@ def run_test(
             enforce_eager=True,
     ) as vllm_model:
         lora_request = LoRARequest("vision", 1, vision_lora_path)
-        vllm_model.model.llm_engine.add_lora(lora_request=lora_request)
         vllm_outputs_per_case = [
             vllm_model.generate_greedy_logprobs(prompts,
                                                 max_tokens,
                                                 num_logprobs=num_logprobs,
-                                                images=images)
-            for prompts, images in inputs
+                                                images=images,
+                                                audios=audios,
+                                                lora_request=lora_request)
+            for prompts, images, audios in inputs
         ]
 
-    # use eager mode for hf runner, since phi3_v didn't work with flash_attn
-    hf_model_kwargs = {"_attn_implementation": "eager"}
+    hf_model_kwargs = {"_attn_implementation": "sdpa"}
     with hf_runner(model, dtype=dtype,
                    model_kwargs=hf_model_kwargs) as hf_model:
-        eos_token_id = hf_model.processor.tokenizer.eos_token_id
+
+        hf_processor = hf_model.processor
+        eos_token_id = hf_processor.tokenizer.eos_token_id
+
+        def patch_hf_processor(*args,
+                               text="",
+                               images=None,
+                               audio=None,
+                               sampling_rate=None,
+                               **kwargs):
+            audios = None
+            if audio is not None and sampling_rate is not None:
+                audios = [(audio, sampling_rate)]
+            return hf_processor(*args,
+                                text=text,
+                                images=images,
+                                audios=audios,
+                                **kwargs)
+
+        hf_model.processor = patch_hf_processor
+
         hf_outputs_per_case = [
             hf_model.generate_greedy_logprobs_limit(prompts,
                                                     max_tokens,
                                                     num_logprobs=num_logprobs,
                                                     images=images,
+                                                    audios=audios,
                                                     eos_token_id=eos_token_id,
                                                     num_logits_to_keep=0)
-            for prompts, images in inputs
+            for prompts, images, audios in inputs
         ]
 
     for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
@@ -138,8 +166,6 @@ def run_test(
         )
 
 
-# Since we use _attn_implementation="eager" for hf_runner, there is more
-# significant numerical difference. The basic `logprobs=5` fails to pass.
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize(
     "size_factors",
@@ -151,7 +177,7 @@ def run_test(
         # Single-scale, batched
         [1.0, 1.0, 1.0],
         # Multi-scale
-        [0.7, 0.75, 1.0],
+        [0.25, 0.5, 1.0],
     ],
 )
 @pytest.mark.parametrize("dtype", [target_dtype])
@@ -166,6 +192,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
     inputs_per_image = [(
         [prompt for _ in size_factors],
         [rescale_image_size(image, factor) for factor in size_factors],
+        None,
     ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
 
     run_test(
@@ -201,17 +228,18 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
 @pytest.mark.parametrize("max_model_len", [10000])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
-@pytest.mark.xfail(
-    reason="Phi-4-MM multi-image inference is divergent with hf model.")
 def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
                              size_factors, dtype: str, max_model_len: int,
                              max_tokens: int, num_logprobs: int) -> None:
     images = [asset.pil_image for asset in image_assets]
 
     inputs_per_case = [
-        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
-         [[rescale_image_size(image, factor) for image in images]
-          for factor in size_factors])
+        (
+            [HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
+            [[rescale_image_size(image, factor) for image in images]
+             for factor in size_factors],
+            None,
+        ),
     ]
 
     run_test(
@@ -226,3 +254,38 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
         mm_limit=2,
         tensor_parallel_size=1,
     )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_model_len", [10000])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
+                              max_model_len: int, max_tokens: int,
+                              num_logprobs: int) -> None:
+
+    # use the example speech question so that the model outputs are reasonable
+    audio = librosa.load(speech_question, sr=None)
+    image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+
+    inputs_vision_speech = [
+        (
+            ["<|user|><|image_1|><|audio_1|><|end|><|assistant|>"],
+            [image],
+            [audio],
+        ),
+    ]
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs_vision_speech,
+        model,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )

From dda811021a5d58746583db1f3060929f3e6d90e0 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Fri, 11 Apr 2025 14:19:07 +0800
Subject: [PATCH 377/593] [CPU][Bugfix] Fix CPU docker issues (#16454)

Signed-off-by: jiang.li <jiang1.li@intel.com>
---
 docker/Dockerfile.cpu | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index 8133651865b50..54d1ce86d0112 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -18,6 +18,8 @@ WORKDIR /workspace/
 ARG PYTHON_VERSION=3.12
 ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 
+ENV LD_PRELOAD=""
+
 # Install minimal dependencies and uv
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     --mount=type=cache,target=/var/lib/apt,sharing=locked \
@@ -32,6 +34,7 @@ ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
 
 ENV PATH="/root/.local/bin:$PATH"
 ENV VIRTUAL_ENV="/opt/venv"
+ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python
 RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
 ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 

From f8f9c0ba6280a700501e23746b3a6230b549eabe Mon Sep 17 00:00:00 2001
From: Alex Brooks <alex.brooks@ibm.com>
Date: Fri, 11 Apr 2025 00:19:40 -0600
Subject: [PATCH 378/593] [Bugfix] Don't set an upper bound on repetition
 penalty (#16403)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
---
 vllm/sampling_params.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 75cf09e0a2282..68ed996649471 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -385,9 +385,10 @@ class SamplingParams(
         if not -2.0 <= self.frequency_penalty <= 2.0:
             raise ValueError("frequency_penalty must be in [-2, 2], got "
                              f"{self.frequency_penalty}.")
-        if not 0.0 < self.repetition_penalty <= 2.0:
-            raise ValueError("repetition_penalty must be in (0, 2], got "
-                             f"{self.repetition_penalty}.")
+        if self.repetition_penalty <= 0.0:
+            raise ValueError(
+                "repetition_penalty must be greater than zero, got "
+                f"{self.repetition_penalty}.")
         if self.temperature < 0.0:
             raise ValueError(
                 f"temperature must be non-negative, got {self.temperature}.")

From 905e91e9aca430421a8402c38a289c978f939ee3 Mon Sep 17 00:00:00 2001
From: DefTruth <31974251+DefTruth@users.noreply.github.com>
Date: Fri, 11 Apr 2025 14:44:22 +0800
Subject: [PATCH 379/593] Revert "[Model] use AutoWeightsLoader for
 deepseek_v2, internlm2" (#16453)

---
 vllm/model_executor/models/deepseek_v2.py | 174 +++++++++++-----------
 vllm/model_executor/models/internlm2.py   |  75 +++++-----
 2 files changed, 122 insertions(+), 127 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 62714f883bc1f..23b450aeddac9 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -53,7 +53,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
+from .utils import (PPMissingLayer, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -668,91 +668,6 @@ class DeepseekV2Model(nn.Module):
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[Tuple[str,
-                                                   torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-
-        # Params for weights, fp8 weight scales, fp8 activation scales
-        # (param_name, weight_name, expert_id, shard_id)
-        expert_params_mapping = FusedMoE.make_expert_params_mapping(
-            ckpt_gate_proj_name="gate_proj",
-            ckpt_down_proj_name="down_proj",
-            ckpt_up_proj_name="up_proj",
-            num_experts=self.config.n_routed_experts)
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
-            if spec_layer is not None:
-                continue  # skip spec decode layers for main model
-
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                # Skip non-stacked layers and experts (experts handled below).
-                if weight_name not in name:
-                    continue
-                # We have mlp.experts[0].gate_proj in the checkpoint.
-                # Since we handle the experts below in expert_params_mapping,
-                # we need to skip here BEFORE we update the name, otherwise
-                # name will be updated to mlp.experts[0].gate_up_proj, which
-                # will then be updated below in expert_params_mapping
-                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
-                if (("mlp.experts." in name) and name not in params_dict):
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                for mapping in expert_params_mapping:
-                    param_name, weight_name, expert_id, shard_id = mapping
-                    if weight_name not in name:
-                        continue
-                    name = name.replace(weight_name, param_name)
-
-                    if is_pp_missing_parameter(name, self):
-                        continue
-
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  name,
-                                  shard_id=shard_id,
-                                  expert_id=expert_id)
-                    break
-                else:
-                    # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
-                        continue
-
-                    # Remapping the name of FP8 kv-scale.
-                    name = maybe_remap_kv_scale_name(name, params_dict)
-                    if name is None:
-                        continue
-
-                    if is_pp_missing_parameter(name, self):
-                        continue
-
-                    param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
-                    weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
 
@@ -822,8 +737,91 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        loader = AutoWeightsLoader(self, skip_prefixes=["rotary_emb.inv_freq"])
-        return loader.load_weights(weights)
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts)
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
+            if spec_layer is not None:
+                continue  # skip spec decode layers for main model
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if (("mlp.experts." in name) and name not in params_dict):
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
 
 
 class DeepseekV3ForCausalLM(DeepseekV2ForCausalLM):
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index bf544ed3a0af0..520b85c0cdfbc 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -32,7 +32,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors, PoolerOutput
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -306,42 +306,6 @@ class InternLM2Model(nn.Module):
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[Tuple[str,
-                                                   torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("gate_up_proj", "w1", 0),
-            ("gate_up_proj", "w3", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
 
 class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
     packed_modules_mapping = {
@@ -409,8 +373,41 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        loader = AutoWeightsLoader(self, skip_prefixes=["rotary_emb.inv_freq"])
-        return loader.load_weights(weights)
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "w1", 0),
+            ("gate_up_proj", "w3", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
 
 
 class InternLM2ForRewardModel(InternLM2ForCausalLM):

From f7030df3be651bbce42932be736129d37caa856b Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 11 Apr 2025 15:32:37 +0800
Subject: [PATCH 380/593] [Core][LoRA][1/N] Add LoRA for
 EncoderDecoderModelRunner (#15990)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/layers.py                  |  5 +++++
 vllm/model_executor/models/mllama.py | 11 +++++++++++
 vllm/worker/enc_dec_model_runner.py  | 26 +++++++++++++++++++++++++-
 3 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 7a9d5237ab754..d9de0f3cfeb30 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -866,6 +866,11 @@ class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA):
                 and len(packed_modules_list) == 3)
 
 
+#TODO: Implement this
+class QKVCrossParallelLinearWithLoRA(BaseLayerWithLoRA):
+    pass
+
+
 class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
 
     def __init__(self, base_layer: RowParallelLinear) -> None:
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 251d95e41dc3d..566149c9cf24e 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -52,6 +52,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
@@ -1181,6 +1182,7 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
         super().__init__()
         config: MllamaConfig = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
+        self.config = config
         self.quant_config = quant_config
         self.vocab_size = config.text_config.vocab_size
         self.hidden_size = config.text_config.hidden_size
@@ -1517,6 +1519,15 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
                 updated_params.add(name)
         return updated_params
 
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="multi_modal_projector",
+            tower_model="vision_model")
+
 
 def skip_attention_mask(sparse_mask: List[List[int]]) -> bool:
     for mask in sparse_mask:
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 5f39f2fa4947c..72ff9d66a6898 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -16,6 +16,7 @@ from vllm.config import VllmConfig
 from vllm.forward_context import set_forward_context
 from vllm.inputs import INPUT_REGISTRY, InputRegistry
 from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
@@ -34,6 +35,7 @@ from vllm.worker.model_runner_base import (
 from vllm.worker.utils import assert_enc_dec_mr_supported_scenario
 
 logger = init_logger(__name__)
+LORA_WARMUP_RANK = 8
 
 
 @dataclasses.dataclass(frozen=True)
@@ -160,7 +162,11 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
         if num_steps > 1:
             raise ValueError("num_steps > 1 is not supported in "
                              "EncoderDecoderModelRunner")
-
+        if self.lora_config:
+            assert model_input.lora_requests is not None
+            assert model_input.lora_mapping is not None
+            self.set_active_loras(model_input.lora_requests,
+                                  model_input.lora_mapping)
         if (model_input.attn_metadata is not None
                 and model_input.attn_metadata.prefill_metadata is None
                 and model_input.attn_metadata.decode_metadata.use_cuda_graph):
@@ -268,6 +274,22 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
         max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
         max_num_seqs = self.scheduler_config.max_num_seqs
 
+        # This represents the maximum number of different requests
+        # that will have unique loras, and therefore the max amount of
+        # memory consumption. Create dummy lora request copies from the
+        # lora request passed in, which contains a lora from the lora
+        # warmup path.
+        dummy_lora_requests: List[LoRARequest] = []
+        dummy_lora_requests_per_seq: List[LoRARequest] = []
+        if self.lora_config:
+            dummy_lora_requests = self._add_dummy_loras(
+                self.lora_config.max_loras)
+            assert len(dummy_lora_requests) == self.lora_config.max_loras
+            dummy_lora_requests_per_seq = [
+                dummy_lora_requests[idx % len(dummy_lora_requests)]
+                for idx in range(max_num_seqs)
+            ]
+
         # Profile memory usage with max_num_sequences sequences and the total
         # number of tokens equal to max_num_batched_tokens.
         seqs: List[SequenceGroupMetadata] = []
@@ -315,6 +337,8 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
                 block_tables=None,
                 encoder_seq_data=encoder_dummy_data.seq_data,
                 cross_block_table=None,
+                lora_request=dummy_lora_requests_per_seq[group_id]
+                if dummy_lora_requests_per_seq else None,
                 multi_modal_data=decoder_dummy_data.multi_modal_data
                 or encoder_dummy_data.multi_modal_data,
                 multi_modal_placeholders=decoder_dummy_data.

From aa3b3d76e0db63a4214b45805dc9bc3e5609c30e Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 11 Apr 2025 02:09:52 -0600
Subject: [PATCH 381/593] Enforce valid max_num_batched_tokens when
 disable_chunked_mm_input=True (#16447)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 tests/v1/core/test_scheduler.py       | 9 +++++++++
 vllm/engine/arg_utils.py              | 2 +-
 vllm/v1/core/encoder_cache_manager.py | 8 ++++++++
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 75c507555559e..bc17ca32e5b64 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -322,6 +322,15 @@ def test_no_mm_input_chunking():
     assert len(output.finished_req_ids) == 0
     assert output.num_scheduled_tokens[requests[0].request_id] == 800
 
+    # Test that we fail if we disable chunked mm input and use too small
+    # of a max_num_batched_tokens for the mm input.
+    with pytest.raises(ValueError):
+        _ = create_scheduler(
+            model="llava-hf/llava-1.5-7b-hf",
+            max_num_batched_tokens=100,
+            disable_chunked_mm_input=True,
+        )
+
 
 @pytest.mark.parametrize("enable_prefix_caching", [True, False])
 def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 9cc6eca24b5c9..3eafb6827d492 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1030,7 +1030,7 @@ class EngineArgs:
             action=StoreBoolean,
             default=EngineArgs.disable_chunked_mm_input,
             nargs="?",
-            const="False",
+            const="True",
             help="Disable multimodal input chunking attention for V1. "
             "If set to true and chunked prefill is enabled, we do not want to"
             " partially schedule a multimodal item. This ensures that if a "
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index dc76df268c588..05d70bb9b9773 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -133,6 +133,14 @@ def _compute_encoder_budget_multimodal(
     _, max_tokens_per_mm_item = max(max_tokens_by_modality_dict.items(),
                                     key=lambda item: item[1])
 
+    if (scheduler_config.disable_chunked_mm_input and max_tokens_per_mm_item
+            > scheduler_config.max_num_batched_tokens):
+        raise ValueError(
+            "Chunked MM input disabled but max_tokens_per_mm_item "
+            f"({max_tokens_per_mm_item}) is larger than max_num_batched_tokens"
+            f" ({scheduler_config.max_num_batched_tokens}). Please increase "
+            "max_num_batched_tokens.")
+
     encoder_compute_budget = max(scheduler_config.max_num_encoder_input_tokens,
                                  max_tokens_per_mm_item)
     encoder_cache_size = max(scheduler_config.encoder_cache_size,

From a26f59ccbc0ad4d72acce6595db0f128e83180f1 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 11 Apr 2025 16:51:20 +0800
Subject: [PATCH 382/593] [Misc] Raise error for V1 not supporting Long LoRA.
 (#16415)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/config.py      | 6 ++++++
 vllm/lora/models.py | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index ff9579a4bb1e6..d24082799d00b 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2573,6 +2573,11 @@ class LoRAConfig:
             logger.warning("LoRA with chunked prefill is still experimental "
                            "and may be unstable.")
 
+    def verify_lora_support(self):
+        if self.long_lora_scaling_factors is not None and envs.VLLM_USE_V1:
+            raise ValueError(
+                "V1 LoRA does not support long LoRA, please use V0.")
+
 
 @dataclass
 class PromptAdapterConfig:
@@ -3672,6 +3677,7 @@ class VllmConfig:
             self.lora_config.verify_with_model_config(self.model_config)
             self.lora_config.verify_with_scheduler_config(
                 self.scheduler_config)
+            self.lora_config.verify_lora_support()
         if self.prompt_adapter_config:
             self.prompt_adapter_config.verify_with_model_config(
                 self.model_config)
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 8164d919ca8b4..81e0741a03cf7 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -364,7 +364,7 @@ class LoRAModelManager(AdapterModelManager):
         self._last_mapping: Optional[LoRAMapping] = None
         self._create_lora_modules()
         self.model.lora_manager = self
-        self.adapter_type = 'LoRa'
+        self.adapter_type = 'LoRA'
 
     @property
     def capacity(self) -> int:

From 35e076b3a8207e7d39ebab82152fd4b3db648233 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Fri, 11 Apr 2025 18:05:40 +0800
Subject: [PATCH 383/593] [Misc] update api_client example (#16459)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 examples/online_serving/api_client.py | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/examples/online_serving/api_client.py b/examples/online_serving/api_client.py
index e2944896d1610..60e4bccb7517c 100644
--- a/examples/online_serving/api_client.py
+++ b/examples/online_serving/api_client.py
@@ -1,5 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 """Example Python client for `vllm.entrypoints.api_server`
+Start the demo server:
+    python -m vllm.entrypoints.api_server --model <model_name>
+
 NOTE: The API server is used only for demonstration and simple performance
 benchmarks. It is not intended for production use.
 For production use, we recommend `vllm serve` and the OpenAI client API.
@@ -7,6 +10,7 @@ For production use, we recommend `vllm serve` and the OpenAI client API.
 
 import argparse
 import json
+from argparse import Namespace
 from collections.abc import Iterable
 
 import requests
@@ -27,7 +31,6 @@ def post_http_request(prompt: str,
     pload = {
         "prompt": prompt,
         "n": n,
-        "use_beam_search": True,
         "temperature": 0.0,
         "max_tokens": 16,
         "stream": stream,
@@ -55,14 +58,7 @@ def get_response(response: requests.Response) -> list[str]:
     return output
 
 
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--host", type=str, default="localhost")
-    parser.add_argument("--port", type=int, default=8000)
-    parser.add_argument("--n", type=int, default=4)
-    parser.add_argument("--prompt", type=str, default="San Francisco is a")
-    parser.add_argument("--stream", action="store_true")
-    args = parser.parse_args()
+def main(args: Namespace):
     prompt = args.prompt
     api_url = f"http://{args.host}:{args.port}/generate"
     n = args.n
@@ -83,3 +79,14 @@ if __name__ == "__main__":
         output = get_response(response)
         for i, line in enumerate(output):
             print(f"Beam candidate {i}: {line!r}", flush=True)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--n", type=int, default=1)
+    parser.add_argument("--prompt", type=str, default="San Francisco is a")
+    parser.add_argument("--stream", action="store_true")
+    args = parser.parse_args()
+    main(args)

From 51baa9c333a348915dca5d6a2eca0d6037751874 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 11 Apr 2025 11:11:00 +0100
Subject: [PATCH 384/593] Don't install triton on `ppc64le` platform (#16470)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 requirements/cpu.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index 1205c098e6f06..d845fb201ceff 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -17,4 +17,4 @@ torchvision==0.21.0; platform_machine == "ppc64le"
 datasets # for benchmark scripts
 
 # cpu cannot use triton 3.3.0
-triton==3.2.0
+triton==3.2.0; platform_machine != "ppc64le"

From e9528f6dc614879952aa871fae8df296adcfc559 Mon Sep 17 00:00:00 2001
From: DefTruth <31974251+DefTruth@users.noreply.github.com>
Date: Fri, 11 Apr 2025 20:50:50 +0800
Subject: [PATCH 385/593] [Kernel] support merge_attn_states CUDA kernel, 3x
 speedup (#16173)

Signed-off-by: DefTruth <qiustudent_r@163.com>
---
 CMakeLists.txt                           |   1 +
 csrc/attention/merge_attn_states.cu      | 173 +++++++++++++++
 csrc/ops.h                               |   9 +
 csrc/torch_bindings.cpp                  |  15 ++
 tests/kernels/test_merge_attn_states.py  | 265 +++++++++++++++++++++++
 vllm/_custom_ops.py                      |  11 +
 vllm/attention/backends/mla/common.py    |   3 +-
 vllm/attention/ops/merge_attn_states.py  |  42 ++++
 vllm/v1/attention/backends/flash_attn.py |   2 +-
 vllm/v1/attention/backends/mla/common.py |   2 +-
 10 files changed, 519 insertions(+), 4 deletions(-)
 create mode 100644 csrc/attention/merge_attn_states.cu
 create mode 100644 tests/kernels/test_merge_attn_states.py
 create mode 100644 vllm/attention/ops/merge_attn_states.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 15db4a4f4cba4..a0c25df6bd54c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -230,6 +230,7 @@ set(VLLM_EXT_SRC
   "csrc/cache_kernels.cu"
   "csrc/attention/paged_attention_v1.cu"
   "csrc/attention/paged_attention_v2.cu"
+  "csrc/attention/merge_attn_states.cu"
   "csrc/pos_encoding_kernels.cu"
   "csrc/activation_kernels.cu"
   "csrc/layernorm_kernels.cu"
diff --git a/csrc/attention/merge_attn_states.cu b/csrc/attention/merge_attn_states.cu
new file mode 100644
index 0000000000000..7af0caceda2f0
--- /dev/null
+++ b/csrc/attention/merge_attn_states.cu
@@ -0,0 +1,173 @@
+#include <optional>
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <algorithm>
+
+#include "attention_dtypes.h"
+#include "attention_utils.cuh"
+
+namespace vllm {
+
+// Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
+// can be used to combine partial attention results (in the split-KV case)
+template <typename scalar_t, const uint NUM_THREADS>
+__global__ void merge_attn_states_kernel(
+    scalar_t* output, float* output_lse, const scalar_t* prefix_output,
+    const float* prefix_lse, const scalar_t* suffix_output,
+    const float* suffix_lse, const uint num_tokens, const uint num_heads,
+    const uint head_size) {
+  using pack_128b_t = uint4;
+  const uint pack_size = 16 / sizeof(scalar_t);
+  const uint threads_per_head = head_size / pack_size;
+
+  const uint global_idx = blockIdx.x * NUM_THREADS + threadIdx.x;
+  const uint token_head_threads = num_tokens * num_heads * threads_per_head;
+
+  if (global_idx >= token_head_threads) return;
+
+  // global_idx -> token_idx + head_idx + pack_idx
+  const uint token_head_idx = global_idx / threads_per_head;
+  const uint pack_idx = global_idx % threads_per_head;
+
+  const uint token_idx = token_head_idx / num_heads;
+  const uint head_idx = token_head_idx % num_heads;
+
+  const uint pack_offset = pack_idx * pack_size;  // (0~15)*8, etc.
+  const uint head_offset =
+      token_idx * num_heads * head_size + head_idx * head_size;
+  const scalar_t* prefix_head_ptr = prefix_output + head_offset;
+  const scalar_t* suffix_head_ptr = suffix_output + head_offset;
+  scalar_t* output_head_ptr = output + head_offset;
+
+  float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
+  float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
+  p_lse = std::isinf(p_lse) ? -std::numeric_limits<float>::infinity() : p_lse;
+  s_lse = std::isinf(s_lse) ? -std::numeric_limits<float>::infinity() : s_lse;
+
+  const float max_lse = fmaxf(p_lse, s_lse);
+  p_lse = p_lse - max_lse;
+  s_lse = s_lse - max_lse;
+  const float p_se = expf(p_lse);
+  const float s_se = expf(s_lse);
+  const float out_se = p_se + s_se;
+  const float p_scale = p_se / out_se;
+  const float s_scale = s_se / out_se;
+
+  if (pack_offset < head_size) {
+    // Pack 128b load
+    pack_128b_t p_out_pack = reinterpret_cast<const pack_128b_t*>(
+        prefix_head_ptr)[pack_offset / pack_size];
+    pack_128b_t s_out_pack = reinterpret_cast<const pack_128b_t*>(
+        suffix_head_ptr)[pack_offset / pack_size];
+    pack_128b_t o_out_pack;
+
+#pragma unroll
+    for (uint i = 0; i < pack_size; ++i) {
+      // Always use float for FMA to keep high precision.
+      // half(uint16_t), bfloat16, float -> float.
+      const float p_out_f =
+          vllm::to_float(reinterpret_cast<const scalar_t*>(&p_out_pack)[i]);
+      const float s_out_f =
+          vllm::to_float(reinterpret_cast<const scalar_t*>(&s_out_pack)[i]);
+      // fma: a * b + c = p_out_f * p_scale + (s_out_f * s_scale)
+      const float o_out_f = p_out_f * p_scale + (s_out_f * s_scale);
+      // float -> half(uint16_t), bfloat16, float.
+      vllm::from_float(reinterpret_cast<scalar_t*>(&o_out_pack)[i], o_out_f);
+    }
+
+    // Pack 128b storage
+    reinterpret_cast<pack_128b_t*>(output_head_ptr)[pack_offset / pack_size] =
+        o_out_pack;
+  }
+  // We only need to write to output_lse once per head.
+  if (output_lse != nullptr && pack_idx == 0) {
+    float out_lse = logf(out_se) + max_lse;
+    output_lse[head_idx * num_tokens + token_idx] = out_lse;
+  }
+}
+
+}  // namespace vllm
+
+// The following macro is used to dispatch the conversion function based on
+// the output data type. The FN is a macro that calls a function with
+// template<typename scalar_t>.
+#define DISPATCH_BY_SCALAR_DTYPE(scalar_dtype, fn)                      \
+  {                                                                     \
+    if (scalar_dtype == at::ScalarType::Float) {                        \
+      fn(float);                                                        \
+    } else if (scalar_dtype == at::ScalarType::Half) {                  \
+      fn(uint16_t);                                                     \
+    } else if (scalar_dtype == at::ScalarType::BFloat16) {              \
+      fn(__nv_bfloat16);                                                \
+    } else {                                                            \
+      TORCH_CHECK(false, "Unsupported data type of O: ", scalar_dtype); \
+    }                                                                   \
+  }
+
+#define LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS)                     \
+  {                                                                         \
+    vllm::merge_attn_states_kernel<scalar_t, NUM_THREADS><<<grid, block>>>( \
+        reinterpret_cast<scalar_t*>(output.data_ptr()), output_lse_ptr,     \
+        reinterpret_cast<scalar_t*>(prefix_output.data_ptr()),              \
+        reinterpret_cast<float*>(prefix_lse.data_ptr()),                    \
+        reinterpret_cast<scalar_t*>(suffix_output.data_ptr()),              \
+        reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens,        \
+        num_heads, head_size);                                              \
+  }
+
+/*@brief Merges the attention states from prefix and suffix
+ * into the output tensor. NUM_TOKENS: n, NUM_HEADS: h, HEAD_SIZE: d
+ *
+ * @param output [n,h,d] The output tensor to store the merged attention states.
+ * @param output_lse [h,d] Optional tensor to store the log-sum-exp values.
+ * @param prefix_output [n,h,d] The prefix attention states.
+ * @param prefix_lse [h,d] The log-sum-exp values for the prefix attention
+ * states.
+ * @param suffix_output [n,h,d] The suffix attention states.
+ * @param suffix_lse [h,d] The log-sum-exp values for the suffix attention
+ * states.
+ */
+template <typename scalar_t>
+void merge_attn_states_launcher(torch::Tensor& output,
+                                std::optional<torch::Tensor> output_lse,
+                                const torch::Tensor& prefix_output,
+                                const torch::Tensor& prefix_lse,
+                                const torch::Tensor& suffix_output,
+                                const torch::Tensor& suffix_lse) {
+  constexpr uint NUM_THREADS = 128;
+  const uint num_tokens = output.size(0);
+  const uint num_heads = output.size(1);
+  const uint head_size = output.size(2);
+  const uint pack_size = 16 / sizeof(scalar_t);
+  TORCH_CHECK(head_size % pack_size == 0,
+              "headsize must be multiple of pack_size:", pack_size);
+  float* output_lse_ptr = nullptr;
+  if (output_lse.has_value()) {
+    output_lse_ptr = output_lse.value().data_ptr<float>();
+  }
+  // process one pack elements per thread. float -> 4, half/bf16 -> 8
+  const uint threads_per_head = head_size / pack_size;
+  const uint total_threads = num_tokens * num_heads * threads_per_head;
+
+  dim3 block(NUM_THREADS);
+  dim3 grid((total_threads + NUM_THREADS - 1) / NUM_THREADS);
+
+  LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS);
+}
+
+#define CALL_MERGE_ATTN_STATES_LAUNCHER(scalar_t)                           \
+  {                                                                         \
+    merge_attn_states_launcher<scalar_t>(output, output_lse, prefix_output, \
+                                         prefix_lse, suffix_output,         \
+                                         suffix_lse);                       \
+  }
+
+void merge_attn_states(torch::Tensor& output,
+                       std::optional<torch::Tensor> output_lse,
+                       const torch::Tensor& prefix_output,
+                       const torch::Tensor& prefix_lse,
+                       const torch::Tensor& suffix_output,
+                       const torch::Tensor& suffix_lse) {
+  DISPATCH_BY_SCALAR_DTYPE(output.dtype(), CALL_MERGE_ATTN_STATES_LAUNCHER);
+}
diff --git a/csrc/ops.h b/csrc/ops.h
index 152c94e860032..86039a26041ba 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -52,6 +52,15 @@ void paged_attention_v2(
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
     const int64_t blocksparse_head_sliding_step);
 
+#ifndef USE_ROCM
+void merge_attn_states(torch::Tensor& output,
+                       std::optional<torch::Tensor> output_lse,
+                       const torch::Tensor& prefix_output,
+                       const torch::Tensor& prefix_lse,
+                       const torch::Tensor& suffix_output,
+                       const torch::Tensor& suffix_lse);
+#endif
+
 void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
               double epsilon);
 
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index d3b80572b6ead..b6ff6a006c028 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -64,6 +64,21 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "    int blocksparse_head_sliding_step) -> ()");
   ops.impl("paged_attention_v2", torch::kCUDA, &paged_attention_v2);
 
+#ifndef USE_ROCM
+  // Merge attn states
+  // Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
+  // can be used to combine partial attention results (in the split-KV case)
+  ops.def(
+      "merge_attn_states("
+      "    Tensor! output,"
+      "    Tensor!? output_lse,"
+      "    Tensor prefix_output,"
+      "    Tensor prefix_lse,"
+      "    Tensor suffix_output,"
+      "    Tensor suffix_lse) -> ()");
+  ops.impl("merge_attn_states", torch::kCUDA, &merge_attn_states);
+#endif
+
   // Activation ops
   // Activation function used in SwiGLU.
   ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
diff --git a/tests/kernels/test_merge_attn_states.py b/tests/kernels/test_merge_attn_states.py
new file mode 100644
index 0000000000000..7038fbea5c22e
--- /dev/null
+++ b/tests/kernels/test_merge_attn_states.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+
+import pytest
+import torch
+
+from vllm._custom_ops import merge_attn_states as merge_attn_states_cuda
+from vllm.attention.ops.triton_merge_attn_states import (
+    merge_attn_states as merge_attn_states_triton)
+from vllm.platforms import current_platform
+
+
+# Naive PyTorch Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
+# can be used to combine partial attention results (in the split-KV case)
+def merge_attn_states_torch(
+        output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+        prefix_output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+        prefix_lse: torch.Tensor,  # [NUM_HEADS, NUM_TOKENS]
+        suffix_output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+        suffix_lse: torch.Tensor,  # [NUM_HEADS, NUM_TOKENS]
+        output_lse: Optional[torch.Tensor] = None,  # [NUM_HEADS, NUM_TOKENS]
+):
+    p_lse = prefix_lse
+    s_lse = suffix_lse
+    # inf -> -inf
+    p_lse[p_lse == torch.inf] = -torch.inf
+    s_lse[s_lse == torch.inf] = -torch.inf
+    # max_lse [NUM_HEADS, NUM_TOKENS]
+    max_lse = torch.maximum(p_lse, s_lse)
+    p_lse = p_lse - max_lse
+    s_lse = s_lse - max_lse
+    p_lse_exp = torch.exp(p_lse)
+    s_lse_exp = torch.exp(s_lse)
+    out_se = (p_lse_exp + s_lse_exp)
+    if output_lse is not None:
+        output_lse = torch.log(out_se) + max_lse
+    p_scale = p_lse_exp / out_se  # [NUM_HEADS, NUM_TOKENS]
+    s_scale = s_lse_exp / out_se  # [NUM_HEADS, NUM_TOKENS]
+    p_scale = torch.transpose(p_scale, 0,
+                              1).unsqueeze(2)  # [NUM_TOKENS, NUM_HEADS, 1]
+    s_scale = torch.transpose(s_scale, 0,
+                              1).unsqueeze(2)  # [NUM_TOKENS, NUM_HEADS, 1]
+    output = prefix_output * p_scale + suffix_output * s_scale
+    return output, output_lse
+
+
+NUM_BATCH_TOKENS = [256, 512, 613, 1024, 1536, 4096]
+NUM_QUERY_HEADS = [4, 8, 16, 32, 48, 64]
+HEAD_SIZES = [32, 48, 64, 96, 128, 256]
+DTYPES = [torch.float32, torch.half, torch.bfloat16]
+
+all_case_info: list[tuple] = []
+
+
+def generate_markdown_table():
+    global all_case_info
+    table_header = ("| tokens | heads | headsize | dtype "
+                    "| device | torch | triton | cuda | speedup |")
+    table_separator = "| --- | --- | --- | --- | --- | --- | --- | --- | --- |"
+
+    def shortly_dtype(dtype: torch.dtype) -> str:
+        return str(dtype).removeprefix("torch.")
+
+    def shortly_device(device: str) -> str:
+        return device.removeprefix("NVIDIA").strip()
+
+    print(table_header)
+    print(table_separator)
+    for info in all_case_info:
+        (num_tokens, num_heads, head_size, dtype, device,
+         avg_time_torch_kernel, avg_time_triton_kernel, avg_time_cuda_kernel,
+         performance_improved) = info
+        dtype = shortly_dtype(dtype)
+        device = shortly_device(device)
+        print(f"| {num_tokens} | {num_heads} | {head_size} "
+              f"| {dtype} | {device} | {avg_time_torch_kernel:.5f}ms "
+              f"| {avg_time_triton_kernel:.5f}ms "
+              f"| {avg_time_cuda_kernel:.5f}ms "
+              f"| {performance_improved:.4f}x |")
+
+
+@pytest.mark.parametrize("num_tokens", NUM_BATCH_TOKENS)
+@pytest.mark.parametrize("num_query_heads", NUM_QUERY_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("output_dtype", DTYPES)
+@torch.inference_mode()
+def test_merge_attn_states(num_tokens: int, num_query_heads: int,
+                           head_size: int, output_dtype: torch.dtype):
+    if not current_platform.is_cuda():
+        pytest.skip('Currently only support compare triton merge_attn_states '
+                    'with custom cuda merge_attn_states kernel')
+
+    NUM_TOKENS = num_tokens
+    NUM_HEADS = num_query_heads
+    HEAD_SIZE = head_size
+
+    print(f"\nNUM_TOKENS:{NUM_TOKENS}, NUM_HEADS:{NUM_HEADS}, "
+          f"HEAD_SIZE:{HEAD_SIZE}, DTYPE: {output_dtype}, "
+          f"Device: {current_platform.get_device_name()}")
+
+    # prefix_lse and suffix_lse contain inf and normal values
+    prefix_lse = torch.randn(NUM_HEADS,
+                             NUM_TOKENS,
+                             dtype=torch.float32,
+                             device="cuda")
+    suffix_lse = torch.randn(NUM_HEADS,
+                             NUM_TOKENS,
+                             dtype=torch.float32,
+                             device="cuda")
+
+    # Generate boolean masks
+    mask_prefix = torch.rand(NUM_HEADS, NUM_TOKENS) < 0.1
+    mask_suffix = torch.rand(NUM_HEADS, NUM_TOKENS) < 0.1
+    # Ensure that the same position is not True at the same time
+    combined_mask = torch.logical_and(mask_prefix, mask_suffix)
+    mask_prefix = torch.logical_and(mask_prefix, ~combined_mask)
+    mask_suffix = torch.logical_and(mask_suffix, ~combined_mask)
+
+    prefix_lse[mask_prefix] = float('inf')
+    suffix_lse[mask_suffix] = float('inf')
+
+    # Other input tensors (need to be initialized but
+    # no actual calculation needed)
+    output = torch.zeros((NUM_TOKENS, NUM_HEADS, HEAD_SIZE),
+                         dtype=output_dtype,
+                         device="cuda")
+    output_lse = torch.zeros((NUM_HEADS, NUM_TOKENS),
+                             dtype=torch.float32,
+                             device="cuda")
+    prefix_output = torch.randn((NUM_TOKENS, NUM_HEADS, HEAD_SIZE),
+                                dtype=output_dtype,
+                                device="cuda")
+    suffix_output = torch.randn((NUM_TOKENS, NUM_HEADS, HEAD_SIZE),
+                                dtype=output_dtype,
+                                device="cuda")
+
+    warmup_times = 2
+    repeat_times = 20
+
+    output_torch = output.clone()
+    output_lse_torch = output_lse.clone()
+    total_time_torch_kernel = 0
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+
+    # 0. Run the Torch kernel
+    prefix_lse_torch = prefix_lse.clone()
+    suffix_lse_torch = suffix_lse.clone()
+    for _ in range(warmup_times):
+        output_torch, output_lse_torch = merge_attn_states_torch(
+            output_torch, prefix_output, prefix_lse_torch, suffix_output,
+            suffix_lse_torch, output_lse_torch)
+    torch.cuda.synchronize()
+
+    for _ in range(repeat_times):
+        start.record()
+        output_torch, output_lse_torch = merge_attn_states_torch(
+            output_torch, prefix_output, prefix_lse_torch, suffix_output,
+            suffix_lse_torch, output_lse_torch)
+        end.record()
+        torch.cuda.synchronize()
+        total_time_torch_kernel += start.elapsed_time(end)
+
+    avg_time_torch_kernel = total_time_torch_kernel / repeat_times
+
+    # 1. Run the Triton kernel
+    output_ref_triton = output.clone()
+    output_lse_ref_triton = output_lse.clone()
+
+    total_time_triton_kernel = 0
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+
+    for _ in range(warmup_times):
+        merge_attn_states_triton(output_ref_triton, prefix_output, prefix_lse,
+                                 suffix_output, suffix_lse,
+                                 output_lse_ref_triton)
+    torch.cuda.synchronize()
+
+    for _ in range(repeat_times):
+        start.record()
+        merge_attn_states_triton(output_ref_triton, prefix_output, prefix_lse,
+                                 suffix_output, suffix_lse,
+                                 output_lse_ref_triton)
+        end.record()
+        torch.cuda.synchronize()
+        total_time_triton_kernel += start.elapsed_time(end)
+
+    avg_time_triton_kernel = total_time_triton_kernel / repeat_times
+
+    # 2. Run the CUDA kernel
+    total_time_cuda_kernel = 0
+    output_cuda = output.clone()
+    output_lse_cuda = output_lse.clone()
+
+    for _ in range(warmup_times):
+        merge_attn_states_cuda(output_cuda, prefix_output, prefix_lse,
+                               suffix_output, suffix_lse, output_lse_cuda)
+    torch.cuda.synchronize()
+
+    for _ in range(repeat_times):
+        start.record()
+        merge_attn_states_cuda(output_cuda, prefix_output, prefix_lse,
+                               suffix_output, suffix_lse, output_lse_cuda)
+        end.record()
+        torch.cuda.synchronize()
+        total_time_cuda_kernel += start.elapsed_time(end)
+
+    avg_time_cuda_kernel = total_time_cuda_kernel / repeat_times
+
+    # 3. Performance compare
+    performance_improved = avg_time_triton_kernel / avg_time_cuda_kernel
+    print(f" Torch time: {avg_time_torch_kernel:.6f}ms")
+    print(f"Triton time: {avg_time_triton_kernel:.6f}ms")
+    print(f"  CUDA time: {avg_time_cuda_kernel:.6f}ms, "
+          f"Performance: {performance_improved:.5f}x")
+    print("-" * 100)
+
+    # 4. Correctness compare
+    # Liger Kernel: Efficient Triton Kernels for LLM Training
+    # https://arxiv.org/pdf/2410.10989, 3.3 Correctness
+    # use rtol = 1e-2 for bfloat16.
+    rtol = 1e-2 if output_dtype == torch.bfloat16 else 1e-3
+
+    def diff(a: torch.Tensor, b: torch.Tensor):
+        max_diff = torch.max(torch.abs(a.float() - b.float()))
+        return max_diff
+
+    # Use Triton output as reference because we want to replace
+    # the Triton kernel with custom CUDA kernel for merge attn
+    # states operation.
+    output_ref = output_ref_triton
+    output_lse_ref = output_lse_ref_triton
+    torch.testing.assert_close(output_cuda.float(),
+                               output_ref.float(),
+                               atol=1e-3,
+                               rtol=rtol)
+    print("Output all match, max abs diff:")
+    print(f"(Triton vs Torch) : {diff(output_torch, output_ref)}")
+    print(f"  (CUDA vs Torch) : {diff(output_torch, output_cuda)}")
+    print(f"  (CUDA vs Triton): {diff(output_ref, output_cuda)}")
+    print("-" * 100)
+
+    torch.testing.assert_close(output_lse_cuda.float(),
+                               output_lse_ref.float(),
+                               atol=1e-3,
+                               rtol=rtol)
+    print("Output LSE all match, max abs diff:")
+    print(f"(Triton vs Torch) : {diff(output_lse_torch, output_lse_ref)}")
+    print(f"  (CUDA vs Torch) : {diff(output_lse_torch, output_lse_cuda)}")
+    print(f"  (CUDA vs Triton): {diff(output_lse_ref, output_lse_cuda)}")
+    print("-" * 100)
+
+    print("All output values test passed! All inf values "
+          "are correctly replaced with -inf.")
+    print("-" * 100)
+
+    device = current_platform.get_device_name()
+    all_case_info.append(
+        (NUM_TOKENS, NUM_HEADS, HEAD_SIZE, output_dtype, device,
+         avg_time_torch_kernel, avg_time_triton_kernel, avg_time_cuda_kernel,
+         performance_improved))
+    if len(all_case_info) == (len(NUM_BATCH_TOKENS) * len(HEAD_SIZES) *
+                              len(NUM_QUERY_HEADS) * len(DTYPES)):
+        generate_markdown_table()
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 719e02ecd6830..7a4c93ad6f7f8 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -138,6 +138,17 @@ def mla_decode_kvcache_cpu(
                                         block_tables, seq_lens)
 
 
+# merge attn states ops
+def merge_attn_states(output: torch.Tensor,
+                      prefix_output: torch.Tensor,
+                      prefix_lse: torch.Tensor,
+                      suffix_output: torch.Tensor,
+                      suffix_lse: torch.Tensor,
+                      output_lse: Optional[torch.Tensor] = None) -> None:
+    torch.ops._C.merge_attn_states(output, output_lse, prefix_output,
+                                   prefix_lse, suffix_output, suffix_lse)
+
+
 # pos encoding ops
 def rotary_embedding(
     positions: torch.Tensor,
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index 5a47c0f630813..54278f5f608eb 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -204,6 +204,7 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
 from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
                                            compute_slot_mapping_start_idx,
                                            is_block_tables_empty)
+from vllm.attention.ops.merge_attn_states import merge_attn_states
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearBase, RowParallelLinear,
                                                UnquantizedLinearMethod)
@@ -217,9 +218,7 @@ from vllm.vllm_flash_attn.fa_utils import get_flash_attn_version
 
 if HAS_TRITON:
     from vllm.attention.ops.triton_flash_attention import triton_attention
-    from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
 else:
-    merge_attn_states = None
     triton_attention = None
 
 try:
diff --git a/vllm/attention/ops/merge_attn_states.py b/vllm/attention/ops/merge_attn_states.py
new file mode 100644
index 0000000000000..f9fcfe6a63386
--- /dev/null
+++ b/vllm/attention/ops/merge_attn_states.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+
+import torch
+
+from vllm.platforms import current_platform
+
+
+def merge_attn_states(
+    output: torch.Tensor,
+    prefix_output: torch.Tensor,
+    prefix_lse: torch.Tensor,
+    suffix_output: torch.Tensor,
+    suffix_lse: torch.Tensor,
+    output_lse: Optional[torch.Tensor] = None,
+) -> None:
+
+    # NOTE(DefTruth): Currently, custom merge_attn_states CUDA kernel
+    # is not support for FP8 dtype, fallback to use Triton kernel.
+    def supported_dtypes(o: torch.Tensor) -> bool:
+        return o.dtype in [torch.float32, torch.half, torch.bfloat16]
+
+    # NOTE(DefTruth): Currently, custom merge_attn_states CUDA
+    # kernel load/store 128b(16 bytes) per memory issue within
+    # thread. Namely, the headsize(headdim) must be multiple of
+    # pack_size (float32 -> 4, half/bfloat16 -> 8).
+    def supported_headdim(o: torch.Tensor) -> bool:
+        headdim = o.shape[2]  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+        if o.dtype == torch.float32:
+            return headdim % 4 == 0
+        return headdim % 8 == 0
+
+    if (current_platform.is_cuda() and supported_dtypes(output)
+            and supported_headdim(output)):
+        from vllm._custom_ops import merge_attn_states
+        return merge_attn_states(output, prefix_output, prefix_lse,
+                                 suffix_output, suffix_lse, output_lse)
+    else:
+        from vllm.attention.ops.triton_merge_attn_states import (
+            merge_attn_states)
+        return merge_attn_states(output, prefix_output, prefix_lse,
+                                 suffix_output, suffix_lse, output_lse)
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index e1858149dde9e..b4c7708daab91 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -10,7 +10,7 @@ from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType,
                                               is_quantized_kv_cache)
-from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
+from vllm.attention.ops.merge_attn_states import merge_attn_states
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import cdiv
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index e6c4ebc729bb9..8c7179ba0a8af 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -195,7 +195,7 @@ from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
                                               AttentionMetadata,
                                               MLAAttentionImpl)
-from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
+from vllm.attention.ops.merge_attn_states import merge_attn_states
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearBase, RowParallelLinear,

From 9e90c9f73f94d16c382b8c55fc99ab24a9baf09b Mon Sep 17 00:00:00 2001
From: chaow-amd <chaow@amd.com>
Date: Fri, 11 Apr 2025 22:18:32 +0800
Subject: [PATCH 386/593] [Bugfix] Fix bugs of running Quark quantized models
 (#16236)

Signed-off-by: chaow <chaow@amd.com>
---
 tests/quantization/test_quark.py              | 45 +++++++++++++++----
 .../quark/schemes/quark_w8a8_fp8.py           | 10 +++--
 .../quark/schemes/quark_w8a8_int8.py          | 34 +++++++++-----
 3 files changed, 67 insertions(+), 22 deletions(-)

diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py
index 85dc695be6865..ce918a3248872 100644
--- a/tests/quantization/test_quark.py
+++ b/tests/quantization/test_quark.py
@@ -4,17 +4,28 @@
 Run `pytest tests/quantization/test_quark.py`.
 """
 
-import torch
+import pytest
 
 from vllm.model_executor.layers.quantization.quark.quark import (  # noqa: E501
-    QuarkLinearMethod, QuarkW8A8Fp8)
+    QuarkLinearMethod, QuarkW8A8Fp8, QuarkW8A8Int8)
+from vllm.platforms import current_platform
 
 
-def test_quark_fp8(vllm_runner, monkeypatch):
-    # vllm_runner.apply_model() relies on V0 internals.
-    monkeypatch.setenv("VLLM_USE_V1", "0")
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This module relies on V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
+@pytest.mark.parametrize('kv_cache_dtype', ['auto', 'fp8'])
+@pytest.mark.parametrize('tp', [1])
+def test_quark_fp8_w_per_tensor_a_per_tensor(vllm_runner, kv_cache_dtype, tp):
     model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
-    with vllm_runner(model_path) as llm:
+    with vllm_runner(model_path,
+                     kv_cache_dtype=kv_cache_dtype,
+                     tensor_parallel_size=tp) as llm:
 
         def check_model(model):
             layer = model.model.layers[0]
@@ -26,11 +37,29 @@ def test_quark_fp8(vllm_runner, monkeypatch):
 
             if isinstance(qkv_proj.scheme, QuarkW8A8Fp8):
                 assert len(qkv_proj.input_scale.shape) == 0
-                assert qkv_proj.weight.dtype is torch.float8_e4m3fn
-                #assert qkv_proj.weight.dtype is torch.float8_e4m3fnuz
+                assert qkv_proj.weight.dtype is current_platform.fp8_dtype()
                 assert len(qkv_proj.weight_scale.shape) == 0
 
         llm.apply_model(check_model)
 
         output = llm.generate_greedy("Hello my name is", max_tokens=20)
         assert output
+
+
+@pytest.mark.parametrize('tp', [1])
+def test_quark_int8_w_per_tensor_a_per_tensor(vllm_runner, tp):
+    model_path = "amd/Llama-3.1-8B-Instruct-w-int8-a-int8-sym-test"
+    with vllm_runner(model_path, tensor_parallel_size=tp) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+
+            assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
+            assert isinstance(qkv_proj.scheme, QuarkW8A8Int8)
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        assert output
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
index c161849c8c5a2..afd4bb722dad9 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
@@ -21,7 +21,7 @@ class QuarkW8A8Fp8(QuarkScheme):
     def __init__(self, qscheme: str, is_static_input_scheme: Optional[bool]):
         self.qscheme = qscheme
         self.is_static_input_scheme = is_static_input_scheme
-        self.fp8_linear = Fp8LinearOp(use_per_token_if_dynamic=True)
+        self.fp8_linear = Fp8LinearOp(use_per_token_if_dynamic=False)
         self.out_dtype = torch.get_default_dtype()
 
     @classmethod
@@ -41,10 +41,11 @@ class QuarkW8A8Fp8(QuarkScheme):
             )
 
             if current_platform.is_fp8_fnuz():
+                input_scale = getattr(layer, 'input_scale', None)
                 weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
                     weight=weight,
                     weight_scale=max_w_scale,
-                    input_scale=layer.input_scale)
+                    input_scale=input_scale)
                 if input_scale is not None:
                     layer.input_scale = Parameter(input_scale,
                                                   requires_grad=False)
@@ -57,11 +58,12 @@ class QuarkW8A8Fp8(QuarkScheme):
             weight = layer.weight
 
             if current_platform.is_fp8_fnuz():
+                input_scale = getattr(layer, 'input_scale', None)
                 weight, weight_scale, input_scale = \
                     normalize_e4m3fn_to_e4m3fnuz(
                         weight=weight,
                         weight_scale=layer.weight_scale,
-                        input_scale=layer.input_scale)
+                        input_scale=input_scale)
                 if input_scale is not None:
                     layer.input_scale = Parameter(input_scale,
                                                   requires_grad=False)
@@ -105,7 +107,7 @@ class QuarkW8A8Fp8(QuarkScheme):
         # the newly added parameters
         if self.qscheme == "per_channel":
             weight_scale = ChannelQuantScaleParameter(
-                data=torch.empty((sum(output_partition_sizes), 1),
+                data=torch.empty((sum(output_partition_sizes)),
                                  dtype=torch.float32),
                 output_dim=0,
                 weight_loader=weight_loader)
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
index 1bf34b098938c..da8ed8c08506d 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
@@ -35,7 +35,7 @@ class QuarkW8A8Int8(QuarkScheme):
                        input_size_per_partition: int,
                        params_dtype: torch.dtype, weight_loader: Callable,
                        **kwargs):
-        self.logical_widths = output_partition_sizes
+        layer.logical_widths = output_partition_sizes
 
         scaled_mm_linear_kernel_config = ScaledMMLinearLayerConfig(
             is_channelwise=(self.qscheme == "per_channel"),
@@ -63,16 +63,28 @@ class QuarkW8A8Int8(QuarkScheme):
         # WEIGHT SCALE
         if self.qscheme == "per_channel":
             weight_scale = ChannelQuantScaleParameter(
-                data=torch.empty((sum(output_partition_sizes), 1),
+                data=torch.empty((sum(output_partition_sizes)),
                                  dtype=torch.float32),
                 output_dim=0,
                 weight_loader=weight_loader)
+            ChannelQuantZPParameter = ChannelQuantScaleParameter
+            weight_zero_point = ChannelQuantZPParameter(
+                data=torch.empty((sum(output_partition_sizes)),
+                                 dtype=torch.int8),
+                output_dim=0,
+                weight_loader=weight_loader)
         else:
             assert self.qscheme == "per_tensor"
             weight_scale = PerTensorScaleParameter(data=torch.empty(
                 len(output_partition_sizes), dtype=torch.float32),
                                                    weight_loader=weight_loader)
+            PerTensorZPParameter = PerTensorScaleParameter
+            weight_zero_point = PerTensorZPParameter(
+                data=torch.empty(len(output_partition_sizes),
+                                 dtype=torch.int8),
+                weight_loader=weight_loader)
         layer.register_parameter("weight_scale", weight_scale)
+        layer.register_parameter("weight_zero_point", weight_zero_point)
 
         # INPUT SCALE
         if self.is_static_input_scheme:
@@ -81,14 +93,10 @@ class QuarkW8A8Int8(QuarkScheme):
                                             weight_loader=weight_loader)
             layer.register_parameter("input_scale", input_scale)
 
-            if not self.input_symmetric:
-                # Note: quark stores the zp using the same dtype
-                # as the weights
-                # AZP loaded as int8 but used as int32
-                input_zero_point = BasevLLMParameter(
-                    data=torch.empty(1, dtype=torch.int8),
-                    weight_loader=weight_loader)
-                layer.register_parameter("input_zero_point", input_zero_point)
+            input_zero_point = BasevLLMParameter(data=torch.empty(
+                1, dtype=torch.int8),
+                                                 weight_loader=weight_loader)
+            layer.register_parameter("input_zero_point", input_zero_point)
 
         self.kernel = kernel_type(c=scaled_mm_linear_kernel_config,
                                   w_q_param_name="weight",
@@ -100,6 +108,12 @@ class QuarkW8A8Int8(QuarkScheme):
     # Checkpoints are serialized in quark format, which is
     # different from the format the kernel may want. Handle repacking here.
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.register_parameter("weight_zero_point", None)
+        delattr(layer, 'weight_zero_point')
+        if self.input_symmetric:
+            layer.register_parameter("input_zero_point", None)
+            delattr(layer, 'input_zero_point')
+
         self.kernel.process_weights_after_loading(layer)
 
     def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,

From 34b2cf3b3366443e6b68a43572aee1800f6f2b90 Mon Sep 17 00:00:00 2001
From: Tomasz Zielinski <85164140+tzielinski-habana@users.noreply.github.com>
Date: Fri, 11 Apr 2025 16:38:36 +0200
Subject: [PATCH 387/593] [Hardware][Intel-Gaudi] Multi-step scheduling
 implementation for HPU (#12779)

Signed-off-by: Tomasz Zielinski <tomasz.zielinski@intel.com>
---
 vllm/platforms/hpu.py                |   6 +-
 vllm/worker/hpu_model_runner.py      | 423 ++++++++++++++++++++-------
 vllm/worker/multi_step_hpu_worker.py | 122 ++++++++
 3 files changed, 437 insertions(+), 114 deletions(-)
 create mode 100644 vllm/worker/multi_step_hpu_worker.py

diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index 4c842b5251105..456b054b2b43a 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -46,15 +46,15 @@ class HpuPlatform(Platform):
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
 
         scheduler_config = vllm_config.scheduler_config
+        parallel_config = vllm_config.parallel_config
         if scheduler_config.is_multi_step:
-            raise NotImplementedError(
-                "Multi-step execution is not implemented for HPU")
+            parallel_config.worker_cls = \
+                "vllm.worker.multi_step_hpu_worker.MultiStepHPUWorker"
 
         if vllm_config.speculative_config is not None:
             raise NotImplementedError(
                 "Speculative decoding is not implemented for HPU")
 
-        parallel_config = vllm_config.parallel_config
         if parallel_config.worker_cls == "auto":
             parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker"
 
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 6b1593eb8235c..7a346b34cef59 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -32,6 +32,7 @@ from vllm_hpu_extension.profiler import (HabanaHighLevelProfiler,
 import vllm.envs as envs
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import DeviceConfig, VllmConfig
+from vllm.distributed import broadcast_tensor_dict
 from vllm.distributed.parallel_state import get_world_group
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
@@ -44,11 +45,13 @@ from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.sampling_metadata import SequenceGroupToSample
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                              MultiModalKwargs)
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import (IntermediateTensors, SequenceData,
-                           SequenceGroupMetadata)
+from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors,
+                           Logprob, SequenceData, SequenceGroupMetadata,
+                           SequenceOutput)
 from vllm.utils import (bind_kv_cache, is_pin_memory_available,
                         make_tensor_with_pad)
 from vllm.worker.model_runner_base import (
@@ -100,7 +103,10 @@ def subtuple(obj: object,
     if to_override is None:
         to_override = {}
     fields = set(to_copy) | set(to_override.keys())
-    values = {f: to_override.get(f, getattr(obj, f)) for f in fields}
+    if type(obj) is dict:
+        values = {key: obj[key] for key in fields if key in obj}
+    else:
+        values = {f: to_override.get(f, getattr(obj, f)) for f in fields}
     if typename not in _TYPE_CACHE:
         _TYPE_CACHE[typename] = collections.namedtuple(typename,
                                                        ' '.join(fields))
@@ -533,6 +539,8 @@ class ModelInputForHPU(ModelRunnerInputBase):
     virtual_engine: int = 0
     lora_ids: Optional[List[int]] = None
     async_callback: Optional[Callable] = None
+    is_first_multi_step: bool = True
+    is_last_step: bool = True
 
     def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
         tensor_dict = {
@@ -545,6 +553,8 @@ class ModelInputForHPU(ModelRunnerInputBase):
             "batch_size_padded": self.batch_size_padded,
             "virtual_engine": self.virtual_engine,
             "lora_ids": self.lora_ids,
+            "is_first_multi_step": self.is_first_multi_step,
+            "is_last_step": self.is_last_step,
         }
         _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
         return tensor_dict
@@ -656,6 +666,9 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
         self._set_gc_threshold()
         self.use_contiguous_pa = envs.VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH
 
+        # For multi-step scheduling
+        self.cached_step_outputs: List[torch.Tensor] = []
+
     def _set_gc_threshold(self) -> None:
         # Read https://docs.python.org/3/library/gc.html#gc.set_threshold
         # for comprehensive description of gc generations.
@@ -1005,6 +1018,7 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
     def _prepare_decode(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
+        output=None,
     ) -> PrepareDecodeMetadata:
         input_tokens: List[List[int]] = []
         input_positions: List[List[int]] = []
@@ -1035,8 +1049,9 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
 
             for seq_id in seq_ids:
                 seq_data = seq_group_metadata.seq_data[seq_id]
-                generation_token = seq_data.get_last_token_id()
-                input_tokens.append([generation_token])
+                if output is None:
+                    generation_token = seq_data.get_last_token_id()
+                    input_tokens.append([generation_token])
 
                 seq_len = seq_data.get_len()
                 position = seq_len - 1
@@ -1047,6 +1062,9 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
                 seq_lens.append(seq_len)
 
                 block_table = seq_group_metadata.block_tables[seq_id]
+                num_fully_occupied_blocks = position // self.block_size
+                block_table = block_table[:num_fully_occupied_blocks + 1]
+
                 if len(block_table) == 0:
                     block_number = _PAD_BLOCK_ID
                 else:
@@ -1066,9 +1084,14 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
                     block_table = block_table[-sliding_window_blocks:]
                 block_tables.append(block_table)
 
-        input_tokens = torch.tensor(input_tokens,
-                                    dtype=torch.long,
-                                    device=self.device)
+        if output is None:
+            input_tokens = torch.tensor(input_tokens,
+                                        dtype=torch.long,
+                                        device=self.device)
+        else:
+            real_batch_size = len(seq_group_metadata_list)
+            input_tokens = output[:real_batch_size]
+
         input_positions = torch.tensor(input_positions,
                                        dtype=torch.long,
                                        device=self.device)
@@ -1462,7 +1485,27 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
             profiler.start()
         for _ in range(times):
             inputs = self.prepare_model_input(seqs)
-            self.execute_model(inputs, None, warmup_mode=True)
+            is_single_step = \
+                self.vllm_config.scheduler_config.num_scheduler_steps == 1
+            if is_prompt or is_single_step:
+                self.execute_model(inputs, None, warmup_mode=True)
+            else:  # decode with multi-step
+                inputs = dataclasses.replace(inputs,
+                                             is_first_multi_step=True,
+                                             is_last_step=False)
+                self.execute_model(inputs,
+                                   None,
+                                   warmup_mode=True,
+                                   num_steps=2,
+                                   seqs=seqs)
+                inputs = dataclasses.replace(inputs,
+                                             is_first_multi_step=False,
+                                             is_last_step=True)
+                self.execute_model(inputs,
+                                   None,
+                                   warmup_mode=True,
+                                   num_steps=2,
+                                   seqs=seqs)
             torch.hpu.synchronize()
             if profiler:
                 profiler.step()
@@ -1985,115 +2028,273 @@ class HPUModelRunner(HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]):
         intermediate_tensors: Optional[IntermediateTensors] = None,
         num_steps: int = 1,
         warmup_mode=False,
+        seqs=None,
     ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
-        if num_steps > 1:
-            raise ValueError(
-                "num_steps > 1 is not supported in HPUModelRunner")
+        if not model_input.is_first_multi_step:
+            if not model_input.is_last_step:
+                # not first or last multi-step
+                return []
+            # last multi-step
+            output = self._decode_sampler_outputs(
+                model_input) if self.is_driver_worker else []
+            torch.hpu.synchronize()
+        if model_input.is_first_multi_step:
+            # first multi-step
+            if self.lora_config:
+                assert model_input.lora_requests is not None
+                assert model_input.lora_mapping is not None
+                self.set_active_loras(model_input.lora_requests,
+                                      model_input.lora_mapping)
+            input_tokens = model_input.input_tokens
+            input_positions = model_input.input_positions
+            attn_metadata = model_input.attn_metadata
+            sampling_metadata = model_input.sampling_metadata
+            real_batch_size = model_input.real_batch_size
+            batch_size_padded = model_input.batch_size_padded
+            assert input_tokens is not None
+            assert input_positions is not None
+            assert sampling_metadata is not None
+            assert attn_metadata is not None
+            is_prompt = attn_metadata.is_prompt
+            assert is_prompt is not None
+            batch_size = input_tokens.size(0)
+            seq_len = self._seq_len(attn_metadata)
+            use_graphs = self._use_graphs(batch_size, seq_len, is_prompt)
+            self._check_config(batch_size, seq_len, is_prompt, warmup_mode)
 
-        if self.lora_config:
-            assert model_input.lora_requests is not None
-            assert model_input.lora_mapping is not None
-            self.set_active_loras(model_input.lora_requests,
-                                  model_input.lora_mapping)
-        input_tokens = model_input.input_tokens
-        input_positions = model_input.input_positions
-        attn_metadata = model_input.attn_metadata
-        sampling_metadata = model_input.sampling_metadata
-        real_batch_size = model_input.real_batch_size
-        batch_size_padded = model_input.batch_size_padded
-        assert input_tokens is not None
-        assert input_positions is not None
-        assert sampling_metadata is not None
-        assert attn_metadata is not None
-        is_prompt = attn_metadata.is_prompt
-        assert is_prompt is not None
-        batch_size = input_tokens.size(0)
-        seq_len = self._seq_len(attn_metadata)
-        use_graphs = self._use_graphs(batch_size, seq_len, is_prompt)
-        self._check_config(batch_size, seq_len, is_prompt, warmup_mode)
+            lora_mask: torch.Tensor = None
+            lora_logits_mask: torch.Tensor = None
+            if self.lora_config:
+                assert model_input.lora_ids is not None
+                lora_mask, lora_logits_mask = self.create_lora_mask(
+                    input_tokens, model_input.lora_ids,
+                    attn_metadata.is_prompt)
 
-        lora_mask: torch.Tensor = None
-        lora_logits_mask: torch.Tensor = None
-        if self.lora_config:
-            assert model_input.lora_ids is not None
-            lora_mask, lora_logits_mask = self.create_lora_mask(
-                input_tokens, model_input.lora_ids, attn_metadata.is_prompt)
+            execute_model_kwargs = {
+                "input_ids": input_tokens,
+                "positions": input_positions,
+                "attn_metadata": self.trim_attn_metadata(attn_metadata),
+                "intermediate_tensors": intermediate_tensors,
+                "lora_mask": lora_mask,
+                "virtual_engine": model_input.virtual_engine,
+                **(model_input.multi_modal_kwargs or {}),
+            }
+            if htorch.utils.internal.is_lazy():
+                execute_model_kwargs.update(
+                    {"bypass_hpu_graphs": not use_graphs})
 
-        execute_model_kwargs = {
-            "input_ids": input_tokens,
-            "positions": input_positions,
-            "attn_metadata": self.trim_attn_metadata(attn_metadata),
-            "intermediate_tensors": intermediate_tensors,
-            "lora_mask": lora_mask,
-            "virtual_engine": model_input.virtual_engine,
-            **(model_input.multi_modal_kwargs or {}),
-        }
-        if htorch.utils.internal.is_lazy():
-            execute_model_kwargs.update({"bypass_hpu_graphs": not use_graphs})
+            htorch.core.mark_step()
+            if self.is_driver_worker:
+                model_event_name = ("model_"
+                                    f"{'prompt' if is_prompt else 'decode'}_"
+                                    f"bs{batch_size}_"
+                                    f"seq{seq_len}_"
+                                    f"graphs{'T' if use_graphs else 'F'}")
+            else:
+                model_event_name = 'model_executable'
+            if num_steps > 1:
+                # in case of multi-step scheduling
+                # we only want to pythonize in the last step
+                sampling_metadata.skip_sampler_cpu_output = True
+                self.model.model.sampler.include_gpu_probs_tensor = True
+            cache_orig_output_tokens_len: List[Dict] = []
 
-        htorch.core.mark_step()
-        if self.is_driver_worker:
-            model_event_name = ("model_"
-                                f"{'prompt' if is_prompt else 'decode'}_"
-                                f"bs{batch_size}_"
-                                f"seq{seq_len}_"
-                                f"graphs{'T' if use_graphs else 'F'}")
+            def try_revert_dummy_output_tokens():
+                if len(cache_orig_output_tokens_len) > 0:
+                    # Reuse the original output token ids length
+                    for i, seq_group_metadata in enumerate(
+                            seq_group_metadata_list):
+                        for j, data in seq_group_metadata.seq_data.items():
+                            orig_output_tokens_len = \
+                                cache_orig_output_tokens_len[i][j]
+                            data.output_token_ids = \
+                                data.output_token_ids[:orig_output_tokens_len]
+
+            for i in range(num_steps):
+                if i != 0 and not self.is_driver_worker:
+                    broadcast_data = broadcast_tensor_dict(src=0)
+                    if 'early_exit' in broadcast_data and broadcast_data[
+                            'early_exit']:
+                        return [output] if num_steps == 1 else []
+                    execute_model_kwargs.update({
+                        "input_ids":
+                        broadcast_data["input_ids"],
+                        "positions":
+                        broadcast_data["positions"],
+                        "attn_metadata":
+                        self.trim_attn_metadata(
+                            broadcast_data["attn_metadata"])
+                    })
+                with self.profiler.record_event('internal', model_event_name):
+                    hidden_states = self.model.forward(
+                        **execute_model_kwargs,
+                        selected_token_indices=sampling_metadata.
+                        selected_token_indices)
+
+                if self.lora_config:
+                    LoraMask.setLoraMask(
+                        lora_logits_mask.index_select(
+                            0, sampling_metadata.selected_token_indices))
+
+                # Compute the logits.
+                with self.profiler.record_event(
+                        'internal',
+                    ('compute_logits_'
+                     f'{"prompt" if is_prompt else "decode"}_bs'
+                     f'{batch_size}_'
+                     f'seq{seq_len}')):
+                    if num_steps == 1:
+                        sampling_metadata.selected_token_indices = None
+                    logits = self.model.compute_logits(hidden_states,
+                                                       sampling_metadata)
+                htorch.core.mark_step()
+                # Only perform sampling in the driver worker.
+                if not self.is_driver_worker:
+                    continue
+
+                if model_input.async_callback is not None:
+                    model_input.async_callback()
+                # Sample the next token.
+                with self.profiler.record_event(
+                        'internal', ('sample_'
+                                     f'{"prompt" if is_prompt else "decode"}_'
+                                     f'bs{batch_size}_'
+                                     f'seq{seq_len}')):
+                    output = self.model.sample(
+                        logits=logits,
+                        sampling_metadata=sampling_metadata,
+                    )
+                    if num_steps > 1:
+                        output = output.sampled_token_ids
+                        self.cached_step_outputs.append(
+                            output.detach().clone())
+                htorch.core.mark_step()
+                if i < num_steps - 1:
+                    if i == 0:
+                        if model_input.async_callback is not None:
+                            ctx = model_input.async_callback.keywords[  # type: ignore
+                                "ctx"]
+                            seq_group_metadata_list = \
+                                ctx.seq_group_metadata_list
+                        elif seqs is not None:
+                            seq_group_metadata_list = seqs
+                        else:
+                            raise RuntimeError(
+                                "seq_group_metadata_list is uninitialized")
+                        for i, seq_group_metadata in enumerate(
+                                seq_group_metadata_list):
+                            # Skip empty steps
+                            seq_group_metadata.state.current_step += (
+                                num_steps - 2)
+                            # Cache the original output token ids
+                            cache_orig_output_tokens_len.append({})
+                            for j, data in seq_group_metadata.seq_data.items():
+                                cache_orig_output_tokens_len[i][j] = \
+                                    len(data.output_token_ids)
+                    for seq_group_metadata in seq_group_metadata_list:
+                        for data in seq_group_metadata.seq_data.values():
+                            max_output_len = sampling_metadata.seq_groups[
+                                0].sampling_params.max_tokens
+                            if len(data.output_token_ids) < max_output_len - 1:
+                                # add a place holder for prepare_decode
+                                # arbitrary value, this could be any token
+                                dummy_token = (540, )
+                                data.output_token_ids += (dummy_token)
+                            else:
+                                broadcast_tensor_dict({'early_exit': True},
+                                                      src=0)
+                                if num_steps == 1:
+                                    return [output]
+                                else:
+                                    try_revert_dummy_output_tokens()
+                                    return []
+
+                    result = self._prepare_decode(seq_group_metadata_list,
+                                                  output=output)
+                    execute_model_kwargs.update({
+                        "input_ids":
+                        result.input_tokens,
+                        "positions":
+                        result.input_positions,
+                        "attn_metadata":
+                        self.trim_attn_metadata(result.attn_metadata)
+                    })
+                    model_kwargs_broadcast_data = {
+                        "input_ids": result.input_tokens,
+                        "positions": result.input_positions,
+                        "attn_metadata": vars(result.attn_metadata)
+                    }
+                    broadcast_tensor_dict(model_kwargs_broadcast_data, src=0)
+                else:
+                    try_revert_dummy_output_tokens()
+
+            if self.is_driver_worker and self.profiler.enabled:
+                # Stop recording 'execute_model' event
+                self.profiler.end()
+                event_end = self.profiler.get_timestamp_us()
+                counters = self.profiler_counter_helper.get_counter_dict(
+                    cache_config=self.cache_config,
+                    duration=event_end - self.event_start,
+                    seq_len=seq_len,
+                    batch_size_padded=batch_size_padded,
+                    real_batch_size=real_batch_size,
+                    is_prompt=is_prompt)
+                self.profiler.record_counter(self.event_start, counters)
+            if num_steps == 1:
+                return [output] if self.is_driver_worker else []
+            else:
+                return []
+        return output if type(output) is list else [output]
+
+    def _decode_sampler_outputs(self, model_input):
+        use_async_out_proc = model_input.async_callback is not None
+        sampler_outputs = []
+        num_outputs = len(self.cached_step_outputs)
+        for i in range(num_outputs):
+            next_token_ids = self.cached_step_outputs.pop(0)
+            next_token_ids = next_token_ids.cpu().tolist()
+            sampler_output = self._make_decode_output(
+                next_token_ids, model_input.sampling_metadata.seq_groups)
+            sampler_outputs.append(sampler_output)
+
+            if i < num_outputs - 1 and use_async_out_proc:
+                assert model_input.async_callback is not None
+                ctx = model_input.async_callback.keywords[  # type: ignore
+                    "ctx"]
+                ctx.append_output(
+                    outputs=[sampler_output],
+                    seq_group_metadata_list=ctx.seq_group_metadata_list,
+                    scheduler_outputs=ctx.scheduler_outputs,
+                    is_async=False,
+                    is_last_step=False,
+                    is_first_step_output=False)
+                model_input.async_callback()
+
+        if use_async_out_proc:
+            return [sampler_outputs[-1]]
         else:
-            model_event_name = 'model_executable'
-        with self.profiler.record_event('internal', model_event_name):
-            hidden_states = self.model.forward(
-                **execute_model_kwargs,
-                selected_token_indices=sampling_metadata.selected_token_indices
-            )
+            return sampler_outputs
 
-        if self.lora_config:
-            LoraMask.setLoraMask(
-                lora_logits_mask.index_select(
-                    0, sampling_metadata.selected_token_indices))
-
-        # Compute the logits.
-        with self.profiler.record_event(
-                'internal', ('compute_logits_'
-                             f'{"prompt" if is_prompt else "decode"}_bs'
-                             f'{batch_size}_'
-                             f'seq{seq_len}')):
-            sampling_metadata.selected_token_indices = None
-            logits = self.model.compute_logits(hidden_states,
-                                               sampling_metadata)
-        htorch.core.mark_step()
-        # Only perform sampling in the driver worker.
-        if not self.is_driver_worker:
-            return []
-
-        if model_input.async_callback is not None:
-            model_input.async_callback()
-
-        # Sample the next token.
-        with self.profiler.record_event(
-                'internal', ('sample_'
-                             f'{"prompt" if is_prompt else "decode"}_'
-                             f'bs{batch_size}_'
-                             f'seq{seq_len}')):
-            output = self.model.sample(
-                logits=logits,
-                sampling_metadata=sampling_metadata,
-            )
-        output.outputs = output.outputs[:real_batch_size]
-        htorch.core.mark_step()
-
-        if self.is_driver_worker and self.profiler.enabled:
-            # Stop recording 'execute_model' event
-            self.profiler.end()
-            event_end = self.profiler.get_timestamp_us()
-            counters = self.profiler_counter_helper.get_counter_dict(
-                cache_config=self.cache_config,
-                duration=event_end - self.event_start,
-                seq_len=seq_len,
-                batch_size_padded=batch_size_padded,
-                real_batch_size=real_batch_size,
-                is_prompt=is_prompt)
-            self.profiler.record_counter(self.event_start, counters)
-        return [output]
+    def _make_decode_output(
+        self,
+        next_token_ids: List[List[int]],
+        seq_groups: List[SequenceGroupToSample],
+    ) -> SamplerOutput:
+        zero_logprob = Logprob(0.0)
+        sampler_outputs = []
+        batch_idx = 0
+        for seq_group in seq_groups:
+            seq_ids = seq_group.seq_ids
+            seq_outputs = []
+            for seq_id in seq_ids:
+                next_token_id = next_token_ids[batch_idx][0]
+                seq_outputs.append(
+                    SequenceOutput(seq_id, next_token_id,
+                                   {next_token_id: zero_logprob}))
+                batch_idx += 1
+            sampler_outputs.append(
+                CompletionSequenceGroupOutput(seq_outputs, None))
+        return SamplerOutput(sampler_outputs)
 
     def shutdown_inc(self):
         can_finalize_inc = False
diff --git a/vllm/worker/multi_step_hpu_worker.py b/vllm/worker/multi_step_hpu_worker.py
new file mode 100644
index 0000000000000..2c5e2eac75898
--- /dev/null
+++ b/vllm/worker/multi_step_hpu_worker.py
@@ -0,0 +1,122 @@
+# SPDX-License-Identifier: Apache-2.0
+
+###############################################################################
+# Copyright (C) 2025 Habana Labs, Ltd. an Intel Company
+###############################################################################
+
+import dataclasses
+from typing import Dict, Optional, Tuple
+
+import torch
+
+from vllm.distributed import broadcast_tensor_dict
+from vllm.sequence import ExecuteModelRequest
+from vllm.worker.hpu_model_runner import ModelInputForHPU
+from vllm.worker.hpu_worker import HPUWorker
+from vllm.worker.worker_base import WorkerInput
+
+
+class MultiStepHPUWorker(HPUWorker):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.cached_model_input: Optional[ModelInputForHPU] = None
+
+    def _get_driver_input_and_broadcast(
+        self, execute_model_req: ExecuteModelRequest
+    ) -> Tuple[ModelInputForHPU, WorkerInput, Dict[str, torch.Tensor]]:
+        """
+        Get the driver input and broadcast it to other workers.
+        """
+        assert self.is_driver_worker
+        assert execute_model_req.virtual_engine == 0
+
+        is_first_multi_step = execute_model_req.is_first_multi_step
+        is_last_step = execute_model_req.is_last_step
+
+        if is_first_multi_step:
+            # on first step we prepare the worker input and model input normally
+            worker_input: WorkerInput = self.prepare_worker_input(
+                execute_model_req=execute_model_req)
+            worker_input = dataclasses.replace(
+                worker_input,
+                num_steps=execute_model_req.num_lookahead_slots + 1)
+            model_input: ModelInputForHPU = (
+                self.model_runner.prepare_model_input(
+                    execute_model_req.seq_group_metadata_list,
+                    execute_model_req.virtual_engine,
+                    execute_model_req.finished_requests_ids))
+
+            if execute_model_req.async_callback:
+                model_input = dataclasses.replace(
+                    model_input,
+                    async_callback=execute_model_req.async_callback)
+        else:
+            # on subsequent steps we reuse the worker input and model input
+            assert self.cached_model_input is not None
+            model_input = self.cached_model_input
+            worker_input = WorkerInput()
+
+        model_input = dataclasses.replace(
+            model_input,
+            is_first_multi_step=is_first_multi_step,
+            is_last_step=is_last_step)
+
+        if self.do_metadata_broadcast:
+            if is_first_multi_step:
+                broadcast_data = worker_input.as_broadcastable_tensor_dict()
+                broadcast_data.update(
+                    model_input.as_broadcastable_tensor_dict())
+                broadcast_tensor_dict(broadcast_data, src=0)
+            else:
+                broadcast_data = {
+                    "is_first_multi_step": is_first_multi_step,
+                    "is_last_step": is_last_step,
+                }
+                broadcast_tensor_dict(broadcast_data, src=0)
+
+        # Returning empty dict here to keep this compatible with
+        # `LocalOrDistributedWorkerBase._get_driver_input_and_broadcast`
+        return model_input, worker_input, {}
+
+    def prepare_input(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None,
+    ) -> Optional[Tuple[ModelInputForHPU, WorkerInput, Dict[str,
+                                                            torch.Tensor]]]:
+        if self.is_driver_worker:
+            if execute_model_req is None:
+                if self.do_metadata_broadcast:
+                    # This signals that there's no more requests to process for
+                    # now. All workers are running infinite loop with
+                    # broadcast_tensor_dict, and it stops the loop when the
+                    # driver broadcasts an empty input. Send an empty input to
+                    # notify all other workers to stop their execution loop.
+                    broadcast_tensor_dict({}, src=0)
+                return None
+            model_input, worker_input, _ = self._get_driver_input_and_broadcast(
+                execute_model_req)
+            if model_input.is_first_multi_step:
+                self.cached_model_input = model_input
+            return model_input, worker_input, {}
+        else:
+            broadcast_data = broadcast_tensor_dict(src=0)
+            if not broadcast_data:
+                return None
+
+            if len(broadcast_data) == 2:
+                assert self.cached_model_input is not None
+                self.cached_model_input = dataclasses.replace(
+                    self.cached_model_input,
+                    is_first_multi_step=broadcast_data["is_first_multi_step"],
+                    is_last_step=broadcast_data["is_last_step"])
+                empty_worker_input = WorkerInput()
+                return self.cached_model_input, empty_worker_input, {}
+
+            worker_input = WorkerInput.from_broadcasted_tensor_dict(
+                broadcast_data)
+            model_input = (
+                self.model_runner.
+                make_model_input_from_broadcasted_tensor_dict(broadcast_data))
+            self.cached_model_input = model_input
+            return model_input, worker_input, {}

From 70de35a8816e224663aede45b7f54eef250a5cfe Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@users.noreply.github.com>
Date: Fri, 11 Apr 2025 12:24:36 -0400
Subject: [PATCH 388/593] Fix erroneous "model doesn't support compile" warning
 (#16486)

Signed-off-by: rzou <zou3519@gmail.com>
---
 vllm/config.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index d24082799d00b..1fcf0baeac958 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3880,7 +3880,9 @@ def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False):
     try:
         _current_vllm_config = vllm_config
         yield
-    finally:
+    except Exception:
+        raise
+    else:
         logger.debug("enabled custom ops: %s",
                      vllm_config.compilation_config.enabled_custom_ops)
         logger.debug("disabled custom ops: %s",
@@ -3898,6 +3900,7 @@ def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False):
                 " does not support it. Please open an issue on GitHub"
                 " if you want it to be supported.",
                 vllm_config.model_config.model)
+    finally:
         _current_vllm_config = old_vllm_config
 
 
From 4d022cbc757e89b2296daf3b33303eeea4ba4a62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Fri, 11 Apr 2025 19:06:14 +0200
Subject: [PATCH 389/593] [TPU][V1] Make `--disable_chunked_mm_input` mandatory
 for serving MM models (#16483)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 vllm/platforms/tpu.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index ada599c27b446..d8807a72ba2f3 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -120,6 +120,13 @@ class TpuPlatform(Platform):
         assert not vllm_config.speculative_config, (
             "Speculative decoding is not yet supported for TPU backend")
 
+        if scheduler_config.is_multimodal_model and not \
+            scheduler_config.disable_chunked_mm_input:
+            logger.warning("TPU does not support running Multimodal models"\
+            " without setting `--disable_chunked_mm_input`. " \
+            "Forcing --disable_chunked_mm_input.")
+            scheduler_config.disable_chunked_mm_input = True
+
     @classmethod
     def is_pin_memory_available(cls):
         logger.warning("Pin memory is not supported on TPU.")

From f41647ee6b95b38ceb6b81fc397a2526c0499cae Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Fri, 11 Apr 2025 11:54:08 -0600
Subject: [PATCH 390/593] [Kernel] Support W8A8 channel-wise weights and
 per-token activations in triton fused_moe_kernel (#16366)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 tests/kernels/test_block_fp8.py               |  92 +---
 tests/kernels/test_block_int8.py              | 199 ++++++++
 tests/kernels/test_int8_kernel.py             | 149 ++++++
 tests/kernels/test_triton_moe_ptpc_fp8.py     | 159 ++++++
 tests/kernels/utils_block.py                  |  63 +++
 .../layers/fused_moe/fused_moe.py             | 266 ++++++----
 .../layers/quantization/utils/int8_utils.py   | 459 ++++++++++++++++++
 7 files changed, 1229 insertions(+), 158 deletions(-)
 create mode 100644 tests/kernels/test_block_int8.py
 create mode 100644 tests/kernels/test_int8_kernel.py
 create mode 100644 tests/kernels/test_triton_moe_ptpc_fp8.py
 create mode 100644 tests/kernels/utils_block.py
 create mode 100644 vllm/model_executor/layers/quantization/utils/int8_utils.py

diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/test_block_fp8.py
index 347319b303f4a..c450048bf6651 100644
--- a/tests/kernels/test_block_fp8.py
+++ b/tests/kernels/test_block_fp8.py
@@ -18,6 +18,8 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     per_token_group_quant_fp8, w8a8_block_fp8_matmul)
 from vllm.platforms import current_platform
 
+from .utils_block import native_w8a8_block_matmul
+
 dg_available = False
 try:
     import deep_gemm
@@ -75,61 +77,6 @@ def native_per_token_group_quant_fp8(x,
     return x_q, x_s
 
 
-def native_w8a8_block_fp8_matmul(A,
-                                 B,
-                                 As,
-                                 Bs,
-                                 block_size,
-                                 output_dtype=torch.float16):
-    """Matrix multiplication with block-wise quantization using native torch."""
-    A = A.to(torch.float32)
-    B = B.to(torch.float32)
-    assert A.shape[-1] == B.shape[-1]
-    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
-    assert len(block_size) == 2
-    block_n, block_k = block_size[0], block_size[1]
-    assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
-    assert A.shape[:-1] == As.shape[:-1]
-
-    M = A.numel() // A.shape[-1]
-    N, K = B.shape
-    origin_C_shape = A.shape[:-1] + (N, )
-    A = A.reshape(M, A.shape[-1])
-    As = As.reshape(M, As.shape[-1])
-    n_tiles = (N + block_n - 1) // block_n
-    k_tiles = (K + block_k - 1) // block_k
-    assert n_tiles == Bs.shape[0]
-    assert k_tiles == Bs.shape[1]
-
-    C_shape = (M, N)
-    C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
-
-    A_tiles = [
-        A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
-    ]
-    B_tiles = [[
-        B[
-            j * block_n:min((j + 1) * block_n, N),
-            i * block_k:min((i + 1) * block_k, K),
-        ] for i in range(k_tiles)
-    ] for j in range(n_tiles)]
-    C_tiles = [
-        C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
-    ]
-    As_tiles = [As[:, i:i + 1] for i in range(k_tiles)]
-
-    for i in range(k_tiles):
-        for j in range(n_tiles):
-            a = A_tiles[i]
-            b = B_tiles[j][i]
-            c = C_tiles[j]
-            s = As_tiles[i] * Bs[j][i]
-            c[:, :] += torch.matmul(a, b.t()) * s
-
-    C = C.reshape(origin_C_shape).to(output_dtype)
-    return C
-
-
 def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
     """Fused moe with block-wise quantization using native torch."""
     B, D = a.shape
@@ -146,22 +93,22 @@ def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
     for i in range(w1.shape[0]):
         mask = topk_ids == i
         if mask.sum():
-            inter_out = native_w8a8_block_fp8_matmul(a_q[mask],
-                                                     w1[i],
-                                                     a_s[mask],
-                                                     w1_s[i],
-                                                     block_shape,
-                                                     output_dtype=a.dtype)
+            inter_out = native_w8a8_block_matmul(a_q[mask],
+                                                 w1[i],
+                                                 a_s[mask],
+                                                 w1_s[i],
+                                                 block_shape,
+                                                 output_dtype=a.dtype)
             act_out = SiluAndMul().forward_native(inter_out)
             act_out_q, act_out_s = native_per_token_group_quant_fp8(
                 act_out, block_k)
             act_out = act_out.to(torch.float32)
-            out[mask] = native_w8a8_block_fp8_matmul(act_out_q,
-                                                     w2[i],
-                                                     act_out_s,
-                                                     w2_s[i],
-                                                     block_shape,
-                                                     output_dtype=a.dtype)
+            out[mask] = native_w8a8_block_matmul(act_out_q,
+                                                 w2[i],
+                                                 act_out_s,
+                                                 w2_s[i],
+                                                 block_shape,
+                                                 output_dtype=a.dtype)
     return (out.view(B, -1, w2.shape[1]) *
             topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
 
@@ -215,8 +162,8 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
     As = torch.rand(M, k_tiles, dtype=torch.float32) * factor_for_scale
     Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
 
-    ref_out = native_w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size,
-                                           out_dtype)
+    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
+                                       out_dtype)
     out = w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
 
     rel_diff = (torch.mean(
@@ -239,8 +186,6 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
     fp8_info = torch.finfo(torch.float8_e4m3fn)
     fp8_max, fp8_min = fp8_info.max, fp8_info.min
 
-    vllm_config = VllmConfig()
-
     a = torch.randn((M, K), dtype=dtype) / 10
 
     w1_bf16 = (torch.rand(
@@ -266,6 +211,7 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
     score = torch.randn((M, E), dtype=dtype)
 
     # Set the context to avoid lots of warning spam.
+    vllm_config = VllmConfig()
     with set_current_vllm_config(vllm_config):
         out = fused_moe(
             a,
@@ -334,8 +280,8 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
     As = As_fp8.to(torch.float32)
     Bs = Bs_fp8.to(torch.float32)
 
-    ref_out = native_w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size,
-                                           out_dtype)
+    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
+                                       out_dtype)
 
     # Transpose earlier so that the testing will not trigger transposing kernels
     As_fp8 = deep_gemm.get_col_major_tma_aligned_tensor(As_fp8)
diff --git a/tests/kernels/test_block_int8.py b/tests/kernels/test_block_int8.py
new file mode 100644
index 0000000000000..9447f9d691650
--- /dev/null
+++ b/tests/kernels/test_block_int8.py
@@ -0,0 +1,199 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_block_int8.py
+import itertools
+
+import pytest
+import torch
+
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.quantization.utils.int8_utils import (
+    w8a8_block_int8_matmul)
+from vllm.platforms import current_platform
+
+from .utils_block import native_w8a8_block_matmul
+
+if current_platform.get_device_capability() < (7, 0):
+    pytest.skip("INT8 Triton requires CUDA 7.0 or higher",
+                allow_module_level=True)
+
+
+# For test
+def native_per_token_group_quant_int8(x,
+                                      group_size,
+                                      eps=1e-10,
+                                      dtype=torch.int8):
+    """Function to perform per-token-group quantization on an input tensor
+    `x` using native torch.
+
+    It converts the tensor values into int8 values and returns the
+    quantized tensor along with the scaling factor used for quantization.
+    """
+    assert (x.shape[-1] % group_size == 0
+            ), "the last dimension of `x` cannot be divisible by `group_size`"
+    assert x.is_contiguous(), "`x` is not contiguous"
+
+    iinfo = torch.iinfo(dtype)
+    int8_min = iinfo.min
+    int8_max = iinfo.max
+
+    x_ = x.reshape(x.numel() // group_size, group_size)
+    # Use float32 for scale calculation for stability
+    amax = x_.abs().max(dim=-1,
+                        keepdim=True)[0].clamp(min=eps).to(torch.float32)
+    x_s = amax / int8_max
+    x_q = (x_.to(torch.float32) / x_s).round().clamp(
+        min=int8_min, max=int8_max).to(dtype)  # Round before clamping
+    x_q = x_q.reshape(x.shape)
+    x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size, ))
+
+    return x_q, x_s
+
+
+# For test
+def torch_w8a8_block_int8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
+    """This function performs fused moe with block-wise quantization using
+    native torch."""
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+
+    _, block_k = block_shape[0], block_shape[1]
+    a_q, a_s = native_per_token_group_quant_int8(a, block_k)
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            inter_out = native_w8a8_block_matmul(a_q[mask],
+                                                 w1[i],
+                                                 a_s[mask],
+                                                 w1_s[i],
+                                                 block_shape,
+                                                 output_dtype=a.dtype)
+            act_out = SiluAndMul().forward_native(inter_out)
+            act_out_q, act_out_s = native_per_token_group_quant_int8(
+                act_out, block_k)
+            act_out = act_out.to(torch.float32)
+            out[mask] = native_w8a8_block_matmul(act_out_q,
+                                                 w2[i],
+                                                 act_out_s,
+                                                 w2_s[i],
+                                                 block_shape,
+                                                 output_dtype=a.dtype)
+    return (out.view(B, -1, w2.shape[1]) *
+            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+
+
+DTYPES = [torch.half, torch.bfloat16]
+M = [1, 33, 64, 222]
+N = [128, 1024]
+K = [256, 4096]
+E = [8, 24]
+TOP_KS = [2, 6]
+# BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
+BLOCK_SIZE = [[128, 128]]
+SEEDS = [0]
+
+
+@pytest.fixture(autouse=True, scope="module")
+def setup_cuda():
+    """Sets the default CUDA device for all tests in this module."""
+    torch.set_default_device("cuda")
+
+
+@pytest.mark.parametrize("M,N,K,block_size,out_dtype,seed",
+                         itertools.product(M, N, K, BLOCK_SIZE, DTYPES, SEEDS))
+@torch.inference_mode()
+def test_w8a8_block_int8_matmul(M, N, K, block_size, out_dtype, seed):
+    torch.manual_seed(seed)
+    factor_for_scale = 1e-2
+    int8_info = torch.iinfo(torch.int8)
+    int8_max, int8_min = int8_info.max, int8_info.min
+
+    A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * int8_max
+    A_fp8 = A_fp32.clamp(min=int8_min, max=int8_max).to(torch.float8_e4m3fn)
+
+    B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * int8_max
+    B_fp8 = B_fp32.clamp(min=int8_min, max=int8_max).to(torch.float8_e4m3fn)
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+
+    As = torch.rand(M, k_tiles, dtype=torch.float32) * factor_for_scale
+    Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
+
+    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
+                                       out_dtype)
+    out = w8a8_block_int8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
+
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.001
+
+
+@pytest.mark.parametrize(
+    "M, N, K, E, topk, block_size, dtype, seed",
+    itertools.product(M, N, K, E, TOP_KS, BLOCK_SIZE, DTYPES, SEEDS))
+@torch.inference_mode()
+def test_w8a8_block_int8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
+    """Tests the fused_moe kernel with W8A8 INT8 block quantization against a
+    native torch reference."""
+    torch.manual_seed(seed)
+    # Use a smaller factor for scale initialization to prevent large
+    # values/overflow especially when output dtype might be float16
+    factor_for_scale = 1e-2
+    int8_info = torch.iinfo(torch.int8)
+    int8_max, int8_min = int8_info.max, int8_info.min
+
+    a = torch.randn((M, K), dtype=dtype) / 10
+
+    w1_fp32 = (torch.rand(
+        (E, 2 * N, K), dtype=torch.float32) - 0.5) * 2 * int8_max
+    w1 = w1_fp32.clamp(min=int8_min, max=int8_max).to(torch.int8)
+
+    w2_fp32 = (torch.rand((E, K, N), dtype=torch.float32) - 0.5) * 2 * int8_max
+    w2 = w2_fp32.clamp(min=int8_min, max=int8_max).to(torch.int8)
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles_w1 = (2 * N + block_n - 1) // block_n
+    n_tiles_w2 = (K + block_n - 1) // block_n
+    k_tiles_w1 = (K + block_k - 1) // block_k
+    k_tiles_w2 = (N + block_k - 1) // block_k
+
+    w1_s = (torch.rand(
+        (E, n_tiles_w1, k_tiles_w1), dtype=torch.float32) * factor_for_scale)
+    w2_s = (torch.rand(
+        (E, n_tiles_w2, k_tiles_w2), dtype=torch.float32) * factor_for_scale)
+
+    score = torch.randn((M, E), dtype=dtype)
+
+    # Set the context to avoid lots of warning spam.
+    vllm_config = VllmConfig()
+    with set_current_vllm_config(vllm_config):
+        out = fused_moe(
+            a,
+            w1,
+            w2,
+            score,
+            topk,
+            renormalize=False,
+            use_int8_w8a8=True,
+            w1_scale=w1_s,
+            w2_scale=w2_s,
+            block_shape=block_size,
+        )
+        ref_out = torch_w8a8_block_int8_moe(a, w1, w2, w1_s, w2_s, score, topk,
+                                            block_size)
+
+    # Check results
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.06
diff --git a/tests/kernels/test_int8_kernel.py b/tests/kernels/test_int8_kernel.py
new file mode 100644
index 0000000000000..4c7543527c323
--- /dev/null
+++ b/tests/kernels/test_int8_kernel.py
@@ -0,0 +1,149 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_int8_kernel.py
+import itertools
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.quantization.utils.int8_utils import (
+    per_token_quant_int8)
+from vllm.platforms import current_platform
+
+if current_platform.get_device_capability() < (7, 0):
+    pytest.skip("INT8 Triton requires CUDA 7.0 or higher",
+                allow_module_level=True)
+
+
+def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):
+    """Matrix multiplication function that supports per-token input
+    quantization and per-column weight quantization"""
+    A = A.to(torch.float32)
+    B = B.to(torch.float32)
+
+    assert A.shape[-1] == B.shape[-1], "Dimension mismatch"
+    assert B.ndim == 2 and B.is_contiguous(
+    ), "B must be a 2D contiguous tensor"
+
+    # Reshape input
+    M = A.numel() // A.shape[-1]
+    B = B.t()  # Transpose weight matrix
+    N, K = B.shape
+    origin_C_shape = A.shape[:-1] + (K, )
+    A = A.reshape(M, N)
+
+    # As is per-token [M, 1], Bs is per-column [1, K]
+    C = torch.matmul(A, B)  # [M, K]
+    C = As * C * Bs.view(1, -1)  # Broadcast per-column scale
+
+    return C.reshape(origin_C_shape).to(output_dtype)
+
+
+def torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk):
+    """This function performs fused moe with per-column int8 quantization
+    using native torch."""
+
+    B, D = a.shape
+    # Perform per-token quantization
+    a_q, a_s = per_token_quant_int8(a)
+    # Repeat tokens to match topk
+    a_q = a_q.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    # Also repeat the scale
+    a_s = a_s.view(B, -1, 1).repeat(1, topk, 1).reshape(-1, 1)  # [B*topk, 1]
+
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+
+    # Calculate routing
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+    # Process each expert
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            # First MLP layer: note that a_s is now per-token
+            inter_out = native_w8a8_per_token_matmul(a_q[mask],
+                                                     w1[i],
+                                                     a_s[mask],
+                                                     w1_s[i],
+                                                     output_dtype=a.dtype)
+            # Activation function
+            act_out = SiluAndMul().forward_native(inter_out)
+            # Quantize activation output with per-token
+            act_out_q, act_out_s = per_token_quant_int8(act_out)
+
+            # Second MLP layer
+            out[mask] = native_w8a8_per_token_matmul(act_out_q,
+                                                     w2[i],
+                                                     act_out_s,
+                                                     w2_s[i],
+                                                     output_dtype=a.dtype)
+    # Apply routing weights and sum
+    return (out.view(B, -1, w2.shape[1]) *
+            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+
+
+@pytest.fixture(autouse=True, scope="module")
+def setup_cuda():
+    """Sets the default CUDA device for all tests in this module."""
+    torch.set_default_device("cuda")
+
+
+DTYPES = [torch.half, torch.bfloat16]
+M = [1, 33]
+N = [128, 1024]
+K = [256, 4096]
+E = [8]
+TOP_KS = [2, 6]
+SEEDS = [0]
+
+
+@pytest.mark.parametrize("M, N, K, E, topk, dtype, seed",
+                         itertools.product(M, N, K, E, TOP_KS, DTYPES, SEEDS))
+@torch.inference_mode()
+def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed):
+    torch.manual_seed(seed)
+    # Initialize int8 quantization parameters
+    factor_for_scale = 1e-2
+    int8_max = 127
+    int8_min = -128
+
+    # Input tensor
+    # M * K
+    a = torch.randn((M, K), dtype=dtype) / 10
+
+    # Generate int8 weights
+    w1_fp32 = (torch.rand((E, 2 * N, K), dtype=torch.float32) - 0.5) * 2
+    w1 = (w1_fp32 * int8_max).clamp(min=int8_min, max=int8_max).to(torch.int8)
+
+    w2_fp32 = (torch.rand((E, K, N), dtype=torch.float32) - 0.5) * 2
+    w2 = (w2_fp32 * int8_max).clamp(min=int8_min, max=int8_max).to(torch.int8)
+
+    # Generate scale for each column (per-column quantization)
+    w1_s = torch.rand(E, 2 * N, device=w1_fp32.device) * factor_for_scale
+    w2_s = torch.rand(E, K, device=w2_fp32.device) * factor_for_scale
+    score = torch.randn((M, E), dtype=dtype)
+
+    ref_out = torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk)
+    out = fused_moe(
+        a,
+        w1,
+        w2,
+        score,
+        topk,
+        renormalize=False,
+        use_int8_w8a8=True,  # Using int8-w8a8
+        per_channel_quant=True,
+        w1_scale=w1_s,
+        w2_scale=w2_s,
+        block_shape=None,  # Not using block quantization
+    )
+
+    # Check results
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.05
diff --git a/tests/kernels/test_triton_moe_ptpc_fp8.py b/tests/kernels/test_triton_moe_ptpc_fp8.py
new file mode 100644
index 0000000000000..44734e9340aa1
--- /dev/null
+++ b/tests/kernels/test_triton_moe_ptpc_fp8.py
@@ -0,0 +1,159 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_triton_moe_channel_fp8_kernel.py
+import itertools
+
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.platforms import current_platform
+
+if current_platform.get_device_capability() < (9, 0):
+    pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
+                allow_module_level=True)
+
+
+def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):
+    """Matrix multiplication function that supports per-token input
+    quantization and per-column weight quantization"""
+    A = A.to(torch.float32)
+    B = B.to(torch.float32)
+
+    assert A.shape[-1] == B.shape[-1], "Dimension mismatch"
+    assert B.ndim == 2 and B.is_contiguous(
+    ), "B must be a 2D contiguous tensor"
+
+    # Reshape input
+    M = A.numel() // A.shape[-1]
+    B = B.t()  # Transpose weight matrix
+    N, K = B.shape
+    origin_C_shape = A.shape[:-1] + (K, )
+    A = A.reshape(M, N)
+
+    # As is per-token [M, 1], Bs is per-column [1, K]
+    C = torch.matmul(A, B)  # [M, K]
+    C = As * C * Bs.view(1, -1)  # Broadcast per-column scale
+
+    return C.reshape(origin_C_shape).to(output_dtype)
+
+
+def fp8_mask(a, mask):
+    dtype = a.dtype
+    return a.view(torch.int8)[mask].view(dtype)
+
+
+def torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk):
+    """This function performs fused moe with per-column int8
+    quantization using native torch."""
+
+    B, D = a.shape
+    # Perform per-token quantization
+    a_q, a_s = ops.scaled_fp8_quant(a, use_per_token_if_dynamic=True)
+    # Repeat tokens to match topk
+    a_q = a_q.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    # Also repeat the scale
+    a_s = a_s.view(B, -1, 1).repeat(1, topk, 1).reshape(-1, 1)  # [B*topk, 1]
+
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+
+    # Calculate routing
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+    # Process each expert
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            # First MLP layer: note that a_s is now per-token
+            inter_out = native_w8a8_per_token_matmul(
+                fp8_mask(a_q, mask),
+                w1[i],
+                fp8_mask(a_s, mask),
+                w1_s[i],
+                output_dtype=a.dtype,
+            )
+            # Activation function
+            act_out = SiluAndMul().forward_native(inter_out)
+            # Quantize activation output with per-token
+            act_out_q, act_out_s = ops.scaled_fp8_quant(
+                act_out, use_per_token_if_dynamic=True)
+
+            # Second MLP layer
+            out[mask] = native_w8a8_per_token_matmul(act_out_q,
+                                                     w2[i],
+                                                     act_out_s,
+                                                     w2_s[i],
+                                                     output_dtype=a.dtype)
+    # Apply routing weights and sum
+    return (out.view(B, -1, w2.shape[1]) *
+            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+
+
+@pytest.fixture(autouse=True, scope="module")
+def setup_cuda():
+    """Sets the default CUDA device for all tests in this module."""
+    torch.set_default_device("cuda")
+
+
+DTYPES = [torch.half, torch.bfloat16]
+M = [1, 33]
+N = [128, 1024]
+K = [256, 4096]
+E = [8]
+TOP_KS = [2, 6]
+SEEDS = [0]
+
+
+@pytest.mark.parametrize("M, N, K, E, topk, dtype, seed",
+                         itertools.product(M, N, K, E, TOP_KS, DTYPES, SEEDS))
+@torch.inference_mode()
+def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed):
+    torch.manual_seed(seed)
+    # Initialize int8 quantization parameters
+    factor_for_scale = 1e-2
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    fp8_max = finfo.max
+    fp8_min = finfo.min
+
+    # Input tensor
+    # M * K
+    a = torch.randn((M, K), dtype=dtype) / 10
+
+    # Generate int8 weights
+    w1_fp32 = (torch.rand((E, 2 * N, K), dtype=torch.float32) - 0.5) * 2
+    w1 = (w1_fp32 * fp8_max).clamp(min=fp8_min,
+                                   max=fp8_max).to(torch.float8_e4m3fn)
+
+    w2_fp32 = (torch.rand((E, K, N), dtype=torch.float32) - 0.5) * 2
+    w2 = (w2_fp32 * fp8_max).clamp(min=fp8_min,
+                                   max=fp8_max).to(torch.float8_e4m3fn)
+
+    # Generate scale for each column (per-column quantization)
+    w1_s = torch.rand(E, 2 * N, device=w1_fp32.device) * factor_for_scale
+    w2_s = torch.rand(E, K, device=w2_fp32.device) * factor_for_scale
+    score = torch.randn((M, E), dtype=dtype)
+
+    ref_out = torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk)
+    out = fused_moe(
+        a,
+        w1,
+        w2,
+        score,
+        topk,
+        renormalize=False,
+        use_fp8_w8a8=True,  # using fp8
+        per_channel_quant=True,
+        w1_scale=w1_s,
+        w2_scale=w2_s,
+        block_shape=None,  # Not using block quantization
+    )
+
+    # Check results
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.05
diff --git a/tests/kernels/utils_block.py b/tests/kernels/utils_block.py
new file mode 100644
index 0000000000000..c16cba50967eb
--- /dev/null
+++ b/tests/kernels/utils_block.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def native_w8a8_block_matmul(A: torch.Tensor, B: torch.Tensor,
+                             As: torch.Tensor, Bs: torch.Tensor, block_size,
+                             output_dtype):
+    """This function performs matrix multiplication with block-wise
+    quantization using native torch.
+    It is agnostic to the input data type and can be used for both int8 and
+    fp8 data types.
+
+    It takes two input tensors `A` and `B` (int8) with scales `As` and 
+    `Bs` (float32).
+    The output is returned in the specified `output_dtype`.
+    """
+    A = A.to(torch.float32)
+    B = B.to(torch.float32)
+    assert A.shape[-1] == B.shape[-1]
+    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+    assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1]
+
+    M = A.numel() // A.shape[-1]
+    N, K = B.shape
+    origin_C_shape = A.shape[:-1] + (N, )
+    A = A.reshape(M, A.shape[-1])
+    As = As.reshape(M, As.shape[-1])
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+    assert n_tiles == Bs.shape[0]
+    assert k_tiles == Bs.shape[1]
+
+    C_shape = (M, N)
+    C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
+
+    A_tiles = [
+        A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
+    ]
+    B_tiles = [[
+        B[
+            j * block_n:min((j + 1) * block_n, N),
+            i * block_k:min((i + 1) * block_k, K),
+        ] for i in range(k_tiles)
+    ] for j in range(n_tiles)]
+    C_tiles = [
+        C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
+    ]
+    As_tiles = [As[:, i:i + 1] for i in range(k_tiles)]
+
+    for i in range(k_tiles):
+        for j in range(n_tiles):
+            a = A_tiles[i]
+            b = B_tiles[j][i]
+            c = C_tiles[j]
+            s = As_tiles[i] * Bs[j][i]
+            c[:, :] += torch.matmul(a, b.t()) * s
+
+    C = C.reshape(origin_C_shape).to(output_dtype)
+    return C
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index a6a00040fb504..38d739d55e55c 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -16,7 +16,10 @@ from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
     _valid_deep_gemm, deep_gemm_moe_fp8)
 from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
     moe_align_block_size)
-from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8)
+from vllm.model_executor.layers.quantization.utils.int8_utils import (
+    per_token_group_quant_int8, per_token_quant_int8)
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 
@@ -251,50 +254,53 @@ def fused_moe_kernel_gptq_awq(
 
 @triton.jit
 def fused_moe_kernel(
-        # Pointers to matrices
-        a_ptr,
-        b_ptr,
-        c_ptr,
-        a_scale_ptr,
-        b_scale_ptr,
-        topk_weights_ptr,
-        sorted_token_ids_ptr,
-        expert_ids_ptr,
-        num_tokens_post_padded_ptr,
-        # Matrix dimensions
-        N,
-        K,
-        EM,
-        num_valid_tokens,
-        # The stride variables represent how much to increase the ptr by when
-        # moving by 1 element in a particular dimension. E.g. `stride_am` is
-        # how much to increase `a_ptr` by to get the element one row down
-        # (A has M rows).
-        stride_am,
-        stride_ak,
-        stride_be,
-        stride_bk,
-        stride_bn,
-        stride_cm,
-        stride_cn,
-        stride_asm,
-        stride_ask,
-        stride_bse,
-        stride_bsk,
-        stride_bsn,
-        # Block size for block-wise quantization
-        group_n: tl.constexpr,
-        group_k: tl.constexpr,
-        # Meta-parameters
-        BLOCK_SIZE_M: tl.constexpr,
-        BLOCK_SIZE_N: tl.constexpr,
-        BLOCK_SIZE_K: tl.constexpr,
-        GROUP_SIZE_M: tl.constexpr,
-        MUL_ROUTED_WEIGHT: tl.constexpr,
-        top_k: tl.constexpr,
-        compute_type: tl.constexpr,
-        use_fp8_w8a8: tl.constexpr,
-        use_int8_w8a16: tl.constexpr):
+    # Pointers to matrices
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    a_scale_ptr,
+    b_scale_ptr,
+    topk_weights_ptr,
+    sorted_token_ids_ptr,
+    expert_ids_ptr,
+    num_tokens_post_padded_ptr,
+    # Matrix dimensions
+    N,
+    K,
+    EM,
+    num_valid_tokens,
+    # The stride variables represent how much to increase the ptr by when
+    # moving by 1 element in a particular dimension. E.g. `stride_am` is
+    # how much to increase `a_ptr` by to get the element one row down
+    # (A has M rows).
+    stride_am,
+    stride_ak,
+    stride_be,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_asm,
+    stride_ask,
+    stride_bse,
+    stride_bsk,
+    stride_bsn,
+    # Block size for block-wise quantization
+    group_n: tl.constexpr,
+    group_k: tl.constexpr,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    MUL_ROUTED_WEIGHT: tl.constexpr,
+    top_k: tl.constexpr,
+    compute_type: tl.constexpr,
+    use_fp8_w8a8: tl.constexpr,
+    use_int8_w8a8: tl.constexpr,
+    use_int8_w8a16: tl.constexpr,
+    per_channel_quant: tl.constexpr,
+):
     """
     Implements the fused computation for a Mixture of Experts (MOE) using
     token and expert matrices.
@@ -371,12 +377,23 @@ def fused_moe_kernel(
             None, :] * stride_bsn
         b_scale = tl.load(b_scale_ptrs)
 
-    if use_fp8_w8a8:
+    if use_fp8_w8a8 or use_int8_w8a8:
+        # block-wise
         if group_k > 0 and group_n > 0:
             a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
             offs_bsn = offs_bn // group_n
             b_scale_ptrs = (b_scale_ptr + off_experts * stride_bse +
                             offs_bsn * stride_bsn)
+        # channel-wise
+        elif per_channel_quant:
+            b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[
+                None, :] * stride_bsn
+            b_scale = tl.load(b_scale_ptrs)
+            # Load per-token scale for activations
+            a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
+            a_scale = tl.load(a_scale_ptrs, mask=token_mask, other=0.0)[:,
+                                                                        None]
+        # tensor-wise
         else:
             a_scale = tl.load(a_scale_ptr)
             b_scale = tl.load(b_scale_ptr + off_experts)
@@ -400,7 +417,7 @@ def fused_moe_kernel(
         # We accumulate along the K dimension.
         if use_int8_w8a16:
             accumulator = tl.dot(a, b.to(compute_type), acc=accumulator)
-        elif use_fp8_w8a8:
+        elif use_fp8_w8a8 or use_int8_w8a8:
             if group_k > 0 and group_n > 0:
                 k_start = k * BLOCK_SIZE_K
                 offs_ks = k_start // group_k
@@ -412,7 +429,11 @@ def fused_moe_kernel(
                 accumulator += tl.dot(a, b) * a_scale[:,
                                                       None] * b_scale[None, :]
             else:
-                accumulator = tl.dot(a, b, acc=accumulator)
+                if use_fp8_w8a8:
+                    # acc used to enable fp8_fast_accum
+                    accumulator = tl.dot(a, b, acc=accumulator)
+                else:
+                    accumulator += tl.dot(a, b)
         else:
             accumulator += tl.dot(a, b)
         # Advance the ptrs to the next K block.
@@ -426,7 +447,7 @@ def fused_moe_kernel(
         accumulator = accumulator * moe_weight[:, None]
     if use_int8_w8a16:
         accumulator = (accumulator * b_scale).to(compute_type)
-    elif use_fp8_w8a8:
+    elif use_fp8_w8a8 or use_int8_w8a8:
         if group_k > 0 and group_n > 0:
             accumulator = accumulator.to(compute_type)
         else:
@@ -457,27 +478,15 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
                             config: Dict[str, Any],
                             compute_type: tl.dtype,
                             use_fp8_w8a8: bool,
+                            use_int8_w8a8: bool,
                             use_int8_w8a16: bool,
                             use_int4_w4a16: bool,
+                            per_channel_quant: bool,
                             block_shape: Optional[List[int]] = None) -> None:
     assert topk_weights is not None or not mul_routed_weight
     assert topk_weights is None or topk_weights.stride(1) == 1
     assert sorted_token_ids.stride(0) == 1
 
-    if use_fp8_w8a8:
-        assert B_scale is not None
-        assert (block_shape is None or triton.cdiv(B.shape[-2], block_shape[0])
-                == B_scale.shape[-2])
-        assert (block_shape is None or triton.cdiv(B.shape[-1], block_shape[1])
-                == B_scale.shape[-1])
-
-    elif use_int8_w8a16 or use_int4_w4a16:
-        assert B_scale is not None
-        assert block_shape is None or block_shape[0] == 0
-    else:
-        assert A_scale is None
-        assert B_scale is None
-
     M = A.shape[0]
     num_tokens = M * top_k
 
@@ -604,7 +613,9 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
             top_k=top_k,
             compute_type=compute_type,
             use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a8=use_int8_w8a8,
             use_int8_w8a16=use_int8_w8a16,
+            per_channel_quant=per_channel_quant,
             BLOCK_SIZE_K=BLOCK_SIZE_K,
             **config,
         )
@@ -956,8 +967,10 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
                           activation: str = "silu",
                           apply_router_weight_on_input: bool = False,
                           use_fp8_w8a8: bool = False,
+                          use_int8_w8a8: bool = False,
                           use_int8_w8a16: bool = False,
                           use_int4_w4a16: bool = False,
+                          per_channel_quant: bool = False,
                           global_num_experts: int = -1,
                           expert_map: Optional[torch.Tensor] = None,
                           w1_scale: Optional[torch.Tensor] = None,
@@ -969,9 +982,10 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
                           block_shape: Optional[List[int]] = None) -> None:
     fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True,
                        activation, apply_router_weight_on_input, use_fp8_w8a8,
-                       use_int8_w8a16, use_int4_w4a16, global_num_experts,
-                       expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale,
-                       a2_scale, block_shape)
+                       use_int8_w8a8, use_int8_w8a16, use_int4_w4a16,
+                       per_channel_quant, global_num_experts, expert_map,
+                       w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale,
+                       block_shape)
 
 
 def inplace_fused_experts_fake(
@@ -983,8 +997,10 @@ def inplace_fused_experts_fake(
         activation: str = "silu",
         apply_router_weight_on_input: bool = False,
         use_fp8_w8a8: bool = False,
+        use_int8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
         use_int4_w4a16: bool = False,
+        per_channel_quant: bool = False,
         global_num_experts: int = -1,
         expert_map: Optional[torch.Tensor] = None,
         w1_scale: Optional[torch.Tensor] = None,
@@ -1015,8 +1031,10 @@ def outplace_fused_experts(
         activation: str = "silu",
         apply_router_weight_on_input: bool = False,
         use_fp8_w8a8: bool = False,
+        use_int8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
         use_int4_w4a16: bool = False,
+        per_channel_quant: bool = False,
         global_num_experts: int = -1,
         expert_map: Optional[torch.Tensor] = None,
         w1_scale: Optional[torch.Tensor] = None,
@@ -1028,7 +1046,8 @@ def outplace_fused_experts(
         block_shape: Optional[List[int]] = None) -> torch.Tensor:
     return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids,
                               False, activation, apply_router_weight_on_input,
-                              use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16,
+                              use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16,
+                              use_int4_w4a16, per_channel_quant,
                               global_num_experts, expert_map, w1_scale,
                               w2_scale, w1_zp, w2_zp, a1_scale, a2_scale,
                               block_shape)
@@ -1042,8 +1061,10 @@ def outplace_fused_experts_fake(
         topk_ids: torch.Tensor,
         activation: str = "silu",
         use_fp8_w8a8: bool = False,
+        use_int8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
         use_int4_w4a16: bool = False,
+        per_channel_quant: bool = False,
         global_num_experts: int = -1,
         expert_map: Optional[torch.Tensor] = None,
         w1_scale: Optional[torch.Tensor] = None,
@@ -1092,8 +1113,10 @@ def fused_experts(hidden_states: torch.Tensor,
                   activation: str = "silu",
                   apply_router_weight_on_input: bool = False,
                   use_fp8_w8a8: bool = False,
+                  use_int8_w8a8: bool = False,
                   use_int8_w8a16: bool = False,
                   use_int4_w4a16: bool = False,
+                  per_channel_quant: bool = False,
                   global_num_experts: int = -1,
                   expert_map: Optional[torch.Tensor] = None,
                   w1_scale: Optional[torch.Tensor] = None,
@@ -1132,8 +1155,10 @@ def fused_experts(hidden_states: torch.Tensor,
             activation=activation,
             apply_router_weight_on_input=apply_router_weight_on_input,
             use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a8=use_int8_w8a8,
             use_int8_w8a16=use_int8_w8a16,
             use_int4_w4a16=use_int4_w4a16,
+            per_channel_quant=per_channel_quant,
             global_num_experts=global_num_experts,
             expert_map=expert_map,
             w1_scale=w1_scale,
@@ -1145,6 +1170,59 @@ def fused_experts(hidden_states: torch.Tensor,
             block_shape=block_shape)
 
 
+def moe_kernel_prepare_input(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    B_scale: Optional[torch.Tensor],
+    use_fp8_w8a8: bool,
+    use_int8_w8a8: bool,
+    use_int8_w8a16: bool,
+    use_int4_w4a16: bool,
+    per_channel_quant: bool,
+    block_shape: Optional[List[int]] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    if use_fp8_w8a8:
+        assert B_scale is not None
+        if block_shape is None:
+            # If weights are per-channel (per_channel_quant=True), then
+            # activations apply per-token quantization. Otherwise, assume
+            # activation tensor-wise fp8 quantization, dynamic or static
+            A, A_scale = ops.scaled_fp8_quant(
+                A, A_scale, use_per_token_if_dynamic=per_channel_quant)
+        else:
+            # activation block-wise fp8 quantization
+            assert len(block_shape) == 2
+            _, block_k = block_shape[0], block_shape[1]
+            A, A_scale = per_token_group_quant_fp8(A, block_k)
+            assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+            # assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2]
+            # assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1]
+    elif use_int8_w8a8:
+        assert B_scale is not None
+        if block_shape is None:
+            # activation channel-wise int8 quantization
+            assert (per_channel_quant
+                    ), "int8 quantization only supports block or channel-wise"
+            A, A_scale = per_token_quant_int8(A)
+        else:
+            # activation block-wise int8 quantization
+            assert len(block_shape) == 2
+            _, block_k = block_shape[0], block_shape[1]
+            A, A_scale = per_token_group_quant_int8(A, block_k)
+            assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+            # assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2]
+            # assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1]
+    elif use_int8_w8a16 or use_int4_w4a16:
+        assert B_scale is not None
+        assert block_shape is None or block_shape[0] == 0
+    else:
+        assert A_scale is None
+        assert B_scale is None
+
+    return A, A_scale
+
+
 def fused_experts_impl(hidden_states: torch.Tensor,
                        w1: torch.Tensor,
                        w2: torch.Tensor,
@@ -1154,8 +1232,10 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                        activation: str = "silu",
                        apply_router_weight_on_input: bool = False,
                        use_fp8_w8a8: bool = False,
+                       use_int8_w8a8: bool = False,
                        use_int8_w8a16: bool = False,
                        use_int4_w4a16: bool = False,
+                       per_channel_quant: bool = False,
                        global_num_experts: int = -1,
                        expert_map: Optional[torch.Tensor] = None,
                        w1_scale: Optional[torch.Tensor] = None,
@@ -1257,14 +1337,17 @@ def fused_experts_impl(hidden_states: torch.Tensor,
         curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]
         curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx]
 
-        a1q_scale: Optional[torch.Tensor] = None
-
-        if use_fp8_w8a8:
-            qcurr_hidden_states, a1q_scale = _fp8_quantize(
-                curr_hidden_states, a1_scale, block_shape)
-        else:
-            qcurr_hidden_states = curr_hidden_states
-            a1q_scale = a1_scale
+        qcurr_hidden_states, qa1_scale = moe_kernel_prepare_input(
+            A=curr_hidden_states,
+            B=w1,
+            A_scale=a1_scale,
+            B_scale=w1_scale,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a8=use_int8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            use_int4_w4a16=use_int4_w4a16,
+            per_channel_quant=per_channel_quant,
+            block_shape=block_shape)
 
         sorted_token_ids, expert_ids, num_tokens_post_padded = (
             moe_align_block_size(curr_topk_ids, config['BLOCK_SIZE_M'],
@@ -1273,7 +1356,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
         invoke_fused_moe_kernel(qcurr_hidden_states,
                                 w1,
                                 intermediate_cache1,
-                                a1q_scale,
+                                qa1_scale,
                                 w1_scale,
                                 w1_zp,
                                 curr_topk_weights,
@@ -1285,8 +1368,10 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                 config,
                                 compute_type=compute_type,
                                 use_fp8_w8a8=use_fp8_w8a8,
+                                use_int8_w8a8=use_int8_w8a8,
                                 use_int8_w8a16=use_int8_w8a16,
                                 use_int4_w4a16=use_int4_w4a16,
+                                per_channel_quant=per_channel_quant,
                                 block_shape=block_shape)
 
         if activation == "silu":
@@ -1298,19 +1383,22 @@ def fused_experts_impl(hidden_states: torch.Tensor,
         else:
             raise ValueError(f"Unsupported FusedMoe activation: {activation}")
 
-        a2q_scale: Optional[torch.Tensor] = None
-
-        if use_fp8_w8a8:
-            qintermediate_cache2, a2q_scale = _fp8_quantize(
-                intermediate_cache2, a2_scale, block_shape)
-        else:
-            qintermediate_cache2 = intermediate_cache2
-            a2q_scale = a2_scale
+        qintermediate_cache2, qa2_scale = moe_kernel_prepare_input(
+            A=intermediate_cache2,
+            B=w2,
+            A_scale=a2_scale,
+            B_scale=w2_scale,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a8=use_int8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            use_int4_w4a16=use_int4_w4a16,
+            per_channel_quant=per_channel_quant,
+            block_shape=block_shape)
 
         invoke_fused_moe_kernel(qintermediate_cache2,
                                 w2,
                                 intermediate_cache3,
-                                a2q_scale,
+                                qa2_scale,
                                 w2_scale,
                                 w2_zp,
                                 curr_topk_weights,
@@ -1322,8 +1410,10 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                 config,
                                 compute_type=compute_type,
                                 use_fp8_w8a8=use_fp8_w8a8,
+                                use_int8_w8a8=use_int8_w8a8,
                                 use_int8_w8a16=use_int8_w8a16,
                                 use_int4_w4a16=use_int4_w4a16,
+                                per_channel_quant=per_channel_quant,
                                 block_shape=block_shape)
 
         ops.moe_sum(intermediate_cache3.view(*intermediate_cache3.shape),
@@ -1346,8 +1436,10 @@ def fused_moe(
     topk_group: Optional[int] = None,
     custom_routing_function: Optional[Callable] = None,
     use_fp8_w8a8: bool = False,
+    use_int8_w8a8: bool = False,
     use_int8_w8a16: bool = False,
     use_int4_w4a16: bool = False,
+    per_channel_quant: bool = False,
     global_num_experts: int = -1,
     expert_map: Optional[torch.Tensor] = None,
     w1_scale: Optional[torch.Tensor] = None,
@@ -1380,6 +1472,8 @@ def fused_moe(
         note: Deepseekv2 model uses grouped_topk
     - use_fp8_w8a8 (bool): If True, use fp8 arithmetic to compute the inner
         products for w1 and w2. Defaults to False.
+    - use_int8_w8a8 (bool): If True, use int8 arithmetic to compute the inner
+        products for w1 and w2. Defaults to False.
     - use_int8_w8a16 (bool): If True, use matmul of int8 weight and bf16/fp16
         activation to compute the inner products for w1 and w2.
         Defaults to False.
@@ -1426,8 +1520,10 @@ def fused_moe(
                          inplace=inplace,
                          activation=activation,
                          use_fp8_w8a8=use_fp8_w8a8,
+                         use_int8_w8a8=use_int8_w8a8,
                          use_int8_w8a16=use_int8_w8a16,
                          use_int4_w4a16=use_int4_w4a16,
+                         per_channel_quant=per_channel_quant,
                          global_num_experts=global_num_experts,
                          expert_map=expert_map,
                          w1_scale=w1_scale,
diff --git a/vllm/model_executor/layers/quantization/utils/int8_utils.py b/vllm/model_executor/layers/quantization/utils/int8_utils.py
new file mode 100644
index 0000000000000..98b06b6c2ae96
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/int8_utils.py
@@ -0,0 +1,459 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Adapted from https://github.com/sgl-project/sglang/blob/4cb53ecd0cffceb6dee5c011a58f65997a86f151/python/sglang/srt/layers/quantization/int8_kernel.py
+import functools
+import json
+import logging
+import os
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+from vllm.platforms import current_platform
+
+logger = logging.getLogger(__name__)
+
+
+def apply_w8a8_block_int8_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    block_size: List[int],
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    assert input_scale is None
+    # View input as 2D matrix for fp8 methods
+    input_2d = input.view(-1, input.shape[-1])
+    output_shape = [*input.shape[:-1], weight.shape[0]]
+
+    q_input, x_scale = per_token_group_quant_int8(input_2d, block_size[1])
+    output = w8a8_block_int8_matmul(q_input,
+                                    weight,
+                                    x_scale,
+                                    weight_scale,
+                                    block_size,
+                                    output_dtype=input.dtype)
+
+    if bias is not None:
+        output = output + bias
+    return output.to(dtype=input.dtype).view(*output_shape)
+
+
+def input_to_int8(
+        x: torch.Tensor,
+        dtype: torch.dtype = torch.int8) -> Tuple[torch.Tensor, torch.Tensor]:
+    """This function quantizes input values to int8 values with
+    tensor-wise quantization."""
+    iinfo = torch.iinfo(dtype)
+    min_val, max_val = x.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
+    int8_min, int8_max = iinfo.min, iinfo.max
+    scale = int8_max / amax
+    x_scl_sat = (x * scale).clamp(min=int8_min, max=int8_max)
+    return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
+
+
+def block_dequant(
+    x_q_block: torch.Tensor,
+    x_s: torch.Tensor,
+    block_size: List[int],
+) -> torch.Tensor:
+    """This function conducts block-wise dequantization.
+    The inputs are block-wise quantization tensor `x_q_block`,
+    block-wise quantization scale and the block size.
+    The outputs are dequantized tensor.
+    """
+    block_n, block_k = block_size[0], block_size[1]
+    n, k = x_q_block.shape
+    n_tiles = (n + block_n - 1) // block_n
+    k_tiles = (k + block_k - 1) // block_k
+    assert n_tiles == x_s.shape[0]
+    assert k_tiles == x_s.shape[1]
+
+    x_dq_block = x_q_block.to(torch.float32)
+
+    for i in range(k_tiles):
+        for j in range(n_tiles):
+            x_dq_block[
+                j * block_n:min((j + 1) * block_n, n),
+                i * block_k:min((i + 1) * block_k, k),
+            ] *= x_s[j][i]
+
+    return x_dq_block
+
+
+@triton.jit
+def _per_token_quant_int8(
+    x_ptr,
+    xq_ptr,
+    scale_ptr,
+    stride_x,
+    stride_xq,
+    N,
+    BLOCK: tl.constexpr,
+):
+    # Adapted from https://github.com/InternLM/lmdeploy/blob/086481ed84b59bee3b8e4274e5fc69620040c048/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py#L282
+    row_id = tl.program_id(0)
+
+    cols = tl.arange(0, BLOCK)
+    mask = cols < N
+
+    x = tl.load(x_ptr + row_id * stride_x + cols, mask=mask,
+                other=0.0).to(tl.float32)
+    absmax = tl.maximum(tl.max(tl.abs(x)), 1e-10)
+    scale_x = absmax / 127
+    x_q = x * (127 / absmax)
+    x_q = tl.extra.cuda.libdevice.round(x_q).to(tl.int8)
+
+    tl.store(xq_ptr + row_id * stride_xq + cols, x_q, mask=mask)
+    tl.store(scale_ptr + row_id, scale_x)
+
+
+def per_token_quant_int8(x):
+    M = x.numel() // x.shape[-1]
+    N = x.shape[-1]
+    x_q = torch.empty_like(x, device=x.device, dtype=torch.int8)
+    scales = torch.empty(x.shape[:-1] + (1, ),
+                         device=x.device,
+                         dtype=torch.float32)
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+
+    assert x.is_contiguous()
+    _per_token_quant_int8[(M, )](
+        x,
+        x_q,
+        scales,
+        stride_x=x.stride(-2),
+        stride_xq=x_q.stride(-2),
+        N=N,
+        BLOCK=BLOCK,
+        num_warps=num_warps,
+        num_stages=1,
+    )
+
+    return x_q, scales
+
+
+@triton.jit
+def _per_token_group_quant_int8(
+    # Pointers to inputs and output
+    y_ptr,
+    y_q_ptr,
+    y_s_ptr,
+    # Stride of input
+    y_stride,
+    # Columns of input
+    N,
+    # Avoid to divide zero
+    eps,
+    # Information for int8
+    int8_min,
+    int8_max,
+    # Meta-parameters
+    BLOCK: tl.constexpr,
+):
+    """A Triton-accelerated function to perform per-token-group
+    quantization on a tensor.
+
+    This function converts the tensor values into int8 values.
+    """
+    # Map the program id to the row of X and Y it should compute.
+    g_id = tl.program_id(0)
+    y_ptr += g_id * y_stride
+    y_q_ptr += g_id * y_stride
+    y_s_ptr += g_id
+
+    cols = tl.arange(0, BLOCK)  # N <= BLOCK
+    mask = cols < N
+
+    y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32)
+    # Quant
+    _absmax = tl.maximum(tl.max(tl.abs(y)), eps)
+    y_s = _absmax / int8_max
+    y_q = tl.clamp(y / y_s, int8_min, int8_max).to(y_q_ptr.dtype.element_ty)
+
+    tl.store(y_q_ptr + cols, y_q, mask=mask)
+    tl.store(y_s_ptr, y_s)
+
+
+def per_token_group_quant_int8(
+    x: torch.Tensor,
+    group_size: int,
+    eps: float = 1e-10,
+    dtype: torch.dtype = torch.int8,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Function to perform per-token-group quantization on an input tensor `x`.
+
+    It converts the tensor values into signed int8 values and returns the
+    quantized tensor along with the scaling factor used for quantization.
+
+    Args:
+        x: The input tenosr with ndim >= 2.
+        group_size: The group size used for quantization.
+        eps: The minimum to avoid dividing zero.
+        dtype: The dype of output tensor. Note that only `torch.int8`
+            is supported for now.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
+            scaling factor for quantization.
+    """
+    assert (x.shape[-1] % group_size == 0
+            ), "the last dimension of `x` cannot be divisible by `group_size`"
+    assert x.is_contiguous(), "`x` is not contiguous"
+
+    iinfo = torch.iinfo(dtype)
+    int8_max = iinfo.max
+    int8_min = iinfo.min
+
+    x_q = torch.empty_like(x, device=x.device, dtype=dtype)
+    M = x.numel() // group_size
+    N = group_size
+    x_s = torch.empty(
+        x.shape[:-1] + (x.shape[-1] // group_size, ),
+        device=x.device,
+        dtype=torch.float32,
+    )
+
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+    num_stages = 1
+    _per_token_group_quant_int8[(M, )](
+        x,
+        x_q,
+        x_s,
+        group_size,
+        N,
+        eps,
+        int8_min=int8_min,
+        int8_max=int8_max,
+        BLOCK=BLOCK,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+
+    return x_q, x_s
+
+
+@triton.jit
+def _w8a8_block_int8_matmul(
+    # Pointers to inputs and output
+    A,
+    B,
+    C,
+    As,
+    Bs,
+    # Shape for matmul
+    M,
+    N,
+    K,
+    # Block size for block-wise quantization
+    group_n,
+    group_k,
+    # Stride for inputs and output
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_As_m,
+    stride_As_k,
+    stride_Bs_k,
+    stride_Bs_n,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """Triton-accelerated function used to perform linear operations (dot
+    product) on input tensors `A` and `B` with block-wise quantization, and
+    store the result in output tensor `C`.
+    """
+
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    b_ptrs = B + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+
+    As_ptrs = As + offs_am * stride_As_m
+    offs_bsn = offs_bn // group_n
+    Bs_ptrs = Bs + offs_bsn * stride_Bs_n
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+
+        k_start = k * BLOCK_SIZE_K
+        offs_ks = k_start // group_k
+        a_s = tl.load(As_ptrs + offs_ks * stride_As_k)
+        b_s = tl.load(Bs_ptrs + offs_ks * stride_Bs_k)
+
+        accumulator += tl.dot(a, b).to(tl.float32) * a_s[:,
+                                                         None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = C + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
+
+
+@functools.lru_cache
+def get_w8a8_block_int8_configs(N: int, K: int, block_n: int,
+                                block_k: int) -> Optional[Dict[int, Any]]:
+    """
+    Return optimized configurations for the w8a8 block fp8 kernel.
+
+    The return value will be a dictionary that maps an irregular grid of
+    batch sizes to configurations of the w8a8 block fp8 kernel. To evaluate the
+    kernel on a given batch size bs, the closest batch size in the grid should
+    be picked and the associated configuration chosen to invoke the kernel.
+    """
+
+    # First look up if an optimized configuration is available in the configs
+    # directory
+    device_name = current_platform.get_device_name().replace(" ", "_")
+    json_file_name = f"N={N},K={K},device_name={device_name},dtype=int8_w8a8,block_shape=[{block_n}, {block_k}].json"  # noqa: E501
+
+    config_file_path = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
+    if os.path.exists(config_file_path):
+        with open(config_file_path) as f:
+            logger.info(
+                "Using configuration from %s for W8A8 Block INT8 kernel.",
+                config_file_path,
+            )
+            # If a configuration has been found, return it
+            return {int(key): val for key, val in json.load(f).items()}
+
+    # If no optimized configuration is available, we will use the default
+    # configuration
+    logger.warning(
+        ("Using default W8A8 Block INT8 kernel config. Performance might "
+         "be sub-optimal! Config file not found at %s"),
+        config_file_path,
+    )
+    return None
+
+
+def w8a8_block_int8_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: List[int],
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    """This function performs matrix multiplication with block-wise
+    quantization.
+
+    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
+    The output is returned in the specified `output_dtype`.
+
+    Args:
+        A: The input tensor, e.g., activation.
+        B: The input tensor, e.g., weight.
+        As: The per-token-group quantization scale for `A`.
+        Bs: The per-block quantization scale for `B`.
+        block_size: The block size for per-block quantization. It should be
+            2-dim, e.g., [128, 128].
+        output_dytpe: The dtype of the returned tensor.
+
+    Returns:
+        torch.Tensor: The result of matmul.
+    """
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+
+    assert A.shape[-1] == B.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous()
+    assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
+    M = A.numel() // A.shape[-1]
+
+    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
+    N, K = B.shape
+    assert triton.cdiv(N, block_n) == Bs.shape[0]
+    assert triton.cdiv(K, block_k) == Bs.shape[1]
+
+    C_shape = A.shape[:-1] + (N, )
+    C = A.new_empty(C_shape, dtype=output_dtype)
+
+    configs = get_w8a8_block_int8_configs(N, K, block_size[0], block_size[1])
+    if configs:
+        # If an optimal configuration map has been found, look up the
+        # optimal config
+        config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
+    else:
+        # Default config
+        # Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
+        config = {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": block_size[0],
+            "BLOCK_SIZE_K": block_size[1],
+            "GROUP_SIZE_M": 32,
+            "num_warps": 4,
+            "num_stages": 3,
+        }
+
+    def grid(META):
+        return (triton.cdiv(M, META["BLOCK_SIZE_M"]) *
+                triton.cdiv(N, META["BLOCK_SIZE_N"]), )
+
+    _w8a8_block_int8_matmul[grid](
+        A,
+        B,
+        C,
+        As,
+        Bs,
+        M,
+        N,
+        K,
+        block_n,
+        block_k,
+        A.stride(-2),
+        A.stride(-1),
+        B.stride(1),
+        B.stride(0),
+        C.stride(-2),
+        C.stride(-1),
+        As.stride(-2),
+        As.stride(-1),
+        Bs.stride(1),
+        Bs.stride(0),
+        **config,
+    )
+
+    return C

From 5285589f37fdc0fb0c3fa4b05b71fadb68eb1130 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 12 Apr 2025 03:41:09 +0800
Subject: [PATCH 391/593] [Doc] Document InternVL3 support (#16495)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 docs/source/models/supported_models.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index d8e81281a75ef..de59d7e7a234e 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -878,9 +878,9 @@ See [this page](#generative-models) for more information on how to use generativ
   *
   * ✅︎
 - * `InternVLChatModel`
-  * InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0
+  * InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0
   * T + I<sup>E+</sup>
-  * `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.
+  * `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.
   *
   * ✅︎
   * ✅︎

From 71b9cde01044fa6fa7c2fdf3043dd315a9f89e65 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Fri, 11 Apr 2025 13:59:50 -0600
Subject: [PATCH 392/593] [Bugfix] handle alignment of encoder_seq_lens in
 mllama.py (#14784)

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 .../vision_language/test_mllama.py            | 59 ++++++++++++++++---
 vllm/model_executor/models/mllama.py          | 45 ++++++++++----
 2 files changed, 82 insertions(+), 22 deletions(-)

diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py
index a9f0de7689c4d..d94c2e885cb6e 100644
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -209,14 +209,15 @@ def _run_test(
     # will hurt multiprocessing backend with fork method (the default method).
 
     # max_model_len should be greater than image_feature_size
-    with vllm_runner(model,
-                     dtype=dtype,
-                     max_model_len=8192,
-                     max_num_seqs=3,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT
-                                          }) as vllm_model:
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            max_model_len=19212,  # 3 max size images
+            max_num_seqs=3,
+            tensor_parallel_size=tensor_parallel_size,
+            distributed_executor_backend=distributed_executor_backend,
+            limit_mm_per_prompt={"image":
+                                 _LIMIT_IMAGE_PER_PROMPT}) as vllm_model:
         vllm_outputs_per_image = [
             vllm_model.generate_greedy_logprobs(prompts,
                                                 max_tokens,
@@ -507,7 +508,7 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
             model,
             dtype=dtype,
             max_model_len=8192,
-            max_num_seqs=2,
+            max_num_seqs=4,
             tensor_parallel_size=1,
             limit_mm_per_prompt={"image":
                                  _LIMIT_IMAGE_PER_PROMPT}) as vllm_model:
@@ -552,6 +553,23 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
                                             num_logprobs,
                                             images=images)
 
+        # Mixed batch with text and images with different numbers of tiles
+        prompts = [
+            "<|begin_of_text|>Hello!",
+            "<|begin_of_text|>Some text before.<|image|>What is in the image?",  # noqa: E501
+            "<|begin_of_text|>Some text before.<|image|>What is in the image?",  # noqa: E501
+        ]
+        images = [
+            None,
+            [stop_sign],
+            # smaller image must be 2nd for the repro
+            [stop_sign.resize((448, 448))],
+        ]
+        vllm_model.generate_greedy_logprobs(prompts,
+                                            max_tokens,
+                                            num_logprobs,
+                                            images=images)
+
 
 class DummyModel:
     image_token_id = MLLAMA_IMAGE_TOKEN_ID
@@ -674,3 +692,26 @@ def test_get_full_text_row_masked_out_mask(input_indices) -> None:
                 f"full_text_row_masked_out_mask[{idx}] must be " \
                 f"'{must_be_masked}' "
             idx += 1
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("encoder_seq_lens, num_tiles, expected", [
+    ([6404], [[4]], [6404]),
+    ([0, 6404], [[4]], [6404]),
+    ([0, 1601, 8005], [[1], [4, 1]], [1601, 8005]),
+    ([0, 19212, 0, 3202], [[4, 4, 4], [2]], [19212, 3202]),
+])
+def test_parse_and_validate_encoder_lens(encoder_seq_lens, num_tiles,
+                                         expected) -> None:
+
+    dummy = DummyModel()
+    num_tokens_per_tile = 1601
+    actual_encoder_seq_lens = MllamaForConditionalGeneration \
+        ._get_and_validate_encoder_lens(
+            dummy,
+            encoder_seq_lens,
+            num_tiles,
+            num_tokens_per_tile,
+        )
+    assert actual_encoder_seq_lens == expected, \
+        f"Expected {expected} but got {actual_encoder_seq_lens}"
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 566149c9cf24e..7bfb3ada6bb46 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1301,6 +1301,31 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         raise AssertionError("This line should be unreachable.")
 
+    def _get_and_validate_encoder_lens(
+        self,
+        encoder_seq_lens: List[int],
+        num_tiles: List[List[int]],
+        num_tokens_per_tile: int,
+    ) -> List[int]:
+        # Get the actual number of encoder tokens for each sample.
+        # Because attn_metadata.encoder_seq_lens only counts the last
+        # group of images for each sample, which is used to cheat the
+        # block manager to allocate blocks for those images only.
+        # See MllamaMultiModalProcessor for more details.
+        actual_encoder_seq_lens = [
+            sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles
+        ]
+
+        # remove 0 encoder len entries for text-only requests for these
+        # assertions
+        attn_metadata_lens = [x for x in encoder_seq_lens if x > 0]
+        assert len(actual_encoder_seq_lens) == len(attn_metadata_lens)
+        for actual_len, last_group_len in zip(actual_encoder_seq_lens,
+                                              attn_metadata_lens):
+            assert actual_len >= last_group_len
+
+        return actual_encoder_seq_lens
+
     def flat_encoder_result(self, cross_attention_states: torch.Tensor,
                             attn_metadata: AttentionMetadata,
                             actual_encoder_seq_lens: List[int]):
@@ -1428,20 +1453,14 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
         else:
             skip_cross_attention = False
 
-            # Get the actual number of encoder tokens for each sample.
-            # Because attn_metadata.encoder_seq_lens only counts the last
-            # group of images for each sample, which is used to cheat the
-            # block manager to allocate blocks for those images only.
-            # See MllamaMultiModalProcessor for more details.
-            num_tiles_tensor = kwargs.pop("num_tiles")
-            num_tiles = [t.tolist() for t in num_tiles_tensor]
+            num_tiles = [t.tolist() for t in kwargs.pop("num_tiles")]
             num_tokens_per_tile = calc_token_per_chunk(self.image_size)
-            actual_encoder_seq_lens = [
-                sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles
-            ]
-            for actual_len, last_group_len in zip(
-                    actual_encoder_seq_lens, attn_metadata.encoder_seq_lens):
-                assert actual_len >= last_group_len
+
+            actual_encoder_seq_lens = self._get_and_validate_encoder_lens(
+                attn_metadata.encoder_seq_lens,
+                num_tiles,
+                num_tokens_per_tile,
+            )
 
             cross_attention_states = self.get_cross_attention_states(
                 image_inputs, attn_metadata, actual_encoder_seq_lens)

From cd77382ac176b3acc717d79be7981dc60e71dbe3 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 11 Apr 2025 21:27:27 +0100
Subject: [PATCH 393/593] Improve configs - `LoadConfig` (#16422)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config.py           | 75 ++++++++++++++++++----------------
 vllm/engine/arg_utils.py | 88 +++++++++++++++-------------------------
 vllm/utils.py            | 28 +++++++++----
 3 files changed, 95 insertions(+), 96 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 1fcf0baeac958..b466b765d7749 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -17,7 +17,7 @@ from dataclasses import (MISSING, dataclass, field, fields, is_dataclass,
 from importlib.util import find_spec
 from pathlib import Path
 from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Final, Literal,
-                    Optional, Protocol, Union)
+                    Optional, Protocol, TypeVar, Union)
 
 import torch
 from pydantic import BaseModel, Field, PrivateAttr
@@ -45,6 +45,7 @@ from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
                         random_uuid, resolve_obj_by_qualname)
 
 if TYPE_CHECKING:
+    from _typeshed import DataclassInstance
     from ray.util.placement_group import PlacementGroup
 
     from vllm.executor.executor_base import ExecutorBase
@@ -53,8 +54,11 @@ if TYPE_CHECKING:
     from vllm.model_executor.model_loader.loader import BaseModelLoader
     from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
         BaseTokenizerGroup)
+
+    Config = TypeVar("Config", bound=DataclassInstance)
 else:
     QuantizationConfig = None
+    Config = TypeVar("Config")
 
 logger = init_logger(__name__)
 
@@ -159,7 +163,7 @@ def get_attr_docs(cls: type[Any]) -> dict[str, str]:
     return out
 
 
-def config(cls: type[Any]) -> type[Any]:
+def config(cls: type[Config]) -> type[Config]:
     """
     A decorator that ensures all fields in a dataclass have default values
     and that each field has a docstring.
@@ -1431,44 +1435,47 @@ class LoadFormat(str, enum.Enum):
     FASTSAFETENSORS = "fastsafetensors"
 
 
+@config
 @dataclass
 class LoadConfig:
-    """
-        download_dir: Directory to download and load the weights, default to the
-            default cache directory of huggingface.
-        load_format: The format of the model weights to load:
-            "auto" will try to load the weights in the safetensors format and
-                fall back to the pytorch bin format if safetensors format is
-                not available.
-            "pt" will load the weights in the pytorch bin format.
-            "safetensors" will load the weights in the safetensors format.
-            "npcache" will load the weights in pytorch format and store
-                a numpy cache to speed up the loading.
-            "dummy" will initialize the weights with random values, which is
-                mainly for profiling.
-            "tensorizer" will use CoreWeave's tensorizer library for
-                fast weight loading.
-            "bitsandbytes" will load nf4 type weights.
-            "sharded_state" will load weights from pre-sharded checkpoint files,
-                supporting efficient loading of tensor-parallel models.
-            "gguf" will load weights from GGUF format files.
-            "mistral" will load weights from consolidated safetensors files used
-                by Mistral models.
-            "runai_streamer" will load weights from RunAI streamer format files.
-        model_loader_extra_config: The extra config for the model loader.
-        ignore_patterns: The list of patterns to ignore when loading the model.
-            Default to "original/**/*" to avoid repeated loading of llama's
-            checkpoints.
-        use_tqdm_on_load: Whether to enable tqdm for showing progress bar during
-            loading. Default to True
-    """
+    """Configuration for loading the model weights."""
 
-    load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
+    load_format: Union[str, LoadFormat,
+                       "BaseModelLoader"] = LoadFormat.AUTO.value
+    """The format of the model weights to load:\n
+    - "auto" will try to load the weights in the safetensors format and fall
+    back to the pytorch bin format if safetensors format is not available.\n
+    - "pt" will load the weights in the pytorch bin format.\n
+    - "safetensors" will load the weights in the safetensors format.\n
+    - "npcache" will load the weights in pytorch format and store a numpy cache
+    to speed up the loading.\n
+    - "dummy" will initialize the weights with random values, which is mainly
+    for profiling.\n
+    - "tensorizer" will use CoreWeave's tensorizer library for fast weight
+    loading. See the Tensorize vLLM Model script in the Examples section for
+    more information.\n
+    - "runai_streamer" will load the Safetensors weights using Run:ai Model
+    Streamer.\n
+    - "bitsandbytes" will load the weights using bitsandbytes quantization.\n
+    - "sharded_state" will load weights from pre-sharded checkpoint files,
+    supporting efficient loading of tensor-parallel models.\n
+    - "gguf" will load weights from GGUF format files (details specified in
+    https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n
+    - "mistral" will load weights from consolidated safetensors files used by
+    Mistral models."""
     download_dir: Optional[str] = None
-    model_loader_extra_config: Optional[Union[str, dict]] = field(
-        default_factory=dict)
+    """Directory to download and load the weights, default to the default
+    cache directory of Hugging Face."""
+    model_loader_extra_config: Optional[Union[str, dict]] = None
+    """Extra config for model loader. This will be passed to the model loader
+    corresponding to the chosen load_format. This should be a JSON string that
+    will be parsed into a dictionary."""
     ignore_patterns: Optional[Union[list[str], str]] = None
+    """The list of patterns to ignore when loading the model. Default to
+    "original/**/*" to avoid repeated loading of llama's checkpoints."""
     use_tqdm_on_load: bool = True
+    """Whether to enable tqdm for showing progress bar when loading model
+    weights."""
 
     def compute_hash(self) -> str:
         """
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 3eafb6827d492..70e628ed16808 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -101,8 +101,8 @@ class EngineArgs:
     tokenizer_mode: str = 'auto'
     trust_remote_code: bool = False
     allowed_local_media_path: str = ""
-    download_dir: Optional[str] = None
-    load_format: str = 'auto'
+    download_dir: Optional[str] = LoadConfig.download_dir
+    load_format: str = LoadConfig.load_format
     config_format: ConfigFormat = ConfigFormat.AUTO
     dtype: str = 'auto'
     kv_cache_dtype: str = 'auto'
@@ -174,8 +174,10 @@ class EngineArgs:
     ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
     num_gpu_blocks_override: Optional[int] = None
     num_lookahead_slots: int = 0
-    model_loader_extra_config: Optional[dict] = None
-    ignore_patterns: Optional[Union[str, List[str]]] = None
+    model_loader_extra_config: Optional[
+        dict] = LoadConfig.model_loader_extra_config
+    ignore_patterns: Optional[Union[str,
+                                    List[str]]] = LoadConfig.ignore_patterns
     preemption_mode: Optional[str] = None
 
     scheduler_delay_factor: float = 0.0
@@ -213,7 +215,7 @@ class EngineArgs:
     additional_config: Optional[Dict[str, Any]] = None
     enable_reasoning: Optional[bool] = None
     reasoning_parser: Optional[str] = None
-    use_tqdm_on_load: bool = True
+    use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
 
     def __post_init__(self):
         if not self.tokenizer:
@@ -234,9 +236,13 @@ class EngineArgs:
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         """Shared CLI arguments for vLLM engine."""
 
+        def is_type_in_union(cls: type[Any], type: type[Any]) -> bool:
+            """Check if the class is a type in a union type."""
+            return get_origin(cls) is Union and type in get_args(cls)
+
         def is_optional(cls: type[Any]) -> bool:
             """Check if the class is an optional type."""
-            return get_origin(cls) is Union and type(None) in get_args(cls)
+            return is_type_in_union(cls, type(None))
 
         def get_kwargs(cls: type[Any]) -> Dict[str, Any]:
             cls_docs = get_attr_docs(cls)
@@ -255,6 +261,10 @@ class EngineArgs:
                 if is_optional(field.type):
                     kwargs[name]["type"] = nullable_str
                     continue
+                # Handle str in union fields
+                if is_type_in_union(field.type, str):
+                    kwargs[name]["type"] = str
+                    continue
                 kwargs[name]["type"] = field.type
             return kwargs
 
@@ -333,38 +343,23 @@ class EngineArgs:
             "from directories specified by the server file system. "
             "This is a security risk. "
             "Should only be enabled in trusted environments.")
-        parser.add_argument('--download-dir',
-                            type=nullable_str,
-                            default=EngineArgs.download_dir,
-                            help='Directory to download and load the weights.')
-        parser.add_argument(
-            '--load-format',
-            type=str,
-            default=EngineArgs.load_format,
-            choices=[f.value for f in LoadFormat],
-            help='The format of the model weights to load.\n\n'
-            '* "auto" will try to load the weights in the safetensors format '
-            'and fall back to the pytorch bin format if safetensors format '
-            'is not available.\n'
-            '* "pt" will load the weights in the pytorch bin format.\n'
-            '* "safetensors" will load the weights in the safetensors format.\n'
-            '* "npcache" will load the weights in pytorch format and store '
-            'a numpy cache to speed up the loading.\n'
-            '* "dummy" will initialize the weights with random values, '
-            'which is mainly for profiling.\n'
-            '* "tensorizer" will load the weights using tensorizer from '
-            'CoreWeave. See the Tensorize vLLM Model script in the Examples '
-            'section for more information.\n'
-            '* "runai_streamer" will load the Safetensors weights using Run:ai'
-            'Model Streamer.\n'
-            '* "bitsandbytes" will load the weights using bitsandbytes '
-            'quantization.\n'
-            '* "sharded_state" will load weights from pre-sharded checkpoint '
-            'files, supporting efficient loading of tensor-parallel models\n'
-            '* "gguf" will load weights from GGUF format files (details '
-            'specified in https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n'
-            '* "mistral" will load weights from consolidated safetensors files '
-            'used by Mistral models.\n')
+        # Model loading arguments
+        load_kwargs = get_kwargs(LoadConfig)
+        load_group = parser.add_argument_group(
+            title="LoadConfig",
+            description=LoadConfig.__doc__,
+        )
+        load_group.add_argument('--load-format',
+                                choices=[f.value for f in LoadFormat],
+                                **load_kwargs["load_format"])
+        load_group.add_argument('--download-dir',
+                                **load_kwargs["download_dir"])
+        load_group.add_argument('--model-loader-extra-config',
+                                **load_kwargs["model_loader_extra_config"])
+        load_group.add_argument('--use-tqdm-on-load',
+                                action=argparse.BooleanOptionalAction,
+                                **load_kwargs["use_tqdm_on_load"])
+
         parser.add_argument(
             '--config-format',
             default=EngineArgs.config_format,
@@ -770,14 +765,6 @@ class EngineArgs:
                             default=1,
                             help=('Maximum number of forward steps per '
                                   'scheduler call.'))
-        parser.add_argument(
-            '--use-tqdm-on-load',
-            dest='use_tqdm_on_load',
-            action=argparse.BooleanOptionalAction,
-            default=EngineArgs.use_tqdm_on_load,
-            help='Whether to enable/disable progress bar '
-            'when loading model weights.',
-        )
 
         parser.add_argument(
             '--multi-step-stream-outputs',
@@ -806,15 +793,6 @@ class EngineArgs:
                             default=None,
                             help='The configurations for speculative decoding.'
                             ' Should be a JSON string.')
-
-        parser.add_argument('--model-loader-extra-config',
-                            type=nullable_str,
-                            default=EngineArgs.model_loader_extra_config,
-                            help='Extra config for model loader. '
-                            'This will be passed to the model loader '
-                            'corresponding to the chosen load_format. '
-                            'This should be a JSON string that will be '
-                            'parsed into a dictionary.')
         parser.add_argument(
             '--ignore-patterns',
             action="append",
diff --git a/vllm/utils.py b/vllm/utils.py
index 551f1a4c9d26a..0fa3384aa0901 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-import argparse
 import asyncio
 import concurrent
 import contextlib
@@ -25,6 +24,7 @@ import socket
 import subprocess
 import sys
 import tempfile
+import textwrap
 import threading
 import time
 import traceback
@@ -32,6 +32,8 @@ import types
 import uuid
 import warnings
 import weakref
+from argparse import (Action, ArgumentDefaultsHelpFormatter, ArgumentParser,
+                      ArgumentTypeError)
 from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
 from collections import UserDict, defaultdict
 from collections.abc import (AsyncGenerator, Awaitable, Generator, Hashable,
@@ -1209,7 +1211,7 @@ def run_once(f: Callable[P, None]) -> Callable[P, None]:
     return wrapper
 
 
-class StoreBoolean(argparse.Action):
+class StoreBoolean(Action):
 
     def __call__(self, parser, namespace, values, option_string=None):
         if values.lower() == "true":
@@ -1221,15 +1223,28 @@ class StoreBoolean(argparse.Action):
                              "Expected 'true' or 'false'.")
 
 
-class SortedHelpFormatter(argparse.ArgumentDefaultsHelpFormatter):
+class SortedHelpFormatter(ArgumentDefaultsHelpFormatter):
     """SortedHelpFormatter that sorts arguments by their option strings."""
 
+    def _split_lines(self, text, width):
+        """
+        1. Sentences split across lines have their single newlines removed.
+        2. Paragraphs and explicit newlines are split into separate lines.
+        3. Each line is wrapped to the specified width (width of terminal).
+        """
+        # The patterns also include whitespace after the newline
+        single_newline = re.compile("(?<!\n)\n(?!\n)\s*")
+        multiple_newlines = re.compile("\n{2,}\s*")
+        text = single_newline.sub(' ', text)
+        lines = re.split(multiple_newlines, text)
+        return sum([textwrap.wrap(line, width) for line in lines], [])
+
     def add_arguments(self, actions):
         actions = sorted(actions, key=lambda x: x.option_strings)
         super().add_arguments(actions)
 
 
-class FlexibleArgumentParser(argparse.ArgumentParser):
+class FlexibleArgumentParser(ArgumentParser):
     """ArgumentParser that allows both underscore and dash in names."""
 
     def __init__(self, *args, **kwargs):
@@ -1280,11 +1295,10 @@ class FlexibleArgumentParser(argparse.ArgumentParser):
             value = int(value)
         except ValueError:
             msg = "Port must be an integer"
-            raise argparse.ArgumentTypeError(msg) from None
+            raise ArgumentTypeError(msg) from None
 
         if not (1024 <= value <= 65535):
-            raise argparse.ArgumentTypeError(
-                "Port must be between 1024 and 65535")
+            raise ArgumentTypeError("Port must be between 1024 and 65535")
 
         return value
 

From 16eda8c43a9d62f3dc90e8a0149855b3820049c8 Mon Sep 17 00:00:00 2001
From: "Ye (Charlotte) Qi" <yeq@meta.com>
Date: Fri, 11 Apr 2025 15:26:17 -0700
Subject: [PATCH 394/593] [Frontend] Added chat templates for LLaMa4 pythonic
 tool calling (#16463)

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Co-authored-by: Kai Wu <kaiwu@meta.com>
---
 docs/source/features/tool_calling.md          |   2 +
 .../tool_chat_template_llama4_pythonic.jinja  | 139 ++++++++++++++++++
 tests/tool_use/conftest.py                    |  25 +++-
 tests/tool_use/utils.py                       |  16 ++
 .../tool_parsers/pythonic_tool_parser.py      |   2 +-
 5 files changed, 182 insertions(+), 2 deletions(-)
 create mode 100644 examples/tool_chat_template_llama4_pythonic.jinja

diff --git a/docs/source/features/tool_calling.md b/docs/source/features/tool_calling.md
index 17cee6da471c3..8b8bbd28d3483 100644
--- a/docs/source/features/tool_calling.md
+++ b/docs/source/features/tool_calling.md
@@ -245,6 +245,8 @@ Example supported models:
 * `meta-llama/Llama-3.2-3B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`)
 * `Team-ACE/ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`)
 * `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`)
+* `meta-llama/Llama-4-Scout-17B-16E-Instruct`\* (use with `examples/tool_chat_template_llama4_pythonic.jinja`)
+* `meta-llama/Llama-4-Maverick-17B-128E-Instruct`\* (use with `examples/tool_chat_template_llama4_pythonic.jinja`)
 
 Flags: `--tool-call-parser pythonic --chat-template {see_above}`
 
diff --git a/examples/tool_chat_template_llama4_pythonic.jinja b/examples/tool_chat_template_llama4_pythonic.jinja
new file mode 100644
index 0000000000000..bd18a35bdda93
--- /dev/null
+++ b/examples/tool_chat_template_llama4_pythonic.jinja
@@ -0,0 +1,139 @@
+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = false %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- if messages[0]['content'] is string %}
+        {%- set system_message = messages[0]['content']|trim %}
+    {%- else %}
+        {%- set system_message = messages[0]['content'][0]['text']|trim %}
+    {%- endif %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- if tools is not none %}
+        {#- Add default tool system message when tools are provided #}
+        {%- set system_message = "You are a helpful assistant with tool calling "
+            "capabilities. Only reply with a tool call if the function exists in the "
+            "library provided by the user. If it doesn't exist, just reply directly in "
+            "natural language. When you receive a tool call response, use the output to "
+            "format an answer to the original user question." %}
+    {%- else %}
+        {%- set system_message = "" %}
+    {%- endif %}
+{%- endif %}
+
+{#- System message if the user supplied one, or if tools are used (default tool system message) #}
+{%- if system_message %}
+    {#- always use user provided system message to override default tool system message #}
+    {{- "<|header_start|>system<|header_end|>\n\n" }}
+    {{- system_message }}
+    {%- if tools is not none and not tools_in_user_message %}
+        {{- "Tools: You have access to the following tools. You might need to use one "
+            "or more function/tool calls to fulfill the task. \n"
+            "If none are needed, then proceed to the response.\n\n"
+            "Tool Call Syntax: You can call tools using the following syntax:\n"
+            "[func_name1(params_name1=params_value1, params_name2=params_value2, ...), ...]\n"
+            "Do not include anything else when calling the tools with the syntax above.\n\n"
+            "Here is a list of functions in JSON format that you can invoke.\n " }}
+        {%- for t in tools %}
+            {{- t | tojson(indent=4) }}
+            {{- "\n\n" }}
+        {%- endfor %}
+    {%- endif %}
+    {{- "<|eot|>" }}
+{%- endif %}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and tools is not none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- if messages[0]['content'] is string %}
+            {%- set first_user_message = messages[0]['content']|trim %}
+        {%- else %}
+            {%- set first_user_message = messages[0]['content'] | selectattr('type', 'equalto', 'text') | map(attribute='text') | map('trim') | join('\n') %}
+        {%- endif %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+    {%- endif %}
+    {{- '<|header_start|>user<|header_end|>\n\n' -}}
+    {{- first_user_message}}
+    {{- "\nHere is a list of functions in JSON format that you can invoke:"}}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- "Should you decide to return the function call(s), put them in the format "
+        "of [func_name1(params_name1=params_value1, params_name2=params_value2, "
+        "...), ...]\nDo not include anything else when calling the tools with the "
+        "syntax above." }}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+    {{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }}
+        {%- if message['content'] is string %}
+            {{- message['content'] }}
+        {%- else %}
+            {%- for content in message['content'] %}
+                {%- if content['type'] == 'image' %}
+                    {{- '<|image|>' }}
+                {%- elif content['type'] == 'text' %}
+                    {{- content['text'] | trim }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- "<|eot|>" }}
+    {%- elif 'tool_calls' in message and message.tool_calls|length > 0 %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {{- '<|header_start|>assistant<|header_end|>\n\n' -}}
+        {%- if message['content'] is string %}
+            {{- message['content'] }}
+        {%- else %}
+            {%- for content in message['content'] %}
+                {%- if content['type'] == 'image' %}
+                    {{- '<|image|>' }}
+                {%- elif content['type'] == 'text' %}
+                    {{- content['text'] }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- tool_call.name + '(' -}}
+            {%- for param in tool_call.arguments %}
+                {{- param + '=' -}}
+                {{- "%s" | format(tool_call.arguments[param]) -}}
+                {% if not loop.last %}, {% endif %}
+            {%- endfor %}
+            {{- ')' -}}
+            {% if not loop.last %}, {% endif %}
+        {%- endfor %}
+        {{- "<|eom|>" }}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|header_start|>ipython<|header_end|>\n\n" }}
+        {%- if message.content is string %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {%- for content in message['content']  %}
+                {%- if content['type']  == 'text' %}
+                    {{- content['text'] | tojson }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- "<|eom|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|header_start|>assistant<|header_end|>\n\n' }}
+{%- endif %}
diff --git a/tests/tool_use/conftest.py b/tests/tool_use/conftest.py
index 39ab01c9b8741..4bf9b45fe212b 100644
--- a/tests/tool_use/conftest.py
+++ b/tests/tool_use/conftest.py
@@ -10,10 +10,33 @@ from vllm.platforms import current_platform
 from .utils import ARGS, CONFIGS, ServerConfig
 
 
+# select models to test based on command line arguments
+def pytest_addoption(parser):
+    parser.addoption("--models",
+                     nargs="+",
+                     help="Specify one or more models to test")
+    parser.addoption("--extended",
+                     action="store_true",
+                     default=False,
+                     help="invoke extended tests requiring large GPUs")
+
+
 # for each server config, download the model and return the config
 @pytest.fixture(scope="session", params=CONFIGS.keys())
 def server_config(request):
-    config = CONFIGS[request.param]
+    extended = request.config.getoption("--extended")
+    models = request.config.getoption("--models")
+
+    config_keys_to_test = [
+        key for key in CONFIGS if (models is None or key in models) and (
+            extended or not CONFIGS[key].get("extended", False))
+    ]
+
+    config_key = request.param
+    if config_key not in config_keys_to_test:
+        pytest.skip(f"Skipping config '{config_key}'")
+
+    config = CONFIGS[config_key]
 
     if current_platform.is_rocm() and not config.get("supports_rocm", True):
         pytest.skip("The {} model can't be tested on the ROCm platform".format(
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index 231e4aad8c336..7c87c73f04dad 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -16,6 +16,7 @@ class ServerConfig(TypedDict, total=False):
     system_prompt: Optional[str]
     supports_parallel: Optional[bool]
     supports_rocm: Optional[bool]
+    extended: Optional[bool]  # tests do not run in CI automatically
 
 
 def patch_system_prompt(messages: list[dict[str, Any]],
@@ -82,6 +83,21 @@ CONFIGS: dict[str, ServerConfig] = {
         "supports_parallel":
         False,
     },
+    "llama4": {
+        "model":
+        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+        "arguments": [
+            "--enforce-eager", "--no-enable-prefix-caching",
+            "--tool-call-parser", "pythonic", "--chat-template",
+            str(VLLM_PATH /
+                "examples/tool_chat_template_llama4_pythonic.jinja"), "-tp",
+            "4"
+        ],
+        "supports_parallel":
+        False,
+        "extended":
+        True
+    },
     "mistral": {
         "model":
         "mistralai/Mistral-7B-Instruct-v0.3",
diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
index 1b9317f16f345..9f141d6b334b6 100644
--- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
@@ -28,7 +28,7 @@ class _UnexpectedAstError(Exception):
 class PythonicToolParser(ToolParser):
     """
     Tool call parser for models that produce tool calls in a pythonic style,
-    such as Llama 3.2 models.
+    such as Llama 3.2 and Llama 4 models.
 
     Used when --enable-auto-tool-choice --tool-call-parser pythonic are all set
     """

From a3bf8d4a2b3b84c8580e0984e8463937e25b1b99 Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Date: Fri, 11 Apr 2025 15:26:55 -0700
Subject: [PATCH 395/593] [Kernel] Add tuned FusedMoE kernel config for Llama4
 Scout, TP=8 on H100  (#16488)

---
 .../E=16,N=1024,device_name=NVIDIA_H100.json  | 146 ++++++++++++++++++
 1 file changed, 146 insertions(+)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json
new file mode 100644
index 0000000000000..04420388cca39
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}

From c09632a66c0a4576518eef598aa13aa2d34bd337 Mon Sep 17 00:00:00 2001
From: Christian Sears <117944059+Chr1st1anSears@users.noreply.github.com>
Date: Fri, 11 Apr 2025 18:54:58 -0400
Subject: [PATCH 396/593] Update openai_compatible_server.md (#16507)

Signed-off-by: Christian Sears <csears@redhat.com>
---
 docs/source/serving/openai_compatible_server.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index b2e972fa46983..11ca571c684a1 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -2,15 +2,15 @@
 
 # OpenAI-Compatible Server
 
-vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more!
+vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more! This functionality lets you serve models and interact with them using an HTTP client.
 
-You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](#deployment-docker):
+In your terminal, you can [install](../getting_started/installation.md) vLLM, then start the server with the [`vllm serve`](#vllm-serve) command. (You can also use our [Docker](#deployment-docker) image.)
 
 ```bash
 vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
 ```
 
-To call the server, you can use the [official OpenAI Python client](https://github.com/openai/openai-python), or any other HTTP client.
+To call the server, in your preferred text editor, create a script that uses an HTTP client. Include any messages that you want to send to the model. Then run that script. Below is an example script using the [official OpenAI Python client](https://github.com/openai/openai-python).
 
 ```python
 from openai import OpenAI

From 56c76c2e0ea174faff3422dfc6733a2c2e7c7444 Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Sat, 12 Apr 2025 07:19:40 +0800
Subject: [PATCH 397/593] [Bugfix] clean up duplicated code (#16485)

Signed-off-by: Gogs <gogs@fake.local>
Co-authored-by: Gogs <gogs@fake.local>
---
 vllm/model_executor/models/opt.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index d4c2b4c48d908..4a12f36d90e84 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -324,7 +324,6 @@ class OPTForCausalLM(nn.Module, SupportsPP):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        super().__init__()
         self.config = config
         self.quant_config = quant_config
         self.model = OPTModel(vllm_config=vllm_config,

From 87b836ba777c07acf35da802be0aacbfc51da3b5 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Fri, 11 Apr 2025 17:32:22 -0600
Subject: [PATCH 398/593] Bugfix for PixtralHF models without
 spatial_merge_size (#16513)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/model_executor/models/pixtral.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index ee1e7713e90e2..38e140a91ecf5 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -926,8 +926,9 @@ class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]):
         return self.vision_config.image_size
 
     def get_patch_size(self) -> int:
-        return (self.vision_config.patch_size *
-                self.vision_config.spatial_merge_size)
+        spatial_merge_size = getattr(self.vision_config, "spatial_merge_size",
+                                     1)
+        return (self.vision_config.patch_size * spatial_merge_size)
 
     def get_patch_grid_length(self) -> int:
         image_size, patch_size = self.get_image_size(), self.get_patch_size()

From ed4792c99030067155e08a516fffd3b93d008037 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Fri, 11 Apr 2025 20:39:23 -0400
Subject: [PATCH 399/593] [Doc] Fix link to vLLM blog (#16519)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1a9aabc75a44f..dda3ae6009f55 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>
 
 <p align="center">
-| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>
 
 ---

From 57504a4bcf82cf544a4261778fc0505e492e4484 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Fri, 11 Apr 2025 18:52:38 -0600
Subject: [PATCH 400/593] [CI][Bugfix] Add mistral_tool_use to Ci (#16517)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .buildkite/test-pipeline.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index cdc487c1e78d4..16acc2fd1127a 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -378,8 +378,10 @@ steps:
   source_file_dependencies:
     - vllm/
     - tests/tool_use
+    - tests/mistral_tool_use
   commands:
     - pytest -v -s tool_use
+    - pytest -v -s mistral_tool_use
 
 #####  models test  #####
 

From 41cc883c293eebc0aecd5b078f19517f44a6d871 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 11 Apr 2025 17:54:06 -0700
Subject: [PATCH 401/593] [BugFix] Handle non-contiguous tensors properly when
 serializing (#16492)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/test_serial_utils.py | 22 ++++++++++++++++++----
 vllm/v1/serial_utils.py       | 19 ++++++++++++-------
 2 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py
index 0fc3b074533da..bc0e0cbd85e1a 100644
--- a/tests/v1/test_serial_utils.py
+++ b/tests/v1/test_serial_utils.py
@@ -22,6 +22,10 @@ class MyType:
     list_of_tensors: list[torch.Tensor]
     numpy_array: np.ndarray
     unrecognized: UnrecognizedType
+    small_f_contig_tensor: torch.Tensor
+    large_f_contig_tensor: torch.Tensor
+    small_non_contig_tensor: torch.Tensor
+    large_non_contig_tensor: torch.Tensor
 
 
 def test_encode_decode():
@@ -40,6 +44,10 @@ def test_encode_decode():
         ],
         numpy_array=np.arange(512),
         unrecognized=UnrecognizedType(33),
+        small_f_contig_tensor=torch.rand(5, 4).t(),
+        large_f_contig_tensor=torch.rand(1024, 4).t(),
+        small_non_contig_tensor=torch.rand(2, 4)[:, 1:3],
+        large_non_contig_tensor=torch.rand(1024, 512)[:, 10:20],
     )
 
     encoder = MsgpackEncoder()
@@ -47,10 +55,10 @@ def test_encode_decode():
 
     encoded = encoder.encode(obj)
 
-    # There should be the main buffer + 2 large tensor buffers
-    # + 1 large numpy array. "large" is <= 256 bytes.
+    # There should be the main buffer + 4 large tensor buffers
+    # + 1 large numpy array. "large" is <= 512 bytes.
     # The two small tensors are encoded inline.
-    assert len(encoded) == 4
+    assert len(encoded) == 6
 
     decoded: MyType = decoder.decode(encoded)
 
@@ -62,7 +70,7 @@ def test_encode_decode():
 
     encoded2 = encoder.encode_into(obj, preallocated)
 
-    assert len(encoded2) == 4
+    assert len(encoded2) == 6
     assert encoded2[0] is preallocated
 
     decoded2: MyType = decoder.decode(encoded2)
@@ -78,3 +86,9 @@ def assert_equal(obj1: MyType, obj2: MyType):
         for a, b in zip(obj1.list_of_tensors, obj2.list_of_tensors))
     assert np.array_equal(obj1.numpy_array, obj2.numpy_array)
     assert obj1.unrecognized.an_int == obj2.unrecognized.an_int
+    assert torch.equal(obj1.small_f_contig_tensor, obj2.small_f_contig_tensor)
+    assert torch.equal(obj1.large_f_contig_tensor, obj2.large_f_contig_tensor)
+    assert torch.equal(obj1.small_non_contig_tensor,
+                       obj2.small_non_contig_tensor)
+    assert torch.equal(obj1.large_non_contig_tensor,
+                       obj2.large_non_contig_tensor)
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 99b352fdef80a..3af6793fde74c 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -14,9 +14,10 @@ from msgspec import msgpack
 
 CUSTOM_TYPE_PICKLE = 1
 CUSTOM_TYPE_CLOUDPICKLE = 2
+CUSTOM_TYPE_RAW_VIEW = 3
 
 # TODO calibrate this size
-INLINE_BUF_SIZE_THRESHOLD = 256
+MIN_NOCOPY_BUF_SIZE = 512
 
 bytestr = Union[bytes, bytearray, memoryview, zmq.Frame]
 
@@ -76,14 +77,16 @@ class MsgpackEncoder:
         self, obj: np.ndarray
     ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]:
         assert self.aux_buffers is not None
-        if not obj.shape or obj.nbytes < INLINE_BUF_SIZE_THRESHOLD:
-            # Encode small arrays and scalars inline.
-            data = obj.data
+        arr_data = obj.data if obj.data.c_contiguous else obj.tobytes()
+        if not obj.shape or obj.nbytes < MIN_NOCOPY_BUF_SIZE:
+            # Encode small arrays and scalars inline. Using this extension type
+            # ensures we can avoid copying when decoding.
+            data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr_data)
         else:
-            # Otherwise encode index of backing buffer.
-            obj = np.ascontiguousarray(obj)
+            # Otherwise encode index of backing buffer to avoid copy.
             data = len(self.aux_buffers)
-            self.aux_buffers.append(obj.data)
+            self.aux_buffers.append(arr_data)
+
         # We serialize the ndarray as a tuple of native types.
         # The data is either inlined if small, or an index into a list of
         # backing buffers that we've stashed in `aux_buffers`.
@@ -131,6 +134,8 @@ class MsgpackDecoder:
         return np.ndarray(buffer=buffer, dtype=np.dtype(dtype), shape=shape)
 
     def ext_hook(self, code: int, data: memoryview) -> Any:
+        if code == CUSTOM_TYPE_RAW_VIEW:
+            return data
         if code == CUSTOM_TYPE_PICKLE:
             return pickle.loads(data)
         if code == CUSTOM_TYPE_CLOUDPICKLE:

From 802329dee9e5b70c0c73df93c9db1ecdc4632664 Mon Sep 17 00:00:00 2001
From: "Ye (Charlotte) Qi" <yeq@meta.com>
Date: Fri, 11 Apr 2025 19:53:10 -0700
Subject: [PATCH 402/593] [Doc] Update Llama4 Model Names in Supported Models
 (#16509)

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
---
 docs/source/models/supported_models.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index de59d7e7a234e..c5029d856453b 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -885,7 +885,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
   * ✅︎
 - * `Llama4ForConditionalGeneration`
-  * Llama-4-17B-Omni-Instruct
+  * Llama 4
   * T + I<sup>+</sup>
   * `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc.
   *

From bd6028d6b0bbc0c569ece0535067081c5e8bdc14 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Sat, 12 Apr 2025 00:21:08 -0600
Subject: [PATCH 403/593] Optimized topk for topk=1 (Llama-4) (#16512)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/model_executor/models/llama4.py | 4 ++--
 vllm/model_executor/models/utils.py  | 9 +++++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index 8785e9dcff08a..51efbfe202f0b 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -37,7 +37,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
 from .llama import LlamaForCausalLM, LlamaMLP, LlamaModel
-from .utils import (AutoWeightsLoader, extract_layer_index,
+from .utils import (AutoWeightsLoader, extract_layer_index, fast_topk,
                     is_pp_missing_parameter)
 
 
@@ -50,7 +50,7 @@ class Llama4MoE(nn.Module):
         topk: int,
         renormalize: bool,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        router_scores, router_indices = torch.topk(gating_output, topk, dim=-1)
+        router_scores, router_indices = fast_topk(gating_output, topk, dim=-1)
         router_scores = torch.sigmoid(router_scores.float()).to(
             hidden_states.dtype)
         return (router_scores, router_indices.to(torch.int32))
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index f197434f31432..7ed0560ee43fe 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -703,3 +703,12 @@ def cast_overflow_tensors(
         clamp_value = torch.finfo(tensors.dtype).max - offset
         tensors = torch.clamp(tensors, min=-clamp_value, max=clamp_value)
     return tensors
+
+
+def fast_topk(values, topk, dim):
+    if topk == 1:
+        # Use max along the specified dimension to get both value and index
+        return torch.max(values, dim=dim, keepdim=True)
+    else:
+        # Use topk for efficiency with larger k values
+        return torch.topk(values, topk, dim=dim)

From e92d7085bfb25200c540391d0be06cb3b7c29ea4 Mon Sep 17 00:00:00 2001
From: leon-seidel <83984854+leon-seidel@users.noreply.github.com>
Date: Sat, 12 Apr 2025 08:22:07 +0200
Subject: [PATCH 404/593] [Feature][V1] Add xgrammar to support minLength,
 maxLength with test (#16516)

Signed-off-by: Leon Seidel <leon.seidel@fau.de>
---
 .../llm/test_struct_output_generate.py        | 39 +++++++++++++++++++
 tests/v1/structured_output/test_utils.py      | 16 ++++----
 vllm/v1/structured_output/utils.py            |  3 +-
 3 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index d848490b89e8a..b179dc3b4747c 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -325,6 +325,45 @@ def test_structured_output(
         output_json = json.loads(generated_text)
         jsonschema.validate(instance=output_json, schema=json_schema)
 
+    #
+    # Test 10: Generate structured with minLength and maxLength
+    #
+    min_length = 50
+    max_length = 50
+    json_schema = {
+        "type": "object",
+        "properties": {
+            "description": {
+                "type": "string",
+                "maxLength": max_length,
+                "minLength": min_length
+            }
+        },
+        "required": ["description"]
+    }
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(json=json_schema))
+    outputs = llm.generate(
+        prompts="Generate a description of a frog using 50 characters.",
+        sampling_params=sampling_params,
+        use_tqdm=True)
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json, schema=json_schema)
+
 
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("model_name, tokenizer_mode",
diff --git a/tests/v1/structured_output/test_utils.py b/tests/v1/structured_output/test_utils.py
index 554f38926269b..0929f99016289 100644
--- a/tests/v1/structured_output/test_utils.py
+++ b/tests/v1/structured_output/test_utils.py
@@ -13,14 +13,6 @@ def unsupported_string_schemas():
             "type": "string",
             "pattern": "^[a-zA-Z]+$"
         },
-        {
-            "type": "string",
-            "minLength": 1
-        },
-        {
-            "type": "string",
-            "maxLength": 100
-        },
         {
             "type": "string",
             "format": "email"
@@ -164,6 +156,14 @@ def supported_schema():
                 "type": "string",
                 "enum": ["sedan", "suv", "truck"]
             },
+            "short_description": {
+                "type": "string",
+                "maxLength": 50
+            },
+            "long_description": {
+                "type": "string",
+                "minLength": 50
+            },
             "address": {
                 "type": "object",
                 "properties": {
diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
index a771256ef29fd..56eed95944e2f 100644
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -41,8 +41,7 @@ def has_xgrammar_unsupported_json_features(schema: dict[str, Any]) -> bool:
             return True
 
         # Unsupported keywords for strings
-        if obj.get("type") == "string" and any(
-                key in obj for key in ("minLength", "maxLength", "format")):
+        if obj.get("type") == "string" and "format" in obj:
             return True
 
         # Unsupported keywords for objects

From fbf722c6e6104de0757a3a948f7f5e5b13b34bd2 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Sat, 12 Apr 2025 14:23:10 +0800
Subject: [PATCH 405/593] [Frontend] support matryoshka representation /
 support embedding API dimensions (#16331)

---
 .../offline_inference/embed_matryoshka_fy.py  | 48 +++++++++++
 tests/conftest.py                             | 16 ++--
 .../openai/test_embedding_dimensions.py       | 82 +++++++++++++++++++
 tests/models/embedding/language/test_jina.py  | 40 ++++++++-
 tests/models/embedding/utils.py               |  7 ++
 vllm/config.py                                |  9 ++
 vllm/entrypoints/llm.py                       | 10 +++
 vllm/entrypoints/openai/protocol.py           |  6 +-
 vllm/entrypoints/openai/serving_embedding.py  | 12 +--
 vllm/model_executor/layers/pooler.py          | 22 ++++-
 vllm/pooling_params.py                        | 23 +++++-
 11 files changed, 253 insertions(+), 22 deletions(-)
 create mode 100644 examples/offline_inference/embed_matryoshka_fy.py
 create mode 100644 tests/entrypoints/openai/test_embedding_dimensions.py

diff --git a/examples/offline_inference/embed_matryoshka_fy.py b/examples/offline_inference/embed_matryoshka_fy.py
new file mode 100644
index 0000000000000..ab71fbe73e6aa
--- /dev/null
+++ b/examples/offline_inference/embed_matryoshka_fy.py
@@ -0,0 +1,48 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from argparse import Namespace
+
+from vllm import LLM, EngineArgs, PoolingParams
+from vllm.utils import FlexibleArgumentParser
+
+
+def main(args: Namespace):
+    # Sample prompts.
+    prompts = [
+        "Follow the white rabbit.",  # English
+        "Sigue al conejo blanco.",  # Spanish
+        "Suis le lapin blanc.",  # French
+        "跟着白兔走。",  # Chinese
+        "اتبع الأرنب الأبيض.",  # Arabic
+        "Folge dem weißen Kaninchen.",  # German
+    ]
+
+    # Create an LLM.
+    # You should pass task="embed" for embedding models
+    model = LLM(**vars(args))
+
+    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
+    outputs = model.embed(prompts, pooling_params=PoolingParams(dimensions=32))
+
+    # Print the outputs.
+    print("\nGenerated Outputs:")
+    print("-" * 60)
+    for prompt, output in zip(prompts, outputs):
+        embeds = output.outputs.embedding
+        embeds_trimmed = ((str(embeds[:16])[:-1] +
+                           ", ...]") if len(embeds) > 16 else embeds)
+        print(f"Prompt: {prompt!r} \n"
+              f"Embeddings: {embeds_trimmed} "
+              f"(size={len(embeds)})")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(model="jinaai/jina-embeddings-v3",
+                        task="embed",
+                        trust_remote_code=True)
+    args = parser.parse_args()
+    main(args)
diff --git a/tests/conftest.py b/tests/conftest.py
index c5d393907ec8c..69447d3c474d3 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -960,19 +960,19 @@ class VllmRunner:
         req_outputs = self.model.classify(prompts)
         return [req_output.outputs.probs for req_output in req_outputs]
 
-    def encode(
-        self,
-        prompts: list[str],
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
-    ) -> list[list[float]]:
+    def encode(self,
+               prompts: list[str],
+               images: Optional[PromptImageInput] = None,
+               videos: Optional[PromptVideoInput] = None,
+               audios: Optional[PromptAudioInput] = None,
+               *args,
+               **kwargs) -> list[list[float]]:
         inputs = self.get_inputs(prompts,
                                  images=images,
                                  videos=videos,
                                  audios=audios)
 
-        req_outputs = self.model.embed(inputs)
+        req_outputs = self.model.embed(inputs, *args, **kwargs)
         return [req_output.outputs.embedding for req_output in req_outputs]
 
     def score(
diff --git a/tests/entrypoints/openai/test_embedding_dimensions.py b/tests/entrypoints/openai/test_embedding_dimensions.py
new file mode 100644
index 0000000000000..79d43a2231f82
--- /dev/null
+++ b/tests/entrypoints/openai/test_embedding_dimensions.py
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`.
+"""
+
+from typing import NamedTuple
+
+import openai
+import pytest
+
+from vllm.entrypoints.openai.protocol import EmbeddingResponse
+
+from ...utils import RemoteOpenAIServer
+
+
+class ModelInfo(NamedTuple):
+    name: str
+    is_matryoshka: bool
+
+
+MODELS = [
+    ModelInfo(name="BAAI/bge-m3", is_matryoshka=False),
+    ModelInfo(name="jinaai/jina-embeddings-v3", is_matryoshka=True),
+]
+
+input_texts = [
+    "The chef prepared a delicious meal.",
+] * 3
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model", MODELS)
+async def test_validating_dimensions(model: ModelInfo):
+    args = [
+        "--task",
+        "embed",
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--enforce-eager",
+        "--max-model-len",
+        "512",
+        "--trust_remote_code"
+    ]
+    with RemoteOpenAIServer(model.name, args) as remote_server:
+        client = remote_server.get_async_client()
+
+        async def make_request(dimensions):
+            embedding_response = await client.embeddings.create(
+                model=model.name,
+                input=input_texts,
+                dimensions=dimensions,
+                encoding_format="float",
+            )
+            embeddings = EmbeddingResponse.model_validate(
+                embedding_response.model_dump(mode="json"))
+
+            assert embeddings.id is not None
+            assert len(embeddings.data) == 3
+            assert len(embeddings.data[0].embedding) > 0
+            assert embeddings.usage.completion_tokens == 0
+            assert embeddings.usage.prompt_tokens > 0
+            assert embeddings.usage.total_tokens > 0
+
+            if dimensions is not None:
+                assert len(embeddings.data[0].embedding) == dimensions
+
+        if model.is_matryoshka:
+            for dimensions in [None, 16]:
+                await make_request(dimensions)
+
+            with pytest.raises(openai.BadRequestError):
+                for dimensions in [-1]:
+                    await make_request(dimensions)
+
+        else:
+            for dimensions in [None]:
+                await make_request(dimensions)
+
+            with pytest.raises(openai.BadRequestError):
+                for dimensions in [-1, 16]:
+                    await make_request(dimensions)
diff --git a/tests/models/embedding/language/test_jina.py b/tests/models/embedding/language/test_jina.py
index 2a3eab02ddd9e..881d0a75b1584 100644
--- a/tests/models/embedding/language/test_jina.py
+++ b/tests/models/embedding/language/test_jina.py
@@ -8,7 +8,8 @@ import math
 
 import pytest
 
-from tests.models.embedding.utils import check_embeddings_close
+from tests.models.embedding.utils import check_embeddings_close, matryoshka_fy
+from vllm import PoolingParams
 
 SCORING_MODELS = [
     "jinaai/jina-reranker-v2-base-multilingual",  # Roberta
@@ -126,3 +127,40 @@ def test_embeddings(
         name_1="vllm",
         tol=1e-2,
     )
+
+
+@pytest.mark.parametrize("model", EMBEDDING_MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dimensions", [16, 32])
+def test_matryoshka(
+    hf_runner,
+    vllm_runner,
+    model,
+    dtype: str,
+    dimensions: int,
+    monkeypatch,
+) -> None:
+
+    example_prompts = EMBEDDING_PROMPTS
+
+    with hf_runner(
+            model,
+            dtype=dtype,
+            is_sentence_transformer=True,
+    ) as hf_model:
+        hf_outputs = hf_model.encode(example_prompts, task="text-matching")
+        hf_outputs = matryoshka_fy(hf_outputs, dimensions)
+
+    with vllm_runner(model, task="embed", dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.encode(
+            example_prompts,
+            pooling_params=PoolingParams(dimensions=dimensions))
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+        tol=1e-2,
+    )
diff --git a/tests/models/embedding/utils.py b/tests/models/embedding/utils.py
index bef85eaf372f1..5aeeb51785402 100644
--- a/tests/models/embedding/utils.py
+++ b/tests/models/embedding/utils.py
@@ -30,3 +30,10 @@ def check_embeddings_close(
                     f"\n{name_1}:\t{embeddings_1[:16]!r}")
 
         assert sim >= 1 - tol, fail_msg
+
+
+def matryoshka_fy(tensor, dimensions):
+    tensor = torch.tensor(tensor)
+    tensor = tensor[..., :dimensions]
+    tensor = F.normalize(tensor, p=2, dim=1)
+    return tensor
diff --git a/vllm/config.py b/vllm/config.py
index b466b765d7749..d3e224a6d8346 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -583,6 +583,15 @@ class ModelConfig:
                     if getattr(user_config, k) is None:
                         setattr(user_config, k, v)
 
+            if self.is_matryoshka:
+                if user_config.normalize is None:
+                    user_config.normalize = True
+                elif not user_config.normalize:
+                    raise ValueError(
+                        "`normalize` must be enabled (set to True) "
+                        "for models that are compatible with "
+                        "Matryoshka Representation.")
+
             return user_config
 
         return None
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 70bb73f482c86..a707087a2e286 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -921,6 +921,11 @@ class LLM:
         if pooling_params is None:
             # Use default pooling params.
             pooling_params = PoolingParams()
+        elif isinstance(pooling_params, PoolingParams):
+            pooling_params.verify(self.llm_engine.model_config)
+        else:
+            for pooling_param in pooling_params:
+                pooling_param.verify(self.llm_engine.model_config)
 
         self._validate_and_add_requests(
             prompts=parsed_prompts,
@@ -939,6 +944,8 @@ class LLM:
         /,
         *,
         use_tqdm: bool = True,
+        pooling_params: Optional[Union[PoolingParams,
+                                       Sequence[PoolingParams]]] = None,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[EmbeddingRequestOutput]:
@@ -953,6 +960,8 @@ class LLM:
             prompts: The prompts to the LLM. You may pass a sequence of prompts
                 for batch inference. See :class:`~vllm.inputs.PromptType`
                 for more details about the format of each prompts.
+            pooling_params: The pooling parameters for pooling. If None, we
+                use the default pooling parameters.
             use_tqdm: Whether to use tqdm to display the progress bar.
             lora_request: LoRA request to use for generation, if any.
             prompt_adapter_request: Prompt Adapter request to use for
@@ -968,6 +977,7 @@ class LLM:
 
         items = self.encode(prompts,
                             use_tqdm=use_tqdm,
+                            pooling_params=pooling_params,
                             lora_request=lora_request,
                             prompt_adapter_request=prompt_adapter_request)
 
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index cbd5f6e566b30..4639b4cea06b7 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1006,7 +1006,8 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
     # doc: end-embedding-extra-params
 
     def to_pooling_params(self):
-        return PoolingParams(additional_data=self.additional_data)
+        return PoolingParams(dimensions=self.dimensions,
+                             additional_data=self.additional_data)
 
 
 class EmbeddingChatRequest(OpenAIBaseModel):
@@ -1068,7 +1069,8 @@ class EmbeddingChatRequest(OpenAIBaseModel):
         return data
 
     def to_pooling_params(self):
-        return PoolingParams(additional_data=self.additional_data)
+        return PoolingParams(dimensions=self.dimensions,
+                             additional_data=self.additional_data)
 
 
 EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest]
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 0ee58672631d0..ba960de17cab3 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -80,9 +80,6 @@ class OpenAIServingEmbedding(OpenAIServing):
             return error_check_ret
 
         encoding_format = request.encoding_format
-        if request.dimensions is not None:
-            return self.create_error_response(
-                "dimensions is currently not supported")
 
         model_name = self._get_model_name(request.model)
         request_id = f"embd-{self._base_request_id(raw_request)}"
@@ -99,6 +96,13 @@ class OpenAIServingEmbedding(OpenAIServing):
                     "greater than max_model_len."
                     " Please, select a smaller truncation size.")
 
+        pooling_params = request.to_pooling_params()
+
+        try:
+            pooling_params.verify(self.model_config)
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
         try:
             (
                 lora_request,
@@ -146,8 +150,6 @@ class OpenAIServingEmbedding(OpenAIServing):
         # Schedule the request and get the result generator.
         generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
         try:
-            pooling_params = request.to_pooling_params()
-
             for i, engine_prompt in enumerate(engine_prompts):
                 request_id_item = f"{request_id}-{i}"
 
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 0012636ef9ffc..3f6ab64e4fa91 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -97,7 +97,7 @@ class SimplePooler(nn.Module):
         pooling_metadata: PoolingMetadata,
     ) -> PoolerOutput:
         pooled_data = self.extract_states(hidden_states, pooling_metadata)
-        pooled_data = self.head(pooled_data)
+        pooled_data = self.head(pooled_data, pooling_metadata)
         pooled_outputs = [self.build_output(data) for data in pooled_data]
         return PoolerOutput(outputs=pooled_outputs)
 
@@ -217,14 +217,28 @@ class PoolerHead(nn.Module):
         self.normalize = normalize
         self.softmax = softmax
 
-    def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor]):
+    def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor],
+                pooling_metadata: PoolingMetadata):
+
+        dimensions_list = [
+            pooling_param.dimensions
+            for _, pooling_param in pooling_metadata.seq_groups
+        ]
+        if any(d is not None for d in dimensions_list):
+            # change the output dimension
+            assert len(pooled_data) == len(dimensions_list)
+            pooled_data = [
+                vecs if d is None else vecs[..., :d]
+                for vecs, d in zip(pooled_data, dimensions_list)
+            ]
+
         if self.normalize:
             if isinstance(pooled_data, list):
                 pooled_data = [
-                    F.normalize(data, p=2, dim=1) for data in pooled_data
+                    F.normalize(data, p=2, dim=-1) for data in pooled_data
                 ]
             else:
-                pooled_data = F.normalize(pooled_data, p=2, dim=1)
+                pooled_data = F.normalize(pooled_data, p=2, dim=-1)
 
         if self.softmax:
             if isinstance(pooled_data, list):
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 061232eb11830..f71daf0c19551 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -1,9 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Any, Optional
+from typing import TYPE_CHECKING, Any, Optional
 
 import msgspec
 
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+
 
 class PoolingParams(
         msgspec.Struct,
@@ -12,14 +15,30 @@ class PoolingParams(
     """API parameters for pooling models. This is currently a placeholder.
 
     Attributes:
+        dimensions: Reduce the dimensions of embeddings
+                    if model support matryoshka representation.
         additional_data: Any additional data needed for pooling.
     """
+
+    dimensions: Optional[int] = None
     additional_data: Optional[Any] = None
 
     def clone(self) -> "PoolingParams":
         """Returns a deep copy of the PoolingParams instance."""
-        return PoolingParams(additional_data=self.additional_data)
+        return PoolingParams(dimensions=self.dimensions,
+                             additional_data=self.additional_data)
+
+    def verify(self, model_config: "ModelConfig") -> None:
+        if self.dimensions is not None:
+            if not model_config.is_matryoshka:
+                raise ValueError(
+                    f'Model "{model_config.served_model_name}" does not '
+                    f'support matryoshka representation, '
+                    f'changing output dimensions will lead to poor results.')
+            if self.dimensions < 1:
+                raise ValueError("Dimensions must be greater than 0")
 
     def __repr__(self) -> str:
         return (f"PoolingParams("
+                f"dimensions={self.dimensions}, "
                 f"additional_metadata={self.additional_data})")

From 4a3a5187225d12ac63cf4422925423a69d1b6008 Mon Sep 17 00:00:00 2001
From: Tianer Zhou <ezhoureal@gmail.com>
Date: Sat, 12 Apr 2025 14:24:22 +0800
Subject: [PATCH 406/593] fix: spelling (#16466)

Signed-off-by: Tianer Zhou <ezhoureal@gmail.com>
---
 csrc/custom_all_reduce.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh
index 7150ce29b41ef..44709b4597765 100644
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@@ -375,7 +375,7 @@ class CustomAllreduce {
   bool fully_connected_;
 
   RankSignals sg_;
-  // Stores an map from a pointer to its peer pointers from all ranks.
+  // Stores a map from a pointer to its peer pointers from all ranks.
   std::unordered_map<void*, RankData*> buffers_;
   Signal* self_sg_;
 

From c5bc0e7fcce38c4a1aa4fae827a222d4014f6294 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 12 Apr 2025 14:48:43 +0800
Subject: [PATCH 407/593] [Misc] Update chat utils tests (#16520)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/entrypoints/test_chat_utils.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index a00387ef6b8c1..92c1e0fec6b74 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -25,6 +25,7 @@ EXAMPLES_DIR = VLLM_PATH / "examples"
 
 PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
 ULTRAVOX_MODEL_ID = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
+QWEN2AUDIO_MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct"
 QWEN2VL_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
 QWEN25VL_MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
 MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
@@ -841,6 +842,8 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
     assert isinstance(chat_template, str)
 
 
+# NOTE: Qwen2-Audio default chat template is specially defined inside
+# processor class instead of using `tokenizer_config.json`
 # yapf: disable
 @pytest.mark.parametrize(
     ("model", "expected_format"),
@@ -848,6 +851,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
      (QWEN2VL_MODEL_ID, "openai"),
      (QWEN25VL_MODEL_ID, "openai"),
      (ULTRAVOX_MODEL_ID, "string"),
+     (QWEN2AUDIO_MODEL_ID, "openai"),
      (MLLAMA_MODEL_ID, "openai"),
      (LLAMA_GUARD_MODEL_ID, "openai")],
 )
@@ -900,10 +904,13 @@ def test_resolve_content_format_hf_defined(model, expected_format):
      ("template_chatglm2.jinja", "string"),
      ("template_chatml.jinja", "string"),
      ("template_deepseek_vl2.jinja", "string"),
+     ("template_dse_qwen2_vl.jinja", "openai"),
      ("template_falcon_180b.jinja", "string"),
      ("template_falcon.jinja", "string"),
+     ("template_florence2.jinja", "string"),
      ("template_inkbot.jinja", "string"),
      ("template_llava.jinja", "string"),
+     ("template_teleflm.jinja", "string"),
      ("template_vlm2vec.jinja", "openai"),
      ("tool_chat_template_granite_20b_fc.jinja", "string"),
      ("tool_chat_template_hermes.jinja", "string"),

From f069f3ea74a6436c5870f792d1dc0cc50ff74380 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Sat, 12 Apr 2025 09:27:03 +0200
Subject: [PATCH 408/593] [Misc] Openai transcription client example use same
 Whisper model (#16487)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 examples/online_serving/openai_transcription_client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/online_serving/openai_transcription_client.py b/examples/online_serving/openai_transcription_client.py
index 494e7c8ebe12a..062868dd8adf0 100644
--- a/examples/online_serving/openai_transcription_client.py
+++ b/examples/online_serving/openai_transcription_client.py
@@ -23,7 +23,7 @@ def sync_openai():
     with open(str(mary_had_lamb), "rb") as f:
         transcription = client.audio.transcriptions.create(
             file=f,
-            model="openai/whisper-small",
+            model="openai/whisper-large-v3",
             language="en",
             response_format="json",
             temperature=0.0)

From d9fc8cd9da4a69cb4171efb7cb5a46308680c83c Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 12 Apr 2025 16:52:39 +0800
Subject: [PATCH 409/593] [V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/supported_models.md        |  4 +-
 docs/source/serving/offline_inference.md      | 24 ++++++
 examples/offline_inference/audio_language.py  |  5 ++
 .../encoder_decoder_multimodal.py             |  5 ++
 examples/offline_inference/vision_language.py | 79 +++++++++++--------
 .../vision_language_embedding.py              |  7 ++
 .../vision_language_multi_image.py            |  5 ++
 tests/entrypoints/openai/test_audio.py        | 61 +++++++-------
 .../vision_language/vlm_utils/core.py         |  4 +
 tests/models/test_oot_registration.py         |  1 +
 tests/multimodal/test_processing.py           |  7 +-
 vllm/config.py                                | 12 ++-
 vllm/engine/arg_utils.py                      |  6 +-
 vllm/entrypoints/chat_utils.py                | 29 +++++--
 vllm/model_executor/models/minicpmo.py        |  2 +-
 vllm/model_executor/models/minicpmv.py        |  4 +-
 vllm/model_executor/models/qwen2_vl.py        |  4 +-
 vllm/multimodal/processing.py                 | 34 ++++++--
 vllm/multimodal/profiling.py                  | 18 +----
 vllm/multimodal/registry.py                   |  6 +-
 vllm/multimodal/utils.py                      |  2 +-
 21 files changed, 214 insertions(+), 105 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index c5029d856453b..ffedd5b04051b 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -759,7 +759,7 @@ On the other hand, modalities separated by `/` are mutually exclusive.
 See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model.
 
 :::{important}
-To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference)
+**To enable multiple multi-modal items per text prompt in vLLM V0**, you have to set `limit_mm_per_prompt` (offline inference)
 or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt:
 
 Offline inference:
@@ -777,6 +777,8 @@ Online serving:
 vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
 ```
 
+**This is no longer required if you are using vLLM V1.**
+
 :::
 
 :::{note}
diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md
index 2fa19332d4aa6..85f2cafacdd38 100644
--- a/docs/source/serving/offline_inference.md
+++ b/docs/source/serving/offline_inference.md
@@ -110,6 +110,30 @@ If you run out of CPU RAM, try the following options:
 - (Multi-modal models only) you can set the size of multi-modal input cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB).
 - (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB).
 
+#### Disable unused modalities
+
+You can disable unused modalities (except for text) by setting its limit to zero.
+
+For example, if your application only accepts image input, there is no need to allocate any memory for videos.
+
+```python
+from vllm import LLM
+
+# Accept images but not videos
+llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+          limit_mm_per_prompt={"video": 0})
+```
+
+You can even run a multi-modal model for text-only inference:
+
+```python
+from vllm import LLM
+
+# Don't accept images. Just text.
+llm = LLM(model="google/gemma-3-27b-it",
+          limit_mm_per_prompt={"image": 0})
+```
+
 ### Performance optimization and tuning
 
 You can potentially improve the performance of vLLM by finetuning various options.
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 9d7585914f5eb..248090474de66 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -196,6 +196,11 @@ def main(args):
     req_data = model_example_map[model](question_per_audio_count[audio_count],
                                         audio_count)
 
+    # Disable other modalities to save memory
+    default_limits = {"image": 0, "video": 0, "audio": 0}
+    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
+        req_data.engine_args.limit_mm_per_prompt or {})
+
     engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
     llm = LLM(**engine_args)
 
diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py
index b2f2386d83b08..456ee60eaabf3 100644
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@@ -133,6 +133,11 @@ def main(args):
 
     req_data = model_example_map[model]()
 
+    # Disable other modalities to save memory
+    default_limits = {"image": 0, "video": 0, "audio": 0}
+    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
+        req_data.engine_args.limit_mm_per_prompt or {})
+
     engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
     llm = LLM(**engine_args)
 
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 7b587f29b5a7f..c0799bde61807 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -45,7 +45,7 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData:
         max_model_len=4096,
         max_num_seqs=2,
         dtype="bfloat16",
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
@@ -71,7 +71,7 @@ def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
         max_model_len=2048,
         max_num_seqs=2,
         mm_processor_kwargs={"crop_to_patches": True},
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
     prompts = [
         f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
@@ -92,7 +92,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
     prompts = [f"Question: {question} Answer:" for question in questions]
     engine_args = EngineArgs(
         model="Salesforce/blip2-opt-6.7b",
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     return ModelRequestData(
@@ -110,7 +110,7 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
         model="facebook/chameleon-7b",
         max_model_len=4096,
         max_num_seqs=2,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     return ModelRequestData(
@@ -129,8 +129,8 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
         model=model_name,
         max_model_len=4096,
         max_num_seqs=2,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
         hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
+        limit_mm_per_prompt={"image": 1},
     )
 
     prompts = [
@@ -155,7 +155,7 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
         max_num_seqs=2,
         trust_remote_code=True,
         dtype="bfloat16",
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]
@@ -175,7 +175,7 @@ def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
         model="adept/fuyu-8b",
         max_model_len=2048,
         max_num_seqs=2,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     return ModelRequestData(
@@ -194,7 +194,7 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
         max_model_len=2048,
         max_num_seqs=2,
         mm_processor_kwargs={"do_pan_and_scan": True},
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     prompts = [("<bos><start_of_turn>user\n"
@@ -219,7 +219,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
         trust_remote_code=True,
         enforce_eager=True,
         hf_overrides={"architectures": ["GLM4VForCausalLM"]},
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     prompts = [
@@ -246,7 +246,7 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
         model=model_name,
         trust_remote_code=True,
         max_model_len=8192,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -287,7 +287,7 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
                 "longest_edge": 3 * 364
             },
         },
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
     prompts = [(
         f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
@@ -314,7 +314,7 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
                 "longest_edge": 384
             },
         },
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
     prompts = [
         (f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
@@ -337,7 +337,7 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
         model=model_name,
         trust_remote_code=True,
         max_model_len=4096,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -375,7 +375,7 @@ def run_llava(questions: list[str], modality: str) -> ModelRequestData:
     engine_args = EngineArgs(
         model="llava-hf/llava-1.5-7b-hf",
         max_model_len=4096,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     return ModelRequestData(
@@ -392,7 +392,7 @@ def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
     engine_args = EngineArgs(
         model="llava-hf/llava-v1.6-mistral-7b-hf",
         max_model_len=8192,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     return ModelRequestData(
@@ -414,7 +414,7 @@ def run_llava_next_video(questions: list[str],
         model="llava-hf/LLaVA-NeXT-Video-7B-hf",
         max_model_len=8192,
         max_num_seqs=2,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     return ModelRequestData(
@@ -442,7 +442,7 @@ def run_llava_onevision(questions: list[str],
     engine_args = EngineArgs(
         model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
         max_model_len=16384,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     return ModelRequestData(
@@ -465,7 +465,7 @@ def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
         model="TIGER-Lab/Mantis-8B-siglip-llama3",
         max_model_len=4096,
         hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
     stop_token_ids = [128009]
 
@@ -506,7 +506,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
         max_model_len=4096,
         max_num_seqs=2,
         trust_remote_code=True,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
     # NOTE The stop_token_ids are different for various versions of MiniCPM-V
     # 2.0
@@ -561,7 +561,7 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
         max_model_len=8192,
         max_num_seqs=2,
         tensor_parallel_size=2,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
@@ -587,7 +587,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
         model=model_name,
         max_model_len=8192,
         max_num_seqs=2,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -611,7 +611,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
-def run_llama4(questions: list[str], modality: str):
+def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
 
     model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
@@ -621,8 +621,8 @@ def run_llama4(questions: list[str], modality: str):
         max_model_len=8192,
         max_num_seqs=4,
         tensor_parallel_size=8,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
         gpu_memory_utilization=0.4,
+        limit_mm_per_prompt={"image": 1},
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -657,7 +657,7 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
         model=model_name,
         trust_remote_code=True,
         dtype="bfloat16",
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     prompts = [
@@ -683,7 +683,7 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
         trust_remote_code=True,
         max_model_len=4096,
         tensor_parallel_size=4,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -710,7 +710,8 @@ def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
     prompts = ["caption en" for _ in questions]
     engine_args = EngineArgs(
         model="google/paligemma-3b-mix-224",
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+        limit_mm_per_prompt={"image": 1},
+    )
 
     return ModelRequestData(
         engine_args=engine_args,
@@ -726,7 +727,8 @@ def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
     prompts = ["caption en" for _ in questions]
     engine_args = EngineArgs(
         model="google/paligemma2-3b-ft-docci-448",
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+        limit_mm_per_prompt={"image": 1},
+    )
 
     return ModelRequestData(
         engine_args=engine_args,
@@ -762,7 +764,7 @@ def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
         max_num_seqs=2,
         # Note - mm_processor_kwargs can also be passed to generate/chat calls
         mm_processor_kwargs={"num_crops": 16},
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     return ModelRequestData(
@@ -793,6 +795,7 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
         max_num_seqs=2,
         enable_lora=True,
         max_lora_rank=320,
+        limit_mm_per_prompt={"image": 1},
     )
 
     return ModelRequestData(
@@ -813,7 +816,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
         model=model_name,
         max_model_len=6144,
         max_num_seqs=2,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
@@ -834,7 +837,7 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
         max_model_len=1024,
         max_num_seqs=2,
         hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
@@ -859,7 +862,7 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
             "min_pixels": 28 * 28,
             "max_pixels": 1280 * 28 * 28,
         },
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     if modality == "image":
@@ -894,7 +897,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
             "max_pixels": 1280 * 28 * 28,
             "fps": 1,
         },
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     if modality == "image":
@@ -925,7 +928,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
         model=model_name,
         trust_remote_code=True,
         max_model_len=4096,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -1082,7 +1085,15 @@ def main(args):
 
     req_data = model_example_map[model](questions, modality)
 
-    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    # Disable other modalities to save memory
+    default_limits = {"image": 0, "video": 0, "audio": 0}
+    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
+        req_data.engine_args.limit_mm_per_prompt or {})
+
+    engine_args = asdict(req_data.engine_args) | {
+        "seed": args.seed,
+        "disable_mm_preprocessor_cache": args.disable_mm_preprocessor_cache,
+    }
     llm = LLM(**engine_args)
 
     # To maintain code compatibility in this script, we add LoRA here.
diff --git a/examples/offline_inference/vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py
index 8321d3e254a2a..ad3c5ae0627b3 100644
--- a/examples/offline_inference/vision_language_embedding.py
+++ b/examples/offline_inference/vision_language_embedding.py
@@ -63,6 +63,7 @@ def run_e5_v(query: Query) -> ModelRequestData:
         model="royokong/e5-v",
         task="embed",
         max_model_len=4096,
+        limit_mm_per_prompt={"image": 1},
     )
 
     return ModelRequestData(
@@ -93,6 +94,7 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
         task="embed",
         trust_remote_code=True,
         mm_processor_kwargs={"num_crops": 4},
+        limit_mm_per_prompt={"image": 1},
     )
 
     return ModelRequestData(
@@ -131,6 +133,11 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
     query = get_query(modality)
     req_data = model_example_map[model](query)
 
+    # Disable other modalities to save memory
+    default_limits = {"image": 0, "video": 0, "audio": 0}
+    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
+        req_data.engine_args.limit_mm_per_prompt or {})
+
     engine_args = asdict(req_data.engine_args) | {"seed": seed}
     llm = LLM(**engine_args)
 
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 1ac141d8a583c..7aff5fd07a355 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -687,6 +687,11 @@ def run_chat(model: str, question: str, image_urls: list[str],
              seed: Optional[int]):
     req_data = model_example_map[model](question, image_urls)
 
+    # Disable other modalities to save memory
+    default_limits = {"image": 0, "video": 0, "audio": 0}
+    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
+        req_data.engine_args.limit_mm_per_prompt or {})
+
     engine_args = asdict(req_data.engine_args) | {"seed": seed}
     llm = LLM(**engine_args)
 
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index 3267dcc15e4a9..b13002a5b6823 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -12,7 +12,9 @@ from ...utils import RemoteOpenAIServer
 MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
 TEST_AUDIO_URLS = [
     AudioAsset("winning_call").url,
+    AudioAsset("mary_had_lamb").url,
 ]
+MAXIMUM_AUDIOS = 2
 
 
 @pytest.fixture(scope="module")
@@ -24,6 +26,8 @@ def server():
         "5",
         "--enforce-eager",
         "--trust-remote-code",
+        "--limit-mm-per-prompt",
+        f"audio={MAXIMUM_AUDIOS}",
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -46,7 +50,7 @@ def base64_encoded_audio() -> dict[str, str]:
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
 async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
                                          model_name: str, audio_url: str):
     messages = [{
@@ -100,7 +104,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
 async def test_single_chat_session_audio_base64encoded(
         client: openai.AsyncOpenAI, model_name: str, audio_url: str,
         base64_encoded_audio: dict[str, str]):
@@ -158,7 +162,7 @@ async def test_single_chat_session_audio_base64encoded(
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
 async def test_single_chat_session_input_audio(
         client: openai.AsyncOpenAI, model_name: str, audio_url: str,
         base64_encoded_audio: dict[str, str]):
@@ -330,28 +334,21 @@ async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+@pytest.mark.parametrize(
+    "audio_urls", [TEST_AUDIO_URLS, TEST_AUDIO_URLS + [TEST_AUDIO_URLS[0]]])
 async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
-                                 audio_url: str,
-                                 base64_encoded_audio: dict[str, str]):
+                                 audio_urls: list[str]):
 
     messages = [{
         "role":
         "user",
         "content": [
-            {
+            *({
                 "type": "audio_url",
                 "audio_url": {
                     "url": audio_url
                 }
-            },
-            {
-                "type": "input_audio",
-                "input_audio": {
-                    "data": base64_encoded_audio[audio_url],
-                    "format": "wav"
-                }
-            },
+            } for audio_url in audio_urls),
             {
                 "type": "text",
                 "text": "What's happening in this audio?"
@@ -359,20 +356,30 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
         ],
     }]
 
-    with pytest.raises(openai.BadRequestError):  # test multi-audio input
-        await client.chat.completions.create(
+    if len(audio_urls) > MAXIMUM_AUDIOS:
+        with pytest.raises(openai.BadRequestError):  # test multi-audio input
+            await client.chat.completions.create(
+                model=model_name,
+                messages=messages,
+                max_completion_tokens=10,
+                temperature=0.0,
+            )
+
+        # the server should still work afterwards
+        completion = await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+        )
+        completion = completion.choices[0].text
+        assert completion is not None and len(completion) >= 0
+    else:
+        chat_completion = await client.chat.completions.create(
             model=model_name,
             messages=messages,
             max_completion_tokens=10,
             temperature=0.0,
         )
-
-    # the server should still work afterwards
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    completion = completion.choices[0].text
-    assert completion is not None and len(completion) >= 0
+        message = chat_completion.choices[0].message
+        assert message.content is not None and len(message.content) >= 0
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py
index 2eae643fa2e47..fd046f3cd8e81 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/core.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py
@@ -51,6 +51,10 @@ def run_test(
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
+    # Disable other modalities to save memory
+    default_limits = {"image": 0, "video": 0, "audio": 0}
+    limit_mm_per_prompt = default_limits | limit_mm_per_prompt
+
     vllm_outputs_per_mm = []
     hf_outputs_per_mm = []
 
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index e6141b97b10dc..f1ed8a04cfa08 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -90,6 +90,7 @@ def test_oot_registration_multimodal(
                   max_model_len=4096,
                   enforce_eager=True,
                   limit_mm_per_prompt={"image": 1})
+
         first_token = llm.get_tokenizer().decode(0)
         outputs = llm.generate(prompts, sampling_params)
 
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index fa9588a050965..59f7bf8fab2fe 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -972,10 +972,13 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
     if is_valid:
         exc_ctx = nullcontext()
     else:
-        exc_ctx = pytest.raises(ValueError, match="this model only supports")
+        exc_ctx = pytest.raises(ValueError, match="The model only supports")
 
     with exc_ctx:
-        profiler.get_decoder_dummy_data(model_config.max_model_len)
+        profiler.get_decoder_dummy_data(
+            model_config.max_model_len,
+            mm_counts=limit_mm_per_prompt,
+        )
 
 
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
diff --git a/vllm/config.py b/vllm/config.py
index d3e224a6d8346..2912361ee35e6 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2667,14 +2667,20 @@ class MultiModalConfig:
                                usedforsecurity=False).hexdigest()
         return hash_str
 
+    def get_default_limit_per_prompt(self) -> int:
+        """
+        Return the default number of input items allowed per prompt
+        for any modality if not specified by the user.
+        """
+        return 999 if envs.VLLM_USE_V1 else 1
+
     def get_limit_per_prompt(self, modality: str) -> int:
         """
         Get the maximum number of input items allowed per prompt
         for the given modality.
-
-        If not set by the user, this defaults to `1`.
         """
-        return self.limit_per_prompt.get(modality, 1)
+        default = self.get_default_limit_per_prompt()
+        return self.limit_per_prompt.get(modality, default)
 
     # TODO: Add configs to init vision tower or not.
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 70e628ed16808..975afe5ada835 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -671,13 +671,13 @@ class EngineArgs:
             type=nullable_kvs,
             default=EngineArgs.limit_mm_per_prompt,
             # The default value is given in
-            # MultiModalConfig.get_limit_per_prompt
+            # MultiModalConfig.get_default_limit_per_prompt
             help=('For each multimodal plugin, limit how many '
                   'input instances to allow for each prompt. '
                   'Expects a comma-separated list of items, '
                   'e.g.: `image=16,video=2` allows a maximum of 16 '
-                  'images and 2 videos per prompt. Defaults to 1 for '
-                  'each modality.'))
+                  'images and 2 videos per prompt. Defaults to '
+                  '1 (V0) or 999 (V1) for each modality.'))
         parser.add_argument(
             '--mm-processor-kwargs',
             default=None,
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 23c2c3cfd5811..6fb7dc2c9763a 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -35,7 +35,7 @@ from typing_extensions import Required, TypeAlias, TypedDict
 
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
-from vllm.multimodal import MultiModalDataDict
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
 from vllm.multimodal.utils import MediaConnector
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
@@ -452,8 +452,6 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
 
         self._model_config = model_config
         self._tokenizer = tokenizer
-        self._allowed_items = (model_config.multimodal_config.limit_per_prompt
-                               if model_config.multimodal_config else {})
 
         self._items_by_modality = defaultdict[str, list[_T]](list)
 
@@ -465,6 +463,10 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
     def allowed_local_media_path(self):
         return self._model_config.allowed_local_media_path
 
+    @property
+    def mm_registry(self):
+        return MULTIMODAL_REGISTRY
+
     @staticmethod
     @cache
     def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str:
@@ -540,12 +542,29 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
         Add a multi-modal item to the current prompt and returns the
         placeholder string to use, if any.
         """
-        allowed_count = self._allowed_items.get(modality, 1)
+        mm_registry = self.mm_registry
+        model_config = self.model_config
+
+        input_modality = modality.replace("_embeds", "")
+
+        if mm_registry.has_processor(model_config):
+            mm_processor = mm_registry.create_processor(model_config)
+            allowed_counts = mm_processor.info.get_allowed_mm_limits()
+            allowed_count = allowed_counts.get(input_modality, 0)
+        else:
+            mm_config = model_config.multimodal_config
+            if mm_config is None:
+                msg = "This model does not support multi-modal inputs"
+                raise ValueError(msg)
+
+            allowed_count = mm_config.get_limit_per_prompt(input_modality)
+
         current_count = len(self._items_by_modality[modality]) + 1
         if current_count > allowed_count:
             raise ValueError(
                 f"At most {allowed_count} {modality}(s) may be provided in "
-                "one request.")
+                "one request. You can set `--limit-mm-per-prompt` to "
+                "increase this limit if the model supports it.")
 
         self._items_by_modality[modality].append(item)
 
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index 29c3cc5e769b3..a2ca92cdec072 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -126,7 +126,7 @@ class MiniCPMOMultiModalDataParser(MiniCPMVMultiModalDataParser):
     def _parse_audio_data(
         self,
         data: Union[dict[str, torch.Tensor], ModalityData[AudioItem]],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> Optional[ModalityDataItems[Any, Any]]:
         if isinstance(data, dict):
             return MiniCPMOAudioEmbeddingItems(
                 data,
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index c504737e1b335..1a91cf9bab478 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -290,7 +290,7 @@ class MiniCPMVMultiModalDataParser(MultiModalDataParser):
     def _parse_image_data(
         self,
         data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> Optional[ModalityDataItems[Any, Any]]:
         if isinstance(data, dict):
             return MiniCPMVImageEmbeddingItems(
                 data,
@@ -302,7 +302,7 @@ class MiniCPMVMultiModalDataParser(MultiModalDataParser):
     def _parse_video_data(
         self,
         data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> Optional[ModalityDataItems[Any, Any]]:
         if isinstance(data, dict):
             return MiniCPMVVideoEmbeddingItems(
                 data,
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 11950f78f1d25..8c24b8f7df521 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -720,7 +720,7 @@ class Qwen2VLMultiModalDataParser(MultiModalDataParser):
     def _parse_image_data(
         self,
         data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> Optional[ModalityDataItems[Any, Any]]:
         if isinstance(data, dict):
             return DictEmbeddingItems(
                 data,
@@ -734,7 +734,7 @@ class Qwen2VLMultiModalDataParser(MultiModalDataParser):
     def _parse_video_data(
         self,
         data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> Optional[ModalityDataItems[Any, Any]]:
         if isinstance(data, dict):
             return DictEmbeddingItems(
                 data,
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index f531314abedc7..7f289426d349e 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1034,6 +1034,20 @@ class BaseProcessingInfo:
         """
         raise NotImplementedError
 
+    def get_allowed_mm_limits(self) -> Mapping[str, int]:
+        """Return the maximum allowed number of items for each modality."""
+        supported_mm_limits = self.get_supported_mm_limits()
+        mm_config = self.ctx.get_mm_config()
+
+        allowed_limits = dict[str, int]()
+        for modality, supported_limit in supported_mm_limits.items():
+            user_limit = mm_config.get_limit_per_prompt(modality)
+
+            allowed_limits[modality] = (user_limit if supported_limit is None
+                                        else min(user_limit, supported_limit))
+
+        return allowed_limits
+
 
 _I = TypeVar("_I", bound=BaseProcessingInfo)
 
@@ -1087,14 +1101,24 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         before passing them to :meth:`_get_hf_mm_data`.
         """
         mm_items = self.data_parser.parse_mm_data(mm_data)
-        mm_config = self.info.ctx.get_mm_config()
+        supported_mm_limits = self.info.get_supported_mm_limits()
+        allowed_mm_limits = self.info.get_allowed_mm_limits()
 
         for modality, items in mm_items.items():
-            limit = mm_config.get_limit_per_prompt(modality)
-            if len(items) > limit:
+            supported_limit = supported_mm_limits.get(modality, 0)
+            allowed_limit = allowed_mm_limits.get(modality, 0)
+            num_items = len(items)
+
+            if supported_limit is not None and num_items > supported_limit:
                 raise ValueError(
-                    f"You set {modality}={limit} (or defaulted to 1) in "
-                    f"`--limit-mm-per-prompt`, but passed {len(items)} "
+                    f"The model only supports at most {supported_limit} "
+                    f"{modality} items, but you passed {num_items} "
+                    f"{modality} items in the same prompt.")
+
+            if num_items > allowed_limit:
+                raise ValueError(
+                    f"You set or defaulted to {modality}={allowed_limit} "
+                    f"in --limit-mm-per-prompt`, but passed {num_items} "
                     f"{modality} items in the same prompt.")
 
         return mm_items
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 29de9b7cda03c..a173487c470c8 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -162,23 +162,7 @@ class MultiModalProfiler(Generic[_I]):
         return self.processor.dummy_inputs
 
     def get_mm_limits(self) -> Mapping[str, int]:
-        mm_config = self.processing_info.ctx.get_mm_config()
-        supported_mm_limits = self.processing_info.get_supported_mm_limits()
-
-        mm_limits = {
-            modality: mm_config.get_limit_per_prompt(modality)
-            for modality in supported_mm_limits
-        }
-
-        for modality, supported_limit in supported_mm_limits.items():
-            limit = mm_limits[modality]
-            if supported_limit is not None and supported_limit < limit:
-                raise ValueError(
-                    f"You set {modality}={limit} (or defaulted to 1) in "
-                    f"`--limit-mm-per-prompt`, but this model only supports "
-                    f"at most {supported_limit} {modality} items.")
-
-        return mm_limits
+        return self.processing_info.get_allowed_mm_limits()
 
     def _get_dummy_mm_inputs(
         self,
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index eafa28d612a6d..def0595013b8b 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -265,8 +265,10 @@ class MultiModalRegistry:
 
             return profiler.get_mm_max_tokens(
                 seq_len,
-                {modality: 1
-                 for modality in mm_limits},
+                {
+                    modality: 1
+                    for modality, limit in mm_limits.items() if limit > 0
+                },
             )
 
         return {
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 77c83f0c2b212..3f9b5be28b02b 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -264,7 +264,7 @@ fetch_video = global_media_connector.fetch_video
 
 def encode_audio_base64(
     audio: np.ndarray,
-    sampling_rate: int,
+    sampling_rate: float,
 ) -> str:
     """Encode audio as base64."""
     audio_io = AudioMediaIO()

From 68bb122eb458a42169dd9d1772a7b32b84b2be95 Mon Sep 17 00:00:00 2001
From: Huazhong Ji <hzji210@gmail.com>
Date: Sat, 12 Apr 2025 17:20:25 +0800
Subject: [PATCH 410/593] [MISC] Make GroupCoordinator compatible with
 out-of-tree devices (#16464)

Signed-off-by: hzji210@gmail.com <hzji210@gmail.com>
---
 vllm/distributed/parallel_state.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index fa493fefb8f05..e0eeeffb88a70 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -194,9 +194,11 @@ class GroupCoordinator:
 
         from vllm.platforms import current_platform
 
-        # TODO: fix it for other platforms
         if current_platform.is_cuda_alike():
             self.device = torch.device(f"cuda:{local_rank}")
+        elif current_platform.is_out_of_tree():
+            self.device = torch.device(
+                f"{current_platform.device_name}:{local_rank}")
         else:
             self.device = torch.device("cpu")
 

From 3cdc57669f231a2c2bdbebb0480164d10cf89edb Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 12 Apr 2025 19:21:37 +0800
Subject: [PATCH 411/593] [Misc] Delete redundant code (#16530)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 examples/offline_inference/vision_language.py             | 7 -------
 examples/offline_inference/vision_language_multi_image.py | 7 -------
 2 files changed, 14 deletions(-)

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index c0799bde61807..f51cef95e8596 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -1096,13 +1096,6 @@ def main(args):
     }
     llm = LLM(**engine_args)
 
-    # To maintain code compatibility in this script, we add LoRA here.
-    # You can also add LoRA using:
-    # llm.generate(prompts, lora_request=lora_request,...)
-    if req_data.lora_requests:
-        for lora_request in req_data.lora_requests:
-            llm.llm_engine.add_lora(lora_request=lora_request)
-
     # Don't want to check the flag multiple times, so just hijack `prompts`.
     prompts = req_data.prompts if args.use_different_prompt_per_request else [
         req_data.prompts[0]
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 7aff5fd07a355..89818f8b33ee6 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -695,13 +695,6 @@ def run_chat(model: str, question: str, image_urls: list[str],
     engine_args = asdict(req_data.engine_args) | {"seed": seed}
     llm = LLM(**engine_args)
 
-    # To maintain code compatibility in this script, we add LoRA here.
-    # You can also add LoRA using:
-    # llm.generate(prompts, lora_request=lora_request,...)
-    if req_data.lora_requests:
-        for lora_request in req_data.lora_requests:
-            llm.llm_engine.add_lora(lora_request=lora_request)
-
     sampling_params = SamplingParams(temperature=0.0,
                                      max_tokens=256,
                                      stop_token_ids=req_data.stop_token_ids)

From 70363bccfac1a6a0818ea577ad9cf8123a0ec3ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= <jiefu@tencent.com>
Date: Sat, 12 Apr 2025 22:39:42 +0800
Subject: [PATCH 412/593] Fix syntaxWarning: invalid escape sequence '\s'
 (#16532)

Signed-off-by: Jie Fu <jiefu@tencent.com>
---
 vllm/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 0fa3384aa0901..c2aad04941b82 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1233,8 +1233,8 @@ class SortedHelpFormatter(ArgumentDefaultsHelpFormatter):
         3. Each line is wrapped to the specified width (width of terminal).
         """
         # The patterns also include whitespace after the newline
-        single_newline = re.compile("(?<!\n)\n(?!\n)\s*")
-        multiple_newlines = re.compile("\n{2,}\s*")
+        single_newline = re.compile(r"(?<!\n)\n(?!\n)\s*")
+        multiple_newlines = re.compile(r"\n{2,}\s*")
         text = single_newline.sub(' ', text)
         lines = re.split(multiple_newlines, text)
         return sum([textwrap.wrap(line, width) for line in lines], [])

From 93e5f3c5fb4a4bbd49610efb96aad30df95fca66 Mon Sep 17 00:00:00 2001
From: SnowCharm <qiuyilun@u.nus.edu>
Date: Sat, 12 Apr 2025 22:54:37 +0800
Subject: [PATCH 413/593] [Perf] Optimize Preparing Inputs for GPU Model Runner
 (#16484)

Signed-off-by: snowcharm <snowcharmqq@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/worker/gpu_model_runner.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0e70d77e1b7e7..70e8bd75ec94e 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -484,14 +484,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         self.input_batch.block_table.commit(num_reqs)
 
         # Get the number of scheduled tokens for each request.
-        # TODO: The Python loop can be slow. Optimize.
-        num_scheduled_tokens = np.empty(num_reqs, dtype=np.int32)
-        max_num_scheduled_tokens = 0
-        for i, req_id in enumerate(self.input_batch.req_ids):
-            num_tokens = scheduler_output.num_scheduled_tokens[req_id]
-            num_scheduled_tokens[i] = num_tokens
-            max_num_scheduled_tokens = max(max_num_scheduled_tokens,
-                                           num_tokens)
+        req_ids = self.input_batch.req_ids
+        tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
+        num_scheduled_tokens = np.array(tokens, dtype=np.int32)
+        max_num_scheduled_tokens = max(tokens)
 
         # Get request indices.
         # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]

From 6c11ecf8d337d3b89e891588c81640a0bd30f6e1 Mon Sep 17 00:00:00 2001
From: Ryan McConville <ryan@ryanmcconville.com>
Date: Sat, 12 Apr 2025 21:19:19 +0100
Subject: [PATCH 414/593] [Bugfix] Validate logit biases to prevent out of
 vocab ids crashing engine (#16529)

Signed-off-by: Ryan McConville <ryan@ryanmcconville.com>
---
 .../openai/test_chat_logit_bias_validation.py | 88 +++++++++++++++++++
 vllm/v1/engine/processor.py                   | 21 +++++
 vllm/v1/sample/sampler.py                     | 10 +++
 3 files changed, 119 insertions(+)
 create mode 100644 tests/entrypoints/openai/test_chat_logit_bias_validation.py

diff --git a/tests/entrypoints/openai/test_chat_logit_bias_validation.py b/tests/entrypoints/openai/test_chat_logit_bias_validation.py
new file mode 100644
index 0000000000000..9dab524ea4801
--- /dev/null
+++ b/tests/entrypoints/openai/test_chat_logit_bias_validation.py
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import openai
+import pytest
+import pytest_asyncio
+
+from vllm.config import ModelConfig
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
+
+
+def get_vocab_size(model_name):
+    config = ModelConfig(
+        model=model_name,
+        task="auto",
+        tokenizer=model_name,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="bfloat16",
+    )
+    return config.get_vocab_size()
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "1024",
+        "--enforce-eager",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_chat_logit_bias_valid(client):
+    """Test that valid logit_bias values are accepted in chat completions."""
+    vocab_size = get_vocab_size(MODEL_NAME)
+    valid_token_id = vocab_size - 1
+
+    completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "Testing valid logit bias"
+        }],
+        max_tokens=5,
+        logit_bias={str(valid_token_id): 1.0},
+    )
+
+    assert completion.choices[0].message.content is not None
+
+
+@pytest.mark.asyncio
+async def test_chat_logit_bias_invalid(client):
+    """Test that invalid logit_bias values are rejected in chat completions."""
+    vocab_size = get_vocab_size(MODEL_NAME)
+    invalid_token_id = vocab_size + 1
+
+    with pytest.raises(openai.BadRequestError) as excinfo:
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[{
+                "role": "user",
+                "content": "Testing invalid logit bias"
+            }],
+            max_tokens=5,
+            logit_bias={str(invalid_token_id): 1.0},
+        )
+
+    error = excinfo.value
+    error_message = str(error)
+
+    assert error.status_code == 400
+    assert str(invalid_token_id) in error_message
+    assert str(vocab_size) in error_message
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 7d1913ecebed2..6d3290f165653 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -77,6 +77,7 @@ class Processor:
         params: SamplingParams,
     ) -> None:
         self._validate_structured_output(params)
+        self._validate_logit_bias(params)
 
         if params.allowed_token_ids is None:
             return
@@ -87,6 +88,26 @@ class Processor:
             raise ValueError(
                 "allowed_token_ids contains out-of-vocab token id!")
 
+    def _validate_logit_bias(
+        self,
+        params: SamplingParams,
+    ) -> None:
+        """Validate logit_bias token IDs are within vocabulary range."""
+        if not params.logit_bias:
+            return
+
+        vocab_size = self.model_config.get_vocab_size()
+        invalid_token_ids = []
+
+        for token_id in params.logit_bias:
+            if token_id < 0 or token_id >= vocab_size:
+                invalid_token_ids.append(token_id)
+
+        if invalid_token_ids:
+            raise ValueError(
+                f"token_id(s) {invalid_token_ids} in logit_bias contain "
+                f"out-of-vocab token ids. Vocabulary size: {vocab_size}")
+
     def _validate_supported_sampling_params(
         self,
         params: SamplingParams,
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 004f98496b0d7..16561d30a6dc3 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -230,9 +230,19 @@ class Sampler(nn.Module):
         # TODO(houseroad): this implementation is extremely inefficient.
         # One idea is implement this as a PyTorch C++ op, and we may
         # even optimize the logit_bias layout.
+
+        # Get vocabulary size from logits
+        vocab_size = logits.shape[-1]
+
         for i, logit_bias in enumerate(sampling_metadata.logit_bias):
             if logit_bias:
                 for token_id, bias in logit_bias.items():
+                    # Check token_id bounds to ensure within vocabulary
+                    if token_id < 0 or token_id >= vocab_size:
+                        raise ValueError(
+                            f"token_id {token_id} in logit_bias contains "
+                            f"out-of-vocab token id. Vocabulary size: "
+                            f"{vocab_size}")
                     logits[i, token_id] += bias
         return logits
 

From f49e5aff11c986ed4d45202b1716c5d74786efa9 Mon Sep 17 00:00:00 2001
From: Lily Liu <lilyliupku@gmail.com>
Date: Sat, 12 Apr 2025 19:42:51 -0700
Subject: [PATCH 415/593] [V1][Spec Decode] KV cache slots for eagle heads
 (#16370)

Signed-off-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
---
 tests/v1/core/test_kv_cache_utils.py | 86 ++++++++++++++++++++++++----
 vllm/v1/core/kv_cache_manager.py     | 16 ++++--
 vllm/v1/core/sched/scheduler.py      | 13 ++++-
 vllm/v1/engine/core.py               |  1 +
 4 files changed, 98 insertions(+), 18 deletions(-)

diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index d2b04c15820c6..a4a571b180c6b 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -7,6 +7,7 @@ from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.utils import GiB_bytes, sha256
+from vllm.v1.core.kv_cache_manager import KVCacheManager
 # disable yapf here as it formats differently than isort such that both fail
 # yapf: disable
 from vllm.v1.core.kv_cache_utils import (NONE_HASH, BlockHashType,
@@ -48,6 +49,18 @@ def make_request(request_id,
     )
 
 
+def new_kv_cache_spec(block_size=16,
+                      num_kv_heads=2,
+                      head_size=64,
+                      dtype=torch.float32,
+                      use_mla=False):
+    return FullAttentionSpec(block_size=block_size,
+                             num_kv_heads=num_kv_heads,
+                             head_size=head_size,
+                             dtype=dtype,
+                             use_mla=use_mla)
+
+
 def test_none_hash():
     assert NONE_HASH is not None
     assert isinstance(NONE_HASH, int)
@@ -327,18 +340,6 @@ def test_metrics():
 
 
 def test_unify_kv_cache_configs():
-
-    def new_kv_cache_spec(block_size=16,
-                          num_kv_heads=2,
-                          head_size=64,
-                          dtype=torch.float32,
-                          use_mla=False):
-        return FullAttentionSpec(block_size=block_size,
-                                 num_kv_heads=num_kv_heads,
-                                 head_size=head_size,
-                                 dtype=dtype,
-                                 use_mla=use_mla)
-
     same_kv_cache_config = [
         KVCacheConfig(
             num_blocks=10,
@@ -470,3 +471,64 @@ def test_estimate_max_model_len(model_id, max_model_len,
     estimated_max_len = estimate_max_model_len(vllm_config, kv_cache_spec,
                                                8 * GiB_bytes)
     assert estimated_max_len == want_estimated_max_len
+
+
+def test_allocate_with_lookahead():
+    """Verify that lookahead tokens correctly affect block allocation"""
+    block_size = 4
+    config = KVCacheConfig(
+        num_blocks=10,
+        tensors={
+            "layer1": KVCacheTensor(100),
+        },
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer1"],
+                             new_kv_cache_spec(block_size=block_size)),
+        ],
+    )
+
+    request = make_request(
+        request_id=0,
+        prompt_token_ids=[],
+        mm_positions=None,
+        mm_hashes=None,
+    )
+
+    # Test case 1: Requires additional lookahead tokens
+    kv_cache_manager = KVCacheManager(kv_cache_config=config,
+                                      max_model_len=100,
+                                      num_preallocate_tokens=0)
+    blocks = kv_cache_manager.allocate_slots(
+        request,
+        num_tokens=3,
+        num_lookahead_tokens=2,  # Total required: 3+2=5 tokens
+    )
+    assert len(blocks) == 2  # ceil(5/4)=2 blocks
+
+    # Test case 2: With precomputed blocks
+    kv_cache_manager = KVCacheManager(kv_cache_config=config,
+                                      max_model_len=100,
+                                      num_preallocate_tokens=4)
+    # num_preallocate_blocks = 4 // 4 - 2 // 4 = 1
+    # required_blocks = ceil((3 + 2) /4) = 2
+    # total_blocks = 1 + 2 = 3
+    blocks = kv_cache_manager.allocate_slots(
+        request,
+        num_tokens=3,
+        num_lookahead_tokens=2,
+    )
+    assert len(blocks) == 3
+
+    # Test case 3: With precomputed blocks
+    # num_preallocate_blocks = 4 // 4 - 4 // 4 = 0
+    # required_blocks = ceil((3 + 4) / 4) = 2
+    # total_blocks = 0 + 2 = 2
+    kv_cache_manager = KVCacheManager(kv_cache_config=config,
+                                      max_model_len=100,
+                                      num_preallocate_tokens=4)
+    blocks = kv_cache_manager.allocate_slots(
+        request,
+        num_tokens=3,
+        num_lookahead_tokens=4,
+    )
+    assert len(blocks) == 2
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 4e74c20d36659..33761cf7f9c01 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -164,7 +164,8 @@ class KVCacheManager:
         self,
         request: Request,
         num_tokens: int,
-        new_computed_blocks: Optional[list[KVCacheBlock]] = None
+        new_computed_blocks: Optional[list[KVCacheBlock]] = None,
+        num_lookahead_tokens: int = 0,
     ) -> Optional[list[KVCacheBlock]]:
         """Add slots for a request with new tokens to append.
 
@@ -174,6 +175,9 @@ class KVCacheManager:
                 not include the tokens that have already been computed.
             new_computed_blocks: A list of new computed blocks just hitting the
                 prefix caching.
+            num_lookahead_tokens: The number of speculative tokens to allocate.
+                This is used by spec decode proposers with kv-cache such 
+                as eagle.
 
         Blocks layout:
         -----------------------------------------------------------------------
@@ -211,8 +215,9 @@ class KVCacheManager:
         # the new prefix caching hits
         num_computed_tokens = (request.num_computed_tokens +
                                len(new_computed_blocks) * self.block_size)
-        num_required_blocks = cdiv(num_computed_tokens + num_tokens,
-                                   self.block_size)
+        num_required_blocks = cdiv(
+            num_computed_tokens + num_tokens + num_lookahead_tokens,
+            self.block_size)
         num_new_blocks = (num_required_blocks - len(req_blocks) -
                           len(new_computed_blocks))
 
@@ -246,8 +251,11 @@ class KVCacheManager:
         else:
             # Get new blocks from the free block pool considering
             # preallocated blocks.
+            num_preallocate_blocks = max(
+                0, self.num_preallocate_blocks -
+                num_lookahead_tokens // self.block_size)
             num_new_blocks = min(
-                num_new_blocks + self.num_preallocate_blocks,
+                num_new_blocks + num_preallocate_blocks,
                 self.block_pool.get_num_free_blocks(),
                 # Should not exceed the maximum number of blocks per request.
                 # This is especially because the block table has the shape
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 488d32cb82cfd..a81574875a5c1 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -7,7 +7,8 @@ from collections import deque
 from collections.abc import Iterable
 from typing import Optional, Union
 
-from vllm.config import CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig
+from vllm.config import (CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig,
+                         SpeculativeConfig)
 from vllm.logger import init_logger
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
@@ -39,6 +40,7 @@ class Scheduler(SchedulerInterface):
         lora_config: Optional[LoRAConfig],
         kv_cache_config: KVCacheConfig,
         structured_output_manager: StructuredOutputManager,
+        speculative_config: SpeculativeConfig = None,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         include_finished_set: bool = False,
         log_stats: bool = False,
@@ -112,6 +114,11 @@ class Scheduler(SchedulerInterface):
         self.encoder_cache_manager = EncoderCacheManager(
             cache_size=encoder_cache_size)
 
+        self.num_lookahead_tokens = 0
+        if speculative_config and speculative_config.method == "eagle":
+            self.num_lookahead_tokens = \
+                speculative_config.num_speculative_tokens
+
     def schedule(self) -> SchedulerOutput:
         # NOTE(woosuk) on the scheduling algorithm:
         # There's no "decoding phase" nor "prefill phase" in the scheduler.
@@ -188,7 +195,9 @@ class Scheduler(SchedulerInterface):
 
             while True:
                 new_blocks = self.kv_cache_manager.allocate_slots(
-                    request, num_new_tokens)
+                    request,
+                    num_new_tokens,
+                    num_lookahead_tokens=self.num_lookahead_tokens)
                 if new_blocks is None:
                     # The request cannot be scheduled.
                     # Preempt the lowest-priority request.
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index b8c2bebbc5ecb..f642e51001a8e 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -98,6 +98,7 @@ class EngineCore:
             cache_config=vllm_config.cache_config,
             lora_config=vllm_config.lora_config,
             kv_cache_config=kv_cache_config,
+            speculative_config=vllm_config.speculative_config,
             structured_output_manager=self.structured_output_manager,
             include_finished_set=vllm_config.parallel_config.data_parallel_size
             > 1,

From d085a4408244502dd496cae0ccbc524352e87ef8 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Sun, 13 Apr 2025 08:55:18 -0600
Subject: [PATCH 416/593] Enable PTPC FP8 for CompressedTensorsW8A8Fp8MoEMethod
 (triton fused_moe) (#16537)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .../compressed_tensors_moe.py                 | 107 +++++++++++-------
 1 file changed, 68 insertions(+), 39 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index d22999659380e..628724c5b7d67 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -88,14 +88,23 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
             "input_activations")
 
-        if not (self.weight_quant.strategy == QuantizationStrategy.TENSOR
-                and self.input_quant.strategy == QuantizationStrategy.TENSOR):
+        per_tensor = (self.weight_quant.strategy == QuantizationStrategy.TENSOR
+                      and self.input_quant.strategy
+                      == QuantizationStrategy.TENSOR)
+        per_channel = (
+            self.weight_quant.strategy == QuantizationStrategy.CHANNEL
+            and self.input_quant.strategy == QuantizationStrategy.TOKEN)
+        if not (per_tensor or per_channel):
             raise ValueError(
-                "For FP8 Fused MoE layers, only per-tensor scales "
-                "for weights and activations are supported. Found "
+                "For FP8 Fused MoE layers, we require per tensor "
+                "or channelwise, dynamic per token quantization. Found "
                 f"{self.weight_quant}, {self.input_quant}")
 
         self.static_input_scales = not self.input_quant.dynamic
+        if self.static_input_scales and per_channel:
+            raise ValueError(
+                "For FP8 Fused MoE layer, we require either per tensor or "
+                "channelwise, dynamic per token quantization.")
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
@@ -123,24 +132,40 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
         # WEIGHT_SCALES
-        # Allocate 2 scales for w1 and w3 respectively.
-        # They will be combined to a single scale after weight loading.
-        w13_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
-                                                         2,
-                                                         dtype=torch.float32),
-                                              requires_grad=False)
-        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
+            # Allocate 2 scales for w1 and w3 respectively.
+            # They are combined to a single scale after weight loading.
+            w13_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts, 2, dtype=torch.float32),
+                                                  requires_grad=False)
+            layer.register_parameter("w13_weight_scale", w13_weight_scale)
+            w2_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts, dtype=torch.float32),
+                                                 requires_grad=False)
+            layer.register_parameter("w2_weight_scale", w2_weight_scale)
+            # Add PER-TENSOR quantization for FusedMoE.weight_loader.
+            extra_weight_attrs.update(
+                {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+            set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+            set_weight_attrs(w2_weight_scale, extra_weight_attrs)
 
-        w2_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
-                                                        dtype=torch.float32),
-                                             requires_grad=False)
-        layer.register_parameter("w2_weight_scale", w2_weight_scale)
-        # Add the quantization method used (per tensor/grouped/channel)
-        # to ensure the weight scales are loaded in properly
-        extra_weight_attrs.update(
-            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
-        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
-        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+        elif self.weight_quant.strategy == QuantizationStrategy.CHANNEL:
+            w13_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                1,
+                dtype=torch.float32),
+                                                  requires_grad=False)
+            layer.register_parameter("w13_weight_scale", w13_weight_scale)
+            w2_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts, hidden_size, 1, dtype=torch.float32),
+                                                 requires_grad=False)
+            layer.register_parameter("w2_weight_scale", w2_weight_scale)
+            # Add PER-CHANNEL quantization for FusedMoE.weight_loader.
+            extra_weight_attrs.update(
+                {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value})
+            set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+            set_weight_attrs(w2_weight_scale, extra_weight_attrs)
 
         # INPUT_SCALES
         if self.static_input_scales:
@@ -163,6 +188,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         # Fp8 moe kernels require a single activation scale.
         # We take the max of all the scales in case they differ.
         if self.static_input_scales:
+            assert self.input_quant.strategy == QuantizationStrategy.TENSOR
             if (layer.w13_input_scale is None or layer.w2_input_scale is None):
                 raise ValueError(
                     "QuantConfig has static quantization, but found "
@@ -204,24 +230,25 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 layer.w2_input_scale = torch.nn.Parameter(w2_input_scale,
                                                           requires_grad=False)
 
-        # Fp8 moe kernel needs single weight scale for w13 per expert.
-        # We take the max then dequant and requant each expert.
-        assert layer.w13_weight_scale is not None
-        shard_size = layer.intermediate_size_per_partition
-        max_w13_scales = layer.w13_weight_scale.max(dim=1).values
-        for expert_id in range(layer.local_num_experts):
-            start = 0
-            for shard_id in range(2):
-                dq_weight = per_tensor_dequantize(
-                    layer.w13_weight[expert_id][start:start + shard_size, :],
-                    layer.w13_weight_scale[expert_id][shard_id])
-                layer.w13_weight[expert_id][
-                    start:start + shard_size, :], _ = ops.scaled_fp8_quant(
-                        dq_weight, max_w13_scales[expert_id])
-                start += shard_size
-
-        layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
-                                                    requires_grad=False)
+        # For Per-TENSOR case, Fp8 moe kernel needs single weight scale
+        # for w13 per expert. Use max then dequant and requant each expert.
+        if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
+            assert layer.w13_weight_scale is not None
+            shard_size = layer.intermediate_size_per_partition
+            max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+            for expert_id in range(layer.local_num_experts):
+                start = 0
+                for shard_id in range(2):
+                    dq_weight = per_tensor_dequantize(
+                        layer.w13_weight[expert_id][start:start +
+                                                    shard_size, :],
+                        layer.w13_weight_scale[expert_id][shard_id])
+                    layer.w13_weight[expert_id][
+                        start:start + shard_size, :], _ = ops.scaled_fp8_quant(
+                            dq_weight, max_w13_scales[expert_id])
+                    start += shard_size
+            layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
+                                                        requires_grad=False)
 
     def apply(
         self,
@@ -265,6 +292,8 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             activation=activation,
             apply_router_weight_on_input=apply_router_weight_on_input,
             use_fp8_w8a8=True,
+            per_channel_quant=self.weight_quant.strategy ==
+            QuantizationStrategy.CHANNEL,
             global_num_experts=global_num_experts,
             expert_map=expert_map,
             w1_scale=layer.w13_weight_scale,

From 63d2705edbe7fbf4d581ef49503725f3481e04c7 Mon Sep 17 00:00:00 2001
From: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Date: Sun, 13 Apr 2025 17:20:26 -0700
Subject: [PATCH 417/593] [Benchmark][Bugfix] Fix SonnetDataset default values
 in benchmark_throughput.py (#16556)

---
 benchmarks/benchmark_throughput.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 39e4a34bc7f5f..67e509c1f550f 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -597,18 +597,26 @@ if __name__ == "__main__":
     parser.add_argument(
         "--prefix-len",
         type=int,
-        default=0,
-        help="Number of fixed prefix tokens before the random "
-        "context in a request (default: 0).",
-    )
+        default=None,
+        help=f"Number of prefix tokens to be used in RandomDataset "
+        "and SonnetDataset. For RandomDataset, the total input "
+        "length is the sum of prefix-len (default: "
+        f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length "
+        "sampled from [input_len * (1 - range_ratio), "
+        "input_len * (1 + range_ratio)]. For SonnetDataset, "
+        f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) "
+        "controls how much of the input is fixed lines versus "
+        "random lines, but the total input length remains approximately "
+        "input_len tokens.")
     # random dataset
     parser.add_argument(
         "--random-range-ratio",
         type=float,
-        default=0.0,
-        help="Range ratio for sampling input/output length, "
-        "used only for RandomDataset. Must be in the range [0, 1) to define "
-        "a symmetric sampling range "
+        default=None,
+        help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) "
+        "for sampling input/output length, "
+        "used only for RandomDataset. Must be in the range [0, 1) to "
+        "define a symmetric sampling range "
         "[length * (1 - range_ratio), length * (1 + range_ratio)].",
     )
 

From dc1b4a6f1300003ae27f033afbdff5e2683721ce Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Sun, 13 Apr 2025 22:13:38 -0400
Subject: [PATCH 418/593] [Core][V0] Enable regex support with xgrammar
 (#13228)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 tests/entrypoints/llm/test_guided_generate.py     | 15 +++++++++++++--
 vllm/model_executor/guided_decoding/__init__.py   |  9 ++-------
 .../guided_decoding/xgrammar_decoding.py          | 10 ++++++++++
 3 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index 3b85ad68c057a..e43e9826e8f9b 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -286,15 +286,26 @@ def test_validation_against_both_guided_decoding_options(sample_regex, llm):
 
 @pytest.mark.skip_global_cleanup
 def test_disable_guided_decoding_fallback(sample_regex, llm):
+    # see has_xgrammar_unsupported_json_features()
+    unsupported_json = {
+        "type": "object",
+        "properties": {
+            "example": {
+                "type": "string",
+                "minLength": 5  # unsupported by xgrammar
+            }
+        }
+    }
     sampling_params = SamplingParams(temperature=0.8,
                                      top_p=0.95,
                                      guided_decoding=GuidedDecodingParams(
-                                         regex=sample_regex,
+                                         json=unsupported_json,
                                          backend="xgrammar:no-fallback"))
 
     with pytest.raises(
             ValueError,
-            match="xgrammar does not support regex guided decoding"):
+            match="xgrammar does not support advanced JSON schema features "
+            "like enums, patterns or numeric ranges."):
         llm.generate(prompts="This should fail",
                      sampling_params=sampling_params,
                      use_tqdm=True)
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index 6f0eede74b5aa..d4ee1be9a445d 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -59,14 +59,9 @@ def maybe_backend_fallback(
         from vllm.model_executor.guided_decoding.xgrammar_decoding import (
             xgr_installed)
 
-        # xgrammar doesn't support regex, fallback to outlines
-        if guided_params.regex is not None:
-            fallback_or_error(
-                guided_params,
-                "xgrammar does not support regex guided decoding.", "outlines")
         # xgrammar doesn't support some JSON schema features
-        elif (guided_params.json is not None
-              and has_xgrammar_unsupported_json_features(guided_params.json)):
+        if (guided_params.json is not None and
+                has_xgrammar_unsupported_json_features(guided_params.json)):
             fallback_or_error(
                 guided_params,
                 "xgrammar does not support advanced JSON schema features like "
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index d7e600e9beb12..ff223c3c9b83e 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -152,6 +152,7 @@ class GrammarConfig:
     grammar_str: str | None = None
     json_object: bool | None = None
     any_whitespace: bool = True
+    regex_str: str | None = None
     max_threads: int = 8
 
     @classmethod
@@ -255,6 +256,13 @@ class GrammarConfig:
                 max_threads=max_threads,
                 tokenizer_data=tokenizer_data,
             )
+        elif guided_params.regex:
+            return cls(
+                regex_str=guided_params.regex,
+                tokenizer_hash=tokenizer_hash,
+                max_threads=max_threads,
+                tokenizer_data=tokenizer_data,
+            )
         else:
             raise ValueError(
                 "Currently only support JSON and EBNF grammar mode for xgrammar"
@@ -330,6 +338,8 @@ class XGrammarLogitsProcessor:
                 self.ctx = compiler\
                     .compile_json_schema('{"type": "object"}',
                                          any_whitespace=any_whitespace)
+            elif self.config.regex_str:
+                self.ctx = compiler.compile_regex(self.config.regex_str)
             else:
                 raise ValueError(
                     "Invalid configuration for xgrammar logits processor")

From e51929ebca5833a5e6eed484775178d33a014faa Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 14 Apr 2025 10:24:16 +0100
Subject: [PATCH 419/593] Improve configs - `SchedulerConfig` (#16533)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config.py                       | 151 ++++++++-----
 vllm/engine/arg_utils.py             | 318 ++++++++++++++-------------
 vllm/entrypoints/openai/cli_args.py  |  24 +-
 vllm/entrypoints/openai/run_batch.py |   4 +-
 4 files changed, 279 insertions(+), 218 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 2912361ee35e6..08947e39bc412 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1522,6 +1522,9 @@ class LoadConfig:
             self.ignore_patterns = ["original/**/*"]
 
 
+DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"]
+
+
 @config
 @dataclass
 class ParallelConfig:
@@ -1563,7 +1566,7 @@ class ParallelConfig:
     placement_group: Optional["PlacementGroup"] = None
     """ray distributed model workers placement group."""
 
-    distributed_executor_backend: Optional[Union[str,
+    distributed_executor_backend: Optional[Union[DistributedExecutorBackend,
                                                  type["ExecutorBase"]]] = None
     """Backend to use for distributed model
     workers, either "ray" or "mp" (multiprocessing). If the product
@@ -1687,7 +1690,7 @@ class ParallelConfig:
             # current node and we aren't in a ray placement group.
 
             from vllm.executor import ray_utils
-            backend = "mp"
+            backend: DistributedExecutorBackend = "mp"
             ray_found = ray_utils.ray_is_available()
             if current_platform.is_neuron():
                 # neuron uses single process to control multiple devices
@@ -1755,92 +1758,124 @@ class ParallelConfig:
             "worker_extension_cls must be a string (qualified class name).")
 
 
+SchedulerPolicy = Literal["fcfs", "priority"]
+
+
+@config
 @dataclass
 class SchedulerConfig:
     """Scheduler configuration."""
 
-    runner_type: str = "generate"  # The runner type to launch for the model.
+    runner_type: RunnerType = "generate"
+    """The runner type to launch for the model."""
 
-    # Maximum number of tokens to be processed in a single iteration.
-    max_num_batched_tokens: int = field(default=None)  # type: ignore
+    max_num_batched_tokens: int = None  # type: ignore
+    """Maximum number of tokens to be processed in a single iteration.
+    
+    This config has no static default. If left unspecified by the user, it will
+    be set in `EngineArgs.create_engine_config` based on the usage context."""
 
-    # Maximum number of sequences to be processed in a single iteration.
-    max_num_seqs: int = 128
+    max_num_seqs: int = None  # type: ignore
+    """Maximum number of sequences to be processed in a single iteration.
+    
+    This config has no static default. If left unspecified by the user, it will
+    be set in `EngineArgs.create_engine_config` based on the usage context."""
 
-    # Maximum length of a sequence (including prompt and generated text).
-    max_model_len: int = 8192
+    max_model_len: int = None  # type: ignore
+    """Maximum length of a sequence (including prompt and generated text). This
+    is primarily set in `ModelConfig` and that value should be manually
+    duplicated here."""
 
-    # Maximum number of sequences that can be partially prefilled concurrently
     max_num_partial_prefills: int = 1
+    """For chunked prefill, the maximum number of sequences that can be
+    partially prefilled concurrently."""
 
-    # Maximum number of "very long prompt" sequences that can be prefilled
-    # concurrently (long is defined by long_prefill_threshold)
     max_long_partial_prefills: int = 1
+    """For chunked prefill, the maximum number of prompts longer than
+    long_prefill_token_threshold that will be prefilled concurrently. Setting
+    this less than max_num_partial_prefills will allow shorter prompts to jump
+    the queue in front of longer prompts in some cases, improving latency."""
 
-    # calculate context length that determines which sequences are
-    # considered "long"
     long_prefill_token_threshold: int = 0
+    """For chunked prefill, a request is considered long if the prompt is
+    longer than this number of tokens."""
 
-    # The number of slots to allocate per sequence per
-    # step, beyond the known token ids. This is used in speculative
-    # decoding to store KV activations of tokens which may or may not be
-    # accepted.
     num_lookahead_slots: int = 0
+    """The number of slots to allocate per sequence per
+    step, beyond the known token ids. This is used in speculative
+    decoding to store KV activations of tokens which may or may not be
+    accepted.
+
+    NOTE: This will be replaced by speculative config in the future; it is
+    present to enable correctness tests until then."""
 
-    # Apply a delay (of delay factor multiplied by previous
-    # prompt latency) before scheduling next prompt.
     delay_factor: float = 0.0
+    """Apply a delay (of delay factor multiplied by previous
+    prompt latency) before scheduling next prompt."""
 
-    # If True, prefill requests can be chunked based
-    # on the remaining max_num_batched_tokens.
-    enable_chunked_prefill: bool = False
+    enable_chunked_prefill: bool = None  # type: ignore
+    """If True, prefill requests can be chunked based
+    on the remaining max_num_batched_tokens."""
 
     is_multimodal_model: bool = False
+    """True if the model is multimodal."""
 
-    # NOTE: The following multimodal encoder budget will be initialized to
-    # max_num_batched_tokens and overridden in case max multimodal embedding
-    # size is larger.
-    # TODO (ywang96): Make these configurable.
-    # Multimodal encoder compute budget, only used in V1
-    max_num_encoder_input_tokens: int = field(default=None)  # type: ignore
+    # TODO (ywang96): Make this configurable.
+    max_num_encoder_input_tokens: int = field(init=False)
+    """Multimodal encoder compute budget, only used in V1.
+    
+    NOTE: This is not currently configurable. It will be overridden by
+    max_num_batched_tokens in case max multimodal embedding size is larger."""
 
-    # Multimodal encoder cache size, only used in V1
-    encoder_cache_size: int = field(default=None)  # type: ignore
+    # TODO (ywang96): Make this configurable.
+    encoder_cache_size: int = field(init=False)
+    """Multimodal encoder cache size, only used in V1.
+
+    NOTE: This is not currently configurable. It will be overridden by
+    max_num_batched_tokens in case max multimodal embedding size is larger."""
 
-    # Whether to perform preemption by swapping or
-    # recomputation. If not specified, we determine the mode as follows:
-    # We use recomputation by default since it incurs lower overhead than
-    # swapping. However, when the sequence group has multiple sequences
-    # (e.g., beam search), recomputation is not currently supported. In
-    # such a case, we use swapping instead.
     preemption_mode: Optional[str] = None
+    """Whether to perform preemption by swapping or
+    recomputation. If not specified, we determine the mode as follows:
+    We use recomputation by default since it incurs lower overhead than
+    swapping. However, when the sequence group has multiple sequences
+    (e.g., beam search), recomputation is not currently supported. In
+    such a case, we use swapping instead."""
 
     num_scheduler_steps: int = 1
+    """Maximum number of forward steps per scheduler call."""
 
-    multi_step_stream_outputs: bool = False
+    multi_step_stream_outputs: bool = True
+    """If False, then multi-step will stream outputs at the end of all steps"""
 
-    # Private API. If used, scheduler sends delta data to
-    # workers instead of an entire data. It should be enabled only
-    # when SPMD worker architecture is enabled. I.e.,
-    # VLLM_USE_RAY_SPMD_WORKER=1
     send_delta_data: bool = False
+    """Private API. If used, scheduler sends delta data to
+    workers instead of an entire data. It should be enabled only
+    when SPMD worker architecture is enabled. I.e.,
+    VLLM_USE_RAY_SPMD_WORKER=1"""
 
-    # The scheduling policy to use. "fcfs" (default) or "priority".
-    policy: str = "fcfs"
+    policy: SchedulerPolicy = "fcfs"
+    """The scheduling policy to use:\n
+    - "fcfs" means first come first served, i.e. requests are handled in order
+    of arrival.\n
+    - "priority" means requests are handled based on given priority (lower
+    value means earlier handling) and time of arrival deciding any ties)."""
 
     chunked_prefill_enabled: bool = field(init=False)
+    """True if chunked prefill is enabled."""
 
-    # If set to true and chunked prefill is enabled, we do not want to
-    # partially schedule a multimodal item. Only used in V1
-    # This ensures that if a request has a mixed prompt
-    # (like text tokens TTTT followed by image tokens IIIIIIIIII) where only
-    # some image tokens can be scheduled (like TTTTIIIII, leaving IIIII),
-    # it will be scheduled as TTTT in one step and IIIIIIIIII in the next.
     disable_chunked_mm_input: bool = False
+    """If set to true and chunked prefill is enabled, we do not want to
+    partially schedule a multimodal item. Only used in V1
+    This ensures that if a request has a mixed prompt
+    (like text tokens TTTT followed by image tokens IIIIIIIIII) where only
+    some image tokens can be scheduled (like TTTTIIIII, leaving IIIII),
+    it will be scheduled as TTTT in one step and IIIIIIIIII in the next."""
 
-    # scheduler class or path. "vllm.core.scheduler.Scheduler" (default)
-    # or "mod.custom_class".
     scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler"
+    """The scheduler class to use. "vllm.core.scheduler.Scheduler" is the
+    default scheduler. Can be a class directly or the path to a class of form
+    "mod.custom_class"."""
 
     def compute_hash(self) -> str:
         """
@@ -1862,6 +1897,18 @@ class SchedulerConfig:
         return hash_str
 
     def __post_init__(self) -> None:
+        if self.max_model_len is None:
+            self.max_model_len = 8192
+            logger.warning(
+                "max_model_len was is not set. Defaulting to arbitrary value "
+                "of %d.", self.max_model_len)
+
+        if self.max_num_seqs is None:
+            self.max_num_seqs = 128
+            logger.warning(
+                "max_num_seqs was is not set. Defaulting to arbitrary value "
+                "of %d.", self.max_num_seqs)
+
         if self.max_num_batched_tokens is None:
             if self.enable_chunked_prefill:
                 if self.num_scheduler_steps > 1:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 975afe5ada835..32cb2e90af20b 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1,25 +1,30 @@
 # SPDX-License-Identifier: Apache-2.0
 
+# yapf: disable
 import argparse
 import dataclasses
 import json
 import re
 import threading
 from dataclasses import MISSING, dataclass, fields
-from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Mapping, Optional,
-                    Tuple, Type, Union, cast, get_args, get_origin)
+from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Literal, Mapping,
+                    Optional, Tuple, Type, TypeVar, Union, cast, get_args,
+                    get_origin)
 
 import torch
+from typing_extensions import TypeIs
 
 import vllm.envs as envs
 from vllm import version
 from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat,
-                         DecodingConfig, DeviceConfig, HfOverrides,
+                         DecodingConfig, DeviceConfig,
+                         DistributedExecutorBackend, HfOverrides,
                          KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
                          ModelConfig, ModelImpl, ObservabilityConfig,
                          ParallelConfig, PoolerConfig, PromptAdapterConfig,
-                         SchedulerConfig, SpeculativeConfig, TaskOption,
-                         TokenizerPoolConfig, VllmConfig, get_attr_docs)
+                         SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
+                         TaskOption, TokenizerPoolConfig, VllmConfig,
+                         get_attr_docs)
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
@@ -28,7 +33,9 @@ from vllm.reasoning import ReasoningParserManager
 from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser, StoreBoolean, is_in_ray_actor
+from vllm.utils import FlexibleArgumentParser, is_in_ray_actor
+
+# yapf: enable
 
 if TYPE_CHECKING:
     from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
@@ -47,11 +54,32 @@ DEVICE_OPTIONS = [
     "hpu",
 ]
 
+# object is used to allow for special typing forms
+T = TypeVar("T")
+TypeHint = Union[type[Any], object]
+TypeHintT = Union[type[T], object]
 
-def nullable_str(val: str):
-    if not val or val == "None":
+
+def optional_arg(val: str, return_type: type[T]) -> Optional[T]:
+    if val == "" or val == "None":
         return None
-    return val
+    try:
+        return cast(Callable, return_type)(val)
+    except ValueError as e:
+        raise argparse.ArgumentTypeError(
+            f"Value {val} cannot be converted to {return_type}.") from e
+
+
+def optional_str(val: str) -> Optional[str]:
+    return optional_arg(val, str)
+
+
+def optional_int(val: str) -> Optional[int]:
+    return optional_arg(val, int)
+
+
+def optional_float(val: str) -> Optional[float]:
+    return optional_arg(val, float)
 
 
 def nullable_kvs(val: str) -> Optional[Mapping[str, int]]:
@@ -112,7 +140,8 @@ class EngineArgs:
     # is intended for expert use only. The API may change without
     # notice.
     distributed_executor_backend: Optional[Union[
-        str, Type[ExecutorBase]]] = ParallelConfig.distributed_executor_backend
+        DistributedExecutorBackend,
+        Type[ExecutorBase]]] = ParallelConfig.distributed_executor_backend
     # number of P/D disaggregation (or other disaggregation) workers
     pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size
     tensor_parallel_size: int = ParallelConfig.tensor_parallel_size
@@ -129,11 +158,13 @@ class EngineArgs:
     swap_space: float = 4  # GiB
     cpu_offload_gb: float = 0  # GiB
     gpu_memory_utilization: float = 0.90
-    max_num_batched_tokens: Optional[int] = None
-    max_num_partial_prefills: Optional[int] = 1
-    max_long_partial_prefills: Optional[int] = 1
-    long_prefill_token_threshold: Optional[int] = 0
-    max_num_seqs: Optional[int] = None
+    max_num_batched_tokens: Optional[
+        int] = SchedulerConfig.max_num_batched_tokens
+    max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills
+    max_long_partial_prefills: int = SchedulerConfig.max_long_partial_prefills
+    long_prefill_token_threshold: int = \
+        SchedulerConfig.long_prefill_token_threshold
+    max_num_seqs: Optional[int] = SchedulerConfig.max_num_seqs
     max_logprobs: int = 20  # Default value for OpenAI Chat Completions API
     disable_log_stats: bool = False
     revision: Optional[str] = None
@@ -169,20 +200,21 @@ class EngineArgs:
     lora_dtype: Optional[Union[str, torch.dtype]] = 'auto'
     max_cpu_loras: Optional[int] = None
     device: str = 'auto'
-    num_scheduler_steps: int = 1
-    multi_step_stream_outputs: bool = True
+    num_scheduler_steps: int = SchedulerConfig.num_scheduler_steps
+    multi_step_stream_outputs: bool = SchedulerConfig.multi_step_stream_outputs
     ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
     num_gpu_blocks_override: Optional[int] = None
-    num_lookahead_slots: int = 0
+    num_lookahead_slots: int = SchedulerConfig.num_lookahead_slots
     model_loader_extra_config: Optional[
         dict] = LoadConfig.model_loader_extra_config
     ignore_patterns: Optional[Union[str,
                                     List[str]]] = LoadConfig.ignore_patterns
-    preemption_mode: Optional[str] = None
+    preemption_mode: Optional[str] = SchedulerConfig.preemption_mode
 
-    scheduler_delay_factor: float = 0.0
-    enable_chunked_prefill: Optional[bool] = None
-    disable_chunked_mm_input: bool = False
+    scheduler_delay_factor: float = SchedulerConfig.delay_factor
+    enable_chunked_prefill: Optional[
+        bool] = SchedulerConfig.enable_chunked_prefill
+    disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input
 
     guided_decoding_backend: str = DecodingConfig.guided_decoding_backend
     logits_processor_pattern: Optional[str] = None
@@ -194,8 +226,8 @@ class EngineArgs:
     otlp_traces_endpoint: Optional[str] = None
     collect_detailed_traces: Optional[str] = None
     disable_async_output_proc: bool = False
-    scheduling_policy: Literal["fcfs", "priority"] = "fcfs"
-    scheduler_cls: Union[str, Type[object]] = "vllm.core.scheduler.Scheduler"
+    scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
+    scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls
 
     override_neuron_config: Optional[Dict[str, Any]] = None
     override_pooler_config: Optional[PoolerConfig] = None
@@ -236,15 +268,33 @@ class EngineArgs:
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         """Shared CLI arguments for vLLM engine."""
 
-        def is_type_in_union(cls: type[Any], type: type[Any]) -> bool:
+        def is_type_in_union(cls: TypeHint, type: TypeHint) -> bool:
             """Check if the class is a type in a union type."""
-            return get_origin(cls) is Union and type in get_args(cls)
+            is_union = get_origin(cls) is Union
+            type_in_union = type in [get_origin(a) or a for a in get_args(cls)]
+            return is_union and type_in_union
 
-        def is_optional(cls: type[Any]) -> bool:
+        def get_type_from_union(cls: TypeHint, type: TypeHintT) -> TypeHintT:
+            """Get the type in a union type."""
+            for arg in get_args(cls):
+                if (get_origin(arg) or arg) is type:
+                    return arg
+            raise ValueError(f"Type {type} not found in union type {cls}.")
+
+        def is_optional(cls: TypeHint) -> TypeIs[Union[Any, None]]:
             """Check if the class is an optional type."""
             return is_type_in_union(cls, type(None))
 
-        def get_kwargs(cls: type[Any]) -> Dict[str, Any]:
+        def can_be_type(cls: TypeHint, type: TypeHintT) -> TypeIs[TypeHintT]:
+            """Check if the class can be of type."""
+            return cls is type or get_origin(cls) is type or is_type_in_union(
+                cls, type)
+
+        def is_custom_type(cls: TypeHint) -> bool:
+            """Check if the class is a custom type."""
+            return cls.__module__ != "builtins"
+
+        def get_kwargs(cls: type[Any]) -> dict[str, Any]:
             cls_docs = get_attr_docs(cls)
             kwargs = {}
             for field in fields(cls):
@@ -253,19 +303,41 @@ class EngineArgs:
                 default = (field.default_factory
                            if field.default is MISSING else field.default)
                 kwargs[name] = {"default": default, "help": cls_docs[name]}
-                # When using action="store_true"
-                # add_argument doesn't accept type
-                if field.type is bool:
-                    continue
-                # Handle optional fields
-                if is_optional(field.type):
-                    kwargs[name]["type"] = nullable_str
-                    continue
-                # Handle str in union fields
-                if is_type_in_union(field.type, str):
-                    kwargs[name]["type"] = str
-                    continue
-                kwargs[name]["type"] = field.type
+
+                # Make note of if the field is optional and get the actual
+                # type of the field if it is
+                optional = is_optional(field.type)
+                field_type = get_args(
+                    field.type)[0] if optional else field.type
+
+                if can_be_type(field_type, bool):
+                    # Creates --no-<name> and --<name> flags
+                    kwargs[name]["action"] = argparse.BooleanOptionalAction
+                    kwargs[name]["type"] = bool
+                elif can_be_type(field_type, Literal):
+                    # Creates choices from Literal arguments
+                    if is_type_in_union(field_type, Literal):
+                        field_type = get_type_from_union(field_type, Literal)
+                    choices = get_args(field_type)
+                    kwargs[name]["choices"] = choices
+                    choice_type = type(choices[0])
+                    assert all(type(c) is choice_type for c in choices), (
+                        f"All choices must be of the same type. "
+                        f"Got {choices} with types {[type(c) for c in choices]}"
+                    )
+                    kwargs[name]["type"] = choice_type
+                elif can_be_type(field_type, int):
+                    kwargs[name]["type"] = optional_int if optional else int
+                elif can_be_type(field_type, float):
+                    kwargs[name][
+                        "type"] = optional_float if optional else float
+                elif (can_be_type(field_type, str)
+                      or can_be_type(field_type, dict)
+                      or is_custom_type(field_type)):
+                    kwargs[name]["type"] = optional_str if optional else str
+                else:
+                    raise ValueError(
+                        f"Unsupported type {field.type} for argument {name}. ")
             return kwargs
 
         # Model arguments
@@ -285,13 +357,13 @@ class EngineArgs:
             'which task to use.')
         parser.add_argument(
             '--tokenizer',
-            type=nullable_str,
+            type=optional_str,
             default=EngineArgs.tokenizer,
             help='Name or path of the huggingface tokenizer to use. '
             'If unspecified, model name or path will be used.')
         parser.add_argument(
             "--hf-config-path",
-            type=nullable_str,
+            type=optional_str,
             default=EngineArgs.hf_config_path,
             help='Name or path of the huggingface config to use. '
             'If unspecified, model name or path will be used.')
@@ -303,21 +375,21 @@ class EngineArgs:
             'the input. The generated output will contain token ids.')
         parser.add_argument(
             '--revision',
-            type=nullable_str,
+            type=optional_str,
             default=None,
             help='The specific model version to use. It can be a branch '
             'name, a tag name, or a commit id. If unspecified, will use '
             'the default version.')
         parser.add_argument(
             '--code-revision',
-            type=nullable_str,
+            type=optional_str,
             default=None,
             help='The specific revision to use for the model code on '
             'Hugging Face Hub. It can be a branch name, a tag name, or a '
             'commit id. If unspecified, will use the default version.')
         parser.add_argument(
             '--tokenizer-revision',
-            type=nullable_str,
+            type=optional_str,
             default=None,
             help='Revision of the huggingface tokenizer to use. '
             'It can be a branch name, a tag name, or a commit id. '
@@ -357,7 +429,6 @@ class EngineArgs:
         load_group.add_argument('--model-loader-extra-config',
                                 **load_kwargs["model_loader_extra_config"])
         load_group.add_argument('--use-tqdm-on-load',
-                                action=argparse.BooleanOptionalAction,
                                 **load_kwargs["use_tqdm_on_load"])
 
         parser.add_argument(
@@ -413,7 +484,7 @@ class EngineArgs:
             'the behavior is subject to change in each release.')
         parser.add_argument(
             '--logits-processor-pattern',
-            type=nullable_str,
+            type=optional_str,
             default=None,
             help='Optional regex pattern specifying valid logits processor '
             'qualified names that can be passed with the `logits_processors` '
@@ -439,7 +510,6 @@ class EngineArgs:
         )
         parallel_group.add_argument(
             '--distributed-executor-backend',
-            choices=['ray', 'mp', 'uni', 'external_launcher'],
             **parallel_kwargs["distributed_executor_backend"])
         parallel_group.add_argument(
             '--pipeline-parallel-size', '-pp',
@@ -450,18 +520,15 @@ class EngineArgs:
                                     **parallel_kwargs["data_parallel_size"])
         parallel_group.add_argument(
             '--enable-expert-parallel',
-            action='store_true',
             **parallel_kwargs["enable_expert_parallel"])
         parallel_group.add_argument(
             '--max-parallel-loading-workers',
             **parallel_kwargs["max_parallel_loading_workers"])
         parallel_group.add_argument(
             '--ray-workers-use-nsight',
-            action='store_true',
             **parallel_kwargs["ray_workers_use_nsight"])
         parallel_group.add_argument(
             '--disable-custom-all-reduce',
-            action='store_true',
             **parallel_kwargs["disable_custom_all_reduce"])
         # KV cache arguments
         parser.add_argument('--block-size',
@@ -502,14 +569,6 @@ class EngineArgs:
                             'block manager v2) is now the default. '
                             'Setting this flag to True or False'
                             ' has no effect on vLLM behavior.')
-        parser.add_argument(
-            '--num-lookahead-slots',
-            type=int,
-            default=EngineArgs.num_lookahead_slots,
-            help='Experimental scheduling config necessary for '
-            'speculative decoding. This will be replaced by '
-            'speculative config in the future; it is present '
-            'to enable correctness tests until then.')
 
         parser.add_argument('--seed',
                             type=int,
@@ -552,36 +611,6 @@ class EngineArgs:
             default=None,
             help='If specified, ignore GPU profiling result and use this number'
             ' of GPU blocks. Used for testing preemption.')
-        parser.add_argument('--max-num-batched-tokens',
-                            type=int,
-                            default=EngineArgs.max_num_batched_tokens,
-                            help='Maximum number of batched tokens per '
-                            'iteration.')
-        parser.add_argument(
-            "--max-num-partial-prefills",
-            type=int,
-            default=EngineArgs.max_num_partial_prefills,
-            help="For chunked prefill, the max number of concurrent \
-            partial prefills.")
-        parser.add_argument(
-            "--max-long-partial-prefills",
-            type=int,
-            default=EngineArgs.max_long_partial_prefills,
-            help="For chunked prefill, the maximum number of prompts longer "
-            "than --long-prefill-token-threshold that will be prefilled "
-            "concurrently. Setting this less than --max-num-partial-prefills "
-            "will allow shorter prompts to jump the queue in front of longer "
-            "prompts in some cases, improving latency.")
-        parser.add_argument(
-            "--long-prefill-token-threshold",
-            type=float,
-            default=EngineArgs.long_prefill_token_threshold,
-            help="For chunked prefill, a request is considered long if the "
-            "prompt is longer than this number of tokens.")
-        parser.add_argument('--max-num-seqs',
-                            type=int,
-                            default=EngineArgs.max_num_seqs,
-                            help='Maximum number of sequences per iteration.')
         parser.add_argument(
             '--max-logprobs',
             type=int,
@@ -594,7 +623,7 @@ class EngineArgs:
         # Quantization settings.
         parser.add_argument('--quantization',
                             '-q',
-                            type=nullable_str,
+                            type=optional_str,
                             choices=[*QUANTIZATION_METHODS, None],
                             default=EngineArgs.quantization,
                             help='Method used to quantize the weights. If '
@@ -658,7 +687,7 @@ class EngineArgs:
                             'asynchronous tokenization. Ignored '
                             'if tokenizer_pool_size is 0.')
         parser.add_argument('--tokenizer-pool-extra-config',
-                            type=nullable_str,
+                            type=optional_str,
                             default=EngineArgs.tokenizer_pool_extra_config,
                             help='Extra config for tokenizer pool. '
                             'This should be a JSON string that will be '
@@ -721,7 +750,7 @@ class EngineArgs:
                   'base model dtype.'))
         parser.add_argument(
             '--long-lora-scaling-factors',
-            type=nullable_str,
+            type=optional_str,
             default=EngineArgs.long_lora_scaling_factors,
             help=('Specify multiple scaling factors (which can '
                   'be different from base model scaling factor '
@@ -766,28 +795,6 @@ class EngineArgs:
                             help=('Maximum number of forward steps per '
                                   'scheduler call.'))
 
-        parser.add_argument(
-            '--multi-step-stream-outputs',
-            action=StoreBoolean,
-            default=EngineArgs.multi_step_stream_outputs,
-            nargs="?",
-            const="True",
-            help='If False, then multi-step will stream outputs at the end '
-            'of all steps')
-        parser.add_argument(
-            '--scheduler-delay-factor',
-            type=float,
-            default=EngineArgs.scheduler_delay_factor,
-            help='Apply a delay (of delay factor multiplied by previous '
-            'prompt latency) before scheduling next prompt.')
-        parser.add_argument(
-            '--enable-chunked-prefill',
-            action=StoreBoolean,
-            default=EngineArgs.enable_chunked_prefill,
-            nargs="?",
-            const="True",
-            help='If set, the prefill requests can be chunked based on the '
-            'max_num_batched_tokens.')
         parser.add_argument('--speculative-config',
                             type=json.loads,
                             default=None,
@@ -863,22 +870,43 @@ class EngineArgs:
             help="Disable async output processing. This may result in "
             "lower performance.")
 
-        parser.add_argument(
-            '--scheduling-policy',
-            choices=['fcfs', 'priority'],
-            default="fcfs",
-            help='The scheduling policy to use. "fcfs" (first come first served'
-            ', i.e. requests are handled in order of arrival; default) '
-            'or "priority" (requests are handled based on given '
-            'priority (lower value means earlier handling) and time of '
-            'arrival deciding any ties).')
-
-        parser.add_argument(
-            '--scheduler-cls',
-            default=EngineArgs.scheduler_cls,
-            help='The scheduler class to use. "vllm.core.scheduler.Scheduler" '
-            'is the default scheduler. Can be a class directly or the path to '
-            'a class of form "mod.custom_class".')
+        # Scheduler arguments
+        scheduler_kwargs = get_kwargs(SchedulerConfig)
+        scheduler_group = parser.add_argument_group(
+            title="SchedulerConfig",
+            description=SchedulerConfig.__doc__,
+        )
+        scheduler_group.add_argument(
+            '--max-num-batched-tokens',
+            **scheduler_kwargs["max_num_batched_tokens"])
+        scheduler_group.add_argument('--max-num-seqs',
+                                     **scheduler_kwargs["max_num_seqs"])
+        scheduler_group.add_argument(
+            "--max-num-partial-prefills",
+            **scheduler_kwargs["max_num_partial_prefills"])
+        scheduler_group.add_argument(
+            "--max-long-partial-prefills",
+            **scheduler_kwargs["max_long_partial_prefills"])
+        scheduler_group.add_argument(
+            "--long-prefill-token-threshold",
+            **scheduler_kwargs["long_prefill_token_threshold"])
+        scheduler_group.add_argument('--num-lookahead-slots',
+                                     **scheduler_kwargs["num_lookahead_slots"])
+        scheduler_group.add_argument('--scheduler-delay-factor',
+                                     **scheduler_kwargs["delay_factor"])
+        scheduler_group.add_argument(
+            '--enable-chunked-prefill',
+            **scheduler_kwargs["enable_chunked_prefill"])
+        scheduler_group.add_argument(
+            '--multi-step-stream-outputs',
+            **scheduler_kwargs["multi_step_stream_outputs"])
+        scheduler_group.add_argument('--scheduling-policy',
+                                     **scheduler_kwargs["policy"])
+        scheduler_group.add_argument(
+            "--disable-chunked-mm-input",
+            **scheduler_kwargs["disable_chunked_mm_input"])
+        parser.add_argument('--scheduler-cls',
+                            **scheduler_kwargs["scheduler_cls"])
 
         parser.add_argument(
             '--override-neuron-config',
@@ -930,7 +958,7 @@ class EngineArgs:
             'class without changing the existing functions.')
         parser.add_argument(
             "--generation-config",
-            type=nullable_str,
+            type=optional_str,
             default="auto",
             help="The folder path to the generation config. "
             "Defaults to 'auto', the generation config will be loaded from "
@@ -1003,20 +1031,6 @@ class EngineArgs:
             "Note that even if this is set to False, cascade attention will be "
             "only used when the heuristic tells that it's beneficial.")
 
-        parser.add_argument(
-            "--disable-chunked-mm-input",
-            action=StoreBoolean,
-            default=EngineArgs.disable_chunked_mm_input,
-            nargs="?",
-            const="True",
-            help="Disable multimodal input chunking attention for V1. "
-            "If set to true and chunked prefill is enabled, we do not want to"
-            " partially schedule a multimodal item. This ensures that if a "
-            "request has a mixed prompt (like text tokens TTTT followed by "
-            "image tokens IIIIIIIIII) where only some image tokens can be "
-            "scheduled (like TTTTIIIII, leaving IIIII), it will be scheduled "
-            "as TTTT in one step and IIIIIIIIII in the next.")
-
         return parser
 
     @classmethod
@@ -1370,7 +1384,7 @@ class EngineArgs:
                                recommend_to_remove=False)
             return False
 
-        if self.preemption_mode != EngineArgs.preemption_mode:
+        if self.preemption_mode != SchedulerConfig.preemption_mode:
             _raise_or_fallback(feature_name="--preemption-mode",
                                recommend_to_remove=True)
             return False
@@ -1381,17 +1395,17 @@ class EngineArgs:
                                recommend_to_remove=True)
             return False
 
-        if self.scheduling_policy != EngineArgs.scheduling_policy:
+        if self.scheduling_policy != SchedulerConfig.policy:
             _raise_or_fallback(feature_name="--scheduling-policy",
                                recommend_to_remove=False)
             return False
 
-        if self.num_scheduler_steps != EngineArgs.num_scheduler_steps:
+        if self.num_scheduler_steps != SchedulerConfig.num_scheduler_steps:
             _raise_or_fallback(feature_name="--num-scheduler-steps",
                                recommend_to_remove=True)
             return False
 
-        if self.scheduler_delay_factor != EngineArgs.scheduler_delay_factor:
+        if self.scheduler_delay_factor != SchedulerConfig.delay_factor:
             _raise_or_fallback(feature_name="--scheduler-delay-factor",
                                recommend_to_remove=True)
             return False
@@ -1475,9 +1489,9 @@ class EngineArgs:
 
         # No Concurrent Partial Prefills so far.
         if (self.max_num_partial_prefills
-                != EngineArgs.max_num_partial_prefills
+                != SchedulerConfig.max_num_partial_prefills
                 or self.max_long_partial_prefills
-                != EngineArgs.max_long_partial_prefills):
+                != SchedulerConfig.max_long_partial_prefills):
             _raise_or_fallback(feature_name="Concurrent Partial Prefill",
                                recommend_to_remove=False)
             return False
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 218a8fbe10b76..af546c3032af8 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -11,7 +11,7 @@ import ssl
 from collections.abc import Sequence
 from typing import Optional, Union, get_args
 
-from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
+from vllm.engine.arg_utils import AsyncEngineArgs, optional_str
 from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
                                          validate_chat_template)
 from vllm.entrypoints.openai.serving_models import (LoRAModulePath,
@@ -79,7 +79,7 @@ class PromptAdapterParserAction(argparse.Action):
 
 def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
     parser.add_argument("--host",
-                        type=nullable_str,
+                        type=optional_str,
                         default=None,
                         help="Host name.")
     parser.add_argument("--port", type=int, default=8000, help="Port number.")
@@ -108,13 +108,13 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                         default=["*"],
                         help="Allowed headers.")
     parser.add_argument("--api-key",
-                        type=nullable_str,
+                        type=optional_str,
                         default=None,
                         help="If provided, the server will require this key "
                         "to be presented in the header.")
     parser.add_argument(
         "--lora-modules",
-        type=nullable_str,
+        type=optional_str,
         default=None,
         nargs='+',
         action=LoRAParserAction,
@@ -126,14 +126,14 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         "\"base_model_name\": \"id\"}``")
     parser.add_argument(
         "--prompt-adapters",
-        type=nullable_str,
+        type=optional_str,
         default=None,
         nargs='+',
         action=PromptAdapterParserAction,
         help="Prompt adapter configurations in the format name=path. "
         "Multiple adapters can be specified.")
     parser.add_argument("--chat-template",
-                        type=nullable_str,
+                        type=optional_str,
                         default=None,
                         help="The file path to the chat template, "
                         "or the template in single-line form "
@@ -151,20 +151,20 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         'similar to OpenAI schema. '
         'Example: ``[{"type": "text", "text": "Hello world!"}]``')
     parser.add_argument("--response-role",
-                        type=nullable_str,
+                        type=optional_str,
                         default="assistant",
                         help="The role name to return if "
                         "``request.add_generation_prompt=true``.")
     parser.add_argument("--ssl-keyfile",
-                        type=nullable_str,
+                        type=optional_str,
                         default=None,
                         help="The file path to the SSL key file.")
     parser.add_argument("--ssl-certfile",
-                        type=nullable_str,
+                        type=optional_str,
                         default=None,
                         help="The file path to the SSL cert file.")
     parser.add_argument("--ssl-ca-certs",
-                        type=nullable_str,
+                        type=optional_str,
                         default=None,
                         help="The CA certificates file.")
     parser.add_argument(
@@ -180,13 +180,13 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
     )
     parser.add_argument(
         "--root-path",
-        type=nullable_str,
+        type=optional_str,
         default=None,
         help="FastAPI root_path when app is behind a path based routing proxy."
     )
     parser.add_argument(
         "--middleware",
-        type=nullable_str,
+        type=optional_str,
         action="append",
         default=[],
         help="Additional ASGI middleware to apply to the app. "
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 0d06ba3df23f9..3ffa5a32c1731 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -12,7 +12,7 @@ import torch
 from prometheus_client import start_http_server
 from tqdm import tqdm
 
-from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
+from vllm.engine.arg_utils import AsyncEngineArgs, optional_str
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.logger import RequestLogger, logger
 # yapf: disable
@@ -61,7 +61,7 @@ def parse_args():
         "to the output URL.",
     )
     parser.add_argument("--response-role",
-                        type=nullable_str,
+                        type=optional_str,
                         default="assistant",
                         help="The role name to return if "
                         "`request.add_generation_prompt=True`.")

From ce4ddd2d1a96faf393b0dd8c057cbf2774162297 Mon Sep 17 00:00:00 2001
From: DefTruth <31974251+DefTruth@users.noreply.github.com>
Date: Mon, 14 Apr 2025 17:39:47 +0800
Subject: [PATCH 420/593] [Misc] remove warning if triton>=3.2.0 (#16553)

Signed-off-by: DefTruth <qiustudent_r@163.com>
---
 vllm/attention/ops/triton_decode_attention.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/vllm/attention/ops/triton_decode_attention.py b/vllm/attention/ops/triton_decode_attention.py
index 40daec3ec1242..35ee0835f42a1 100644
--- a/vllm/attention/ops/triton_decode_attention.py
+++ b/vllm/attention/ops/triton_decode_attention.py
@@ -39,11 +39,12 @@ is_hip_ = current_platform.is_rocm()
 
 logger = logging.getLogger(__name__)
 
-# TODO: Remove this when triton>=3.2.0. This issue will not affect performance
-# and accuracy.
-logger.warning(
-    "The following error message 'operation scheduled before its operands' "
-    "can be ignored.")
+# Only print the following warnings when triton version < 3.2.0.
+# The issue won't affect performance or accuracy.
+if triton.__version__ < '3.2.0':
+    logger.warning(
+        "The following error message 'operation scheduled before its operands' "
+        "can be ignored.")
 
 
 @triton.jit

From 7cbfc1094359d52508bf18611e6cc46bea2e1d43 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Mon, 14 Apr 2025 17:59:15 +0800
Subject: [PATCH 421/593] [Misc] refactor examples (#16563)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 .../disaggregated_prefill.py                  |  6 +-
 .../disaggregated_prefill_lmcache.py          | 15 +++--
 .../online_serving/cohere_rerank_client.py    | 54 +++++++++------
 .../online_serving/jinaai_rerank_client.py    | 23 ++++---
 .../openai_chat_completion_client.py          | 67 +++++++++++--------
 5 files changed, 102 insertions(+), 63 deletions(-)

diff --git a/examples/offline_inference/disaggregated_prefill.py b/examples/offline_inference/disaggregated_prefill.py
index 36ee24bf7f18b..d60985146c5c9 100644
--- a/examples/offline_inference/disaggregated_prefill.py
+++ b/examples/offline_inference/disaggregated_prefill.py
@@ -95,7 +95,7 @@ def run_decode(prefill_done):
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
 
-if __name__ == "__main__":
+def main():
     prefill_done = Event()
     prefill_process = Process(target=run_prefill, args=(prefill_done, ))
     decode_process = Process(target=run_decode, args=(prefill_done, ))
@@ -109,3 +109,7 @@ if __name__ == "__main__":
     # Terminate the prefill node when decode is finished
     decode_process.join()
     prefill_process.terminate()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/disaggregated_prefill_lmcache.py b/examples/offline_inference/disaggregated_prefill_lmcache.py
index 5c84bbfc92c53..7da6fb7aaa230 100644
--- a/examples/offline_inference/disaggregated_prefill_lmcache.py
+++ b/examples/offline_inference/disaggregated_prefill_lmcache.py
@@ -38,6 +38,10 @@ os.environ["LMCACHE_REMOTE_URL"] = f"lm://localhost:{port}"
 # `naive` indicates using raw bytes of the tensor without any compression
 os.environ["LMCACHE_REMOTE_SERDE"] = "naive"
 
+prompts = [
+    "Hello, how are you?" * 1000,
+]
+
 
 def run_prefill(prefill_done, prompts):
     # We use GPU 0 for prefill node.
@@ -106,12 +110,7 @@ def run_lmcache_server(port):
     return server_proc
 
 
-if __name__ == "__main__":
-
-    prompts = [
-        "Hello, how are you?" * 1000,
-    ]
-
+def main():
     prefill_done = Event()
     prefill_process = Process(target=run_prefill, args=(prefill_done, prompts))
     decode_process = Process(target=run_decode, args=(prefill_done, prompts))
@@ -128,3 +127,7 @@ if __name__ == "__main__":
     prefill_process.terminate()
     lmcache_server_process.terminate()
     lmcache_server_process.wait()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/cohere_rerank_client.py b/examples/online_serving/cohere_rerank_client.py
index fc434ada1d156..c2d4ef08ddbbe 100644
--- a/examples/online_serving/cohere_rerank_client.py
+++ b/examples/online_serving/cohere_rerank_client.py
@@ -2,32 +2,46 @@
 """
 Example of using the OpenAI entrypoint's rerank API which is compatible with
 the Cohere SDK: https://github.com/cohere-ai/cohere-python
+Note that `pip install cohere` is needed to run this example.
 
 run: vllm serve BAAI/bge-reranker-base
 """
+from typing import Union
+
 import cohere
+from cohere import Client, ClientV2
 
-# cohere v1 client
-co = cohere.Client(base_url="http://localhost:8000", api_key="sk-fake-key")
-rerank_v1_result = co.rerank(
-    model="BAAI/bge-reranker-base",
-    query="What is the capital of France?",
-    documents=[
-        "The capital of France is Paris", "Reranking is fun!",
-        "vLLM is an open-source framework for fast AI serving"
-    ])
+model = "BAAI/bge-reranker-base"
 
-print(rerank_v1_result)
+query = "What is the capital of France?"
 
-# or the v2
-co2 = cohere.ClientV2("sk-fake-key", base_url="http://localhost:8000")
+documents = [
+    "The capital of France is Paris", "Reranking is fun!",
+    "vLLM is an open-source framework for fast AI serving"
+]
 
-v2_rerank_result = co2.rerank(
-    model="BAAI/bge-reranker-base",
-    query="What is the capital of France?",
-    documents=[
-        "The capital of France is Paris", "Reranking is fun!",
-        "vLLM is an open-source framework for fast AI serving"
-    ])
 
-print(v2_rerank_result)
+def cohere_rerank(client: Union[Client, ClientV2], model: str, query: str,
+                  documents: list[str]) -> dict:
+    return client.rerank(model=model, query=query, documents=documents)
+
+
+def main():
+    # cohere v1 client
+    cohere_v1 = cohere.Client(base_url="http://localhost:8000",
+                              api_key="sk-fake-key")
+    rerank_v1_result = cohere_rerank(cohere_v1, model, query, documents)
+    print("-" * 50)
+    print("rerank_v1_result:\n", rerank_v1_result)
+    print("-" * 50)
+
+    # or the v2
+    cohere_v2 = cohere.ClientV2("sk-fake-key",
+                                base_url="http://localhost:8000")
+    rerank_v2_result = cohere_rerank(cohere_v2, model, query, documents)
+    print("rerank_v2_result:\n", rerank_v2_result)
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/jinaai_rerank_client.py b/examples/online_serving/jinaai_rerank_client.py
index 3e760e1717883..3076bba765ce5 100644
--- a/examples/online_serving/jinaai_rerank_client.py
+++ b/examples/online_serving/jinaai_rerank_client.py
@@ -23,12 +23,19 @@ data = {
         "The capital of France is Paris.", "Horses and cows are both animals"
     ]
 }
-response = requests.post(url, headers=headers, json=data)
 
-# Check the response
-if response.status_code == 200:
-    print("Request successful!")
-    print(json.dumps(response.json(), indent=2))
-else:
-    print(f"Request failed with status code: {response.status_code}")
-    print(response.text)
+
+def main():
+    response = requests.post(url, headers=headers, json=data)
+
+    # Check the response
+    if response.status_code == 200:
+        print("Request successful!")
+        print(json.dumps(response.json(), indent=2))
+    else:
+        print(f"Request failed with status code: {response.status_code}")
+        print(response.text)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py
index a815620411309..74e0c045d6214 100644
--- a/examples/online_serving/openai_chat_completion_client.py
+++ b/examples/online_serving/openai_chat_completion_client.py
@@ -1,38 +1,49 @@
 # SPDX-License-Identifier: Apache-2.0
-
+"""Example Python client for OpenAI Chat Completion using vLLM API server
+NOTE: start a supported chat completion model server with `vllm serve`, e.g.
+    vllm serve meta-llama/Llama-2-7b-chat-hf
+"""
 from openai import OpenAI
 
 # Modify OpenAI's API key and API base to use vLLM's API server.
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    # defaults to os.environ.get("OPENAI_API_KEY")
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
+messages = [{
+    "role": "system",
+    "content": "You are a helpful assistant."
+}, {
+    "role": "user",
+    "content": "Who won the world series in 2020?"
+}, {
+    "role": "assistant",
+    "content": "The Los Angeles Dodgers won the World Series in 2020."
+}, {
+    "role": "user",
+    "content": "Where was it played?"
+}]
 
-models = client.models.list()
-model = models.data[0].id
 
-chat_completion = client.chat.completions.create(
-    messages=[{
-        "role": "system",
-        "content": "You are a helpful assistant."
-    }, {
-        "role": "user",
-        "content": "Who won the world series in 2020?"
-    }, {
-        "role":
-        "assistant",
-        "content":
-        "The Los Angeles Dodgers won the World Series in 2020."
-    }, {
-        "role": "user",
-        "content": "Where was it played?"
-    }],
-    model=model,
-)
+def main():
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
 
-print("Chat completion results:")
-print(chat_completion)
+    models = client.models.list()
+    model = models.data[0].id
+
+    chat_completion = client.chat.completions.create(
+        messages=messages,
+        model=model,
+    )
+
+    print("-" * 50)
+    print("Chat completion results:")
+    print(chat_completion)
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()

From 1dd23386ecab7b7c50ea61b8ff37ca14d2dbc0f7 Mon Sep 17 00:00:00 2001
From: shangmingc <caishangming@linux.alibaba.com>
Date: Mon, 14 Apr 2025 19:31:37 +0800
Subject: [PATCH 422/593] [Misc] Update usage with mooncake lib for kv transfer
 (#16523)

Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
---
 .../kv_lookup_buffer/mooncake_store.py        |  2 +-
 .../kv_transfer/kv_pipe/mooncake_pipe.py      | 20 +++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
index 7fd5967293f26..5bb7110216768 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
@@ -70,7 +70,7 @@ class MooncakeStore(KVStoreBufferBase):
     ):
 
         try:
-            from mooncake_vllm_adaptor import MooncakeDistributedStore
+            from mooncake.store import MooncakeDistributedStore
         except ImportError as e:
             raise ImportError(
                 "Please install mooncake by following the instructions at "
diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
index ec46d40454472..466700264915c 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
@@ -57,14 +57,14 @@ class MooncakeTransferEngine:
 
     def __init__(self, kv_rank: int, local_rank: int):
         try:
-            import mooncake_vllm_adaptor as mva
+            from mooncake.engine import TransferEngine
         except ImportError as e:
             raise ImportError(
                 "Please install mooncake by following the instructions at "
                 "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md "  # noqa: E501
                 "to run vLLM with MooncakeConnector.") from e
 
-        self.engine = mva.mooncake_vllm_adaptor()
+        self.engine = TransferEngine()
         self.local_rank = local_rank
 
         try:
@@ -140,12 +140,12 @@ class MooncakeTransferEngine:
                     "Mooncake Configuration error. `metadata_backend`"
                     f" should be one of {supported_backend}.")
 
-            self.engine.initializeExt(local_hostname, metadata_server,
-                                      protocol, device_name, metadata_backend)
+            self.engine.initialize_ext(local_hostname, metadata_server,
+                                       protocol, device_name, metadata_backend)
 
     def allocate_managed_buffer(self, length: int) -> int:
         """Allocate a managed buffer of the specified length."""
-        ret = self.engine.allocateManagedBuffer(length)
+        ret = self.engine.allocate_managed_buffer(length)
         if ret <= 0:
             logger.error("Allocation Return Error")
             raise Exception("Allocation Return Error")
@@ -153,13 +153,13 @@ class MooncakeTransferEngine:
 
     def free_managed_buffer(self, buffer: int, length: int) -> int:
         """Free a previously allocated managed buffer."""
-        return self.engine.freeManagedBuffer(buffer, length)
+        return self.engine.free_managed_buffer(buffer, length)
 
     def transfer_sync(self, buffer: int, peer_buffer_address: int,
                       length: int) -> int:
         """Synchronously transfer data to the specified address."""
-        ret = self.engine.transferSync(self.remote_url, buffer,
-                                       peer_buffer_address, length)
+        ret = self.engine.transfer_sync_read(self.remote_url, buffer,
+                                             peer_buffer_address, length)
         if ret < 0:
             logger.error("Transfer Return Error")
             raise Exception("Transfer Return Error")
@@ -168,11 +168,11 @@ class MooncakeTransferEngine:
     def write_bytes_to_buffer(self, buffer: int, user_data: bytes,
                               length: int) -> int:
         """Write bytes to the allocated buffer."""
-        return self.engine.writeBytesToBuffer(buffer, user_data, length)
+        return self.engine.write_bytes_to_buffer(buffer, user_data, length)
 
     def read_bytes_from_buffer(self, buffer: int, length: int) -> bytes:
         """Read bytes from the allocated buffer."""
-        return self.engine.readBytesFromBuffer(buffer, length)
+        return self.engine.read_bytes_from_buffer(buffer, length)
 
     def wait_for_ack(self, src_ptr: int, length: int) -> None:
         """Asynchronously wait for ACK from the receiver."""

From 6bf27affb6d451a02e158aa5728a7d8c45507303 Mon Sep 17 00:00:00 2001
From: "Md. Shafi Hussain" <Md.Shafi.Hussain@ibm.com>
Date: Mon, 14 Apr 2025 21:38:39 +0530
Subject: [PATCH 423/593] [fix]: Dockerfile.ppc64le fixes for opencv-python and
 hf-xet (#16048)

Signed-off-by: Md. Shafi Hussain <Md.Shafi.Hussain@ibm.com>
---
 docker/Dockerfile.ppc64le | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/docker/Dockerfile.ppc64le b/docker/Dockerfile.ppc64le
index 4540af4e8cdc8..ec979227871c6 100644
--- a/docker/Dockerfile.ppc64le
+++ b/docker/Dockerfile.ppc64le
@@ -126,13 +126,16 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 FROM base-builder AS cv-builder
 
 ARG MAX_JOBS
-ARG OPENCV_VERSION=84
+ARG OPENCV_VERSION=86
+# patch for version 4.11.0.86
+ARG OPENCV_PATCH=97f3f39
 ARG ENABLE_HEADLESS=1
 RUN --mount=type=cache,target=/root/.cache/uv \
     source /opt/rh/gcc-toolset-13/enable && \
     git clone --recursive https://github.com/opencv/opencv-python.git -b ${OPENCV_VERSION} && \
     cd opencv-python && \
-    sed -i 's/"setuptools==59.2.0",/"setuptools<70.0",/g' pyproject.toml && \
+    sed -i -E -e 's/"setuptools.+",/"setuptools",/g' pyproject.toml && \
+    cd opencv && git cherry-pick --no-commit $OPENCV_PATCH && cd .. && \
     python -m build --wheel --installer=uv --outdir /opencvwheels/
 
 ###############################################################
@@ -148,9 +151,15 @@ COPY --from=arrow-builder /tmp/control /dev/null
 COPY --from=cv-builder /tmp/control /dev/null
 
 ARG VLLM_TARGET_DEVICE=cpu
+ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
 
 # this step installs vllm and populates uv cache
 # with all the transitive dependencies
+RUN --mount=type=cache,target=/root/.cache/uv \
+    source /opt/rh/gcc-toolset-13/enable && \
+    git clone https://github.com/huggingface/xet-core.git && cd xet-core/hf_xet/ && \
+    uv pip install maturin && \
+    uv build --wheel --out-dir /hf_wheels/
 RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
     --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
@@ -159,7 +168,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     source /opt/rh/gcc-toolset-13/enable && \
     uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \
     sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
-    uv pip install pandas pythran pybind11 && \
+    uv pip install pandas pythran pybind11 /hf_wheels/*.whl && \
     # sentencepiece.pc is in some pkgconfig inside uv cache
     export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
     uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
@@ -247,8 +256,9 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
     --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
     --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
+    --mount=type=bind,from=vllmcache-builder,source=/hf_wheels/,target=/hf_wheels/,ro \
     --mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \
-    HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /vllmwheel/*.whl
+    HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /hf_wheels/*.whl /vllmwheel/*.whl
 
 COPY ./ /workspace/vllm
 WORKDIR /workspace/vllm

From aa29841ede3b1d337a51674c66b4393f8e2c150a Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 15 Apr 2025 00:24:16 +0800
Subject: [PATCH 424/593] [Bugfix] Multi-modal caches not acting like LRU
 caches (#16593)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/lora/test_utils.py         | 109 -------------------------
 tests/test_utils.py              | 133 +++++++++++++++++++++++++++++--
 vllm/utils.py                    |  69 +++++++++++++---
 vllm/v1/engine/mm_input_cache.py |   2 +-
 4 files changed, 187 insertions(+), 126 deletions(-)

diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py
index 34a26e9edf36a..1c90cedf1a16a 100644
--- a/tests/lora/test_utils.py
+++ b/tests/lora/test_utils.py
@@ -9,7 +9,6 @@ from torch import nn
 
 from vllm.lora.utils import (get_adapter_absolute_path,
                              parse_fine_tuned_lora_name, replace_submodule)
-from vllm.utils import LRUCache
 
 
 def test_parse_fine_tuned_lora_name_valid():
@@ -85,114 +84,6 @@ def test_replace_submodule():
     assert dict(model.named_modules())["seq1.dense2"] == dense2
 
 
-class TestLRUCache(LRUCache):
-
-    def _on_remove(self, key, value):
-        if not hasattr(self, "_remove_counter"):
-            self._remove_counter = 0
-        self._remove_counter += 1
-
-
-def test_lru_cache():
-    cache = TestLRUCache(3)
-
-    cache.put(1, 1)
-    assert len(cache) == 1
-
-    cache.put(1, 1)
-    assert len(cache) == 1
-
-    cache.put(2, 2)
-    assert len(cache) == 2
-
-    cache.put(3, 3)
-    assert len(cache) == 3
-    assert set(cache.cache) == {1, 2, 3}
-
-    cache.put(4, 4)
-    assert len(cache) == 3
-    assert set(cache.cache) == {2, 3, 4}
-    assert cache._remove_counter == 1
-    assert cache.get(2) == 2
-
-    cache.put(5, 5)
-    assert set(cache.cache) == {2, 4, 5}
-    assert cache._remove_counter == 2
-
-    assert cache.pop(5) == 5
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 4}
-    assert cache._remove_counter == 3
-
-    cache.pop(10)
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 4}
-    assert cache._remove_counter == 3
-
-    cache.get(10)
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 4}
-    assert cache._remove_counter == 3
-
-    cache.put(6, 6)
-    assert len(cache) == 3
-    assert set(cache.cache) == {2, 4, 6}
-    assert 2 in cache
-    assert 4 in cache
-    assert 6 in cache
-
-    cache.remove_oldest()
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 6}
-    assert cache._remove_counter == 4
-
-    cache.clear()
-    assert len(cache) == 0
-    assert cache._remove_counter == 6
-
-    cache._remove_counter = 0
-
-    cache[1] = 1
-    assert len(cache) == 1
-
-    cache[1] = 1
-    assert len(cache) == 1
-
-    cache[2] = 2
-    assert len(cache) == 2
-
-    cache[3] = 3
-    assert len(cache) == 3
-    assert set(cache.cache) == {1, 2, 3}
-
-    cache[4] = 4
-    assert len(cache) == 3
-    assert set(cache.cache) == {2, 3, 4}
-    assert cache._remove_counter == 1
-    assert cache[2] == 2
-
-    cache[5] = 5
-    assert set(cache.cache) == {2, 4, 5}
-    assert cache._remove_counter == 2
-
-    del cache[5]
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 4}
-    assert cache._remove_counter == 3
-
-    cache.pop(10)
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 4}
-    assert cache._remove_counter == 3
-
-    cache[6] = 6
-    assert len(cache) == 3
-    assert set(cache.cache) == {2, 4, 6}
-    assert 2 in cache
-    assert 4 in cache
-    assert 6 in cache
-
-
 # Unit tests for get_adapter_absolute_path
 @patch('os.path.isabs')
 def test_get_adapter_absolute_path_absolute(mock_isabs):
diff --git a/tests/test_utils.py b/tests/test_utils.py
index b6129a102085b..580e65f1f8338 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -13,11 +13,11 @@ import torch
 from vllm_test_utils.monitor import monitor
 
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
-from vllm.utils import (FlexibleArgumentParser, MemorySnapshot,
-                        PlaceholderModule, StoreBoolean, bind_kv_cache,
-                        deprecate_kwargs, get_open_port, memory_profiling,
-                        merge_async_iterators, sha256, supports_kw,
-                        swap_dict_values)
+from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache,
+                        MemorySnapshot, PlaceholderModule, StoreBoolean,
+                        bind_kv_cache, deprecate_kwargs, get_open_port,
+                        memory_profiling, merge_async_iterators, sha256,
+                        supports_kw, swap_dict_values)
 
 from .utils import create_new_process_for_each_test, error_on_warning
 
@@ -417,6 +417,129 @@ def test_bind_kv_cache_pp():
         assert ctx['layers.0.self_attn'].kv_cache[1] is kv_cache[1][0]
 
 
+class TestLRUCache(LRUCache):
+
+    def _on_remove(self, key, value):
+        if not hasattr(self, "_remove_counter"):
+            self._remove_counter = 0
+        self._remove_counter += 1
+
+
+def test_lru_cache():
+    cache = TestLRUCache(3)
+    assert cache.stat() == CacheInfo(hits=0, total=0)
+    assert cache.stat(delta=True) == CacheInfo(hits=0, total=0)
+
+    cache.put(1, 1)
+    assert len(cache) == 1
+
+    cache.put(1, 1)
+    assert len(cache) == 1
+
+    cache.put(2, 2)
+    assert len(cache) == 2
+
+    cache.put(3, 3)
+    assert len(cache) == 3
+    assert set(cache.cache) == {1, 2, 3}
+
+    cache.put(4, 4)
+    assert len(cache) == 3
+    assert set(cache.cache) == {2, 3, 4}
+    assert cache._remove_counter == 1
+
+    assert cache.get(2) == 2
+    assert cache.stat() == CacheInfo(hits=1, total=1)
+    assert cache.stat(delta=True) == CacheInfo(hits=1, total=1)
+
+    assert cache[2] == 2
+    assert cache.stat() == CacheInfo(hits=2, total=2)
+    assert cache.stat(delta=True) == CacheInfo(hits=1, total=1)
+
+    cache.put(5, 5)
+    assert set(cache.cache) == {2, 4, 5}
+    assert cache._remove_counter == 2
+
+    assert cache.pop(5) == 5
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    assert cache.get(-1) is None
+    assert cache.stat() == CacheInfo(hits=2, total=3)
+    assert cache.stat(delta=True) == CacheInfo(hits=0, total=1)
+
+    cache.pop(10)
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    cache.get(10)
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    cache.put(6, 6)
+    assert len(cache) == 3
+    assert set(cache.cache) == {2, 4, 6}
+    assert 2 in cache
+    assert 4 in cache
+    assert 6 in cache
+
+    cache.remove_oldest()
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 6}
+    assert cache._remove_counter == 4
+
+    cache.clear()
+    assert len(cache) == 0
+    assert cache._remove_counter == 6
+    assert cache.stat() == CacheInfo(hits=0, total=0)
+    assert cache.stat(delta=True) == CacheInfo(hits=0, total=0)
+
+    cache._remove_counter = 0
+
+    cache[1] = 1
+    assert len(cache) == 1
+
+    cache[1] = 1
+    assert len(cache) == 1
+
+    cache[2] = 2
+    assert len(cache) == 2
+
+    cache[3] = 3
+    assert len(cache) == 3
+    assert set(cache.cache) == {1, 2, 3}
+
+    cache[4] = 4
+    assert len(cache) == 3
+    assert set(cache.cache) == {2, 3, 4}
+    assert cache._remove_counter == 1
+    assert cache[2] == 2
+
+    cache[5] = 5
+    assert set(cache.cache) == {2, 4, 5}
+    assert cache._remove_counter == 2
+
+    del cache[5]
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    cache.pop(10)
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    cache[6] = 6
+    assert len(cache) == 3
+    assert set(cache.cache) == {2, 4, 6}
+    assert 2 in cache
+    assert 4 in cache
+    assert 6 in cache
+
+
 def test_placeholder_module_error_handling():
     placeholder = PlaceholderModule("placeholder_1234")
 
diff --git a/vllm/utils.py b/vllm/utils.py
index c2aad04941b82..c4c74a37d2f40 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -236,6 +236,12 @@ class CacheInfo(NamedTuple):
 
         return self.hits / self.total
 
+    def __sub__(self, other: CacheInfo):
+        return CacheInfo(
+            hits=self.hits - other.hits,
+            total=self.total - other.total,
+        )
+
 
 class LRUCache(cachetools.LRUCache[_K, _V], Generic[_K, _V]):
 
@@ -243,15 +249,26 @@ class LRUCache(cachetools.LRUCache[_K, _V], Generic[_K, _V]):
                  capacity: float,
                  getsizeof: Optional[Callable[[_V], float]] = None):
         super().__init__(capacity, getsizeof)
+
         self.pinned_items = set[_K]()
-        self.capacity = capacity
 
         self._hits = 0
         self._total = 0
+        self._last_info = CacheInfo(hits=0, total=0)
+
+    def __getitem__(self, key: _K, *, update_info: bool = True) -> _V:
+        value = super().__getitem__(key)
+
+        if update_info:
+            self._hits += 1
+            self._total += 1
+
+        return value
 
     def __delitem__(self, key: _K) -> None:
         run_on_remove = key in self
-        value = self.__getitem__(key)
+        value = self.__getitem__(key,
+                                 update_info=False)  # type: ignore[call-arg]
         super().__delitem__(key)
         if key in self.pinned_items:
             # Todo: add warning to inform that del pinned item
@@ -271,8 +288,32 @@ class LRUCache(cachetools.LRUCache[_K, _V], Generic[_K, _V]):
         """Return the internal order dictionary (read-only)."""
         return MappingProxyType(self._LRUCache__order)  # type: ignore
 
-    def stat(self) -> CacheInfo:
-        return CacheInfo(hits=self._hits, total=self._total)
+    @property
+    def capacity(self) -> float:
+        return self.maxsize
+
+    @property
+    def usage(self) -> float:
+        if self.maxsize == 0:
+            return 0
+
+        return self.currsize / self.maxsize
+
+    def stat(self, *, delta: bool = False) -> CacheInfo:
+        """
+        Gets the cumulative number of hits and queries against this cache.
+
+        If :code:`delta=True`, instead gets these statistics
+        since the last call that also passed :code:`delta=True`.
+        """
+        info = CacheInfo(hits=self._hits, total=self._total)
+
+        if delta:
+            info_delta = info - self._last_info
+            self._last_info = info
+            info = info_delta
+
+        return info
 
     def touch(self, key: _K) -> None:
         self._LRUCache__update(key)  # type: ignore
@@ -292,7 +333,8 @@ class LRUCache(cachetools.LRUCache[_K, _V], Generic[_K, _V]):
                                     _T]] = None) -> Optional[Union[_V, _T]]:
         value: Optional[Union[_V, _T]]
         if key in self:
-            value = self.__getitem__(key)
+            value = self.__getitem__(
+                key, update_info=False)  # type: ignore[call-arg]
 
             self._hits += 1
         else:
@@ -317,8 +359,9 @@ class LRUCache(cachetools.LRUCache[_K, _V], Generic[_K, _V]):
         if key not in self:
             return default
 
-        value = self[key]
-        del self[key]
+        value = self.__getitem__(key,
+                                 update_info=False)  # type: ignore[call-arg]
+        self.__delitem__(key)
         return value
 
     def put(self, key: _K, value: _V) -> None:
@@ -353,10 +396,6 @@ class LRUCache(cachetools.LRUCache[_K, _V], Generic[_K, _V]):
         while self.currsize > self.capacity:
             self.remove_oldest()
 
-    def clear(self) -> None:
-        while len(self) > 0:
-            self.remove_oldest(remove_pinned=True)
-
     def popitem(self, remove_pinned: bool = False):
         """Remove and return the `(key, value)` pair least recently used."""
         if not remove_pinned:
@@ -372,6 +411,14 @@ class LRUCache(cachetools.LRUCache[_K, _V], Generic[_K, _V]):
         value = self.pop(cast(_K, lru_key))
         return (lru_key, value)
 
+    def clear(self) -> None:
+        while len(self) > 0:
+            self.remove_oldest(remove_pinned=True)
+
+        self._hits = 0
+        self._total = 0
+        self._last_info = CacheInfo(hits=0, total=0)
+
 
 class PyObjectCache:
     """Used to cache python objects to avoid object allocations
diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
index ef5a2e5acb152..c765c1bbffcf3 100644
--- a/vllm/v1/engine/mm_input_cache.py
+++ b/vllm/v1/engine/mm_input_cache.py
@@ -50,7 +50,7 @@ class MirroredProcessingCache:
 
         full_mm_inputs = list[Optional[MultiModalKwargs]]()
         for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
-            if mm_hash in self.mm_cache:
+            if self.mm_cache.get(mm_hash) is not None:
                 mm_input = None
             else:
                 self.mm_cache[mm_hash] = mm_input

From b3f2fddd172cef23b42ffaf6c226877b6588964c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Mon, 14 Apr 2025 19:01:05 +0200
Subject: [PATCH 425/593] [TPU][V1] Fix exponential padding when
 `max-num-batched-tokens` is not a power of 2 (#16596)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 tests/v1/tpu/worker/test_tpu_model_runner.py | 12 ++++++++++++
 vllm/v1/worker/tpu_model_runner.py           |  4 +++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 8ea8c890613a3..5c7eab0b6b11b 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -299,6 +299,18 @@ def test_get_paddings():
     actual_paddings = _get_token_paddings(min_token_size, max_token_size,
                                           padding_gap)
     assert actual_paddings == expected_paddings
+    # Exponential padding.
+    max_token_size, padding_gap = 1024, 0
+    expected_paddings = [16, 32, 64, 128, 256, 512, 1024]
+    actual_paddings = _get_token_paddings(min_token_size, max_token_size,
+                                          padding_gap)
+    assert actual_paddings == expected_paddings
+    # Exponential padding with max_token_size not a power of two.
+    max_token_size = 317
+    expected_paddings = [16, 32, 64, 128, 256, 512]
+    actual_paddings = _get_token_paddings(min_token_size, max_token_size,
+                                          padding_gap)
+    assert actual_paddings == expected_paddings
 
 
 def test_get_padded_token_len():
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 69251d8bbb31f..6300f16c0b3fb 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1040,9 +1040,11 @@ def _get_token_paddings(min_token_size: int, max_token_size: int,
 
     if padding_gap == 0:
         logger.info("Using exponential token paddings:")
-        while num <= max_token_size:
+        while True:
             logger.info("    %d", num)
             paddings.append(num)
+            if num >= max_token_size:
+                break
             num *= 2
 
     else:

From 9883a1885970f53f88fd962c85dd406ade0966cd Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 14 Apr 2025 18:06:01 +0100
Subject: [PATCH 426/593] Fix triton install condition on CPU (#16600)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 requirements/cpu.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index d845fb201ceff..69f732c2417a1 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -12,9 +12,9 @@ torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
 torchaudio==2.6.0; platform_machine == "ppc64le"
 
 # required for the image processor of phi3v, this must be updated alongside torch
-torchvision; platform_machine != "ppc64le"  and platform_machine != "s390x"
+torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
 torchvision==0.21.0; platform_machine == "ppc64le"
 datasets # for benchmark scripts
 
 # cpu cannot use triton 3.3.0
-triton==3.2.0; platform_machine != "ppc64le"
+triton==3.2.0; platform_machine == "x86_64"

From 7b5ecf79bd94aab0d782c70126d0dcc37c16bc60 Mon Sep 17 00:00:00 2001
From: Nishan Acharya <58680481+Nash-123@users.noreply.github.com>
Date: Mon, 14 Apr 2025 23:25:32 +0530
Subject: [PATCH 427/593] s390x: Fix PyArrow build and add CPU test script for
 Buildkite CI (#16036)

Signed-off-by: Nishan Acharya <Nishan.Acharya@ibm.com>
---
 .../scripts/hardware_ci/run-cpu-test-s390x.sh | 13 ++++++++++
 docker/Dockerfile.s390x                       | 24 +++++++++++++++++--
 2 files changed, 35 insertions(+), 2 deletions(-)
 create mode 100755 .buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
new file mode 100755
index 0000000000000..a97fa502e6cfc
--- /dev/null
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Setup cleanup
+remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Try building the docker image
+docker build -t cpu-test -f docker/Dockerfile.s390x .
diff --git a/docker/Dockerfile.s390x b/docker/Dockerfile.s390x
index 5a84dc12d8f71..128929ac33311 100644
--- a/docker/Dockerfile.s390x
+++ b/docker/Dockerfile.s390x
@@ -58,7 +58,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     cd ../../python && \
     export PYARROW_PARALLEL=4 && \
     export ARROW_BUILD_TYPE=release && \
-    uv pip install -r requirements/build.txt && \
+    uv pip install -r requirements-build.txt && \
     python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --bundle-arrow-cpp bdist_wheel
 
 FROM python-install AS numa-build
@@ -96,6 +96,22 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \
     python setup.py bdist_wheel
 
+FROM python-install AS hf-xet-builder
+# Install hf-xet
+WORKDIR /tmp
+ENV CARGO_HOME=/root/.cargo
+ENV RUSTUP_HOME=/root/.rustup
+ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
+    --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
+    git clone https://github.com/huggingface/xet-core.git && \
+    cd xet-core/hf_xet/ && \
+    uv pip install maturin patchelf && \
+    python -m maturin build --release --out dist && \
+    mkdir -p /tmp/hf-xet/dist && \
+    cp dist/*.whl /tmp/hf-xet/dist/
+
 # Final build stage
 FROM python-install AS vllm-cpu
 ARG PYTHON_VERSION
@@ -120,12 +136,15 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
     --mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \
     --mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
+    --mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \
      sed -i '/^torch/d' requirements/build.txt && \
      ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \
      VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \
+     HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl | head -n 1) && \
     uv pip install -v \    
         $ARROW_WHL_FILE  \
         $VISION_WHL_FILE \
+        $HF_XET_WHL_FILE \
         --extra-index-url https://download.pytorch.org/whl/nightly/cpu \
         --index-strategy unsafe-best-match \
         -r requirements/build.txt \
@@ -149,4 +168,5 @@ USER 2000
 WORKDIR /home/vllm
 
 # Set the default entrypoint
-ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
\ No newline at end of file
+ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
+

From b1308b84a3a6323f33dc995923b6cffa86e7199e Mon Sep 17 00:00:00 2001
From: courage17340 <courage17340@users.noreply.github.com>
Date: Tue, 15 Apr 2025 05:41:48 +0800
Subject: [PATCH 428/593] [Model][VLM] Add Kimi-VL model support (#16387)

Signed-off-by: courage17340 <courage17340@163.com>
---
 docs/source/models/supported_models.md        |   7 +
 examples/offline_inference/vision_language.py |  24 +
 .../vision_language_multi_image.py            |  40 ++
 requirements/test.in                          |   1 +
 requirements/test.txt                         |  10 +-
 .../vision_language/test_models.py            |  12 +
 .../vision_language/vlm_utils/model_utils.py  |  11 +
 .../multimodal/processing/test_common.py      |   1 +
 tests/models/registry.py                      |   3 +
 vllm/entrypoints/chat_utils.py                |   2 +
 vllm/model_executor/models/kimi_vl.py         | 608 +++++++++++++++++
 vllm/model_executor/models/moonvit.py         | 628 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |   1 +
 vllm/transformers_utils/config.py             |  14 +-
 vllm/transformers_utils/configs/__init__.py   |   4 +
 vllm/transformers_utils/configs/kimi_vl.py    |  36 +
 vllm/transformers_utils/configs/moonvit.py    |  32 +
 vllm/v1/worker/gpu_model_runner.py            |  16 +-
 18 files changed, 1436 insertions(+), 14 deletions(-)
 create mode 100644 vllm/model_executor/models/kimi_vl.py
 create mode 100644 vllm/model_executor/models/moonvit.py
 create mode 100644 vllm/transformers_utils/configs/kimi_vl.py
 create mode 100644 vllm/transformers_utils/configs/moonvit.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index ffedd5b04051b..b6fef2f43b831 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -886,6 +886,13 @@ See [this page](#generative-models) for more information on how to use generativ
   *
   * ✅︎
   * ✅︎
+- * `KimiVLForConditionalGeneration`
+  * Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking
+  * T + I<sup>+</sup>
+  * `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking`
+  *
+  *
+  * ✅︎
 - * `Llama4ForConditionalGeneration`
   * Llama 4
   * T + I<sup>+</sup>
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index f51cef95e8596..281d4fbdfc2e2 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -364,6 +364,29 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# Kimi-VL
+def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    prompts = [
+        "<|im_user|>user<|im_middle|><|media_start|>image<|media_content|>"
+        f"<|media_pad|><|media_end|>{question}<|im_end|>"
+        "<|im_assistant|>assistant<|im_middle|>" for question in questions
+    ]
+
+    engine_args = EngineArgs(
+        model="moonshotai/Kimi-VL-A3B-Instruct",
+        max_model_len=4096,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        trust_remote_code=True,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # LLaVA-1.5
 def run_llava(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -966,6 +989,7 @@ model_example_map = {
     "h2ovl_chat": run_h2ovl,
     "idefics3": run_idefics3,
     "internvl_chat": run_internvl,
+    "kimi_vl": run_kimi_vl,
     "llava": run_llava,
     "llava-next": run_llava_next,
     "llava-next-video": run_llava_next_video,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 89818f8b33ee6..6fa4a754403ad 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -326,6 +326,45 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "moonshotai/Kimi-VL-A3B-Instruct"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=4,
+        tensor_parallel_size=1,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        trust_remote_code=True,
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *placeholders,
+            {
+                "type": "text",
+                "text": question
+            },
+        ],
+    }]
+
+    processor = AutoProcessor.from_pretrained(model_name,
+                                              trust_remote_code=True)
+
+    prompt = processor.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
 
@@ -640,6 +679,7 @@ model_example_map = {
     "h2ovl_chat": load_h2ovl,
     "idefics3": load_idefics3,
     "internvl_chat": load_internvl,
+    "kimi_vl": load_kimi_vl,
     "llama4": load_llama4,
     "mistral3": load_mistral3,
     "mllama": load_mllama,
diff --git a/requirements/test.in b/requirements/test.in
index 95c94dcdbe999..b9b3df0651b01 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -10,6 +10,7 @@ pytest-timeout
 # testing utils
 awscli
 backoff # required for phi4mm test
+blobfile # required for kimi-vl test
 einops # required for MPT, qwen-vl and Mamba
 httpx
 librosa # required for audio tests
diff --git a/requirements/test.txt b/requirements/test.txt
index 476b4a2cc0ec2..a5c062b0b1f62 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -39,6 +39,8 @@ bitsandbytes==0.45.3
     # via -r requirements/test.in
 black==24.10.0
     # via datamodel-code-generator
+blobfile==3.0.0
+    # via -r requirements/test.in
 boto3==1.35.57
     # via tensorizer
 botocore==1.35.57
@@ -127,6 +129,7 @@ fastsafetensors==0.1.10
     # via -r requirements/test.in
 filelock==3.16.1
     # via
+    #   blobfile
     #   datasets
     #   huggingface-hub
     #   ray
@@ -227,7 +230,9 @@ llvmlite==0.44.0
 lm-eval==0.4.8
     # via -r requirements/test.in
 lxml==5.3.0
-    # via sacrebleu
+    # via
+    #   blobfile
+    #   sacrebleu
 markdown-it-py==3.0.0
     # via rich
 markupsafe==3.0.2
@@ -426,6 +431,8 @@ pybind11==2.13.6
     # via lm-eval
 pycparser==2.22
     # via cffi
+pycryptodomex==3.22.0
+    # via blobfile
 pydantic==2.9.2
     # via
     #   datamodel-code-generator
@@ -689,6 +696,7 @@ tzdata==2024.2
     # via pandas
 urllib3==2.2.3
     # via
+    #   blobfile
     #   botocore
     #   requests
     #   responses
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 5bd10544d81b6..5c87cefcd8e9e 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -318,6 +318,18 @@ VLM_TEST_SETTINGS = {
         use_tokenizer_eos=True,
         patch_hf_runner=model_utils.internvl_patch_hf_runner,
     ),
+    "kimi_vl": VLMTestInfo(
+        models=["moonshotai/Kimi-VL-A3B-Instruct"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|im_user|>user<|im_middle|>{img_prompt}<|im_end|><|im_assistant|>assistant<|im_middle|>", # noqa: E501
+        img_idx_to_prompt=lambda _: "<|media_start|>image<|media_content|><|media_pad|><|media_end|>",  # noqa: E501
+        max_model_len=8192,
+        max_num_seqs=2,
+        dtype="bfloat16",
+        tensor_parallel_size=1,
+        vllm_output_post_proc=model_utils.kimiv_vl_vllm_to_hf_output,
+        marks=[large_gpu_mark(min_gb=48)],
+    ),
     "llama4": VLMTestInfo(
         models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
         prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index 3520345c9679c..49305332726e4 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -68,6 +68,17 @@ def qwen2_vllm_to_hf_output(
     return output_ids, hf_output_str, out_logprobs
 
 
+def kimiv_vl_vllm_to_hf_output(
+        vllm_output: RunnerOutput,
+        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    """Sanitize vllm output [kimi_vl models] to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    hf_output_str = output_str + "<|im_end|>[EOS]"
+
+    return output_ids, hf_output_str, out_logprobs
+
+
 def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
                                   model: str) -> RunnerOutput:
     config = AutoConfig.from_pretrained(model)
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 0f25c189457bc..b14e8a02bb1a3 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -258,6 +258,7 @@ def _test_processing_correctness_mistral(
     "OpenGVLab/InternVL2-1B",
     "HuggingFaceM4/Idefics3-8B-Llama3",
     "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
+    "moonshotai/Kimi-VL-A3B-Instruct",
     "meta-llama/Llama-4-Scout-17B-16E-Instruct",
     "llava-hf/llava-1.5-7b-hf",
     "llava-hf/llava-v1.6-mistral-7b-hf",
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 896b6c3bf47b9..530da89cc72e3 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -302,6 +302,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                          trust_remote_code=True),
     "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3",  # noqa: E501
                                                         {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}),  # noqa: E501
+    "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct",  # noqa: E501
+                                                      extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},  # noqa: E501
+                                                      trust_remote_code=True),
     "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct",   # noqa: E501
                                                       min_transformers_version="4.51"),
     "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 6fb7dc2c9763a..d6010e1c7802d 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -512,6 +512,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
                 return "<|fim_prefix|><|img|><|fim_suffix|>"
             if model_type == "gemma3":
                 return "<start_of_image>"
+            if model_type == "kimi_vl":
+                return "<|media_start|>image<|media_content|><|media_pad|><|media_end|>" # noqa: E501
 
             raise TypeError(f"Unknown {modality} model type: {model_type}")
         elif modality == "audio":
diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py
new file mode 100644
index 0000000000000..c2fac70afc49c
--- /dev/null
+++ b/vllm/model_executor/models/kimi_vl.py
@@ -0,0 +1,608 @@
+# SPDX-License-Identifier: Apache-2.0
+# ruff: noqa: E501
+# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/modeling_kimi_vl.py
+# Copyright 2025 The Moonshot AI Team, DeepSeek-AI, and HuggingFace Inc. team. All rights reserved.
+#
+# The code is based on llava (llava/modeling_llava.py) and DeepSeek-V3 (DeepSeek-V3/modeling_deepseek.py), but modified for KimiVL.
+#
+# Licensing Information:
+# - Code derived from llava (llava/modeling_llava.py) and DeepSeek-V3 (DeepSeek-V3/modeling_deepseek.py) is licensed under the Apache License, Version 2.0.
+# - Other parts of the code are licensed under the MIT License.
+#
+# Apache License, Version 2.0:
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License:
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import copy
+import math
+from collections.abc import Mapping
+from dataclasses import dataclass
+from typing import (Any, Iterable, List, Literal, Optional, Sequence, Tuple,
+                    TypedDict, Union)
+
+import torch
+from torch import nn
+from transformers import BatchFeature
+from transformers.activations import GELUActivation
+
+from vllm.config import VllmConfig
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.models.deepseek_v2 import DeepseekV2Model
+from vllm.model_executor.models.interfaces import SupportsMultiModal
+from vllm.model_executor.models.moonvit import MoonVitPretrainedModel
+from vllm.model_executor.models.utils import merge_multimodal_embeddings
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig
+from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
+
+from .utils import is_pp_missing_parameter, maybe_prefix
+
+logger = init_logger(__name__)
+
+
+# For dummy input only
+@dataclass
+class MaxImageTokenMeta:
+    width: int = 1024
+    height: int = 1024
+
+
+class KimiVLMultiModalProjector(nn.Module):
+
+    def __init__(self, config: KimiVLConfig):
+        super().__init__()
+
+        self.hidden_size = (config.vision_config.hidden_size *
+                            config.vision_config.merge_kernel_size[0] *
+                            config.vision_config.merge_kernel_size[1])
+
+        self.pre_norm = torch.nn.LayerNorm(config.vision_config.hidden_size,
+                                           eps=1e-5)
+        self.linear_1 = nn.Linear(self.hidden_size,
+                                  self.hidden_size,
+                                  bias=True)
+        self.act = GELUActivation()
+        self.linear_2 = nn.Linear(self.hidden_size,
+                                  config.text_config.hidden_size,
+                                  bias=True)
+
+    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.pre_norm(image_features).view(
+            -1, self.hidden_size)
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class KimiVLImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: Union[torch.Tensor, List[torch.Tensor]]
+    """
+    Shape:`(num_patches, num_channels, patch_size, patch_size)`
+    """
+
+    image_grid_hws: torch.Tensor
+    """Shape:`(num_images, 2)`"""
+
+
+# TODO: support embeds too
+# We only support pixel input for kimi-vl now
+KimiVLImageInputs = KimiVLImagePixelInputs
+
+
+class KimiVLProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(KimiVLConfig)
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        hf_processor = self.get_hf_processor()
+        patch_size = hf_processor.image_processor.patch_size
+        kernel_size = hf_processor.image_processor.merge_kernel_size
+        in_token_limit = hf_processor.image_processor.in_token_limit
+        height = image_height
+        width = image_width
+        assert isinstance(height,
+                          int), f"height must be int, current height {height}"
+        assert isinstance(width,
+                          int), f"width must be int, current width {width}"
+        assert kernel_size is not None, "kernel_size must be specified"
+
+        if (width // patch_size) * (height // patch_size) > in_token_limit:
+            scale = math.sqrt(in_token_limit / ((width // patch_size) *
+                                                (height // patch_size)))
+            new_w, new_h = int(width * scale), int(height * scale)
+            width, height = new_w, new_h
+
+        kernel_height, kernel_width = kernel_size
+
+        pad_height = (kernel_height * patch_size - height %
+                      (kernel_height * patch_size)) % (kernel_height *
+                                                       patch_size)
+        pad_width = (kernel_width * patch_size - width %
+                     (kernel_width * patch_size)) % (kernel_width * patch_size)
+
+        # Calculate new dimensions after padding and patching
+        token_height = (height + pad_height) // (kernel_size[0] * patch_size)
+        token_width = (width + pad_width) // (kernel_size[1] * patch_size)
+        return int(token_height * token_width)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        # None means unlimited
+        return {"image": None}
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        return {
+            "image":
+            self.get_num_image_tokens(
+                image_width=MaxImageTokenMeta.width,
+                image_height=MaxImageTokenMeta.height,
+            ),
+        }
+
+    @property
+    def image_token_id(self) -> int:
+        return self.get_hf_config().media_placeholder_token_id
+
+
+class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]):
+
+    def __init__(self, info: KimiVLProcessingInfo) -> None:
+        super().__init__(info)
+
+        self.image_token_id = self.info.image_token_id
+        self.image_token = self.info.get_tokenizer().decode(
+            self.image_token_id)
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
+
+        width = MaxImageTokenMeta.width
+        height = MaxImageTokenMeta.height
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=width,
+                                   height=height,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text=self.image_token * num_images,
+            mm_data=mm_data,
+        )
+
+
+class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        image_grid_hws = hf_inputs.get("image_grid_hws", torch.empty((0, 2)))
+        image_grid_sizes = image_grid_hws.prod(-1)
+
+        # pixel_values is merged as a single large tensor
+        # image_grid_hws is shapes for each subtensor in pixel_values
+        return dict(
+            pixel_values=MultiModalFieldConfig.flat_from_sizes(
+                "image", image_grid_sizes),
+            image_grid_hws=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        image_token_id = self.info.image_token_id
+
+        def get_replacement(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                num_image_tokens = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                num_image_tokens = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                )
+
+            return [image_token_id] * num_image_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=get_replacement,
+            ),
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(KimiVLMultiModalProcessor,
+                                        info=KimiVLProcessingInfo,
+                                        dummy_inputs=KimiVLDummyInputsBuilder)
+class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal):
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        model_config = vllm_config.model_config
+        config: KimiVLConfig = model_config.hf_config
+        self.config = config
+        quant_config = vllm_config.quant_config
+
+        assert isinstance(config.vision_config, MoonViTConfig)
+
+        self.vision_tower = MoonVitPretrainedModel(config.vision_config)
+
+        self.multi_modal_projector = KimiVLMultiModalProjector(config=config)
+
+        self.quant_config = quant_config
+        sub_vllm_config = copy.deepcopy(vllm_config)
+        sub_vllm_config.model_config.hf_config = sub_vllm_config.model_config.hf_config.text_config
+        self.language_model = DeepseekV2Model(
+            vllm_config=sub_vllm_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+        self.unpadded_vocab_size = config.text_config.vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.text_config.hidden_size,
+            org_num_embeddings=self.config.text_config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE)
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size, logit_scale)
+        self.sampler = get_sampler()
+        self.media_placeholder: int = self.config.media_placeholder_token_id
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.tp_world_size = get_tensor_model_parallel_world_size()
+
+    # ref: qwen2_vl.py
+    def _validate_and_reshape_mm_tensor(self, mm_input: object,
+                                        name: str) -> torch.Tensor:
+        if not isinstance(mm_input, (torch.Tensor, list)):
+            raise ValueError(f"Incorrect type of {name}. "
+                             f"Got type: {type(mm_input)}")
+        if isinstance(mm_input, torch.Tensor):
+            if mm_input.ndim == 2:
+                return mm_input
+            if mm_input.ndim != 3:
+                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
+                                 f"Got ndim: {mm_input.ndim} "
+                                 f"(shape={mm_input.shape})")
+            return mm_input.reshape(-1, mm_input.shape[-1])
+        else:
+            return torch.concat(mm_input)
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[KimiVLImageInputs]:
+        # image input type must be pixel values now
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_grid_hws = kwargs.pop("image_grid_hws", None)
+
+        if pixel_values is None:
+            return None
+
+        image_grid_hws = self._validate_and_reshape_mm_tensor(
+            image_grid_hws, "image grid hws")
+        # pixel_values may have complex shapes
+        num_channels = 3
+        patch_size = self.config.vision_config.patch_size
+        if isinstance(pixel_values, list):
+            pixel_values = torch.cat([
+                x.reshape(-1, num_channels, patch_size, patch_size)
+                for x in pixel_values
+            ])
+        else:
+            pixel_values = pixel_values.reshape(-1, num_channels, patch_size,
+                                                patch_size)
+        # fp32 -> bf16
+        pixel_values = pixel_values.to(torch.bfloat16)
+        # image_grid_hws.shape = (N, 2)
+        assert image_grid_hws.ndim == 2, f"unexpected shape for image_grid_hws: {image_grid_hws.shape}"
+
+        return KimiVLImagePixelInputs(
+            type="pixel_values",
+            pixel_values=pixel_values,
+            image_grid_hws=image_grid_hws,
+        )
+
+    # perform vt on processored pixel_values
+    @torch.inference_mode()
+    def _process_image_pixels(self,
+                              inputs: KimiVLImagePixelInputs) -> torch.Tensor:
+        assert self.vision_tower is not None
+
+        pixel_values = inputs["pixel_values"]
+        image_grid_hws = inputs["image_grid_hws"]
+        return self.vision_tower(pixel_values, image_grid_hws)
+
+    def _process_image_input(self,
+                             image_input: KimiVLImageInputs) -> torch.Tensor:
+        assert image_input["type"] == "pixel_values"
+        image_features = self._process_image_pixels(image_input)
+        assert isinstance(image_features, list)
+        lengths = [x.shape[0] for x in image_features]
+        return self.multi_modal_projector(
+            torch.cat(image_features)).split(lengths)
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> Optional[NestedTensors]:
+        # Validate the multimodal input keyword arguments
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+
+        # Run multimodal inputs through encoder and projector
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+
+        # `get_input_embeddings` should already be implemented for the language
+        # model as one of the requirements of basic vLLM model implementation.
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                multimodal_embeddings=multimodal_embeddings,
+                placeholder_token_id=self.config.media_placeholder_token_id)
+
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> SamplerOutput:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+        # NOTE: In v1, inputs_embeds is always generated at model runner from
+        # `get_multimodal_embeddings` and `get_input_embeddings`, this
+        # condition is only for v0 compatibility.
+        elif inputs_embeds is None:
+            image_input = self._parse_and_validate_image_input(**kwargs)
+            if image_input is None:
+                inputs_embeds = None
+            else:
+                inputs_embeds = self.get_input_embeddings(input_ids)
+                image_embeds = self._process_image_input(image_input)
+                inputs_embeds = merge_multimodal_embeddings(
+                    input_ids,
+                    inputs_embeds,
+                    image_embeds,
+                    placeholder_token_id=self.config.
+                    media_placeholder_token_id,
+                )
+                input_ids = None
+
+        hidden_states = self.language_model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata,
+                       **kwargs) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata, **kwargs)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        config = self.config.text_config
+        _KEYS_TO_MODIFY_MAPPING = {
+            "language_model.lm_head": "lm_head",
+            "language_model.model": "language_model",
+        }
+        # only doing this for language model part for now.
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        if not config.use_mla:
+            stacked_params_mapping += [
+                (".qkv_proj", ".q_proj", "q"),
+                (".qkv_proj", ".k_proj", "k"),
+                (".qkv_proj", ".v_proj", "v"),
+            ]
+        if getattr(config, "n_routed_experts", None):
+            # Params for weights, fp8 weight scales, fp8 activation scales
+            # (param_name, weight_name, expert_id, shard_id)
+            expert_params_mapping = FusedMoE.make_expert_params_mapping(
+                ckpt_gate_proj_name="gate_proj",
+                ckpt_down_proj_name="down_proj",
+                ckpt_up_proj_name="up_proj",
+                num_experts=config.n_routed_experts)
+        else:
+            expert_params_mapping = []
+
+        params_dict = dict(self.named_parameters())
+        for args in weights:
+            name, loaded_weight = args[:2]
+            kwargs = args[2] if len(args) > 2 else {}
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            spec_layer = get_spec_layer_idx_from_weight_name(config, name)
+            if spec_layer is not None:
+                continue  # skip spec decode layers for main model
+
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
+                if key_to_modify in name:
+                    name = name.replace(key_to_modify, new_key)
+            use_default_weight_loading = False
+            if "vision" in name:
+                if self.vision_tower is not None:
+                    # We only do sharding for language model and
+                    # not vision model for now.
+                    use_default_weight_loading = True
+            else:
+                for (param_name, weight_name,
+                     shard_id) in stacked_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    # We have mlp.experts[0].gate_proj in the checkpoint.
+                    # Since we handle the experts below in expert_params_mapping,
+                    # we need to skip here BEFORE we update the name, otherwise
+                    # name will be updated to mlp.experts[0].gate_up_proj, which
+                    # will then be updated below in expert_params_mapping
+                    # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                    if (("mlp.experts." in name) and name not in params_dict):
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id, **kwargs)
+                    break
+                else:
+                    for idx, (param_name, weight_name, expert_id,
+                              shard_id) in enumerate(expert_params_mapping):
+                        if weight_name not in name:
+                            continue
+                        name = name.replace(weight_name, param_name)
+
+                        if is_pp_missing_parameter(name, self):
+                            continue
+
+                        param = params_dict[name]
+                        weight_loader = param.weight_loader
+                        weight_loader(param,
+                                      loaded_weight,
+                                      name,
+                                      expert_id=expert_id,
+                                      shard_id=shard_id,
+                                      **kwargs)
+                        break
+                    else:
+                        use_default_weight_loading = True
+            if use_default_weight_loading:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight, **kwargs)
+
+
+def get_spec_layer_idx_from_weight_name(config: DeepseekV2Config,
+                                        weight_name: str) -> Optional[int]:
+    if hasattr(config,
+               "num_nextn_predict_layers") and (config.num_nextn_predict_layers
+                                                > 0):
+        layer_idx = config.num_hidden_layers
+        for i in range(config.num_nextn_predict_layers):
+            if weight_name.startswith(f"model.layers.{layer_idx+i}."):
+                return layer_idx + i
+    return None
diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py
new file mode 100644
index 0000000000000..c367d90f847b6
--- /dev/null
+++ b/vllm/model_executor/models/moonvit.py
@@ -0,0 +1,628 @@
+# SPDX-License-Identifier: Apache-2.0
+# ruff: noqa: E501
+# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/modeling_kimi_vl.py
+# This file is meant to be used in kimi_vl.py only
+# Copyright 2025 The Moonshot AI Team, DeepSeek-AI, and HuggingFace Inc. team. All rights reserved.
+#
+# The code is based on llava (llava/modeling_llava.py) and DeepSeek-V3 (DeepSeek-V3/modeling_deepseek.py), but modified for KimiVL.
+#
+# Licensing Information:
+# - Code derived from llava (llava/modeling_llava.py) and DeepSeek-V3 (DeepSeek-V3/modeling_deepseek.py) is licensed under the Apache License, Version 2.0.
+# - Other parts of the code are licensed under the MIT License.
+#
+# Apache License, Version 2.0:
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License:
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import math
+from copy import deepcopy
+from functools import cached_property
+from typing import List, Optional, Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.activations import ACT2FN, PytorchGELUTanh
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import is_flash_attn_2_available
+
+from vllm.transformers_utils.configs.moonvit import MoonViTConfig
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_varlen_func
+else:
+    flash_attn_varlen_func = None
+
+
+def multihead_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    q_cu_seqlens: Optional[torch.Tensor] = None,
+    k_cu_seqlens: Optional[torch.Tensor] = None,
+):
+    """Multi-head attention using flash attention 2.
+
+    Args:
+        q, k, v: tensor of shape (batch_size, seqlen, num_heads, head_dim),
+            or (tot_seqlens, num_heads, head_dim) if packing.
+        q_cu_seqlens (torch.Tensor): cumulative sequence lengths of q.
+            The first element should be 0 and the last element should be q.shape[0].
+        k_cu_seqlens (torch.Tensor): cumulative sequence lengths of k.
+            The first element should be 0 and the last element should be k.shape[0].
+
+    Returns:
+        output: shape (batch_size, seqlen, dim) or (tot_seqlens, dim) if packing,
+            where dim = num_heads * head_dim
+    """
+    # Unified format legal check
+    assert q.dim() == k.dim() == v.dim() == 3, "q, k, v must have 3 dims"
+    assert q_cu_seqlens[-1] == q.shape[
+        0], "q_cu_seqlens must sum to q.shape[0]"
+    assert (k_cu_seqlens[-1] == k.shape[0] ==
+            v.shape[0]), "k_cu_seqlens must sum to k.shape[0]"
+    assert q.dtype in [
+        torch.bfloat16,
+        torch.float16,
+    ], f"unsupported dtype {q.dtype} for multihead attn"
+
+    max_seqlen_q = (q_cu_seqlens[1:] - q_cu_seqlens[:-1]).max().item()
+    max_seqlen_k = (k_cu_seqlens[1:] - k_cu_seqlens[:-1]).max().item()
+    attn_out = flash_attn_varlen_func(
+        q,
+        k,
+        v,
+        q_cu_seqlens,
+        k_cu_seqlens,
+        max_seqlen_q,
+        max_seqlen_k,
+        causal=False,
+    )
+    attn_out = attn_out.flatten(start_dim=-2)
+
+    return attn_out
+
+
+def sdpa_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    q_cu_seqlens: Optional[torch.Tensor] = None,
+    k_cu_seqlens: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """SDPA attention.
+
+    Args:
+        q, k, v: tensor of shape (batch_size, seqlen, num_heads, head_dim),
+            or (tot_seqlens, num_heads, head_dim) if packing.
+    """
+    seq_length = q.shape[0]
+    attention_mask = torch.zeros([1, seq_length, seq_length],
+                                 device=q.device,
+                                 dtype=torch.bool)
+    for i in range(1, len(q_cu_seqlens)):
+        attention_mask[
+            ...,
+            q_cu_seqlens[i - 1]:q_cu_seqlens[i],
+            q_cu_seqlens[i - 1]:q_cu_seqlens[i],
+        ] = True
+    q = q.transpose(0, 1)
+    k = k.transpose(0, 1)
+    v = v.transpose(0, 1)
+    attn_output = F.scaled_dot_product_attention(q,
+                                                 k,
+                                                 v,
+                                                 attention_mask,
+                                                 dropout_p=0.0)
+    attn_output = attn_output.transpose(0, 1)
+    attn_output = attn_output.reshape(seq_length, -1)
+    return attn_output
+
+
+VL_VISION_ATTENTION_FUNCTIONS = {
+    "flash_attention_2": multihead_attention,
+    "sdpa": sdpa_attention,
+}
+
+
+def _apply_rope_input_validation(x, freqs_cis):
+    assert x.ndim == freqs_cis.ndim + 1, (x.shape, freqs_cis.shape)
+    assert x.shape[:-2] == freqs_cis.shape[:-1], (x.shape, freqs_cis.shape)
+    assert x.shape[-1] == 2 * freqs_cis.shape[-1], (x.shape, freqs_cis.shape)
+    assert freqs_cis.dtype == torch.complex64, freqs_cis.dtype
+
+
+def apply_rope(xq: torch.Tensor, xk: torch.Tensor,
+               freqs_cis: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Args: (The leading dimensions of all inputs should be the same)
+        xq: query, tensor of shape (..., num_heads, head_dim)
+        xk: key, tensor of shape (..., num_heads, head_dim)
+        freqs_cis: tensor of shape (..., head_dim/2), dtype=torch.complex64. It contains the precomputed cis(freqs) for each position in the 2D grid.
+    Returns:
+        xq_out, xk_out: tensors of shape (..., num_heads, head_dim)
+    """
+    _apply_rope_input_validation(xq, freqs_cis)
+    _apply_rope_input_validation(xk, freqs_cis)
+
+    freqs_cis = freqs_cis.unsqueeze(-2)  # ..., 1, head_dim/2
+    # ..., num_heads, head_dim/2
+    xq_ = torch.view_as_complex(xq.float().view(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().view(*xq.shape[:-1], -1, 2))
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(
+        -2)  # ..., num_heads, head_dim
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(
+        -2)  # ..., num_heads, head_dim
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+
+
+class Learnable2DInterpPosEmb(nn.Module):
+
+    def __init__(self,
+                 height: int,
+                 width: int,
+                 dim: int,
+                 interpolation_mode: str = "bicubic") -> None:
+        super().__init__()
+        self.height = height
+        self.width = width
+        self.interpolation_mode = interpolation_mode
+        self.weight = nn.Parameter(torch.empty(height, width, dim))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.normal_(self.weight)
+
+    def forward(self, x: torch.Tensor, grid_hws: torch.Tensor) -> torch.Tensor:
+        pos_embs = []
+        for shape in grid_hws.tolist():
+            if shape == self.weight.shape[:-1]:
+                pos_embs.append(self.weight.flatten(end_dim=1))
+            else:
+                pos_embs.append(
+                    F.interpolate(
+                        self.weight.permute((2, 0, 1)).unsqueeze(0),
+                        size=shape,
+                        mode=self.interpolation_mode,
+                    ).squeeze(0).permute((1, 2, 0)).flatten(end_dim=1))
+        out = x + torch.cat(pos_embs)
+        return out
+
+
+class MoonVisionPatchEmbed(nn.Module):
+
+    def __init__(
+        self,
+        out_dim: int,
+        in_dim: int = 3,
+        patch_size: Union[int, Tuple[int, int]] = (14, 14),
+        pos_emb_height: int = 14,
+        pos_emb_width: int = 14,
+    ):
+        super().__init__()
+        assert isinstance(
+            patch_size,
+            (int, Sequence)), f"Invalid patch_size type: {type(patch_size)}"
+        if isinstance(patch_size, int):
+            patch_size = (patch_size, patch_size)
+        assert (len(patch_size) == 2
+                ), f"Expected patch_size to be a tuple of 2, got {patch_size}"
+        self.patch_size = patch_size
+
+        self.proj = nn.Conv2d(in_dim,
+                              out_dim,
+                              kernel_size=patch_size,
+                              stride=patch_size)
+
+        self.pos_emb = Learnable2DInterpPosEmb(height=pos_emb_height,
+                                               width=pos_emb_width,
+                                               dim=out_dim)
+
+    def forward(self, x: torch.Tensor, grid_hw: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x (L, Channels): input tensor
+            grid_hw (N, 2): grid height and width
+
+        Returns:
+            (L, Cout) tensor
+        """
+        x = self.proj(x).view(x.size(0), -1)
+        # apply positional embedding
+        x = self.pos_emb(x, grid_hw)
+        return x
+
+
+class Rope2DPosEmb(nn.Module):
+    """2D rotary position embedding with multi-resolution support.
+
+    This class is intended to be used in the following way:
+    1. Before training, create an instance of Rope2DPosEmb. This instance will hold the precomputed cis.
+    2. Before each forward pass, call `get_freqs_cis_by_*` to get the `freqs_cis` tensor for this iteration.
+    3. During the forward pass, pass the `freqs_cis` tensor to each attention layer, and call `apply` just before each attention operation.
+        The rope is shared across all attention layers and all heads.
+
+    Refs:
+    - RoFormer: https://arxiv.org/abs/2104.09864
+    - VisionLLaMA: https://arxiv.org/abs/2403.00522
+    - https://github.com/Meituan-AutoML/VisionLLaMA/blob/main/dit/models.py
+
+    Args:
+        dim (int): usually the multi-head attention dimension, should be divisible by 4 (TODO: relax this constraint if needed)
+        max_height (int): the maximum height of the 2D grid
+        max_width (int): the maximum width of the 2D grid
+        theta_base (float): the base of the theta
+        device (str): the device to store the precomputed cis
+    """
+
+    def __init__(self,
+                 dim: int,
+                 max_height: int,
+                 max_width: int,
+                 theta_base=10000,
+                 device="cuda"):
+        super().__init__()
+        self.dim = dim
+        assert self.dim % 4 == 0, "dim must be divisible by 4"
+        self.max_height = max_height
+        self.max_width = max_width
+        self.theta_base = theta_base
+        self.device = device
+
+    def extra_repr(self):
+        return f"dim={self.dim}, max_height={self.max_height}, max_width={self.max_width}, theta_base={self.theta_base}"
+
+    @cached_property
+    def precomputed_freqs_cis(self) -> torch.Tensor:
+        """Calculate the cis(freqs) for each position in the 2D grid.
+
+        Return: complex tensor of shape (max_height, max_width, dim//2) and value:
+            height axis: ret[h, w, 2*i] = cis(h * theta_base**(-4*i/dim))
+            weight axis: ret[h, w, 2*i+1] = cis(w * theta_base**(-4*i/dim))   with (i in [0, dim//4))
+            note: `cis` is a mathematical notation defined by cis x = cos x + i sin x,
+        """
+        N = self.max_height * self.max_width
+        flat_pos = torch.arange(0, N).float().to(self.device)
+        x_pos = flat_pos % self.max_width
+        y_pos = flat_pos // self.max_width
+        dim_range = (torch.arange(0, self.dim,
+                                  4)[:(self.dim // 4)].float().to(self.device)
+                     )  # C/4
+        freqs = 1.0 / (self.theta_base**(dim_range / self.dim))
+        x_freqs = torch.outer(x_pos, freqs).float()  # N, C/4
+        y_freqs = torch.outer(y_pos, freqs).float()  # N, C/4
+        x_cis = torch.polar(torch.ones_like(x_freqs), x_freqs)  # N, C/4
+        y_cis = torch.polar(torch.ones_like(y_freqs), y_freqs)  # N, C/4
+        # N, C/4, 2
+        freqs_cis = torch.cat(
+            [x_cis.unsqueeze(dim=-1),
+             y_cis.unsqueeze(dim=-1)], dim=-1)
+        # max_height, max_width, C/2
+        freqs_cis = freqs_cis.reshape(self.max_height, self.max_width, -1)
+        return freqs_cis
+
+    def get_freqs_cis_by_seqlens(self, grid_hws: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            grid_hws (torch.Tensor): containing list of (height, width) or (t, height, width) tuples.
+        Returns:
+            freqs_cis: tensor of shape (sum(t * height * width), dim//2)
+        """
+        shapes = grid_hws.tolist()
+        assert all(1 <= h <= self.max_height and 1 <= w <= self.max_width
+                   for h, w in shapes), (
+                       shapes,
+                       self.max_height,
+                       self.max_width,
+                   )
+        freqs_cis = torch.cat(
+            [
+                self.precomputed_freqs_cis[:h, :w].reshape(-1, self.dim // 2)
+                for h, w in shapes
+            ],
+            dim=0,
+        )
+        return freqs_cis
+
+    def get_freqs_cis_by_idx(self, pos_idx: torch.Tensor,
+                             pos_idx_mask: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            pos_idx: tensor of shape (..., 2), It contains the (h, w) position indices of each 2D token.
+            pos_idx_mask: a mask of shape (...), the leading dimensions should be the same as pos_idx.
+                Rope will only be applied to the tokens with True mask. `freqs_cis` for the tokens with False mask with be ones.
+        Return:
+            freqs_cis: tensor of shape (..., dim//2)
+        """
+        assert (pos_idx.shape[:-1] == pos_idx_mask.shape
+                and pos_idx.shape[-1] == 2 and pos_idx.ndim
+                == pos_idx_mask.ndim + 1), (pos_idx.shape, pos_idx_mask.shape)
+        assert pos_idx_mask.dtype == torch.bool, pos_idx_mask.dtype
+
+        shp = pos_idx_mask.shape + (self.dim // 2, )  # ..., head_dim/2
+        freqs_cis = torch.ones(shp, dtype=torch.complex64,
+                               device=self.device)  # ..., head_dim/2
+        freqs_cis[pos_idx_mask] = self.precomputed_freqs_cis[pos_idx[
+            ..., 0][pos_idx_mask], pos_idx[..., 1][pos_idx_mask]]
+        return freqs_cis
+
+
+class MLP2(nn.Module):
+    """
+    Args:
+        dims: [in_dim, hidden_dim, out_dim]
+        bias: whether to use bias in linear layer.
+    """
+
+    def __init__(self, dims: list[int], activation, bias=True):
+        super().__init__()
+        assert len(dims) == 3
+        self.fc0 = nn.Linear(dims[0], dims[1], bias=bias)
+        self.fc1 = nn.Linear(dims[1], dims[2], bias=bias)
+        self.activation = activation
+        for m in [self.fc0, self.fc1]:
+            nn.init.trunc_normal_(m.weight, std=math.sqrt(2 / m.in_features))
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc0(x)
+        x = self.activation(x)
+        return self.fc1(x)
+
+
+class MoonVitEncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        num_heads: int,
+        hidden_dim: int,
+        mlp_dim: int,
+        *,
+        attn_implementation: str = "sdpa",
+        activation=F.gelu,
+        attn_bias: bool = False,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.hidden_size_per_attention_head = self.hidden_dim // self.num_heads
+        self.attn_implementation = attn_implementation
+        # use fa2 in vllm by default
+        if is_flash_attn_2_available():
+            self.attn_implementation = "flash_attention_2"
+
+        self.norm0 = nn.LayerNorm(hidden_dim)
+        self.norm1 = nn.LayerNorm(hidden_dim)
+        self.mlp = MLP2([hidden_dim, mlp_dim, hidden_dim], activation)
+        self.wqkv = nn.Linear(hidden_dim, hidden_dim * 3, bias=attn_bias)
+        self.wo = nn.Linear(hidden_dim, hidden_dim, bias=attn_bias)
+
+    def attention_qkvpacked(
+        self,
+        x: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rope_freqs_cis: Optional[torch.Tensor] = None,
+    ):
+        """
+        Args:
+            x (torch.Tensor): (batch_size, seqlen, hidden_dim)
+            cu_seqlens (torch.Tensor):
+        """
+        xqkv = self.wqkv(x)
+
+        qkv_shape = xqkv.size()[:-1] + (
+            3,
+            self.num_heads,
+            self.hidden_size_per_attention_head,
+        )
+        # xqkv: (batch_size, seqlen, 3, nheads, headdim)
+        xqkv = xqkv.view(*qkv_shape)
+        xq, xk, xv = torch.unbind(xqkv, dim=-3)
+
+        xq, xk = apply_rope(xq, xk, rope_freqs_cis)
+
+        attn_func = VL_VISION_ATTENTION_FUNCTIONS[self.attn_implementation]
+        attn_out = attn_func(xq,
+                             xk,
+                             xv,
+                             q_cu_seqlens=cu_seqlens,
+                             k_cu_seqlens=cu_seqlens)
+
+        attn_out = self.wo(attn_out)
+        return attn_out
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rope_freqs_cis: Union[torch.Tensor, None] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states: non-packed (B, N, D) or packed (L, D). if non-packed, seqlens should be None, if packed, seqlens should be set
+
+        Returns:
+            output: same shape of input, non-packed (B, N, D) for non-packed input, (L, D) for packed input
+        """
+        residual = hidden_states
+        hidden_states = self.norm0(hidden_states)
+        attn_out = self.attention_qkvpacked(hidden_states,
+                                            cu_seqlens,
+                                            rope_freqs_cis=rope_freqs_cis)
+        hidden_states = residual + attn_out
+
+        residual = hidden_states
+        hidden_states = self.mlp(self.norm1(hidden_states))
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class MoonVitEncoder(nn.Module):
+
+    def __init__(
+        self,
+        hidden_dim: int,
+        num_layers: int,
+        block_cfg: dict,
+    ) -> None:
+        super().__init__()
+
+        self.rope_2d = Rope2DPosEmb(
+            block_cfg["hidden_dim"] // block_cfg["num_heads"], 512, 512)
+        self.blocks = nn.ModuleList(
+            [MoonVitEncoderLayer(**block_cfg) for _ in range(num_layers)])
+        self.final_layernorm = nn.LayerNorm(hidden_dim)
+
+    def forward(self, hidden_states: torch.Tensor,
+                grid_hw: torch.Tensor) -> torch.Tensor:
+        rope_freqs_cis = self.rope_2d.get_freqs_cis_by_seqlens(
+            grid_hws=grid_hw)
+
+        lengths = torch.cat((
+            torch.zeros(1, device=hidden_states.device, dtype=grid_hw.dtype),
+            grid_hw[:, 0] * grid_hw[:, 1],
+        ))
+        cu_seqlens = lengths.cumsum(dim=0, dtype=torch.int32)
+
+        for _, block in enumerate(self.blocks):
+            hidden_states = block(hidden_states,
+                                  cu_seqlens,
+                                  rope_freqs_cis=rope_freqs_cis)
+
+        hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states
+
+
+def patch_merger(
+        x: torch.Tensor,
+        grid_hw: torch.Tensor,
+        merge_kernel_size: list[int, int] = (2, 2),
+) -> List[torch.Tensor]:
+    d_model = x.size(-1)
+
+    outputs = []
+    pre_sum = 0
+    for x_shape in grid_hw.tolist():
+        height, width = x_shape[0], x_shape[1]
+        # Get the current sequence
+        seq = x[pre_sum:pre_sum + height * width]
+        # Reshape along self.merge_kernel_size and concat to the last dimension
+        kernel_height, kernel_width = merge_kernel_size
+        new_height, new_width = height // kernel_height, width // kernel_width
+        reshaped_seq = seq.view(new_height, kernel_height, new_width,
+                                kernel_width, d_model)
+        reshaped_seq = reshaped_seq.permute(0, 2, 1, 3, 4).contiguous()
+        padded_seq = reshaped_seq.view(new_height * new_width,
+                                       kernel_height * kernel_width, -1)
+        outputs.append(padded_seq)
+        pre_sum += height * width
+
+    return outputs
+
+
+class MoonVitVLProjector(nn.Module):
+
+    def __init__(
+        self,
+        in_channels: int,
+        merge_kernel_size: list[int, int],
+        hidden_act: str = "gelu",
+        ln_eps: float = 1e-5,
+        out_dim: int = 4096,
+    ):
+        super().__init__()
+        self.hidden_size = in_channels * merge_kernel_size[
+            0] * merge_kernel_size[1]
+
+        self.pre_norm = nn.nn.LayerNorm(in_channels, eps=ln_eps)
+        self.linear_1 = nn.Linear(self.hidden_size,
+                                  self.hidden_size,
+                                  bias=True)
+        self.act = ACT2FN[hidden_act]
+        self.linear_2 = nn.Linear(self.hidden_size, out_dim, bias=True)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.pre_norm(hidden_states).view(-1, self.hidden_size)
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class MoonVitPretrainedModel(PreTrainedModel):
+    config_class = MoonViTConfig
+    model_type = "moonvit"
+    _no_split_modules = ["PackingTransformer"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+
+    def __init__(self, config: MoonViTConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        config = deepcopy(config)
+        self.merge_kernel_size = config.merge_kernel_size
+        self.patch_size = config.patch_size
+        self.patch_embed = MoonVisionPatchEmbed(
+            out_dim=config.hidden_size,
+            patch_size=config.patch_size,
+            pos_emb_height=config.init_pos_emb_height,
+            pos_emb_width=config.init_pos_emb_width,
+        )
+
+        self.encoder = MoonVitEncoder(
+            hidden_dim=config.hidden_size,
+            num_layers=config.num_hidden_layers,
+            block_cfg={
+                "num_heads": config.num_attention_heads,
+                "hidden_dim": config.hidden_size,
+                "mlp_dim": config.intermediate_size,
+                "activation": PytorchGELUTanh(),
+                "attn_bias": True,
+                "attn_implementation": config._attn_implementation,
+            },
+        )
+
+    def forward(self, pixel_values: torch.Tensor,
+                grid_hw: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            pixel_values (torch.Tensor): The input pixel values.
+            grid_hw (torch.Tensor): The grid height and width.
+
+        Returns:
+            torch.Tensor: The output tokens.
+        """
+        hidden_states = self.patch_embed(pixel_values, grid_hw)
+        hidden_states = self.encoder(hidden_states, grid_hw)
+        hidden_states = patch_merger(hidden_states,
+                                     grid_hw,
+                                     merge_kernel_size=self.merge_kernel_size)
+        return hidden_states
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 0d13d69926cf4..b345113ef306b 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -177,6 +177,7 @@ _MULTIMODAL_MODELS = {
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
     "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
     "SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"),  # noqa: E501
+    "KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"),  # noqa: E501
     "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
     "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),  # noqa: E501
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index fe0319c9b033e..f37605be82893 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -33,12 +33,13 @@ from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config,
                                              EAGLEConfig, ExaoneConfig,
                                              H2OVLChatConfig,
                                              InternVLChatConfig, JAISConfig,
-                                             MedusaConfig, MllamaConfig,
-                                             MLPSpeculatorConfig, MPTConfig,
-                                             NemotronConfig, NVLM_D_Config,
-                                             Olmo2Config, RWConfig,
-                                             SkyworkR1VChatConfig, SolarConfig,
-                                             Telechat2Config, UltravoxConfig)
+                                             KimiVLConfig, MedusaConfig,
+                                             MllamaConfig, MLPSpeculatorConfig,
+                                             MPTConfig, NemotronConfig,
+                                             NVLM_D_Config, Olmo2Config,
+                                             RWConfig, SkyworkR1VChatConfig,
+                                             SolarConfig, Telechat2Config,
+                                             UltravoxConfig)
 # yapf: enable
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.utils import resolve_obj_by_qualname
@@ -62,6 +63,7 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
     "cohere2": Cohere2Config,
     "dbrx": DbrxConfig,
     "deepseek_vl_v2": DeepseekVLV2Config,
+    "kimi_vl": KimiVLConfig,
     "mpt": MPTConfig,
     "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
     "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 53699341bfba8..739eea5cba515 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -13,9 +13,11 @@ from vllm.transformers_utils.configs.falcon import RWConfig
 from vllm.transformers_utils.configs.h2ovl import H2OVLChatConfig
 from vllm.transformers_utils.configs.internvl import InternVLChatConfig
 from vllm.transformers_utils.configs.jais import JAISConfig
+from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
 from vllm.transformers_utils.configs.medusa import MedusaConfig
 from vllm.transformers_utils.configs.mllama import MllamaConfig
 from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
+from vllm.transformers_utils.configs.moonvit import MoonViTConfig
 from vllm.transformers_utils.configs.mpt import MPTConfig
 from vllm.transformers_utils.configs.nemotron import NemotronConfig
 from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
@@ -40,6 +42,8 @@ __all__ = [
     "ExaoneConfig",
     "MllamaConfig",
     "MLPSpeculatorConfig",
+    "MoonViTConfig",
+    "KimiVLConfig",
     "NemotronConfig",
     "NVLM_D_Config",
     "Olmo2Config",
diff --git a/vllm/transformers_utils/configs/kimi_vl.py b/vllm/transformers_utils/configs/kimi_vl.py
new file mode 100644
index 0000000000000..97ff44bb9c1c9
--- /dev/null
+++ b/vllm/transformers_utils/configs/kimi_vl.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
+from typing import Optional, Union
+
+from transformers.configuration_utils import PretrainedConfig
+
+from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
+from vllm.transformers_utils.configs.moonvit import MoonViTConfig
+
+
+class KimiVLConfig(PretrainedConfig):
+    model_type = "kimi_vl"
+
+    def __init__(self,
+                 vision_config: Optional[Union[dict, MoonViTConfig]] = None,
+                 text_config: Optional[Union[dict, DeepseekV2Config]] = None,
+                 ignore_index: int = -100,
+                 media_placeholder_token_id: int = 163605,
+                 pad_token_id: int = 0,
+                 **kwargs):
+        if vision_config is None:
+            vision_config = MoonViTConfig()
+        elif isinstance(vision_config, dict):
+            vision_config = MoonViTConfig(**vision_config)
+        self.vision_config = vision_config
+
+        if text_config is None:
+            text_config = DeepseekV2Config()
+        elif isinstance(text_config, dict):
+            text_config = DeepseekV2Config(**text_config)
+        self.text_config = text_config
+
+        self.ignore_index = ignore_index
+        self.media_placeholder_token_id = media_placeholder_token_id
+
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
diff --git a/vllm/transformers_utils/configs/moonvit.py b/vllm/transformers_utils/configs/moonvit.py
new file mode 100644
index 0000000000000..a2b4059a63efb
--- /dev/null
+++ b/vllm/transformers_utils/configs/moonvit.py
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
+from transformers.configuration_utils import PretrainedConfig
+
+
+class MoonViTConfig(PretrainedConfig):
+    model_type = "moonvit"
+
+    def __init__(
+            self,
+            patch_size: int = 14,
+            init_pos_emb_height: int = 64,
+            init_pos_emb_width: int = 64,
+            num_attention_heads: int = 16,
+            num_hidden_layers: int = 27,
+            hidden_size: int = 1152,
+            intermediate_size: int = 4304,
+            merge_kernel_size: tuple[int, int] = (2, 2),
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.patch_size = patch_size
+        # Positional embedding config
+        self.init_pos_emb_height = init_pos_emb_height
+        self.init_pos_emb_width = init_pos_emb_width
+        # Transformer config
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        # Patch merger config
+        self.merge_kernel_size = merge_kernel_size
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 70e8bd75ec94e..c3d84ab377388 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -997,13 +997,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             # Return empty ModelRunnerOutput if there's no work to do.
             return EMPTY_MODEL_RUNNER_OUTPUT
 
-        if self.is_multimodal_model:
-            # Run the multimodal encoder if any.
-            self._execute_mm_encoder(scheduler_output)
-            mm_embeds = self._gather_mm_embeddings(scheduler_output)
-        else:
-            mm_embeds = []
-
         # Prepare the decoder inputs.
         attn_metadata, logits_indices, spec_decode_metadata = (
             self._prepare_inputs(scheduler_output))
@@ -1019,6 +1012,15 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             num_input_tokens = num_scheduled_tokens
         attn_metadata.num_input_tokens = num_input_tokens
 
+        # _prepare_inputs may reorder the batch, so we must gather multi
+        # modal outputs after that to ensure the correct order
+        if self.is_multimodal_model:
+            # Run the multimodal encoder if any.
+            self._execute_mm_encoder(scheduler_output)
+            mm_embeds = self._gather_mm_embeddings(scheduler_output)
+        else:
+            mm_embeds = []
+
         if self.is_multimodal_model:
             # NOTE(woosuk): To unify token ids and soft tokens (vision
             # embeddings), we always use embeddings (rather than token ids)

From c64ee87267c9da7e13a29b80114f353a11e18a20 Mon Sep 17 00:00:00 2001
From: Siyuan Liu <lsiyuan@google.com>
Date: Mon, 14 Apr 2025 14:50:46 -0700
Subject: [PATCH 429/593] [Hardware][TPU] Add torchvision to tpu dependency
 file (#16616)

Signed-off-by: Siyuan Liu <lsiyuan@google.com>
---
 requirements/tpu.txt | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index 75ebbc4ed9403..b63993ba1ee45 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -17,9 +17,8 @@ ray[data]
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250408-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250408-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250408-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch==2.8.0.dev20250408
+torchvision==0.22.0.dev20250408
 torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
 torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
 torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"

From 1eb3c2ed48206ee82c2de68e7e9eb4e9379abf72 Mon Sep 17 00:00:00 2001
From: Chengji Yao <chengjiyao@google.com>
Date: Mon, 14 Apr 2025 14:56:06 -0700
Subject: [PATCH 430/593] [DOC][TPU] Add core idea about avoiding recompilation
 after warmup (#16614)

Signed-off-by: Chengji Yao <chengjiyao@google.com>
---
 vllm/v1/worker/tpu_model_runner.py | 35 ++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 6300f16c0b3fb..c61c449e17988 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -53,6 +53,41 @@ INVALID_TOKEN_ID = -1
 MIN_NUM_SEQS = 8
 
 
+#########################################################
+# Ways to avoid recompilation
+#########################################################
+#
+# The model executor has two primary components:
+# 1. preparing the model and sampler inputs
+# 2. executing the model and sampler.
+# The core idea is to avoid any TPU computation during input preparation. For
+# better compilation tracking and increased flexibility, the model execution and
+# sampler are divided into several distinct components.
+#
+# Below are the detailed steps:
+#
+# Step 1
+# It is recommended to avoid TPU operations when preparing the model and sampler
+# inputs. CPU tensors can be prepared and transferred to the XLA device using
+# cpu_tensor.to(xla_device), which only triggers CPU to TPU transfers and avoids
+# compilation.
+#
+# Step 2
+# The TPU execution should be decomposed into subgraphs (4 at the moment):
+# 1. the main model
+# 2. selecting hidden states for each request
+# 3. sampler
+# 4. encoder.
+# Each subgraph should be decorated in a torch.compile. This is used to make
+# sure that we have the same subgraph topology in both dummy_run and
+# xecute_model. The results from these subgraphs should either be passed to
+# other subgraphs, or transferred from TPU to CPU using xla_tensor.cpu() for
+# subsequent processing on the CPU.
+#
+# Step 3
+# The dummy_run should be comprehensive, ensuring all potential input shapes and
+# branch predictions are included as subgraph inputs to facilitate
+# pre-compilation.
 class TPUModelRunner:
 
     def __init__(

From d2020acac748e5be1c4a37b06ab632b4e58e6606 Mon Sep 17 00:00:00 2001
From: Shuqiao Li <celestialli@outlook.com>
Date: Tue, 15 Apr 2025 07:31:50 +0800
Subject: [PATCH 431/593] config check sleep mode support oot platforms
 (#16562)

---
 vllm/config.py              | 6 ++++--
 vllm/platforms/interface.py | 3 +++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 08947e39bc412..f86c3272a0ad5 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -417,8 +417,10 @@ class ModelConfig:
 
         from vllm.platforms import current_platform
 
-        if self.enable_sleep_mode and not current_platform.is_cuda():
-            raise ValueError("Sleep mode is only supported on CUDA devices.")
+        if (self.enable_sleep_mode
+                and not current_platform.is_sleep_mode_available()):
+            raise ValueError(
+                "Sleep mode is not supported on current platform.")
 
         hf_config = get_config(self.hf_config_path or self.model,
                                trust_remote_code, revision, code_revision,
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 31a7ffbd910d1..2695da5778aad 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -148,6 +148,9 @@ class Platform:
         """Stateless version of :func:`torch.cuda.is_available`."""
         return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
 
+    def is_sleep_mode_available(self) -> bool:
+        return self._enum == PlatformEnum.CUDA
+
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],

From 6b40996ae8a5b065de5b3b650b5b1324f67f6334 Mon Sep 17 00:00:00 2001
From: Alex Brooks <alex.brooks@ibm.com>
Date: Mon, 14 Apr 2025 20:33:02 -0600
Subject: [PATCH 432/593] [Core][Bugfix] Fix Offline MM Beam Search (#16390)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 tests/conftest.py                  | 32 ++++++-----
 tests/samplers/test_beam_search.py | 85 ++++++++++++++++++++++++++++--
 vllm/beam_search.py                | 13 ++++-
 vllm/entrypoints/llm.py            | 40 +++++++++-----
 4 files changed, 140 insertions(+), 30 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 69447d3c474d3..d272f448f61f8 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -29,12 +29,11 @@ from vllm.distributed import (cleanup_dist_env_and_memory,
                               init_distributed_environment,
                               initialize_model_parallel)
 from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
-                         TokensPrompt, to_enc_dec_tuple_list,
-                         zip_enc_dec_prompts)
+                         to_enc_dec_tuple_list, zip_enc_dec_prompts)
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
-from vllm.utils import cuda_device_count_stateless, is_list_of
+from vllm.utils import cuda_device_count_stateless
 
 logger = init_logger(__name__)
 
@@ -469,12 +468,19 @@ class HfRunner:
         prompts: list[str],
         beam_width: int,
         max_tokens: int,
+        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
     ) -> list[tuple[list[list[int]], list[str]]]:
         outputs = self.generate(prompts,
                                 do_sample=False,
                                 max_new_tokens=max_tokens,
                                 num_beams=beam_width,
-                                num_return_sequences=beam_width)
+                                num_return_sequences=beam_width,
+                                images=images,
+                                videos=videos,
+                                audios=audios)
+
         for i in range(len(outputs)):
             output_ids, output_str = outputs[i]
             for j in range(len(output_ids)):
@@ -936,18 +942,20 @@ class VllmRunner:
 
     def generate_beam_search(
         self,
-        prompts: Union[list[str], list[list[int]]],
+        prompts: list[str],
         beam_width: int,
         max_tokens: int,
+        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
     ) -> list[tuple[list[list[int]], list[str]]]:
-        if is_list_of(prompts, str, check="all"):
-            prompts = [TextPrompt(prompt=prompt) for prompt in prompts]
-        else:
-            prompts = [
-                TokensPrompt(prompt_token_ids=tokens) for tokens in prompts
-            ]
+        inputs = self.get_inputs(prompts,
+                                 images=images,
+                                 videos=videos,
+                                 audios=audios)
+
         outputs = self.model.beam_search(
-            prompts,
+            inputs,
             BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
         returned_outputs = []
         for output in outputs:
diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
index a1a81b3891f65..5de1137eaf682 100644
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -5,6 +5,9 @@ Run `pytest tests/samplers/test_beam_search.py`.
 """
 
 import pytest
+from transformers import AutoModelForSeq2SeqLM
+
+from vllm.assets.audio import AudioAsset
 
 
 @pytest.fixture(autouse=True)
@@ -19,6 +22,7 @@ def v1(run_with_both_engines):
 #   3. Use the model "huggyllama/llama-7b".
 MAX_TOKENS = [64]
 BEAM_WIDTHS = [4]
+MM_BEAM_WIDTHS = [2]
 MODELS = ["TinyLlama/TinyLlama-1.1B-Chat-v1.0"]
 
 
@@ -48,15 +52,90 @@ def test_beam_search_single_input(
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_texts = hf_outputs[i]
         vllm_output_ids, vllm_output_texts = vllm_outputs[i]
-        for i, (hf_text,
+        for j, (hf_text,
                 vllm_text) in enumerate(zip(hf_output_texts,
                                             vllm_output_texts)):
-            print(f">>>{i}-th hf output:")
+            print(f">>>{j}-th hf output:")
             print(hf_text)
-            print(f">>>{i}-th vllm output:")
+            print(f">>>{j}-th vllm output:")
             print(vllm_text)
         assert len(hf_output_ids) == len(vllm_output_ids)
         for j in range(len(hf_output_ids)):
             assert hf_output_ids[j] == vllm_output_ids[j], (
                 f"Test{i} output{j}:\nHF: {hf_output_ids}\n"
                 f"vLLM: {vllm_output_ids}")
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", MAX_TOKENS)
+@pytest.mark.parametrize("beam_width", MM_BEAM_WIDTHS)
+def test_beam_search_passes_multimodal_data(
+    hf_runner,
+    vllm_runner,
+    dtype: str,
+    max_tokens: int,
+    beam_width: int,
+) -> None:
+    """Ensure that beam search passes multimodal data through correctly."""
+    # NOTE - this test is primarily to check that mm data is passed to beams
+    # correctly. As such, we just need to check one extra modality to make
+    # sure things pass through properly.
+    audios = [AudioAsset("mary_had_lamb").audio_and_sample_rate]
+    model = "Qwen/Qwen2-Audio-7B-Instruct"
+    audio_seq = "<|audio_bos|><|AUDIO|><|audio_eos|>"
+    prompts = [
+        f"<|im_start|>user\n{audio_seq}Can you transcribe this?<|im_end|>\n<|im_start|>assistant\n"  #noqa: E501
+    ]
+
+    with hf_runner(model, dtype=dtype,
+                   auto_cls=AutoModelForSeq2SeqLM) as hf_model:
+        audio_token_id = hf_model.config.audio_token_index
+        eos_token_id = hf_model.tokenizer.eos_token_id  # <|im_end|>
+        hf_outputs = hf_model.generate_beam_search(
+            prompts,
+            beam_width=beam_width,
+            max_tokens=max_tokens,
+            audios=audios,
+        )
+
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_beam_search(
+            prompts,
+            beam_width=beam_width,
+            max_tokens=max_tokens,
+            audios=audios,
+        )
+
+    seq_with_no_audio_toks = lambda seq: [
+        tok for tok in seq if tok != audio_token_id
+    ]
+
+    for i in range(len(prompts)):
+        hf_output_ids, hf_output_texts = hf_outputs[i]
+        vllm_output_ids, vllm_output_texts = vllm_outputs[i]
+
+        for j, (hf_text,
+                vllm_text) in enumerate(zip(hf_output_texts,
+                                            vllm_output_texts)):
+            print(f">>>{j}-th hf output [NOTE: special tokens are filtered]:")
+            print(hf_text)
+            print(f">>>{j}-th vllm output:")
+            print(vllm_text)
+        assert len(hf_output_ids) == len(vllm_output_ids)
+
+        for j in range(len(hf_output_ids)):
+            # Compare everything except for the audio tokens; we do this since
+            # the IDs returned from the transformers helper expands the audio
+            # token to match features, while the vLLM helper maintains the
+            # single audio token in the input text
+            filtered_hf_output_ids = seq_with_no_audio_toks(hf_output_ids[j])
+            filtered_vllm_output_ids = seq_with_no_audio_toks(
+                vllm_output_ids[j])
+
+            # HF output IDs may contain the end of sequence
+            if len(filtered_hf_output_ids
+                   ) == len(filtered_vllm_output_ids) + 1:
+                assert filtered_hf_output_ids[-1] == eos_token_id
+                filtered_hf_output_ids = filtered_hf_output_ids[:-1]
+
+            assert filtered_hf_output_ids == filtered_vllm_output_ids
diff --git a/vllm/beam_search.py b/vllm/beam_search.py
index 5d4ebdb7acbcf..967510abaeb9b 100644
--- a/vllm/beam_search.py
+++ b/vllm/beam_search.py
@@ -38,9 +38,18 @@ class BeamSearchOutput:
 
 class BeamSearchInstance:
 
-    def __init__(self, prompt_tokens: list[int]):
+    def __init__(
+        self,
+        prompt_tokens: list[int],
+        logprobs: Optional[list[dict[int, Logprob]]] = None,
+        **kwargs,
+    ):
         self.beams: list[BeamSearchSequence] = [
-            BeamSearchSequence(tokens=prompt_tokens, logprobs=[])
+            BeamSearchSequence(
+                tokens=prompt_tokens,
+                logprobs=[] if logprobs is None else list(logprobs),
+                **kwargs,
+            )
         ]
         self.completed: list[BeamSearchSequence] = []
 
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index a707087a2e286..57c7ab73de37b 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -536,15 +536,18 @@ class LLM:
                                          tokenizer.eos_token_id,
                                          length_penalty)
 
-        # TODO - fix handling of multimodal data for beam search; we pass it
-        # through in the async version on the abstract EngineClient, but not
-        # here.
-        if any("multi_modal_data" in prompt
-               and prompt["multi_modal_data"] is not None
-               for prompt in prompts):
-            logger.warning(
-                "Multimodal data appears to have been provided, but is not"
-                " currently being passed through in LLM.beam_search()!")
+        def create_tokens_prompt_from_beam(
+                beam: BeamSearchSequence) -> TokensPrompt:
+            token_prompt_kwargs: TokensPrompt = {
+                "prompt_token_ids": beam.tokens
+            }
+            if beam.multi_modal_data is not None:
+                token_prompt_kwargs["multi_modal_data"] = beam.multi_modal_data
+
+            if beam.mm_processor_kwargs is not None:
+                token_prompt_kwargs[
+                    "mm_processor_kwargs"] = beam.mm_processor_kwargs
+            return TokensPrompt(**token_prompt_kwargs)
 
         tokenizer = self.get_tokenizer()
         # generate 2 * beam_width candidates at each step
@@ -556,11 +559,20 @@ class LLM:
         instances: list[BeamSearchInstance] = []
 
         for prompt in prompts:
+            # Add multimodal processor kwargs & data
+            mm_kwargs = {}
+            if "multi_modal_data" in prompt:
+                mm_kwargs["multi_modal_data"] = prompt["multi_modal_data"]
+            if "mm_processor_kwargs" in prompt:
+                mm_kwargs["mm_processor_kwargs"] = prompt[
+                    "mm_processor_kwargs"]
+
             if is_token_prompt(prompt):
                 prompt_tokens = prompt["prompt_token_ids"]
             else:
                 prompt_tokens = tokenizer.encode(prompt["prompt"])
-            instances.append(BeamSearchInstance(prompt_tokens))
+            instances.append(
+                BeamSearchInstance(prompt_tokens, logprobs=None, **mm_kwargs))
 
         for _ in range(max_tokens):
             all_beams: list[BeamSearchSequence] = list(
@@ -575,8 +587,7 @@ class LLM:
                 break
 
             prompts_batch = [
-                TokensPrompt(prompt_token_ids=beam.tokens)
-                for beam in all_beams
+                create_tokens_prompt_from_beam(beam) for beam in all_beams
             ]
 
             # only runs for one step
@@ -602,7 +613,10 @@ class LLM:
                                 tokens=current_beam.tokens + [token_id],
                                 logprobs=current_beam.logprobs + [logprobs],
                                 cum_logprob=current_beam.cum_logprob +
-                                logprob_obj.logprob)
+                                logprob_obj.logprob,
+                                multi_modal_data=current_beam.multi_modal_data,
+                                mm_processor_kwargs=current_beam.
+                                mm_processor_kwargs)
 
                             if token_id == tokenizer.eos_token_id and \
                                 not ignore_eos:

From d06ba4ed3f9a5929eabb404842a5c02da42e960b Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <linjinzhen@hotmail.com>
Date: Tue, 15 Apr 2025 11:05:22 +0800
Subject: [PATCH 433/593] [Kernel] moe wna16 marlin kernel (#14447)

Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
---
 CMakeLists.txt                                |   52 +-
 csrc/moe/marlin_moe_wna16/generate_kernels.py |  103 +
 csrc/moe/marlin_moe_wna16/kernel.h            |   44 +
 csrc/moe/marlin_moe_wna16/marlin_template.h   | 1917 +++++++++++++++++
 csrc/moe/marlin_moe_wna16/ops.cu              |  927 ++++++++
 csrc/moe/torch_bindings.cpp                   |   19 +-
 csrc/quantization/gptq_marlin/marlin.cuh      |    9 +-
 .../gptq_marlin/marlin_dtypes.cuh             |   10 +-
 tests/kernels/test_moe.py                     |  254 ++-
 vllm/_custom_ops.py                           |   46 +
 .../layers/fused_moe/fused_marlin_moe.py      |  319 +--
 .../layers/fused_moe/fused_moe.py             |   20 +-
 vllm/model_executor/layers/fused_moe/layer.py |    1 +
 .../layers/quantization/awq_marlin.py         |   35 +-
 .../layers/quantization/gptq_marlin.py        |   37 +-
 .../layers/quantization/utils/marlin_utils.py |   13 +
 16 files changed, 3477 insertions(+), 329 deletions(-)
 create mode 100644 csrc/moe/marlin_moe_wna16/generate_kernels.py
 create mode 100644 csrc/moe/marlin_moe_wna16/kernel.h
 create mode 100644 csrc/moe/marlin_moe_wna16/marlin_template.h
 create mode 100644 csrc/moe/marlin_moe_wna16/ops.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a0c25df6bd54c..4f4b20d3515bc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -609,21 +609,51 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
   cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
   if (MARLIN_MOE_ARCHS)
-    set(MARLIN_MOE_SRC
-        "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
-        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
-        "csrc/moe/marlin_moe_ops.cu")
 
+    #
+    # For the Marlin MOE kernels we automatically generate sources for various
+    # preselected input type pairs and schedules.
+    # Generate sources:
+    set(MOE_MARLIN_GEN_SCRIPT
+      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
+    file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
+
+    message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
+    message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
+
+    if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
+        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
+      execute_process(
+        COMMAND ${CMAKE_COMMAND} -E env
+        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
+          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
+        RESULT_VARIABLE moe_marlin_generation_result
+        OUTPUT_VARIABLE moe_marlin_generation_output
+        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
+        ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
+      )
+
+      if (NOT moe_marlin_generation_result EQUAL 0)
+        message(FATAL_ERROR "Marlin MOE generation failed."
+                            " Result: \"${moe_marlin_generation_result}\""
+                            "\nCheck the log for details: "
+                            "${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
+      else()
+        set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
+            CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
+        message(STATUS "Marlin MOE generation completed successfully.")
+      endif()
+    else()
+      message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
+    endif()
+
+    file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
     set_gencode_flags_for_srcs(
-      SRCS "${MARLIN_MOE_SRC}"
+      SRCS "${MOE_WNAA16_MARLIN_SRC}"
       CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
 
-    list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
+    list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
+
     message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
   else()
     message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
diff --git a/csrc/moe/marlin_moe_wna16/generate_kernels.py b/csrc/moe/marlin_moe_wna16/generate_kernels.py
new file mode 100644
index 0000000000000..d1c0d92f6814a
--- /dev/null
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+import glob
+import itertools
+import os
+import subprocess
+
+import jinja2
+
+FILE_HEAD = """
+// auto generated by generate.py
+// clang-format off
+
+#include "kernel.h"
+#include "marlin_template.h"
+
+namespace MARLIN_NAMESPACE_NAME {
+""".strip()
+
+TEMPLATE = ("template __global__ void Marlin<"
+            "{{scalar_t}}, "
+            "{{w_type_id}}, "
+            "{{threads}}, "
+            "{{thread_m_blocks}}, "
+            "{{thread_n_blocks}}, "
+            "{{thread_k_blocks}}, "
+            "{{'true' if m_block_size_8 else 'false'}}, "
+            "{{stages}}, "
+            "{{'true' if has_act_order else 'false'}}, "
+            "{{'true' if has_zp else 'false'}}, "
+            "{{group_blocks}}, "
+            "{{'true' if is_zp_float else 'false'}}>"
+            "( MARLIN_KERNEL_PARAMS );")
+
+# int8 with zero point case (vllm::kU8) is also supported,
+# we don't add it to reduce wheel size.
+SCALAR_TYPES = ["vllm::kU4", "vllm::kU4B8", "vllm::kU8B128"]
+THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
+
+THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
+# group_blocks:
+#   = 0 : act order case
+#   = -1 : channelwise quantization
+#   > 0 : group_size=16*group_blocks
+GROUP_BLOCKS = [0, -1, 2, 4, 8]
+DTYPES = ["fp16", "bf16"]
+
+
+def remove_old_kernels():
+    for filename in glob.glob(os.path.dirname(__file__) + "/kernel_*.cu"):
+        subprocess.call(["rm", "-f", filename])
+
+
+def generate_new_kernels():
+    for scalar_type, dtype in itertools.product(SCALAR_TYPES, DTYPES):
+        has_zp = "B" not in scalar_type
+        all_template_str_list = []
+
+        for group_blocks, m_blocks, thread_configs in itertools.product(
+                GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
+
+            has_act_order = group_blocks == 0
+            if has_zp and has_act_order:
+                continue
+            if thread_configs[2] == 256:
+                if m_blocks <= 1 and thread_configs[0] != 128:
+                    continue
+                if m_blocks > 1 and thread_configs[0] != 64:
+                    continue
+
+            k_blocks = thread_configs[0] // 16
+            n_blocks = thread_configs[1] // 16
+            threads = thread_configs[2]
+
+            c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
+
+            template_str = jinja2.Template(TEMPLATE).render(
+                scalar_t=c_dtype,
+                w_type_id=scalar_type + ".id()",
+                threads=threads,
+                thread_m_blocks=max(m_blocks, 1),
+                thread_n_blocks=n_blocks,
+                thread_k_blocks=k_blocks,
+                m_block_size_8=m_blocks == 0.5,
+                stages="pipe_stages",
+                has_act_order=has_act_order,
+                has_zp=has_zp,
+                group_blocks=group_blocks,
+                is_zp_float=False,
+            )
+
+            all_template_str_list.append(template_str)
+
+        file_content = FILE_HEAD + "\n\n"
+        file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
+        filename = f"kernel_{dtype}_{scalar_type[6:].lower()}.cu"
+
+        with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
+            f.write(file_content)
+
+
+if __name__ == "__main__":
+    remove_old_kernels()
+    generate_new_kernels()
diff --git a/csrc/moe/marlin_moe_wna16/kernel.h b/csrc/moe/marlin_moe_wna16/kernel.h
new file mode 100644
index 0000000000000..3d92660e8028e
--- /dev/null
+++ b/csrc/moe/marlin_moe_wna16/kernel.h
@@ -0,0 +1,44 @@
+
+#ifndef MARLIN_NAMESPACE_NAME
+  #define MARLIN_NAMESPACE_NAME marlin_moe_wna16
+#endif
+
+#include "quantization/gptq_marlin/marlin.cuh"
+#include "quantization/gptq_marlin/marlin_dtypes.cuh"
+#include "core/scalar_type.hpp"
+
+#define MARLIN_KERNEL_PARAMS                                                \
+  const int4 *__restrict__ A, const int4 *__restrict__ B,                   \
+      int4 *__restrict__ C, int4 *__restrict__ C_tmp,                       \
+      const int4 *__restrict__ scales_ptr, const int4 *__restrict__ zp_ptr, \
+      const int *__restrict__ g_idx,                                        \
+      const int32_t *__restrict__ sorted_token_ids_ptr,                     \
+      const int32_t *__restrict__ expert_ids_ptr,                           \
+      const int32_t *__restrict__ num_tokens_past_padded_ptr,               \
+      const float *__restrict__ topk_weights_ptr, int top_k,                \
+      bool mul_topk_weights, bool is_ep, int num_groups, int prob_m,        \
+      int prob_n, int prob_k, int *locks, bool use_atomic_add,              \
+      bool use_fp32_reduce
+
+namespace MARLIN_NAMESPACE_NAME {
+template <typename scalar_t,  // compute dtype, half or nv_float16
+          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const bool m_block_size_8,  // whether m_block_size == 8
+                                      // only works when thread_m_blocks == 1
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const bool has_act_order,  // whether act_order is enabled
+          const bool has_zp,         // whether zero-points are enabled
+          const int group_blocks,    // number of consecutive 16x16 blocks
+                                     // with a separate quantization scale
+          const bool is_zp_float     // is zero point of float16 type?
+          >
+__global__ void Marlin(MARLIN_KERNEL_PARAMS);
+
+}
diff --git a/csrc/moe/marlin_moe_wna16/marlin_template.h b/csrc/moe/marlin_moe_wna16/marlin_template.h
new file mode 100644
index 0000000000000..205b308fe511b
--- /dev/null
+++ b/csrc/moe/marlin_moe_wna16/marlin_template.h
@@ -0,0 +1,1917 @@
+/*
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Adapted from https://github.com/IST-DASLab/marlin
+ */
+
+#ifndef MARLIN_NAMESPACE_NAME
+  #define MARLIN_NAMESPACE_NAME marlin_moe_wna16
+#endif
+
+#include "quantization/gptq_marlin/marlin.cuh"
+#include "quantization/gptq_marlin/marlin_dtypes.cuh"
+#include "core/scalar_type.hpp"
+
+#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
+  static_assert(std::is_same<scalar_t, half>::value ||          \
+                    std::is_same<scalar_t, nv_bfloat16>::value, \
+                "only float16 and bfloat16 is supported");
+
+namespace MARLIN_NAMESPACE_NAME {
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+
+template <typename scalar_t,  // compute dtype, half or nv_float16
+          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const bool m_block_size_8,  // whether m_block_size == 8
+                                      // only works when thread_m_blocks == 1
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const bool has_act_order,  // whether act_order is enabled
+          const bool has_zp,         // whether zero-points are enabled
+          const int group_blocks,    // number of consecutive 16x16 blocks
+                                     // with a separate quantization scale
+          const bool is_zp_float     // is zero point of float16 type?
+          >
+__global__ void Marlin(
+    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    int4* __restrict__ C_tmp,    // fp32 tmp output buffer (for reduce)
+    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
+                                          // (k/groupsize)xn
+    const int4* __restrict__ zp_ptr,      // 4bit packed zero-points of shape
+                                          // (k/groupsize)x(n/pack_factor)
+    const int* __restrict__ g_idx,        // int32 group indices of shape k
+    const int32_t* __restrict__ sorted_token_ids_ptr,        // moe sorted_ids
+    const int32_t* __restrict__ expert_ids_ptr,              // moe expert ids
+    const int32_t* __restrict__ num_tokens_past_padded_ptr,  // moe num tokens
+    const float* __restrict__ topk_weights_ptr,              // moe top weights
+    int top_k,              // num of experts per token
+    bool mul_topk_weights,  // mul topk weights or not
+    bool is_ep,             // expert parallelism
+    int num_groups,         // number of scale groups per output channel
+    int prob_m,             // batch dimension m
+    int prob_n,             // output dimension n
+    int prob_k,             // reduction dimension k
+    int* locks,             // extra global storage for barrier synchronization
+    bool use_atomic_add,    // whether to use atomic add to reduce
+    bool use_fp32_reduce    // whether to use fp32 global reduce
+) {}
+
+}  // namespace MARLIN_NAMESPACE_NAME
+
+#else
+
+// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
+// output/accumulation.
+template <typename scalar_t>
+__device__ inline void mma(const typename ScalarType<scalar_t>::FragA& a_frag,
+                           const typename ScalarType<scalar_t>::FragB& frag_b,
+                           typename ScalarType<scalar_t>::FragC& frag_c) {
+  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  float* c = reinterpret_cast<float*>(&frag_c);
+  if constexpr (std::is_same<scalar_t, half>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  } else {
+    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+  }
+}
+
+template <typename scalar_t>
+__device__ inline void mma_trans(
+    const typename ScalarType<scalar_t>::FragA& a_frag,
+    const typename ScalarType<scalar_t>::FragB& frag_b,
+    const typename ScalarType<scalar_t>::FragB& frag_b2,
+    typename ScalarType<scalar_t>::FragC& frag_c) {
+  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  const uint32_t* b2 = reinterpret_cast<const uint32_t*>(&frag_b2);
+  float* c = reinterpret_cast<float*>(&frag_c);
+  if constexpr (std::is_same<scalar_t, half>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  } else {
+    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+  }
+}
+
+// Instruction for loading a full 16x16 matrix fragment of operand A from shared
+// memory, directly in tensor core layout.
+template <int count, typename scalar_t>
+__device__ inline void ldsm(typename ScalarType<scalar_t>::FragA& frag_a,
+                            const void* smem_ptr) {
+  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  if constexpr (count == 4) {
+    asm volatile(
+        "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
+        : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
+        : "r"(smem));
+  } else if constexpr (count == 2) {
+    asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n"
+                 : "=r"(a[0]), "=r"(a[1])
+                 : "r"(smem));
+  } else if constexpr (count == 1) {
+    asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n"
+                 : "=r"(a[0])
+                 : "r"(smem));
+  } else {
+    static_assert(count == 1 || count == 2 || count == 4, "invalid count");
+  }
+}
+
+// Lookup-table based 3-input logical operation; explicitly used for
+// dequantization as the compiler does not seem to automatically recognize it in
+// all cases.
+template <int lut>
+__device__ inline int lop3(int a, int b, int c) {
+  int res;
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(res)
+               : "r"(a), "r"(b), "r"(c), "n"(lut));
+  return res;
+}
+
+// Constructs destination register by taking bytes from 2 sources (based on
+// mask)
+template <int start_byte, int mask>
+__device__ inline uint32_t prmt(uint32_t a) {
+  uint32_t res;
+  asm volatile("prmt.b32 %0, %1, %2, %3;\n"
+               : "=r"(res)
+               : "r"(a), "n"(start_byte), "n"(mask));
+  return res;
+}
+
+template <typename scalar_t, int bit>
+__device__ inline typename ScalarType<scalar_t>::FragB dequant(
+    int q, typename ScalarType<scalar_t>::FragB& frag_b);
+
+//
+// Efficiently dequantize 4bit values packed in an int32 value into a full
+// B-fragment of 4 fp16 values. We mostly follow the strategy in the link below,
+// with some small changes:
+// - FP16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L215-L287
+// - BF16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L327-L385
+//
+template <>
+__device__ inline typename ScalarType<half>::FragB dequant<half, 4>(
+    int q, typename ScalarType<half>::FragB& frag_b) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
+  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
+  // directly into `SUB` and `ADD`.
+  const int SUB = 0x64086408;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd480d480;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&SUB));
+  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&MUL),
+                      *reinterpret_cast<const half2*>(&ADD));
+  return frag_b;
+}
+
+template <>
+__device__ inline typename ScalarType<nv_bfloat16>::FragB
+dequant<nv_bfloat16, 4>(int q,
+                        typename ScalarType<nv_bfloat16>::FragB& frag_b) {
+  static constexpr uint32_t MASK = 0x000f000f;
+  static constexpr uint32_t EX = 0x43004300;
+
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  q >>= 4;
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+
+  static constexpr uint32_t MUL = 0x3F803F80;
+  static constexpr uint32_t ADD = 0xC308C308;
+
+  frag_b[0] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&lo),
+                      *reinterpret_cast<const nv_bfloat162*>(&MUL),
+                      *reinterpret_cast<const nv_bfloat162*>(&ADD));
+  frag_b[1] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&hi),
+                      *reinterpret_cast<const nv_bfloat162*>(&MUL),
+                      *reinterpret_cast<const nv_bfloat162*>(&ADD));
+  return frag_b;
+}
+
+//
+// Fast Int8ToFp16/Int8ToBf16: Efficiently dequantize 8bit int values to fp16 or
+// bf16 Reference:
+// - FP16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85
+// - BF16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L125-L175
+//
+template <>
+__device__ inline typename ScalarType<half>::FragB dequant<half, 8>(
+    int q, typename ScalarType<half>::FragB& frag_b) {
+  static constexpr uint32_t mask_for_elt_01 = 0x5250;
+  static constexpr uint32_t mask_for_elt_23 = 0x5351;
+  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+
+  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
+  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
+
+  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
+
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  return frag_b;
+}
+
+template <>
+__device__ inline typename ScalarType<nv_bfloat16>::FragB
+dequant<nv_bfloat16, 8>(int q,
+                        typename ScalarType<nv_bfloat16>::FragB& frag_b) {
+  float fp32_intermediates[4];
+  uint32_t* fp32_intermediates_casted =
+      reinterpret_cast<uint32_t*>(fp32_intermediates);
+
+  static constexpr uint32_t fp32_base = 0x4B000000;
+  fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650);
+  fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652);
+  fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651);
+  fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653);
+
+  fp32_intermediates[0] -= 8388736.f;
+  fp32_intermediates[1] -= 8388736.f;
+  fp32_intermediates[2] -= 8388736.f;
+  fp32_intermediates[3] -= 8388736.f;
+
+  uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(&frag_b);
+  bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0],
+                                   fp32_intermediates_casted[1], 0x7632);
+  bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2],
+                                   fp32_intermediates_casted[3], 0x7632);
+
+  return frag_b;
+}
+
+// Multiply dequantized values by the corresponding quantization scale; used
+// only for grouped quantization.
+template <typename scalar_t>
+__device__ inline void scale(typename ScalarType<scalar_t>::FragB& frag_b,
+                             typename ScalarType<scalar_t>::FragS& frag_s,
+                             int i) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 s =
+      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_s)[i]);
+  frag_b[0] = __hmul2(frag_b[0], s);
+  frag_b[1] = __hmul2(frag_b[1], s);
+}
+
+template <typename scalar_t>
+__device__ inline void scale_and_sub(
+    typename ScalarType<scalar_t>::FragB& frag_b, scalar_t s, scalar_t zp) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 s2 = ScalarType<scalar_t>::num2num2(s);
+  scalar_t2 zp2 = ScalarType<scalar_t>::num2num2(zp);
+  frag_b[0] = __hfma2(frag_b[0], s2, __hneg2(zp2));
+  frag_b[1] = __hfma2(frag_b[1], s2, __hneg2(zp2));
+}
+
+template <typename scalar_t>
+__device__ inline void sub_zp(typename ScalarType<scalar_t>::FragB& frag_b,
+                              typename ScalarType<scalar_t>::scalar_t2& frag_zp,
+                              int i) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 zp =
+      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_zp)[i]);
+  frag_b[0] = __hsub2(frag_b[0], zp);
+  frag_b[1] = __hsub2(frag_b[1], zp);
+}
+
+// Same as above, but for act_order (each K is multiplied individually)
+template <typename scalar_t>
+__device__ inline void scale4(typename ScalarType<scalar_t>::FragB& frag_b,
+                              typename ScalarType<scalar_t>::FragS& frag_s_1,
+                              typename ScalarType<scalar_t>::FragS& frag_s_2,
+                              typename ScalarType<scalar_t>::FragS& frag_s_3,
+                              typename ScalarType<scalar_t>::FragS& frag_s_4,
+                              int i) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 s_val_1_2;
+  s_val_1_2.x = reinterpret_cast<scalar_t*>(&frag_s_1)[i];
+  s_val_1_2.y = reinterpret_cast<scalar_t*>(&frag_s_2)[i];
+
+  scalar_t2 s_val_3_4;
+  s_val_3_4.x = reinterpret_cast<scalar_t*>(&frag_s_3)[i];
+  s_val_3_4.y = reinterpret_cast<scalar_t*>(&frag_s_4)[i];
+
+  frag_b[0] = __hmul2(frag_b[0], s_val_1_2);
+  frag_b[1] = __hmul2(frag_b[1], s_val_3_4);
+}
+
+// Given 2 floats multiply by 2 scales (halves)
+template <typename scalar_t>
+__device__ inline void scale_float(float* c,
+                                   typename ScalarType<scalar_t>::FragS& s) {
+  scalar_t* s_ptr = reinterpret_cast<scalar_t*>(&s);
+  c[0] = __fmul_rn(c[0], ScalarType<scalar_t>::num2float(s_ptr[0]));
+  c[1] = __fmul_rn(c[1], ScalarType<scalar_t>::num2float(s_ptr[1]));
+}
+
+// Wait until barrier reaches `count`, then lock for current threadblock.
+__device__ inline void barrier_acquire(int* lock, int count) {
+  if (threadIdx.x == 0) {
+    int state = -1;
+    do
+      // Guarantee that subsequent writes by this threadblock will be visible
+      // globally.
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
+                   : "=r"(state)
+                   : "l"(lock));
+    while (state != count);
+  }
+  __syncthreads();
+}
+
+// Release barrier and increment visitation count.
+__device__ inline void barrier_release(int* lock, bool reset = false) {
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    if (reset) {
+      lock[0] = 0;
+      return;
+    }
+    int val = 1;
+    // Make sure that all writes since acquiring this barrier are visible
+    // globally, while releasing the barrier.
+    asm volatile("fence.acq_rel.gpu;\n");
+    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
+                 :
+                 : "l"(lock), "r"(val));
+  }
+}
+
+// Wait until value of lock to be negative, and then add 1
+__device__ inline void wait_negative_and_add(int* lock) {
+  if (threadIdx.x == 0) {
+    int state = 0;
+    do
+      // Guarantee that subsequent writes by this threadblock will be visible
+      // globally.
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
+                   : "=r"(state)
+                   : "l"(lock));
+    while (state >= 0);
+    atomicAdd(lock, 1);
+  }
+  __syncthreads();
+}
+
+template <typename scalar_t,  // compute dtype, half or nv_float16
+          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const bool m_block_size_8,  // whether m_block_size == 8
+                                      // only works when thread_m_blocks == 1
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const bool has_act_order,  // whether act_order is enabled
+          const bool has_zp,         // whether zero-points are enabled
+          const int group_blocks,    // number of consecutive 16x16 blocks
+                                     // with a separate quantization scale
+          const bool is_zp_float     // is zero point of float16 type?
+          >
+__global__ void Marlin(
+    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    int4* __restrict__ C_tmp,    // fp32 tmp output buffer (for reduce)
+    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
+                                          // (k/groupsize)xn
+    const int4* __restrict__ zp_ptr,      // 4bit packed zero-points of shape
+                                          // (k/groupsize)x(n/pack_factor)
+    const int* __restrict__ g_idx,        // int32 group indices of shape k
+    const int32_t* __restrict__ sorted_token_ids_ptr,        // moe sorted_ids
+    const int32_t* __restrict__ expert_ids_ptr,              // moe expert ids
+    const int32_t* __restrict__ num_tokens_past_padded_ptr,  // moe num tokens
+    const float* __restrict__ topk_weights_ptr,              // moe top weights
+    int top_k,              // num of experts per token
+    bool mul_topk_weights,  // mul topk weights or not
+    bool is_ep,             // expert parallelism
+    int num_groups,         // number of scale groups per output channel
+    int prob_m,             // batch dimension m
+    int prob_n,             // output dimension n
+    int prob_k,             // reduction dimension k
+    int* locks,             // extra global storage for barrier synchronization
+    bool use_atomic_add,    // whether to use atomic add to reduce
+    bool use_fp32_reduce    // whether to use fp32 global reduce
+) {
+  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
+  // same size, which might involve multiple column "slices" (of width 16 *
+  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
+  // example:
+  //   0 1 3
+  //   0 2 3
+  //   1 2 4
+  // While this kind of partitioning makes things somewhat more complicated, it
+  // ensures good utilization of all SMs for many kinds of shape and GPU
+  // configurations, while requiring as few slow global cross-threadblock
+  // reductions as possible.
+  using Dtype = ScalarType<scalar_t>;
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  using FragA = typename ScalarType<scalar_t>::FragA;
+  using FragB = typename ScalarType<scalar_t>::FragB;
+  using FragC = typename ScalarType<scalar_t>::FragC;
+  using FragS = typename ScalarType<scalar_t>::FragS;
+  using FragZP = typename ScalarType<scalar_t>::FragZP;
+
+  extern __shared__ int4 sh[];
+  static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id);
+
+  constexpr int pack_factor = 32 / w_type.size_bits();
+  static_assert(thread_m_blocks == 1 || !m_block_size_8);
+  constexpr int moe_block_size = m_block_size_8 ? 8 : (16 * thread_m_blocks);
+  const int group_size =
+      (!has_act_order && group_blocks == -1) ? prob_k : prob_k / num_groups;
+  const int scales_expert_stride = prob_n * prob_k / group_size / 8;
+  const int zp_expert_stride =
+      is_zp_float ? prob_n * prob_k / group_size / 8
+                  : prob_n * prob_k / group_size / (pack_factor * 4);
+
+  // parallel: num valid moe blocks
+  int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
+  int parallel = num_tokens_past_padded / moe_block_size;
+  int num_valid_blocks = parallel;
+  if (is_ep) {
+    for (int i = 0; i < parallel; i++) {
+      if (expert_ids_ptr[i] == -1) num_valid_blocks--;
+    }
+  }
+  int num_invalid_blocks = parallel - num_valid_blocks;
+  parallel = num_valid_blocks;
+
+  int k_tiles = prob_k / 16 / thread_k_blocks;
+  int n_tiles = prob_n / 16 / thread_n_blocks;
+  int iters = div_ceil(k_tiles * n_tiles * parallel, gridDim.x);
+
+  if constexpr (!has_act_order && group_blocks != -1) {
+    if (group_blocks >= thread_k_blocks) {
+      // Ensure that the number of tiles in each stripe is a multiple of the
+      // groupsize; this avoids an annoying special case where a stripe starts
+      // in the middle of group.
+      iters = (group_blocks / thread_k_blocks) *
+              div_ceil(iters, (group_blocks / thread_k_blocks));
+    }
+  }
+
+  int slice_row = (iters * blockIdx.x) % k_tiles;
+  int slice_col_par = (iters * blockIdx.x) / k_tiles;
+  int slice_col = slice_col_par;
+  int slice_iters;  // number of threadblock tiles in the current slice
+  int slice_count =
+      0;          // total number of active threadblocks in the current slice
+  int slice_idx;  // index of threadblock in current slice; numbered bottom to
+                  // top
+
+  int par_id = 0;
+  int block_id = -1;
+  int64_t expert_id = 0;  // use int64 to avoid computation result overflow
+  int old_expert_id = 0;
+  int64_t B_expert_off = 0;
+
+  int4* sh_block_sorted_ids_int4 = sh;
+  int32_t* sh_block_sorted_ids =
+      reinterpret_cast<int*>(sh_block_sorted_ids_int4);
+  int4* sh_block_topk_weights_int4 =
+      sh_block_sorted_ids_int4 + moe_block_size / 4;
+  scalar_t2* sh_block_topk_weights =
+      reinterpret_cast<scalar_t2*>(sh_block_topk_weights_int4);
+  int4* sh_new = sh_block_topk_weights_int4 + moe_block_size / 4;
+
+  int32_t block_num_valid_tokens = 0;
+  int32_t locks_off = 0;
+
+  // We can easily implement parallel problem execution by just remapping
+  // indices and advancing global pointers
+  if (slice_col_par >= n_tiles) {
+    slice_col = slice_col_par % n_tiles;
+    par_id = slice_col_par / n_tiles;
+  }
+  if (parallel * n_tiles >= gridDim.x) {
+    // when parallel * n_tiles >= sms
+    // then there are at most $sms$ conflict tile blocks
+    locks_off = blockIdx.x;
+  } else {
+    locks_off = (iters * blockIdx.x) / k_tiles - 1;
+  }
+
+  // read moe block data given block_id
+  // block_sorted_ids / block_num_valid_tokens / block_topk_weights
+  auto read_moe_block_data = [&](int block_id) {
+    block_num_valid_tokens = moe_block_size;
+  #pragma unroll
+    for (int i = 0; i < moe_block_size / 4; i++) {
+      int4 sorted_token_ids_int4 = reinterpret_cast<const int4*>(
+          sorted_token_ids_ptr)[block_id * moe_block_size / 4 + i];
+      int* sorted_token_ids = reinterpret_cast<int*>(&sorted_token_ids_int4);
+  #pragma unroll
+      for (int j = 0; j < 4; j++) {
+        if (sorted_token_ids[j] >= prob_m * top_k) {
+          block_num_valid_tokens = i * 4 + j;
+          break;
+        }
+      }
+      if (block_num_valid_tokens != moe_block_size) break;
+    }
+
+    __syncthreads();
+    int tid4 = threadIdx.x / 4;
+    if (threadIdx.x % 4 == 0 && threadIdx.x < block_num_valid_tokens) {
+      sh_block_sorted_ids_int4[tid4] = reinterpret_cast<const int4*>(
+          sorted_token_ids_ptr)[block_id * moe_block_size / 4 + tid4];
+
+      if (mul_topk_weights) {
+  #pragma unroll
+        for (int i = 0; i < 4; i++) {
+          sh_block_topk_weights[tid4 * 4 + i] =
+              Dtype::num2num2(Dtype::float2num(
+                  topk_weights_ptr[sh_block_sorted_ids[tid4 * 4 + i]]));
+        }
+      }
+    }
+    __syncthreads();
+  };
+
+  // when move to next moe block, find the next block_id and expert_id
+  // and then read moe block data
+  auto update_next_moe_block_data = [&]() {
+    if (par_id >= parallel) return;
+
+    old_expert_id = expert_id;
+    if (num_invalid_blocks > 0) {
+      int skip_count = block_id == -1 ? par_id : 0;
+      block_id++;
+      for (int i = block_id; i < num_tokens_past_padded / moe_block_size; i++) {
+        expert_id = expert_ids_ptr[i];
+        if (expert_id != -1) {
+          if (skip_count == 0) {
+            block_id = i;
+            break;
+          };
+          skip_count--;
+        };
+      }
+    } else {
+      block_id = par_id;
+      expert_id = expert_ids_ptr[block_id];
+    }
+
+    B_expert_off = expert_id * prob_n * prob_k / (pack_factor * 4);
+    scales_ptr += (expert_id - old_expert_id) * scales_expert_stride;
+    if constexpr (has_zp) {
+      zp_ptr += (expert_id - old_expert_id) * zp_expert_stride;
+    }
+    if constexpr (has_act_order) {
+      g_idx += (expert_id - old_expert_id) * prob_k;
+    }
+
+    read_moe_block_data(block_id);
+  };
+
+  // Compute all information about the current slice which is required for
+  // synchronization.
+  auto init_slice = [&](bool first_init = false) {
+    slice_iters =
+        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
+    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
+    if (slice_iters == 0) return;
+    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
+    slice_count = 1;
+    slice_idx = 0;
+    int col_first = iters * div_ceil(k_tiles * slice_col_par, iters);
+    if (col_first <= k_tiles * (slice_col_par + 1)) {
+      int col_off = col_first - k_tiles * slice_col_par;
+      slice_count = div_ceil(k_tiles - col_off, iters);
+      if (col_off > 0) slice_count++;
+      int delta_first = iters * blockIdx.x - col_first;
+      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
+        slice_idx = slice_count - 1;
+      else {
+        slice_idx = slice_count - 1 - delta_first / iters;
+        if (col_off > 0) slice_idx--;
+      }
+    }
+    if (parallel * n_tiles >= gridDim.x) {
+      if (slice_count > 1 && slice_idx == slice_count - 1) {
+        locks_off++;
+      }
+    } else {
+      locks_off++;
+    }
+
+    if (first_init && use_atomic_add && slice_count > 1 && slice_idx == 0) {
+      constexpr int threads_per_m = 16 * thread_n_blocks / 8;
+      int m_per_thread =
+          div_ceil(block_num_valid_tokens, threads / threads_per_m);
+      for (int i = 0; i < m_per_thread; i++) {
+        int row = threads / threads_per_m * i + threadIdx.x / threads_per_m;
+        if (row < block_num_valid_tokens) {
+          int64_t sorted_row = sh_block_sorted_ids[row];
+          int col = slice_col * 16 * thread_n_blocks / 8 +
+                    threadIdx.x % threads_per_m;
+          C[sorted_row * prob_n / 8 + col] = {0, 0, 0, 0};
+        }
+      }
+      // After write zero to output, write a negative value to lock.
+      // Every SM that processes the same slice would wait for
+      // the negative value, and then atomicAdd 1 to it.
+      // After all SMs are processed, the lock value would back to 0 again.
+      __syncthreads();
+      if (threadIdx.x == 0) locks[locks_off] = 1 - slice_count;
+    }
+
+    if (slice_col == n_tiles) {
+      slice_col = 0;
+      par_id++;
+      update_next_moe_block_data();
+    }
+  };
+
+  update_next_moe_block_data();
+  init_slice(true);
+
+  // A sizes/strides
+
+  // stride of the A matrix in global memory
+  int a_gl_stride = prob_k / 8;
+  // stride of an A matrix tile in shared memory
+  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
+  // delta between subsequent A tiles in global memory
+  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
+  // between subsequent accesses within a tile
+  int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory writes
+  constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory tile reads
+  constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
+  // within a shared memory tile
+  constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
+  // overall size of a tile
+  constexpr int a_sh_stage = a_sh_stride * (16 * thread_m_blocks);
+  // number of shared write iterations for a tile
+  constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta);
+
+  // B sizes/strides
+  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
+  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
+  constexpr int b_thread_vecs = w_type.size_bits() == 4 ? 1 : 2;
+  constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
+
+  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
+  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
+  constexpr int b_sh_wr_delta = threads * b_thread_vecs;
+  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
+  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
+
+  // Scale sizes/strides without act_order
+  int s_gl_stride = prob_n / 8;
+  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
+  constexpr int s_tb_groups =
+      !has_act_order && group_blocks != -1 && group_blocks < thread_k_blocks
+          ? thread_k_blocks / group_blocks
+          : 1;
+  constexpr int s_sh_stage = s_tb_groups * s_sh_stride;
+  int s_gl_rd_delta = s_gl_stride;
+
+  // Scale size/strides with act_order
+  constexpr int tb_k = 16 * thread_k_blocks;
+  constexpr int g_idx_stage = has_act_order ? (tb_k * sizeof(int)) / 16 : 0;
+  // constexpr int act_s_row_stride      = 1;
+  // int           act_s_col_stride      = act_s_row_stride * num_groups;
+  int act_s_col_stride = 1;
+  int act_s_col_warp_stride = act_s_col_stride * 8;
+  int tb_n_warps = thread_n_blocks / 4;
+  int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
+
+  // Zero-points sizes/strides
+  int zp_gl_stride = is_zp_float ? prob_n / 8 : (prob_n / pack_factor) / 4;
+  constexpr int zp_sh_stride = is_zp_float
+                                   ? 16 * thread_n_blocks / 8
+                                   : ((16 * thread_n_blocks) / pack_factor) / 4;
+  constexpr int zp_tb_groups = s_tb_groups;
+  constexpr int zp_sh_stage = has_zp ? zp_tb_groups * zp_sh_stride : 0;
+  int zp_gl_rd_delta = zp_gl_stride;
+
+  // Global A read index of current thread.
+  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  a_gl_rd += a_gl_rd_delta_o * slice_row;
+  // Shared write index of current thread.
+  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  // Shared read index.
+  int a_sh_rd =
+      a_sh_stride * ((threadIdx.x % 32) % (16 / (m_block_size_8 ? 2 : 1))) +
+      (threadIdx.x % 32) / (16 / (m_block_size_8 ? 2 : 1));
+  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+
+  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
+                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
+  b_gl_rd += b_sh_stride * slice_col;
+  b_gl_rd += b_gl_rd_delta_o * slice_row;
+  int b_sh_wr = threadIdx.x * b_thread_vecs;
+  int b_sh_rd = threadIdx.x * b_thread_vecs;
+
+  // For act_order
+  constexpr int k_iter_size = tb_k / b_sh_wr_iters;
+  int slice_k_start = tb_k * slice_row;
+  int slice_k_finish = slice_k_start + tb_k * slice_iters;
+  int slice_k_start_shared_fetch = slice_k_start;
+  int slice_n_offset = act_s_col_tb_stride * slice_col;
+
+  // No act_order
+  int s_gl_rd;
+  if constexpr (!has_act_order) {
+    if constexpr (group_blocks == -1) {
+      s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+    } else {
+      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+                s_sh_stride * slice_col + threadIdx.x;
+    }
+  }
+  int s_sh_wr = threadIdx.x;
+  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
+
+  // Zero-points
+  int zp_gl_rd;
+  if constexpr (has_zp) {
+    if constexpr (group_blocks == -1) {
+      zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
+    } else {
+      zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+                 zp_sh_stride * slice_col + threadIdx.x;
+    }
+  }
+  int zp_sh_wr = threadIdx.x;
+  bool zp_sh_wr_pred = threadIdx.x < zp_sh_stride;
+
+  // We use a different scale layout for grouped and column-wise quantization as
+  // we scale a `half2` tile in column-major layout in the former and in
+  // row-major in the latter case.
+  int s_sh_rd;
+  if constexpr (group_blocks != -1)
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) / 4;
+  else if constexpr (group_blocks == -1 && (m_block_size_8 || has_zp))
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) / 8;
+  else
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) % 4;
+
+  // Zero-points have the same read layout as the scales
+  // (without column-wise case)
+  constexpr int num_col_threads = 8;
+  constexpr int num_row_threads = 4;
+  constexpr int num_ints_per_thread = 8 / pack_factor;
+  int zp_sh_rd;
+  if constexpr (has_zp) {
+    if constexpr (is_zp_float) {
+      if constexpr (group_blocks != -1) {
+        zp_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                   (threadIdx.x % 32) / 4;
+      }
+    } else {
+      zp_sh_rd = num_ints_per_thread * num_col_threads *
+                     ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                 num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
+    }
+  }
+
+  // To ensure that writing and reading A tiles to/from shared memory, the
+  // latter in fragment format, is fully bank conflict free, we need to use a
+  // rather fancy XOR-based layout. The key here is that neither reads nor
+  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
+  // same shared memory banks. Further, it seems (based on NSight-Compute) that
+  // each warp must also write a consecutive memory segment?
+  auto transform_a = [&](int i) {
+    int row = i / a_gl_rd_delta_o;
+    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
+  };
+  // Since the computation of this remapping is non-trivial and, due to our main
+  // loop unrolls, all shared memory accesses are static, we simply precompute
+  // both transformed reads and writes.
+  int a_sh_wr_trans[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
+  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++) {
+  #pragma unroll
+    for (int j = 0; j < thread_m_blocks; j++)
+      a_sh_rd_trans[i][j] =
+          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+  }
+
+  // Since B-accesses have non-constant stride they have to be computed at
+  // runtime; we break dependencies between subsequent accesses with a tile by
+  // maintining multiple pointers (we have enough registers), a tiny
+  // optimization.
+  const int4* B_ptr[b_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++)
+    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
+
+  // Shared memory storage for global fetch pipelines.
+  int4* sh_a = sh_new;
+  int4* sh_b = sh_a + (stages * a_sh_stage);
+  int4* sh_g_idx = sh_b + (stages * b_sh_stage);
+  int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
+  int4* sh_s = sh_zp + (stages * zp_sh_stage);
+  int4* sh_red = sh_b;
+
+  // Register storage for double buffer of shared memory reads.
+  FragA frag_a[2][thread_m_blocks];
+  I4 frag_b_quant[2][b_thread_vecs];
+  FragC frag_c[thread_m_blocks][4][2];
+  FragS frag_s[2][4];                    // No act-order
+  FragS act_frag_s[2][4][4];             // For act-order
+  int frag_qzp[2][num_ints_per_thread];  // Zero-points
+  FragZP frag_zp;                        // Zero-points in fp16
+  FragZP frag_zpf[2];                    // Zero-points in fp16 in HQQ
+
+  // Zero accumulators.
+  auto zero_accums = [&]() {
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
+      reinterpret_cast<float*>(frag_c)[i] = 0;
+  };
+
+  int sh_first_group_id = -1;
+  int sh_num_groups = -1;
+  constexpr int sh_max_num_groups = 32;
+
+  auto fetch_act_order_scales_to_shared = [&](bool is_async, int first_group_id,
+                                              int last_group_id) {
+    sh_first_group_id = first_group_id;
+    sh_num_groups = last_group_id - first_group_id + 1;
+
+    if (sh_num_groups < sh_max_num_groups) {
+      sh_num_groups = sh_max_num_groups;
+    }
+
+    if (sh_first_group_id + sh_num_groups > num_groups) {
+      sh_num_groups = num_groups - sh_first_group_id;
+    }
+
+    int row_offset = first_group_id * s_gl_stride;
+
+    if (is_async) {
+      for (int i = 0; i < sh_num_groups; i++) {
+        if (threadIdx.x < s_sh_stride) {
+          cp_async4_pred(&sh_s[(i * s_sh_stride) + threadIdx.x],
+                         &scales_ptr[row_offset + (i * s_gl_stride) +
+                                     slice_n_offset + threadIdx.x]);
+        }
+      }
+    } else {
+      for (int i = 0; i < sh_num_groups; i++) {
+        if (threadIdx.x < s_sh_stride) {
+          sh_s[(i * s_sh_stride) + threadIdx.x] =
+              scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset +
+                         threadIdx.x];
+        }
+      }
+    }
+  };
+  // Asynchronously fetch the next A, B and s tile from global to the next
+  // shared memory pipeline location.
+  int a_remaining_load_count_in_slice = stages;
+  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
+    if (pred) {
+      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+      if (prob_k > thread_k_blocks * 16 * stages || slice_col == 0 ||
+          a_remaining_load_count_in_slice > 0) {
+        a_remaining_load_count_in_slice--;
+  #pragma unroll
+        for (int i = 0; i < a_sh_wr_iters; i++) {
+          int a_idx = a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off;
+          int row = a_idx / a_gl_stride;
+          int64_t sorted_row = 0;
+          if (!m_block_size_8 || row < 8)
+            sorted_row = sh_block_sorted_ids[row] / top_k;
+          int64_t true_idx = sorted_row * a_gl_stride + a_idx % a_gl_stride;
+          cp_async4_pred(&sh_a_stage[a_sh_wr_trans[i]], &A[true_idx],
+                         row < block_num_valid_tokens);
+        }
+      }
+      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < b_sh_wr_iters; i++) {
+  #pragma unroll
+        for (int j = 0; j < b_thread_vecs; j++) {
+          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j],
+                    B_ptr[i] + j + B_expert_off);
+        }
+
+        B_ptr[i] += b_gl_rd_delta_o;
+      }
+
+      if constexpr (has_act_order) {
+        // Fetch g_idx thread-block portion
+        int full_pipe = a_off;
+        int cur_k = slice_k_start_shared_fetch + tb_k * full_pipe;
+        if (cur_k < prob_k && cur_k < slice_k_finish) {
+          int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+
+          int4 const* cur_g_idx_stage_ptr =
+              reinterpret_cast<int4 const*>(&g_idx[cur_k]);
+
+          if (threadIdx.x < g_idx_stage) {
+            cp_async4_pred(&sh_g_idx_stage[threadIdx.x],
+                           &cur_g_idx_stage_ptr[threadIdx.x]);
+          }
+        }
+      } else {
+        if constexpr (group_blocks != -1) {
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+
+          if constexpr (group_blocks >= thread_k_blocks) {
+            // Only fetch scales if this tile starts a new group
+            if (pipe % (group_blocks / thread_k_blocks) == 0) {
+              if (s_sh_wr_pred) {
+                cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
+              }
+              s_gl_rd += s_gl_rd_delta;
+            }
+          } else {
+            for (int i = 0; i < s_tb_groups; i++) {
+              if (s_sh_wr_pred) {
+                cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr],
+                          &scales_ptr[s_gl_rd]);
+              }
+              s_gl_rd += s_gl_rd_delta;
+            }
+          }
+        }
+
+        if constexpr (has_zp && group_blocks != -1) {
+          int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
+
+          if constexpr (group_blocks >= thread_k_blocks) {
+            // Only fetch zero-points if this tile starts a new group
+            if (pipe % (group_blocks / thread_k_blocks) == 0) {
+              if (zp_sh_wr_pred) {
+                cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]);
+              }
+              zp_gl_rd += zp_gl_rd_delta;
+            }
+          } else {
+            for (int i = 0; i < zp_tb_groups; i++) {
+              if (zp_sh_wr_pred) {
+                cp_async4(&sh_zp_stage[i * zp_sh_stride + zp_sh_wr],
+                          &zp_ptr[zp_gl_rd]);
+              }
+              zp_gl_rd += zp_gl_rd_delta;
+            }
+          }
+        }
+      }
+    }
+    // Insert a fence even when we are winding down the pipeline to ensure that
+    // waiting is also correct at this point.
+    cp_async_fence();
+  };
+
+  auto fetch_col_zp_to_shared = [&]() {
+    if (zp_sh_wr_pred) {
+      cp_async4(&sh_zp[zp_sh_wr], &zp_ptr[zp_gl_rd]);
+    }
+  };
+
+  auto fetch_col_scale_to_shared = [&]() {
+    if (s_sh_wr_pred) {
+      cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
+    }
+  };
+
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<stages - 2>();
+    __syncthreads();
+  };
+
+  // Load the next sub-tile from the current location in the shared memory pipe
+  // into the current register buffer.
+  auto fetch_to_registers = [&](int k, int pipe) {
+    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks; i++)
+      ldsm<m_block_size_8 ? 2 : 4, scalar_t>(
+          frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
+    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+
+  #pragma unroll
+    for (int i = 0; i < b_thread_vecs; i++) {
+      frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
+          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
+    }
+  };
+
+  bool is_same_group[stages];
+  int same_group_id[stages];
+
+  auto init_same_group = [&](int pipe) {
+    if constexpr (!has_act_order) {
+      return;
+    }
+
+    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);
+
+    int group_id_1 = sh_g_idx_int_ptr[0];
+    int group_id_2 = sh_g_idx_int_ptr[tb_k - 1];
+
+    is_same_group[pipe] = group_id_1 == group_id_2;
+    same_group_id[pipe] = group_id_1;
+  };
+
+  auto fetch_scales_to_registers = [&](int k, int full_pipe) {
+    int pipe = full_pipe % stages;
+
+    if constexpr (!has_act_order) {
+      // No act-order case
+      if constexpr (group_blocks == -1) {
+        // load only when starting a new slice
+        if (k == 0 && full_pipe == 0) {
+          reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd];
+          reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+        }
+      } else if constexpr (group_blocks != -1) {
+        if constexpr (group_blocks >= thread_k_blocks) {
+          int4* sh_s_stage =
+              sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
+                                   (pipe / (group_blocks / thread_k_blocks)));
+          reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
+        } else {
+          int warp_id = threadIdx.x / 32;
+          int n_warps = thread_n_blocks / 4;
+
+          int warp_row = warp_id / n_warps;
+
+          int cur_k = warp_row * 16;
+          cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+          int k_blocks = cur_k / 16;
+          int cur_group_id = k_blocks / group_blocks;
+
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+
+          reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
+              sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
+        }
+      }
+
+      return;
+    }
+
+    // Act-order case
+
+    // Determine K of the "current" thread-block
+    int cur_k = slice_k_start + tb_k * full_pipe;
+    if (cur_k >= prob_k || cur_k >= slice_k_finish) {
+      return;
+    }
+
+    // Reset (to current thread-block) since we read g_idx portion from the
+    // shared memory
+    cur_k = 0;
+
+    // Progress to current iteration
+    cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+    // Determine "position" inside the thread-block (based on warp and
+    // thread-id)
+    int warp_id = threadIdx.x / 32;
+    int n_warps =
+        thread_n_blocks / 4;  // Each warp processes 4 16-size tiles over N
+
+    int warp_row = warp_id / n_warps;
+    int warp_col = warp_id % n_warps;
+
+    cur_k += warp_row * 16;
+
+    int th_id = threadIdx.x % 32;
+    cur_k += (th_id % 4) * 2;  // Due to tensor-core layout for fp16 B matrix
+
+    int s_col_shift =
+        /*slice_n_offset +*/ (act_s_col_warp_stride * warp_col) +
+        (th_id / 4) * act_s_col_stride;
+
+    if (is_same_group[pipe]) {
+      if (k % 2 == 0) {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
+            sh_s[(same_group_id[pipe] - sh_first_group_id) * s_sh_stride +
+                 s_col_shift];
+      } else {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
+            *(reinterpret_cast<int4*>(&(act_frag_s[(k - 1) % 2][0][0])));
+      }
+
+      for (int i = 1; i < 4; i++) {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
+            *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0])));
+      }
+      return;
+    }
+
+    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);
+
+    constexpr int k_frag_offsets[4] = {0, 1, 8,
+                                       9};  // Tensor core offsets per thread
+
+  #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      int actual_k = cur_k + k_frag_offsets[i];
+
+      int group_id = sh_g_idx_int_ptr[actual_k];
+      int rel_group_id = group_id - sh_first_group_id;
+
+      *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
+          sh_s[rel_group_id * s_sh_stride + s_col_shift];
+    }
+  };
+
+  auto fetch_zp_to_registers = [&](int k, int full_pipe) {
+    // This code does not handle group_blocks == 0,
+    // which signifies act_order.
+    // has_zp implies AWQ, which doesn't have act_order,
+    static_assert(!has_zp || group_blocks != 0);
+
+    if constexpr (has_zp && !is_zp_float) {
+      int pipe = full_pipe % stages;
+
+      if constexpr (group_blocks == -1) {
+        // load only when starting a new slice
+        if (k == 0 && full_pipe == 0) {
+  #pragma unroll
+          for (int i = 0; i < num_ints_per_thread; i++) {
+            frag_qzp[k % 2][i] = (reinterpret_cast<int*>(sh_zp))[zp_sh_rd + i];
+          }
+        }
+
+      } else if constexpr (group_blocks >= thread_k_blocks) {
+        int4* sh_zp_stage =
+            sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) *
+                                   (pipe / (group_blocks / thread_k_blocks)));
+        for (int i = 0; i < num_ints_per_thread; i++) {
+          frag_qzp[k % 2][i] =
+              (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
+        }
+      } else {
+        int warp_id = threadIdx.x / 32;
+        int n_warps = thread_n_blocks / 4;
+
+        int warp_row = warp_id / n_warps;
+
+        int cur_k = warp_row * 16;
+        cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+        int k_blocks = cur_k / 16;
+        int cur_group_id = 0;
+
+        // Suppress bogus and persistent divide-by-zero warning
+  #pragma nv_diagnostic push
+  #pragma nv_diag_suppress divide_by_zero
+        cur_group_id = k_blocks / group_blocks;
+  #pragma nv_diagnostic pop
+
+        int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
+
+        sh_zp_stage += cur_group_id * zp_sh_stride;
+
+        for (int i = 0; i < num_ints_per_thread; i++) {
+          frag_qzp[k % 2][i] =
+              (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
+        }
+      }
+    }
+
+    else if constexpr (has_zp && is_zp_float) {
+      int pipe = full_pipe % stages;
+
+      if constexpr (group_blocks != -1) {
+        if constexpr (group_blocks >= thread_k_blocks) {
+          int4* sh_zp_stage =
+              sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) *
+                                     (pipe / (group_blocks / thread_k_blocks)));
+          reinterpret_cast<int4*>(&frag_zpf[k % 2])[0] = sh_zp_stage[zp_sh_rd];
+        } else {
+          int warp_id = threadIdx.x / 32;
+          int n_warps = thread_n_blocks / 4;
+
+          int warp_row = warp_id / n_warps;
+
+          int cur_k = warp_row * 16;
+          cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+          int k_blocks = cur_k / 16;
+          // Suppress bogus and persistent divide-by-zero warning
+  #pragma nv_diagnostic push
+  #pragma nv_diag_suppress divide_by_zero
+          int cur_group_id = k_blocks / group_blocks;
+  #pragma nv_diagnostic pop
+
+          int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
+
+          reinterpret_cast<int4*>(&frag_zpf[k % 2])[0] =
+              sh_zp_stage[zp_sh_rd + cur_group_id * zp_sh_stride];
+        }
+      }
+    }
+  };
+
+  // Execute the actual tensor core matmul of a sub-tile.
+  bool is_first_matmul_in_slice = true;
+  auto matmul = [&](int k) {
+    int k2 = k % 2;
+    const bool is_new_zp =
+        ((group_blocks != -1) && (group_blocks < thread_k_blocks || k == 0)) ||
+        (group_blocks == -1 && is_first_matmul_in_slice);
+    if constexpr (has_zp && !is_zp_float) {
+      if (is_new_zp) {
+        if constexpr (group_blocks == -1) is_first_matmul_in_slice = false;
+        FragB frag_zp_0;
+        FragB frag_zp_1;
+        int zp_quant_0, zp_quant_1;
+
+        if constexpr (w_type.size_bits() == 4) {
+          zp_quant_0 = frag_qzp[k2][0];
+          zp_quant_1 = zp_quant_0 >> 8;
+        } else {
+          static_assert(w_type.size_bits() == 8);
+          zp_quant_0 = frag_qzp[k2][0];
+          zp_quant_1 = frag_qzp[k2][1];
+        }
+
+        dequant<scalar_t, w_type.size_bits()>(zp_quant_0, frag_zp_0);
+        dequant<scalar_t, w_type.size_bits()>(zp_quant_1, frag_zp_1);
+
+        frag_zp[0] = frag_zp_0[0];
+        frag_zp[1] = frag_zp_0[1];
+        frag_zp[2] = frag_zp_1[0];
+        frag_zp[3] = frag_zp_1[1];
+      }
+    }
+  // We have the m dimension as the inner loop in order to encourage overlapping
+  // dequantization and matmul operations.
+  #pragma unroll
+    for (int j = 0; j < 4; j++) {
+      FragB frag_b0;
+      FragB frag_b1;
+      int b_quant_0, b_quant_1;
+
+      if constexpr (w_type.size_bits() == 4) {
+        b_quant_0 = frag_b_quant[k2][0][j];
+        b_quant_1 = b_quant_0 >> 8;
+      } else {
+        static_assert(w_type.size_bits() == 8);
+        int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k2]);
+        b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
+        b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
+      }
+
+      dequant<scalar_t, w_type.size_bits()>(b_quant_0, frag_b0);
+      dequant<scalar_t, w_type.size_bits()>(b_quant_1, frag_b1);
+
+      // Apply scale to frag_b0
+      if constexpr (has_act_order) {
+        static_assert(group_blocks != -1);
+        scale4<scalar_t>(frag_b0, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
+                         act_frag_s[k2][2][j], act_frag_s[k2][3][j], 0);
+        scale4<scalar_t>(frag_b1, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
+                         act_frag_s[k][2][j], act_frag_s[k2][3][j], 1);
+
+      } else if constexpr (has_zp && !is_zp_float && group_blocks == -1) {
+        int idx = (threadIdx.x / 4) % 2;
+        scalar_t2 s2 = Dtype::nums2num2(
+            reinterpret_cast<scalar_t*>(&frag_s[j / 2][j % 2 * 2 + 0])[idx],
+            reinterpret_cast<scalar_t*>(&frag_s[j / 2][j % 2 * 2 + 1])[idx]);
+        if (is_new_zp) frag_zp[j] = __hmul2(frag_zp[j], s2);
+        scale_and_sub<scalar_t>(frag_b0, s2.x, frag_zp[j].x);
+        scale_and_sub<scalar_t>(frag_b1, s2.y, frag_zp[j].y);
+      } else if constexpr (has_zp && !is_zp_float && group_blocks != -1) {
+        if (is_new_zp)
+          frag_zp[j] = __hmul2(frag_zp[j],
+                               *reinterpret_cast<scalar_t2*>(&frag_s[k2][j]));
+        scale_and_sub<scalar_t>(frag_b0, frag_s[k % 2][j][0].x, frag_zp[j].x);
+        scale_and_sub<scalar_t>(frag_b1, frag_s[k % 2][j][0].y, frag_zp[j].y);
+      } else if constexpr (has_zp && is_zp_float && group_blocks != -1) {
+        if (is_new_zp)
+          frag_zpf[k2][j] = __hmul2(
+              frag_zpf[k2][j], *reinterpret_cast<scalar_t2*>(&frag_s[k2][j]));
+        scale_and_sub<scalar_t>(frag_b0, frag_s[k2][j].x, frag_zpf[k2][j].x);
+        scale_and_sub<scalar_t>(frag_b1, frag_s[k2][j].y, frag_zpf[k2][j].y);
+      } else if constexpr (group_blocks != -1) {
+        scale<scalar_t>(frag_b0, frag_s[k2][j], 0);
+        scale<scalar_t>(frag_b1, frag_s[k2][j], 1);
+      }
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        if constexpr (m_block_size_8) {
+          mma_trans<scalar_t>(frag_a[k2][i], frag_b0, frag_b1, frag_c[i][j][0]);
+        } else {
+          mma<scalar_t>(frag_a[k2][i], frag_b0, frag_c[i][j][0]);
+          mma<scalar_t>(frag_a[k2][i], frag_b1, frag_c[i][j][1]);
+        }
+      }
+    }
+  };
+
+  // Since we slice across the k dimension of a tile in order to increase the
+  // number of warps while keeping the n dimension of a tile reasonable, we have
+  // multiple warps that accumulate their partial sums of the same output
+  // location; which we have to reduce over in the end. We do in shared memory.
+  auto thread_block_reduce = [&]() {
+    constexpr int red_off = threads / b_sh_stride_threads / 2;
+    if (red_off >= 1) {
+      int red_idx = threadIdx.x / b_sh_stride_threads;
+      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
+      constexpr int red_sh_delta = b_sh_stride_threads;
+      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
+                      (threadIdx.x % b_sh_stride_threads);
+
+      // Parallel logarithmic shared memory reduction. We make sure to avoid any
+      // unnecessary read or write iterations, e.g., for two warps we write only
+      // once by warp 1 and read only once by warp 0.
+
+  #pragma unroll
+      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
+  #pragma unroll
+        for (int i = red_off; i > 0; i /= 2) {
+          if (i <= red_idx && red_idx < 2 * i) {
+  #pragma unroll
+            for (int j = 0; j < 4 * 2; j += (m_block_size_8 ? 2 : 1)) {
+              int red_sh_wr =
+                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
+              if (i < red_off) {
+                float* c_rd = reinterpret_cast<float*>(
+                    &sh_red[red_sh_delta * j + red_sh_rd]);
+                float* c_wr = reinterpret_cast<float*>(&sh_red[red_sh_wr]);
+  #pragma unroll
+                for (int k = 0; k < 4; k++)
+                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
+                      c_rd[k] + c_wr[k];
+              }
+              sh_red[red_sh_wr] =
+                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
+            }
+          }
+          __syncthreads();
+        }
+        if (red_idx == 0) {
+  #pragma unroll
+          for (int i = 0; i < 4 * 2; i += (m_block_size_8 ? 2 : 1)) {
+            float* c_rd =
+                reinterpret_cast<float*>(&sh_red[red_sh_delta * i + red_sh_rd]);
+  #pragma unroll
+            for (int j = 0; j < 4; j++)
+              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
+                  c_rd[j];
+          }
+        }
+        __syncthreads();
+      }
+    }
+  };
+
+  // Since multiple threadblocks may process parts of the same column slice, we
+  // finally have to globally reduce over the results. As the striped
+  // partitioning minimizes the number of such reductions and our outputs are
+  // usually rather small, we perform this reduction serially in L2 cache.
+  auto global_reduce_fp16 = [&](bool first = false, bool last = false) {
+    // We are very careful here to reduce directly in the output buffer to
+    // maximize L2 cache utilization in this step. To do this, we write out
+    // results in FP16 (but still reduce with FP32 compute).
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    bool is_th_active = threadIdx.x < active_threads;
+    if (!is_th_active) {
+      return;
+    }
+
+    int c_gl_stride = prob_n / 8;
+    int c_gl_wr_delta_o = 8 * c_gl_stride;
+    int c_gl_wr_delta_i = 4 * (active_threads / 32);
+    int c_gl_wr;
+    if constexpr (m_block_size_8) {
+      c_gl_wr = c_gl_stride * ((threadIdx.x % 4) * 2) + 4 * (threadIdx.x / 32) +
+                (threadIdx.x % 32) / 8;
+      c_gl_wr += (2 * thread_n_blocks) * slice_col;
+    } else {
+      c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
+                4 * (threadIdx.x / 32) + threadIdx.x % 4;
+      c_gl_wr += (2 * thread_n_blocks) * slice_col;
+    }
+    constexpr int c_sh_wr_delta = active_threads;
+    int c_sh_wr = threadIdx.x;
+
+    if (!first) {
+
+  #pragma unroll
+      for (int i = 0; i < (m_block_size_8 ? 2 : thread_m_blocks * 4); i++) {
+        int c_idx;
+        if constexpr (m_block_size_8)
+          c_idx = c_gl_wr + i * c_gl_stride +
+                  (threadIdx.x % 8) / 4 * c_gl_wr_delta_i;
+        else
+          c_idx =
+              c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2);
+        if (c_idx / c_gl_stride < block_num_valid_tokens) {
+          int64_t sorted_row = sh_block_sorted_ids[c_idx / c_gl_stride];
+          int64_t true_idx = sorted_row * c_gl_stride + c_idx % c_gl_stride;
+          sh_red[c_sh_wr + c_sh_wr_delta * i] = C[true_idx];
+        }
+      }
+    }
+
+  #pragma unroll
+    for (int i = 0; i < (m_block_size_8 ? 2 : thread_m_blocks * 4); i++) {
+      if (!first) {
+        int4 c_red = sh_red[c_sh_wr + i * c_sh_wr_delta];
+  #pragma unroll
+        for (int j = 0; j < 2 * 4; j++) {
+          int delta = 0;
+          if constexpr (m_block_size_8) {
+            delta = j % 2 == 1 ? -2 : 0;
+          }
+          reinterpret_cast<float*>(
+              &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4) + delta] +=
+              Dtype::num2float(reinterpret_cast<scalar_t*>(&c_red)[j]);
+        }
+      }
+      if (!last) {
+        int4 c;
+  #pragma unroll
+        for (int j = 0; j < 2 * 4; j++) {
+          int delta = 0;
+          if constexpr (m_block_size_8) {
+            delta = j % 2 == 1 ? -2 : 0;
+          }
+          reinterpret_cast<scalar_t*>(&c)[j] =
+              Dtype::float2num(reinterpret_cast<float*>(
+                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4) + delta]);
+        }
+
+        int c_idx;
+        if constexpr (m_block_size_8)
+          c_idx = c_gl_wr + i * c_gl_stride +
+                  (threadIdx.x % 8) / 4 * c_gl_wr_delta_i;
+        else
+          c_idx =
+              c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2);
+        if (c_idx / c_gl_stride < block_num_valid_tokens) {
+          int64_t sorted_row = sh_block_sorted_ids[c_idx / c_gl_stride];
+          int64_t true_idx = sorted_row * c_gl_stride + c_idx % c_gl_stride;
+          C[true_idx] = c;
+        }
+      }
+    }
+  };
+
+  // Globally reduce over threadblocks that compute the same column block.
+  // We use a tmp C buffer to reduce in full fp32 precision.
+  auto global_reduce_fp32 = [&](bool first = false, bool last = false) {
+    constexpr int tb_m = thread_m_blocks * 16;
+    constexpr int tb_n = thread_n_blocks * 16;
+
+    constexpr int c_size = tb_m * tb_n * sizeof(float) / 16;
+
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    bool is_th_active = threadIdx.x < active_threads;
+
+    constexpr int num_floats = thread_m_blocks * 4 * 2 * 4;
+    constexpr int th_size = num_floats * sizeof(float) / 16;
+
+    int c_cur_offset = locks_off * c_size;
+
+    if (!is_th_active) {
+      return;
+    }
+
+    if (!first) {
+      float* frag_c_ptr = reinterpret_cast<float*>(&frag_c);
+  #pragma unroll
+      for (int k = 0; k < th_size; k++) {
+        if constexpr (m_block_size_8) {
+          if (k % 2) continue;
+        } else {
+          if (k / 8 * 16 + (threadIdx.x % 32) / 4 >= block_num_valid_tokens)
+            continue;
+        }
+
+        sh_red[threadIdx.x] =
+            C_tmp[c_cur_offset + active_threads * k + threadIdx.x];
+
+        float* sh_c_ptr = reinterpret_cast<float*>(&sh_red[threadIdx.x]);
+  #pragma unroll
+        for (int f = 0; f < 4; f++) {
+          frag_c_ptr[k * 4 + f] += sh_c_ptr[f];
+        }
+      }
+    }
+
+    if (!last) {
+      int4* frag_c_ptr = reinterpret_cast<int4*>(&frag_c);
+  #pragma unroll
+      for (int k = 0; k < th_size; k++) {
+        if constexpr (m_block_size_8) {
+          if (k % 2) continue;
+        } else {
+          if (k / 8 * 16 + (threadIdx.x % 32) / 4 >= block_num_valid_tokens)
+            continue;
+        }
+
+        C_tmp[c_cur_offset + active_threads * k + threadIdx.x] = frag_c_ptr[k];
+      }
+    }
+  };
+
+  // Write out the reduce final result in the correct layout. We only actually
+  // reshuffle matrix fragments in this step, the reduction above is performed
+  // in fragment layout.
+  auto write_result = [&]() {
+    int c_gl_stride = prob_n / 8;
+    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
+    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
+    constexpr int c_sh_rd_delta =
+        c_sh_stride * (threads / (2 * thread_n_blocks));
+
+    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+    c_gl_wr += (2 * thread_n_blocks) * slice_col;
+    int c_sh_wr;
+    if constexpr (m_block_size_8) {
+      c_sh_wr = (8 * c_sh_stride) * ((threadIdx.x % 32) % 4 * 2) +
+                (threadIdx.x % 32) / 4;
+      c_sh_wr += 64 * (threadIdx.x / 32);
+    } else {
+      c_sh_wr =
+          (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
+      c_sh_wr += 32 * (threadIdx.x / 32);
+    }
+
+    int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+
+    // We first reorder in shared memory to guarantee the most efficient final
+    // global write patterns
+    auto write = [&](int idx, float c0, float c1, FragS& s) {
+      scalar_t2 res =
+          Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));
+
+      // For per-column quantization we finally apply the scale here (only for
+      // 4-bit)
+      if constexpr (!has_act_order && group_blocks == -1 &&
+                    w_type.size_bits() == 4 && !has_zp) {
+        res = __hmul2(res, s[0]);
+      }
+
+      if constexpr (m_block_size_8) {
+        ((scalar_t*)sh_red)[idx] = res.x;
+        ((scalar_t*)sh_red)[idx + 8 * c_sh_stride] = res.y;
+      } else {
+        ((scalar_t2*)sh_red)[idx] = res;
+      }
+    };
+
+    if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+        for (int j = 0; j < 4; j++) {
+          if constexpr (m_block_size_8) {
+            int wr = c_sh_wr + 16 * j;
+            write(wr, frag_c[i][j][0][0], frag_c[i][j][0][1],
+                  frag_s[j / 2][2 * (j % 2) + 0]);
+            write(wr + 8, frag_c[i][j][0][2], frag_c[i][j][0][3],
+                  frag_s[j / 2][2 * (j % 2) + 1]);
+          } else {
+            int wr = c_sh_wr + 8 * j;
+            write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
+                  frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
+            write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
+                  frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
+            write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
+                  frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
+            write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
+                  frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
+          }
+        }
+        c_sh_wr += 16 * (4 * c_sh_stride);
+      }
+    }
+    __syncthreads();
+
+  #pragma unroll
+    for (int i = 0;
+         i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
+         i++) {
+      int row = c_gl_wr / c_gl_stride;
+      if (row < block_num_valid_tokens) {
+        int64_t sorted_row = sh_block_sorted_ids[row];
+        int64_t true_idx = sorted_row * c_gl_stride + c_gl_wr % c_gl_stride;
+        scalar_t2 topk_weight_score;
+        if (mul_topk_weights) topk_weight_score = sh_block_topk_weights[row];
+        if (use_atomic_add && slice_count > 1 || mul_topk_weights) {
+          scalar_t2* C_half2 = reinterpret_cast<scalar_t2*>(&C[true_idx]);
+          scalar_t2* sh_red_half2 =
+              reinterpret_cast<scalar_t2*>(&sh_red[c_sh_rd]);
+  #pragma unroll
+          for (int a = 0; a < 4; a++) {
+            scalar_t2 res = sh_red_half2[a];
+            if (mul_topk_weights) {
+              res = __hmul2(res, topk_weight_score);
+            }
+
+            if (use_atomic_add && slice_count > 1) {
+              atomicAdd(&C_half2[a], res);
+            } else {
+              C_half2[a] = res;
+            };
+          }
+        } else {
+          C[true_idx] = sh_red[c_sh_rd];
+        }
+        c_gl_wr += c_gl_wr_delta;
+        c_sh_rd += c_sh_rd_delta;
+      }
+    }
+    __syncthreads();
+  };
+
+  // Start global fetch and register load pipelines.
+  auto start_pipes = [&]() {
+
+  #pragma unroll
+    for (int i = 0; i < stages - 1; i++) {
+      if (has_act_order && i == 0) {
+        int last_g_idx = slice_k_start + stages * tb_k * 2;
+        if (last_g_idx >= prob_k) {
+          last_g_idx = prob_k - 1;
+        }
+        fetch_act_order_scales_to_shared(true, g_idx[slice_k_start],
+                                         g_idx[last_g_idx]);
+      }
+
+      if constexpr (has_zp && !is_zp_float && group_blocks == -1) {
+        if (i == 0) {
+          fetch_col_zp_to_shared();
+          fetch_col_scale_to_shared();
+        }
+      }
+      fetch_to_shared(i, i, i < slice_iters);
+    }
+
+    zero_accums();
+    wait_for_stage();
+    init_same_group(0);
+    fetch_to_registers(0, 0);
+    fetch_scales_to_registers(0, 0);
+    fetch_zp_to_registers(0, 0);
+    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
+    slice_k_start_shared_fetch += tb_k * (stages - 1);
+  };
+  if (slice_iters) {
+    start_pipes();
+  }
+
+  // Main loop.
+  while (slice_iters) {
+    // We unroll over both the global fetch and the register load pipeline to
+    // ensure all shared memory accesses are static. Note that both pipelines
+    // have even length meaning that the next iteration will always start at
+    // index 0.
+
+  #pragma unroll
+    for (int pipe = 0; pipe < stages;) {
+  #pragma unroll
+      for (int k = 0; k < b_sh_wr_iters; k++) {
+        fetch_to_registers(k + 1, pipe % stages);
+        fetch_scales_to_registers(k + 1, pipe);
+        fetch_zp_to_registers(k + 1, pipe);
+        if (k == b_sh_wr_iters - 2) {
+          fetch_to_shared((pipe + stages - 1) % stages, pipe,
+                          slice_iters >= stages);
+          pipe++;
+          wait_for_stage();
+          init_same_group(pipe % stages);
+        }
+        matmul(k);
+      }
+      slice_iters--;
+      if (slice_iters == 0) {
+        break;
+      }
+    }
+    a_remaining_load_count_in_slice = 0;
+
+    a_gl_rd += a_gl_rd_delta_o * stages;
+    slice_k_start += tb_k * stages;
+    slice_k_start_shared_fetch += tb_k * stages;
+
+    if constexpr (has_act_order) {
+      int first_group_id = g_idx[slice_k_start];
+      int last_g_idx = slice_k_start + stages * tb_k * 2;
+      if (last_g_idx >= prob_k) {
+        last_g_idx = prob_k - 1;
+      }
+      int last_group_id = g_idx[last_g_idx];
+      if (last_group_id >= sh_first_group_id + sh_num_groups) {
+        fetch_act_order_scales_to_shared(false, first_group_id, last_group_id);
+        __syncthreads();
+      }
+    }
+
+    // Process results and, if necessary, proceed to the next column slice.
+    // While this pattern may not be the most readable, other ways of writing
+    // the loop seemed to noticeably worse performance after compilation.
+    if (slice_iters == 0) {
+      cp_async_wait<0>();
+      bool last = slice_idx == slice_count - 1;
+      // For per-column scales, we only fetch them here in the final step before
+      // write-out
+      if constexpr (!has_act_order && group_blocks == -1 && !has_zp) {
+        if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
+          if (s_sh_wr_pred) {
+            cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
+          }
+          cp_async_fence();
+        }
+      }
+
+      thread_block_reduce();
+      if constexpr (!has_act_order && group_blocks == -1 && !has_zp) {
+        if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
+          cp_async_wait<0>();
+          __syncthreads();
+          if (threadIdx.x / 32 < thread_n_blocks / 4) {
+            reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+            reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+            if constexpr (m_block_size_8) {
+              int idx = (threadIdx.x / 4) % 2;
+              scalar_t2* frag_s_half2 = reinterpret_cast<scalar_t2*>(frag_s);
+  #pragma unroll
+              for (int i = 0; i < 8; i++) {
+                frag_s_half2[i] = Dtype::num2num2(
+                    reinterpret_cast<scalar_t*>(&frag_s_half2[i])[idx]);
+              }
+            }
+          }
+        }
+      }
+
+      // For 8-bit channelwise, we apply the scale before the global reduction
+      // that converts the fp32 results to fp16 (so that we avoid possible
+      // overflow in fp16)
+      if constexpr (!has_act_order && group_blocks == -1 &&
+                    w_type.size_bits() == 8 && !has_zp) {
+        if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+            for (int j = 0; j < 4; j++) {
+              scale_float<scalar_t>(
+                  reinterpret_cast<float*>(&frag_c[i][j][0][0]),
+                  frag_s[j / 2][2 * (j % 2) + 0]);
+              scale_float<scalar_t>(
+                  reinterpret_cast<float*>(&frag_c[i][j][0][2]),
+                  frag_s[j / 2][2 * (j % 2) + (m_block_size_8 ? 1 : 0)]);
+
+              if constexpr (!m_block_size_8) {
+                scale_float<scalar_t>(
+                    reinterpret_cast<float*>(&frag_c[i][j][1][0]),
+                    frag_s[j / 2][2 * (j % 2) + 1]);
+                scale_float<scalar_t>(
+                    reinterpret_cast<float*>(&frag_c[i][j][1][2]),
+                    frag_s[j / 2][2 * (j % 2) + 1]);
+              }
+            }
+          }
+        }
+      }
+
+      if (slice_count > 1 && !use_atomic_add) {
+        // only globally reduce if there is more than one block in a slice
+        barrier_acquire(&locks[locks_off], slice_idx);
+        if (use_fp32_reduce) {
+          global_reduce_fp32(slice_idx == 0, last);
+        } else {
+          global_reduce_fp16(slice_idx == 0, last);
+        }
+        barrier_release(&locks[locks_off], last);
+      }
+      if (use_atomic_add && slice_count > 1 && slice_idx != 0)
+        wait_negative_and_add(&locks[locks_off]);
+      if (last || use_atomic_add)
+        // only the last block in a slice actually writes the result
+        write_result();
+      if (slice_row) a_remaining_load_count_in_slice = stages;
+      slice_row = 0;
+      slice_col_par++;
+      slice_col++;
+      is_first_matmul_in_slice = true;
+      init_slice();
+      if (slice_iters) {
+        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                  (threadIdx.x % a_gl_rd_delta_o);
+  #pragma unroll
+        for (int i = 0; i < b_sh_wr_iters; i++)
+          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
+        if (slice_col == 0) {
+  #pragma unroll
+          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
+        }
+
+        // Update slice k/n for scales loading
+        if constexpr (has_act_order) {
+          slice_k_start = tb_k * slice_row;
+          slice_k_finish = slice_k_start + tb_k * slice_iters;
+          slice_k_start_shared_fetch = slice_k_start;
+          slice_n_offset = act_s_col_tb_stride * slice_col;
+
+        } else {
+          s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+          zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
+        }
+
+        start_pipes();
+      }
+    }
+  }
+}
+
+}  // namespace MARLIN_NAMESPACE_NAME
+
+#endif
diff --git a/csrc/moe/marlin_moe_wna16/ops.cu b/csrc/moe/marlin_moe_wna16/ops.cu
new file mode 100644
index 0000000000000..a16e955a325e2
--- /dev/null
+++ b/csrc/moe/marlin_moe_wna16/ops.cu
@@ -0,0 +1,927 @@
+/*
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Adapted from https://github.com/IST-DASLab/marlin
+ */
+
+#ifndef MARLIN_NAMESPACE_NAME
+  #define MARLIN_NAMESPACE_NAME marlin_moe_wna16
+#endif
+
+#include "kernel.h"
+#include "core/registration.h"
+
+#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
+  static_assert(std::is_same<scalar_t, half>::value ||          \
+                    std::is_same<scalar_t, nv_bfloat16>::value, \
+                "only float16 and bfloat16 is supported");
+
+namespace MARLIN_NAMESPACE_NAME {
+
+__global__ void MarlinDefault(MARLIN_KERNEL_PARAMS){};
+
+using MarlinFuncPtr = void (*)(MARLIN_KERNEL_PARAMS);
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+
+template <int moe_block_size>
+__global__ void permute_cols_kernel(
+    int4 const* __restrict__ a_int4_ptr, int const* __restrict__ perm_int_ptr,
+    int4* __restrict__ out_int4_ptr,
+    const int32_t* __restrict__ sorted_token_ids_ptr,
+    const int32_t* __restrict__ expert_ids_ptr,
+    const int32_t* __restrict__ num_tokens_past_padded_ptr, int size_m,
+    int size_k, int top_k) {};
+
+}  // namespace marlin
+
+torch::Tensor moe_wna16_marlin_gemm(
+    torch::Tensor& a, std::optional<torch::Tensor> const& c_or_none,
+    torch::Tensor& b_q_weight, torch::Tensor& b_scales,
+    std::optional<torch::Tensor> const& b_zeros_or_none,
+    std::optional<torch::Tensor> const& g_idx_or_none,
+    std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
+    torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids,
+    torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights,
+    int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep,
+    vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
+    int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
+    bool is_zp_float) {
+  TORCH_CHECK_NOT_IMPLEMENTED(false,
+                              "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
+  return torch::empty({1, 1});
+}
+
+#else
+
+// For a given "a" of size [M,K] performs a permutation of the K columns based
+// on the given "perm" indices.
+template <int moe_block_size>
+__global__ void permute_cols_kernel(
+    int4 const* __restrict__ a_int4_ptr, int const* __restrict__ perm_int_ptr,
+    int4* __restrict__ out_int4_ptr,
+    const int32_t* __restrict__ sorted_token_ids_ptr,
+    const int32_t* __restrict__ expert_ids_ptr,
+    const int32_t* __restrict__ num_tokens_past_padded_ptr, int size_m,
+    int size_k, int top_k) {
+  int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
+  int num_moe_blocks = div_ceil(num_tokens_past_padded, moe_block_size);
+  int32_t block_sorted_ids[moe_block_size];
+  int block_num_valid_tokens = 0;
+  int64_t old_expert_id = 0;
+  int64_t expert_id = 0;
+  int row_stride = size_k * sizeof(half) / 16;
+
+  auto read_moe_block_data = [&](int block_id) {
+    block_num_valid_tokens = moe_block_size;
+    int4* tmp_block_sorted_ids = reinterpret_cast<int4*>(block_sorted_ids);
+    for (int i = 0; i < moe_block_size / 4; i++) {
+      tmp_block_sorted_ids[i] =
+          ((int4*)sorted_token_ids_ptr)[block_id * moe_block_size / 4 + i];
+    }
+    for (int i = 0; i < moe_block_size; i++) {
+      if (block_sorted_ids[i] >= size_m * top_k) {
+        block_num_valid_tokens = i;
+        break;
+      };
+    }
+  };
+
+  auto permute_row = [&](int row) {
+    int iters = size_k / default_threads;
+    int rest = size_k % default_threads;
+
+    int in_offset = (row / top_k) * row_stride;
+    int out_offset = row * row_stride;
+
+    half const* a_row_half =
+        reinterpret_cast<half const*>(a_int4_ptr + in_offset);
+    half* out_half = reinterpret_cast<half*>(out_int4_ptr + out_offset);
+
+    int base_k = 0;
+
+    for (int i = 0; i < iters; i++) {
+      int cur_k = base_k + threadIdx.x;
+      int src_pos = perm_int_ptr[cur_k];
+
+      out_half[cur_k] = a_row_half[src_pos];
+
+      base_k += default_threads;
+    }
+
+    if (rest) {
+      if (threadIdx.x < rest) {
+        int cur_k = base_k + threadIdx.x;
+        int src_pos = perm_int_ptr[cur_k];
+
+        out_half[cur_k] = a_row_half[src_pos];
+      }
+    }
+  };
+
+  for (int index = blockIdx.x; index < num_moe_blocks; index += gridDim.x) {
+    old_expert_id = expert_id;
+    int tmp_expert_id = expert_ids_ptr[index];
+    if (tmp_expert_id == -1) continue;
+    expert_id = tmp_expert_id;
+    perm_int_ptr += (expert_id - old_expert_id) * size_k;
+    read_moe_block_data(index);
+
+    for (int i = 0; i < block_num_valid_tokens; i++)
+      permute_row(block_sorted_ids[i]);
+  }
+}
+
+typedef struct {
+  int thread_k;
+  int thread_n;
+  int num_threads;
+} thread_config_t;
+
+thread_config_t small_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {128, 128, 256},
+    {64, 128, 128}};
+
+thread_config_t large_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {64, 256, 256},
+    {64, 128, 128}};
+
+typedef struct {
+  int blocks_per_sm;
+  thread_config_t tb_cfg;
+} exec_config_t;
+
+int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
+                          int prob_n, int prob_k, int num_bits, int group_size,
+                          bool has_act_order, bool is_k_full) {
+  bool cache_scales_chunk = has_act_order && !is_k_full;
+
+  int tb_n = th_config.thread_n;
+  int tb_k = th_config.thread_k;
+
+  // Get max scale groups per thread-block
+  int tb_groups;
+  if (group_size == -1) {
+    tb_groups = 1;
+  } else if (group_size == 0) {
+    tb_groups = div_ceil(tb_k, 32);  // Worst case is 32 group size
+  } else {
+    tb_groups = div_ceil(tb_k, group_size);
+  }
+
+  if (cache_scales_chunk) {
+    int load_groups =
+        tb_groups * pipe_stages * 2;     // Chunk size is 2x pipeline over dim K
+    load_groups = max(load_groups, 32);  // We load at least 32 scale groups
+    return load_groups * tb_n * 2;
+
+  } else {
+    int tb_scales = tb_groups * tb_n * 2;
+
+    return tb_scales * pipe_stages;
+  }
+}
+
+int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks,
+                          int prob_m, int prob_n, int prob_k, int num_bits,
+                          int group_size, bool has_act_order, bool is_k_full,
+                          int has_zp, int is_zp_float) {
+  int pack_factor = 32 / num_bits;
+
+  // Get B size
+  int tb_k = th_config.thread_k;
+  int tb_n = th_config.thread_n;
+  int tb_m = thread_m_blocks * 16;
+
+  // shm size for block_sorted_ids/block_topk_weights
+  // both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32)
+  int sh_block_meta_size = tb_m * 4 * 2;
+  int sh_a_size = pipe_stages * (tb_m * tb_k) * 2;
+  int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
+  int sh_s_size =
+      get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
+                            group_size, has_act_order, is_k_full);
+  int sh_g_idx_size = has_act_order && !is_k_full ? pipe_stages * tb_k / 4 : 0;
+  int sh_zp_size = 0;
+  if (has_zp) {
+    if (is_zp_float)
+      sh_zp_size = sh_s_size;
+    else if (num_bits == 4)
+      sh_zp_size = sh_s_size / 4;
+    else if (num_bits == 8)
+      sh_zp_size = sh_s_size / 2;
+  }
+
+  int total_size = sh_a_size + sh_b_size + sh_s_size + sh_zp_size +
+                   sh_g_idx_size + sh_block_meta_size;
+
+  return total_size;
+}
+
+bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks,
+                     int prob_m, int prob_n, int prob_k, int num_bits,
+                     int group_size, bool has_act_order, bool is_k_full,
+                     int has_zp, int is_zp_float, int max_shared_mem) {
+  // Sanity
+  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
+      th_config.num_threads == -1) {
+    return false;
+  }
+
+  // Verify K/N are divisible by thread K/N
+  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
+    return false;
+  }
+
+  // Verify min for thread K/N
+  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
+    return false;
+  }
+
+  // num_threads must be at least 128 (= 4 warps)
+  if (th_config.num_threads < 128) {
+    return false;
+  }
+
+  // Check that pipeline fits into cache
+  int cache_size = get_kernel_cache_size(
+      th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size,
+      has_act_order, is_k_full, has_zp, is_zp_float);
+  return cache_size <= max_shared_mem;
+}
+
+  #define __GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
+                   M_BLOCK_SIZE_8, HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS,       \
+                   NUM_THREADS, IS_ZP_FLOAT)                                  \
+    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&        \
+             thread_n_blocks == THREAD_N_BLOCKS &&                            \
+             thread_k_blocks == THREAD_K_BLOCKS &&                            \
+             m_block_size_8 == M_BLOCK_SIZE_8 &&                              \
+             has_act_order == HAS_ACT_ORDER && has_zp == HAS_ZP &&            \
+             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&    \
+             is_zp_float == IS_ZP_FLOAT) {                                    \
+      kernel = Marlin<scalar_t, W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS,    \
+                      THREAD_N_BLOCKS, THREAD_K_BLOCKS, M_BLOCK_SIZE_8,       \
+                      pipe_stages, HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS,       \
+                      IS_ZP_FLOAT>;                                           \
+    }
+
+  #define GPTQ_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)              \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, true, false, 0, NUM_THREADS, \
+             false)                                                            \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, false, 0,             \
+             NUM_THREADS, false)                                               \
+                                                                               \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, false, -1,            \
+             NUM_THREADS, false)                                               \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, false, 2,             \
+             NUM_THREADS, false)                                               \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, false, 4,             \
+             NUM_THREADS, false)                                               \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, false, 8,             \
+             NUM_THREADS, false)                                               \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, false, -1,           \
+             NUM_THREADS, false)                                               \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, false, 2,            \
+             NUM_THREADS, false)                                               \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, false, 4,            \
+             NUM_THREADS, false)                                               \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, false, 8,            \
+             NUM_THREADS, false)
+
+  #define GPTQ_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)  \
+    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, false, 0,   \
+             NUM_THREADS, false)                                     \
+    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, false, 0,   \
+             NUM_THREADS, false)                                     \
+    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, false, 0,   \
+             NUM_THREADS, false)                                     \
+                                                                     \
+    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, false, -1, \
+             NUM_THREADS, false)                                     \
+    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, false, 2,  \
+             NUM_THREADS, false)                                     \
+    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, false, 4,  \
+             NUM_THREADS, false)                                     \
+    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, false, 8,  \
+             NUM_THREADS, false)                                     \
+                                                                     \
+    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, false, -1, \
+             NUM_THREADS, false)                                     \
+    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, false, 2,  \
+             NUM_THREADS, false)                                     \
+    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, false, 4,  \
+             NUM_THREADS, false)                                     \
+    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, false, 8,  \
+             NUM_THREADS, false)                                     \
+                                                                     \
+    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, false, -1, \
+             NUM_THREADS, false)                                     \
+    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, false, 2,  \
+             NUM_THREADS, false)                                     \
+    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, false, 4,  \
+             NUM_THREADS, false)                                     \
+    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, false, 8,  \
+             NUM_THREADS, false)
+
+  #define AWQ_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)               \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, -1,             \
+             NUM_THREADS, false)                                               \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, 2, NUM_THREADS, \
+             false)                                                            \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, 4, NUM_THREADS, \
+             false)                                                            \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, 8, NUM_THREADS, \
+             false)                                                            \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, -1,            \
+             NUM_THREADS, false)                                               \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, 2,             \
+             NUM_THREADS, false)                                               \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, 4,             \
+             NUM_THREADS, false)                                               \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, 8,             \
+             NUM_THREADS, false)
+
+  #define AWQ_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)  \
+    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, -1, \
+             NUM_THREADS, false)                                    \
+    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, 2,  \
+             NUM_THREADS, false)                                    \
+    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, 4,  \
+             NUM_THREADS, false)                                    \
+    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, 8,  \
+             NUM_THREADS, false)                                    \
+                                                                    \
+    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, -1, \
+             NUM_THREADS, false)                                    \
+    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, 2,  \
+             NUM_THREADS, false)                                    \
+    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, 4,  \
+             NUM_THREADS, false)                                    \
+    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, 8,  \
+             NUM_THREADS, false)                                    \
+                                                                    \
+    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, -1, \
+             NUM_THREADS, false)                                    \
+    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, 2,  \
+             NUM_THREADS, false)                                    \
+    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, 4,  \
+             NUM_THREADS, false)                                    \
+    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, 8,  \
+             NUM_THREADS, false)
+
+  // We currently have 4-bit models only with group_blocks == 4
+  #define HQQ_GET_IF(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)                  \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, 4, NUM_THREADS, \
+             true)                                                             \
+    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, 4,             \
+             NUM_THREADS, true)                                                \
+    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, 4,             \
+             NUM_THREADS, true)                                                \
+    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, 4,             \
+             NUM_THREADS, true)                                                \
+    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, 4,             \
+             NUM_THREADS, true)
+
+template <typename scalar_t>
+MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
+                                int thread_m_blocks, int thread_n_blocks,
+                                int thread_k_blocks, bool m_block_size_8,
+                                bool has_act_order, bool has_zp,
+                                int group_blocks, int num_threads,
+                                bool is_zp_float) {
+  int num_bits = q_type.size_bits();
+  auto kernel = MarlinDefault;
+  if (false) {
+  }
+  GPTQ_GET_IF_M1(vllm::kU4B8, 8, 8, 256)
+  GPTQ_GET_IF_M1(vllm::kU4B8, 8, 4, 128)
+
+  GPTQ_GET_IF_M234(vllm::kU4B8, 16, 4, 256)
+  GPTQ_GET_IF_M234(vllm::kU4B8, 8, 4, 128)
+
+  GPTQ_GET_IF_M1(vllm::kU8B128, 8, 8, 256)
+  GPTQ_GET_IF_M1(vllm::kU8B128, 8, 4, 128)
+
+  GPTQ_GET_IF_M234(vllm::kU8B128, 16, 4, 256)
+  GPTQ_GET_IF_M234(vllm::kU8B128, 8, 4, 128)
+
+  AWQ_GET_IF_M1(vllm::kU4, 8, 8, 256)
+  AWQ_GET_IF_M1(vllm::kU4, 8, 4, 128)
+
+  AWQ_GET_IF_M234(vllm::kU4, 16, 4, 256)
+  AWQ_GET_IF_M234(vllm::kU4, 8, 4, 128)
+
+  return kernel;
+}
+
+template <typename scalar_t>
+exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
+                                    int prob_n, int prob_k, int thread_m_blocks,
+                                    bool m_block_size_8, int num_bits,
+                                    int group_size, bool has_act_order,
+                                    bool is_k_full, bool has_zp,
+                                    bool is_zp_float, int max_shared_mem) {
+  exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
+  thread_config_t* thread_configs = thread_m_blocks > 1
+                                        ? large_batch_thread_configs
+                                        : small_batch_thread_configs;
+  int thread_configs_size =
+      thread_m_blocks > 1
+          ? sizeof(large_batch_thread_configs) / sizeof(thread_config_t)
+          : sizeof(small_batch_thread_configs) / sizeof(thread_config_t);
+
+  int count = 0;
+  constexpr int device_max_reg_size = 255 * 1024;
+  for (int i = 0; i < thread_configs_size; i++) {
+    thread_config_t th_config = thread_configs[i];
+
+    if (!is_valid_config(th_config, thread_m_blocks, prob_m, prob_n, prob_k,
+                         num_bits, group_size, has_act_order, is_k_full, has_zp,
+                         is_zp_float, max_shared_mem)) {
+      continue;
+    }
+
+    int cache_size = get_kernel_cache_size(
+        th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits,
+        group_size, has_act_order, is_k_full, has_zp, is_zp_float);
+
+    int group_blocks = 0;
+    if (!has_act_order) {
+      group_blocks = group_size == -1 ? -1 : group_size / 16;
+    }
+
+    auto kernel = get_marlin_kernel<scalar_t>(
+        q_type, thread_m_blocks, th_config.thread_n / 16,
+        th_config.thread_k / 16, m_block_size_8, has_act_order, has_zp,
+        group_blocks, th_config.num_threads, is_zp_float);
+
+    if (kernel == MarlinDefault) continue;
+
+    if (thread_m_blocks > 1) {
+      exec_cfg = {1, th_config};
+      break;
+    } else {
+      cudaFuncAttributes attr;
+      cudaFuncGetAttributes(&attr, kernel);
+      int reg_size = max(attr.numRegs, 1) * th_config.num_threads * 4;
+      int allow_count = min(device_max_reg_size / reg_size,
+                            max_shared_mem / (cache_size + 1024));
+      allow_count = max(min(allow_count, 4), 1);
+      if (allow_count > count) {
+        count = allow_count;
+        exec_cfg = {count, th_config};
+      };
+    }
+  }
+
+  return exec_cfg;
+}
+
+template <typename scalar_t>
+void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
+               void* zp, void* g_idx, void* perm, void* a_tmp,
+               void* sorted_token_ids, void* expert_ids,
+               void* num_tokens_past_padded, void* topk_weights,
+               int moe_block_size, int top_k, bool mul_topk_weights, bool is_ep,
+               int prob_m, int prob_n, int prob_k, void* workspace,
+               vllm::ScalarType const& q_type, bool has_act_order,
+               bool is_k_full, bool has_zp, int num_groups, int group_size,
+               int dev, cudaStream_t stream, int thread_k, int thread_n,
+               int sms, bool use_atomic_add, bool use_fp32_reduce,
+               bool is_zp_float) {
+  int thread_m_blocks = div_ceil(moe_block_size, 16);
+  bool m_block_size_8 = moe_block_size == 8;
+
+  if (has_zp) {
+    TORCH_CHECK(
+        q_type == vllm::kU4 || q_type == vllm::kU8,
+        "q_type must be u4 or u8 when has_zp = True. Got = ", q_type.str());
+  } else {
+    TORCH_CHECK(
+        q_type == vllm::kU4B8 || q_type == vllm::kU8B128,
+        "q_type must be uint4b8 or uint8b128 when has_zp = False. Got = ",
+        q_type.str());
+  }
+
+  TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
+              ", ", prob_n, ", ", prob_k, "]");
+
+  int group_blocks = 0;
+  if (has_act_order) {
+    if (is_k_full) {
+      TORCH_CHECK(group_size != -1);
+      group_blocks = group_size / 16;
+      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
+                  " is not divisible by group_blocks = ", group_blocks);
+    } else {
+      TORCH_CHECK(group_size == 0);
+      group_blocks = 0;
+    }
+  } else {
+    if (group_size == -1) {
+      group_blocks = -1;
+    } else {
+      group_blocks = group_size / 16;
+      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
+                  " is not divisible by group_blocks = ", group_blocks);
+    }
+  }
+
+  int num_bits = q_type.size_bits();
+  const int4* A_ptr = (const int4*)A;
+  const int4* B_ptr = (const int4*)B;
+  int4* C_ptr = (int4*)C;
+  int4* C_tmp_ptr = (int4*)C_tmp;
+  const int4* s_ptr = (const int4*)s;
+  const int4* zp_ptr = (const int4*)zp;
+  const int* g_idx_ptr = (const int*)g_idx;
+  const int* perm_ptr = (const int*)perm;
+  int4* a_tmp_ptr = (int4*)a_tmp;
+  const int32_t* sorted_token_ids_ptr = (const int32_t*)sorted_token_ids;
+  const int32_t* expert_ids_ptr = (const int32_t*)expert_ids;
+  const int32_t* num_tokens_past_padded_ptr =
+      (const int32_t*)num_tokens_past_padded;
+  const float* topk_weights_ptr = (const float*)topk_weights;
+  int* locks = (int*)workspace;
+
+  if (has_act_order) {
+    // Permute A columns
+    auto kernel = permute_cols_kernel<8>;
+    if (moe_block_size == 8) {
+    } else if (moe_block_size == 16)
+      kernel = permute_cols_kernel<16>;
+    else if (moe_block_size == 32)
+      kernel = permute_cols_kernel<32>;
+    else if (moe_block_size == 48)
+      kernel = permute_cols_kernel<48>;
+    else if (moe_block_size == 64)
+      kernel = permute_cols_kernel<64>;
+    else
+      TORCH_CHECK(false, "unsupported moe_block_size ", moe_block_size);
+
+    // avoid ">>>" being formatted to "> > >"
+    // clang-format off
+    kernel<<<sms, default_threads, 0, stream>>>(
+        A_ptr, perm_ptr, a_tmp_ptr, sorted_token_ids_ptr, expert_ids_ptr,
+        num_tokens_past_padded_ptr, prob_m, prob_k, top_k);
+    // clang-format on
+    A_ptr = a_tmp_ptr;
+    prob_m = prob_m * top_k;
+    top_k = 1;
+
+    // If we have a full K, then we can run the non-act-order version of Marlin
+    // (since the weight rows are reordered by increasing group ids, and by
+    // having a full K, we have full original groups)
+    if (is_k_full) has_act_order = false;
+  }
+
+  int max_shared_mem = 0;
+  cudaDeviceGetAttribute(&max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  TORCH_CHECK(max_shared_mem > 0);
+
+  // Set thread config
+  exec_config_t exec_cfg;
+  thread_config_t thread_tfg;
+  if (thread_k != -1 && thread_n != -1) {
+    thread_tfg = thread_config_t{thread_k, thread_n, default_threads};
+    exec_cfg = exec_config_t{1, thread_tfg};
+    TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
+                " is not divisible by thread_n = ", thread_n);
+    TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
+                " is not divisible by thread_k = ", thread_k);
+  } else {
+    // Auto config
+    exec_cfg = determine_exec_config<scalar_t>(
+        q_type, prob_m, prob_n, prob_k, thread_m_blocks, m_block_size_8,
+        num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
+        max_shared_mem);
+    thread_tfg = exec_cfg.tb_cfg;
+  }
+
+  int num_threads = thread_tfg.num_threads;
+  thread_k = thread_tfg.thread_k;
+  thread_n = thread_tfg.thread_n;
+  int blocks = sms * exec_cfg.blocks_per_sm;
+  if (exec_cfg.blocks_per_sm > 1)
+    max_shared_mem = max_shared_mem / exec_cfg.blocks_per_sm - 1024;
+
+  int thread_k_blocks = thread_k / 16;
+  int thread_n_blocks = thread_n / 16;
+
+  TORCH_CHECK(is_valid_config(thread_tfg, thread_m_blocks, prob_m, prob_n,
+                              prob_k, num_bits, group_size, has_act_order,
+                              is_k_full, has_zp, is_zp_float, max_shared_mem),
+              "Invalid thread config: thread_m_blocks = ", thread_m_blocks,
+              ", thread_k = ", thread_tfg.thread_k,
+              ", thread_n = ", thread_tfg.thread_n,
+              ", num_threads = ", thread_tfg.num_threads, " for MKN = [",
+              prob_m, ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
+              ", group_size = ", group_size,
+              ", has_act_order = ", has_act_order, ", is_k_full = ", is_k_full,
+              ", has_zp = ", has_zp, ", is_zp_float = ", is_zp_float,
+              ", max_shared_mem = ", max_shared_mem);
+
+  auto kernel = get_marlin_kernel<scalar_t>(
+      q_type, thread_m_blocks, thread_n_blocks, thread_k_blocks, m_block_size_8,
+      has_act_order, has_zp, group_blocks, num_threads, is_zp_float);
+
+  if (kernel == MarlinDefault) {
+    TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
+                ", ", prob_k, "]", ", has_act_order = ", has_act_order,
+                ", num_groups = ", num_groups, ", group_size = ", group_size,
+                ", thread_m_blocks = ", thread_m_blocks,
+                ", thread_n_blocks = ", thread_n_blocks,
+                ", thread_k_blocks = ", thread_k_blocks,
+                ", num_bits = ", num_bits);
+  }
+
+  cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
+                       max_shared_mem);
+  // avoid ">>>" being formatted to "> > >"
+  // clang-format off
+  kernel<<<blocks, num_threads, max_shared_mem, stream>>>(
+      A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr,
+      sorted_token_ids_ptr, expert_ids_ptr, num_tokens_past_padded_ptr,
+      topk_weights_ptr, top_k, mul_topk_weights, is_ep, num_groups, prob_m,
+      prob_n, prob_k, locks, use_atomic_add, use_fp32_reduce);
+  // clang-format on
+}
+
+}  // namespace MARLIN_NAMESPACE_NAME
+
+torch::Tensor moe_wna16_marlin_gemm(
+    torch::Tensor& a, std::optional<torch::Tensor> const& c_or_none,
+    torch::Tensor& b_q_weight, torch::Tensor& b_scales,
+    std::optional<torch::Tensor> const& b_zeros_or_none,
+    std::optional<torch::Tensor> const& g_idx_or_none,
+    std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
+    torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids,
+    torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights,
+    int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep,
+    vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
+    int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
+    bool is_zp_float) {
+  vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
+  int pack_factor = 32 / b_q_type.size_bits();
+
+  if (moe_block_size != 8) {
+    TORCH_CHECK(moe_block_size % 16 == 0,
+                "unsupported moe_block_size=", moe_block_size);
+    TORCH_CHECK(moe_block_size >= 16 && moe_block_size <= 64,
+                "unsupported moe_block_size=", moe_block_size);
+  }
+
+  // Verify A
+  TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0),
+              ", size_m = ", size_m);
+  TORCH_CHECK(a.size(1) == size_k, "Shape mismatch: a.size(1) = ", a.size(1),
+              ", size_k = ", size_k);
+
+  // Verify B
+  TORCH_CHECK(
+      size_k % MARLIN_NAMESPACE_NAME::tile_size == 0, "size_k = ", size_k,
+      " is not divisible by tile_size = ", MARLIN_NAMESPACE_NAME::tile_size);
+  TORCH_CHECK((size_k / MARLIN_NAMESPACE_NAME::tile_size) == b_q_weight.size(1),
+              "Shape mismatch: b_q_weight.size(1) = ", b_q_weight.size(1),
+              ", size_k = ", size_k,
+              ", tile_size = ", MARLIN_NAMESPACE_NAME::tile_size);
+  TORCH_CHECK(
+      b_q_weight.size(2) % MARLIN_NAMESPACE_NAME::tile_size == 0,
+      "b_q_weight.size(2) = ", b_q_weight.size(2),
+      " is not divisible by tile_size = ", MARLIN_NAMESPACE_NAME::tile_size);
+  int actual_size_n =
+      (b_q_weight.size(2) / MARLIN_NAMESPACE_NAME::tile_size) * pack_factor;
+  TORCH_CHECK(size_n == actual_size_n, "size_n = ", size_n,
+              ", actual_size_n = ", actual_size_n);
+
+  // Verify device and strides
+  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
+  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
+
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+
+  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
+  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
+
+  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_k = -1;
+  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_n = -1;
+  // sms: number of SMs to use for the kernel
+  int sms = -1;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());
+
+  // Alloc buffers
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
+  torch::Tensor c;
+  if (c_or_none.has_value()) {
+    c = c_or_none.value();
+    TORCH_CHECK(c.device().is_cuda(), "c is not on GPU");
+    TORCH_CHECK(c.is_contiguous(), "c is not contiguous");
+    TORCH_CHECK(c.size(0) == size_m * top_k,
+                "Shape mismatch: c.size(0) = ", c.size(0),
+                ", size_m * topk = ", size_m * top_k);
+    TORCH_CHECK(c.size(1) == size_n, "Shape mismatch: c.size(1) = ", c.size(1),
+                ", size_n = ", size_n);
+  } else {
+    c = torch::empty({size_m * top_k, size_n}, options);
+  }
+
+  // Alloc C tmp buffer that is going to be used for the global reduce
+  torch::Tensor c_tmp;
+  auto options_fp32 =
+      torch::TensorOptions().dtype(at::kFloat).device(a.device());
+  if (use_fp32_reduce && !use_atomic_add) {
+    // max num of threadblocks is sms * 4
+    long max_c_tmp_size = min(
+        (long)size_n * sorted_token_ids.size(0),
+        (long)sms * 4 * moe_block_size * MARLIN_NAMESPACE_NAME::max_thread_n);
+    if (moe_block_size == 8) max_c_tmp_size *= 2;
+    c_tmp = torch::empty({max_c_tmp_size}, options_fp32);
+  } else {
+    c_tmp = torch::empty({0}, options_fp32);
+  }
+
+  // Detect groupsize and act_order
+  int num_groups = -1;
+  int group_size = -1;
+
+  int rank = b_scales.sizes().size();
+  TORCH_CHECK(rank == 3, "b_scales rank = ", rank, " is not 3");
+  TORCH_CHECK(b_scales.size(2) == size_n, "b_scales dim 2 = ", b_scales.size(2),
+              " is not size_n = ", size_n);
+  num_groups = b_scales.size(1);
+
+  torch::Tensor g_idx, perm, a_tmp;
+  ;
+  if (g_idx_or_none.has_value() && perm_or_none.has_value()) {
+    g_idx = g_idx_or_none.value();
+    perm = perm_or_none.value();
+
+    TORCH_CHECK(g_idx.device().is_cuda(), "g_idx is not on GPU");
+    TORCH_CHECK(g_idx.is_contiguous(), "g_idx is not contiguous");
+    TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
+    TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");
+
+    // Verify g_idx and perm
+    TORCH_CHECK((g_idx.size(-1) == 0 && perm.size(-1) == 0) ||
+                    (g_idx.size(-1) == size_k && perm.size(-1) == size_k),
+                "Unexpected g_idx.size(-1) = ", g_idx.size(-1),
+                " and perm.size(-1) = ", perm.size(-1),
+                ", where size_k = ", size_k);
+  } else {
+    g_idx = torch::empty({0}, options);
+    perm = torch::empty({0}, options);
+    a_tmp = torch::empty({0}, options);
+  }
+  bool has_act_order = g_idx.size(-1) > 0 && perm.size(-1) > 0;
+
+  if (has_act_order) {
+    a_tmp = torch::empty({size_m * top_k, size_k}, options);
+    if (is_k_full) {
+      TORCH_CHECK(num_groups > 1, "For act_order, num_groups must be > 1");
+      TORCH_CHECK(size_k % num_groups == 0, "size_k = ", size_k,
+                  ", is not divisible by num_groups = ", num_groups);
+      group_size = size_k / num_groups;
+    } else {
+      group_size = 0;
+    }
+
+  } else {
+    a_tmp = torch::empty({0}, options);
+    if (num_groups > 1) {
+      TORCH_CHECK(
+          size_k % num_groups == 0, "size_k = ", size_k,
+          ", is not divisible by b_scales.size(1) = ", b_scales.size(1));
+      group_size = size_k / num_groups;
+    } else {
+      group_size = -1;
+    }
+  }
+
+  torch::Tensor b_zeros;
+  if (b_zeros_or_none.has_value()) {
+    b_zeros = b_zeros_or_none.value();
+    TORCH_CHECK(b_zeros.device().is_cuda(), "b_zeros is not on GPU");
+    TORCH_CHECK(b_zeros.is_contiguous(), "b_zeros is not contiguous");
+  } else {
+    b_zeros = torch::empty({0}, options);
+  }
+  bool has_zp = b_zeros.size(-1) > 0;
+
+  if (has_zp) {
+    TORCH_CHECK(
+        b_q_type == vllm::kU4,
+        "b_q_type must be u4 when has_zp = True. Got = ", b_q_type.str());
+  } else {
+    TORCH_CHECK(
+        b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128,
+        "b_q_type must be uint4b8 or uint8b128 when has_zp = False. Got = ",
+        b_q_type.str());
+  }
+
+  if (has_zp && is_zp_float) {
+    TORCH_CHECK(a.scalar_type() == at::ScalarType::Half,
+                "Computation type must be float16 (half) when using float zero "
+                "points.");
+  }
+
+  // Verify b_zeros
+  if (has_zp) {
+    int rank = b_zeros.sizes().size();
+    TORCH_CHECK(rank == 3, "b_zeros rank = ", rank, " is not 3");
+    if (is_zp_float) {
+      TORCH_CHECK(b_zeros.size(2) == size_n,
+                  "b_zeros dim 2 = ", b_zeros.size(2),
+                  " is not size_n = ", size_n);
+      TORCH_CHECK(num_groups == b_zeros.size(1),
+                  "b_zeros dim 1 = ", b_zeros.size(1),
+                  " is not num_groups = ", num_groups);
+      TORCH_CHECK(num_groups != -1, "num_groups must be != -1");
+    } else {
+      TORCH_CHECK(b_zeros.size(1) == num_groups,
+                  "b_zeros dim 1 = ", b_zeros.size(1),
+                  " is not num_groups = ", num_groups);
+      TORCH_CHECK(b_zeros.size(2) == size_n / pack_factor,
+                  "b_zeros dim 2 = ", b_zeros.size(2),
+                  " is not size_n / pack_factor = ", size_n / pack_factor);
+    }
+  }
+
+  // Verify workspace size
+  TORCH_CHECK(size_n % MARLIN_NAMESPACE_NAME::min_thread_n == 0,
+              "size_n = ", size_n, ", is not divisible by min_thread_n = ",
+              MARLIN_NAMESPACE_NAME::min_thread_n);
+
+  int max_n_tiles = size_n / MARLIN_NAMESPACE_NAME::min_thread_n;
+  int min_workspace_size = min(
+      max_n_tiles * (int)(sorted_token_ids.size(0) / moe_block_size), sms * 4);
+  TORCH_CHECK(workspace.numel() >= min_workspace_size,
+              "workspace.numel = ", workspace.numel(),
+              " is below min_workspace_size = ", min_workspace_size);
+
+  int dev = a.get_device();
+  if (a.scalar_type() == at::ScalarType::Half) {
+    MARLIN_NAMESPACE_NAME::marlin_mm<half>(
+        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
+        c_tmp.data_ptr<float>(), b_scales.data_ptr<at::Half>(),
+        b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
+        a_tmp.data_ptr<at::Half>(), sorted_token_ids.data_ptr(),
+        expert_ids.data_ptr(), num_tokens_past_padded.data_ptr(),
+        topk_weights.data_ptr(), moe_block_size, top_k, mul_topk_weights, is_ep,
+        size_m, size_n, size_k, workspace.data_ptr(), b_q_type, has_act_order,
+        is_k_full, has_zp, num_groups, group_size, dev,
+        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
+        use_atomic_add, use_fp32_reduce, is_zp_float);
+  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
+    MARLIN_NAMESPACE_NAME::marlin_mm<nv_bfloat16>(
+        a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
+        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
+        b_scales.data_ptr<at::BFloat16>(), b_zeros.data_ptr(), g_idx.data_ptr(),
+        perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
+        sorted_token_ids.data_ptr(), expert_ids.data_ptr(),
+        num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(),
+        moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k,
+        workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
+        num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
+        thread_k, thread_n, sms, use_atomic_add, use_fp32_reduce, is_zp_float);
+  } else {
+    TORCH_CHECK(false,
+                "moe_wna16_marlin_gemm only supports bfloat16 and float16");
+  }
+
+  return c;
+}
+
+#endif
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("moe_wna16_marlin_gemm", &moe_wna16_marlin_gemm);
+}
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index 718418e6cd497..d0de42251f97a 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -43,14 +43,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
   m.impl("moe_wna16_gemm", torch::kCUDA, &moe_wna16_gemm);
 
   m.def(
-      "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
-      "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
-      "b_zeros, Tensor! g_idx, Tensor! perm, Tensor! workspace, "
-      "int b_q_type, SymInt size_m, "
-      "SymInt size_n, SymInt size_k, bool is_k_full, int num_experts, int "
-      "topk, "
-      "int moe_block_size, bool replicate_input, bool apply_weights)"
-      " -> Tensor");
+      "moe_wna16_marlin_gemm(Tensor! a, Tensor? c_or_none,"
+      "Tensor! b_q_weight, Tensor! b_scales, Tensor? b_zeros_or_none,"
+      "Tensor? g_idx_or_none, Tensor? perm_or_none, Tensor! workspace,"
+      "Tensor sorted_token_ids,"
+      "Tensor! expert_ids, Tensor! num_tokens_past_padded,"
+      "Tensor! topk_weights, int moe_block_size, int top_k, "
+      "bool mul_topk_weights, bool is_ep, int b_q_type_id,"
+      "int size_m, int size_n, int size_k,"
+      "bool is_full_k, bool use_atomic_add,"
+      "bool use_fp32_reduce, bool is_zp_float) -> Tensor");
+
   // conditionally compiled so impl registration is in source file
 
 #endif
diff --git a/csrc/quantization/gptq_marlin/marlin.cuh b/csrc/quantization/gptq_marlin/marlin.cuh
index 74ccbac57bd3c..f3b44641e77ee 100644
--- a/csrc/quantization/gptq_marlin/marlin.cuh
+++ b/csrc/quantization/gptq_marlin/marlin.cuh
@@ -9,7 +9,11 @@
 #include <cuda_runtime.h>
 #include <iostream>
 
-namespace marlin {
+#ifndef MARLIN_NAMESPACE_NAME
+  #define MARLIN_NAMESPACE_NAME marlin
+#endif
+
+namespace MARLIN_NAMESPACE_NAME {
 
 // Marlin params
 
@@ -23,6 +27,7 @@ static constexpr int pipe_stages =
 
 static constexpr int min_thread_n = 64;
 static constexpr int min_thread_k = 64;
+static constexpr int max_thread_n = 256;
 
 static constexpr int tile_size = 16;
 static constexpr int max_par = 16;
@@ -84,4 +89,4 @@ __device__ inline void cp_async_wait() {
 
 #endif
 
-}  // namespace marlin
+}  // namespace MARLIN_NAMESPACE_NAME
diff --git a/csrc/quantization/gptq_marlin/marlin_dtypes.cuh b/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
index be06c09bee331..cc16054814342 100644
--- a/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
+++ b/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
@@ -5,7 +5,11 @@
 #include <cuda_fp16.h>
 #include <cuda_bf16.h>
 
-namespace marlin {
+#ifndef MARLIN_NAMESPACE_NAME
+  #define MARLIN_NAMESPACE_NAME marlin
+#endif
+
+namespace MARLIN_NAMESPACE_NAME {
 
 template <typename scalar_t>
 class ScalarType {};
@@ -54,7 +58,7 @@ class ScalarType<nv_bfloat16> {
   using FragS = Vec<nv_bfloat162, 1>;
   using FragZP = Vec<nv_bfloat162, 4>;
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
   static __device__ float inline num2float(const nv_bfloat16 x) {
     return __bfloat162float(x);
   }
@@ -74,6 +78,6 @@ class ScalarType<nv_bfloat16> {
 #endif
 };
 
-}  // namespace marlin
+}  // namespace MARLIN_NAMESPACE_NAME
 
 #endif
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 3f4dd3cf0e5d7..425f36984a33b 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -11,16 +11,14 @@ from transformers import MixtralConfig
 from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
 
 import vllm.model_executor.layers.fused_moe  # noqa
-from tests.kernels.utils import (compute_max_diff, opcheck, stack_and_dev,
-                                 torch_moe, torch_moe_single)
-from vllm import _custom_ops as ops
+from tests.kernels.utils import (opcheck, stack_and_dev, torch_moe,
+                                 torch_moe_single)
 from vllm.model_executor.layers.fused_moe import fused_moe
-from vllm.model_executor.layers.fused_moe.fused_moe import (
-    fused_topk, moe_align_block_size)
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.moe_torch_iterative import (
     fused_moe as iterative_moe)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
-    marlin_quantize)
+    awq_marlin_quantize, marlin_quantize)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     quantize_weights)
 from vllm.model_executor.models.mixtral import MixtralMoE
@@ -287,14 +285,17 @@ def test_mixtral_moe(dtype: torch.dtype, padding: bool, use_rocm_aiter: bool,
                                    atol=mixtral_moe_tol[dtype])
 
 
-@pytest.mark.parametrize("m", [1, 33, 64, 222])
-@pytest.mark.parametrize("n", [128, 2048])
-@pytest.mark.parametrize("k", [128, 1024])
-@pytest.mark.parametrize("e", NUM_EXPERTS)
-@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("m", [1, 33, 123])
+@pytest.mark.parametrize("n", [128, 1024])
+@pytest.mark.parametrize("k", [256, 2048])
+@pytest.mark.parametrize("e", [4, 12])
+@pytest.mark.parametrize("topk", [2, 3])
+@pytest.mark.parametrize("ep_size", [1, 4])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("group_size", [-1, 32, 128])
 @pytest.mark.parametrize("act_order", [True, False])
 @pytest.mark.parametrize("num_bits", [4, 8])
+@pytest.mark.parametrize("has_zp", [True, False])
 @pytest.mark.parametrize("is_k_full", [True, False])
 @pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
 def test_fused_marlin_moe(
@@ -303,9 +304,12 @@ def test_fused_marlin_moe(
     k: int,
     e: int,
     topk: int,
+    ep_size: int,
+    dtype: torch.dtype,
     group_size: int,
     act_order: bool,
     num_bits: int,
+    has_zp: bool,
     is_k_full: bool,
 ):
     current_platform.seed_everything(7)
@@ -316,75 +320,110 @@ def test_fused_marlin_moe(
             return
         if group_size in (k, n):
             return
+        if has_zp:
+            return
     else:
         if not is_k_full:
             return
 
-    quant_type = (scalar_types.uint4b8
-                  if num_bits == 4 else scalar_types.uint8b128)
-    dtype = torch.float16
+    if has_zp:
+        # we don't build kernel for int8 with zero
+        if num_bits == 8:
+            return
+        quant_type = scalar_types.uint4 if num_bits == 4 else scalar_types.uint8
+    else:
+        quant_type = scalar_types.uint4b8 \
+                if num_bits == 4 else scalar_types.uint8b128
     a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
     w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
     w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
 
+    if ep_size > 1:
+        local_e = e // ep_size
+        e_ids = torch.randperm(e, device="cuda", dtype=torch.int32)[:local_e]
+        e_map = torch.full((e, ), -1, device="cuda", dtype=torch.int32)
+        e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
+        w1 = w1[e_ids]
+        w2 = w2[e_ids]
+    else:
+        e_map = None
+
     w_ref1_l = []
     qweight1_l = []
     scales1_l = []
+    zeros1_l = []
     g_idx1_l = []
     sort_indices1_l = []
 
     for i in range(w1.shape[0]):
-        test_perm = torch.randperm(k)
-        w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = marlin_quantize(
-            w1[i].transpose(1, 0), quant_type, group_size, act_order,
-            test_perm)
-        w_ref1_l.append(w_ref1)
-        qweight1_l.append(qweight1)
-        scales1_l.append(scales1)
-        g_idx1_l.append(g_idx1)
-        sort_indices1_l.append(sort_indices1)
+        if has_zp:
+            w_ref1, qweight1, scales1, zeros1 = awq_marlin_quantize(
+                w1[i].transpose(1, 0), quant_type, group_size)
+
+            w_ref1_l.append(w_ref1.T)
+            qweight1_l.append(qweight1)
+            scales1_l.append(scales1)
+            zeros1_l.append(zeros1)
+        else:
+            test_perm = torch.randperm(k)
+            quant_res = marlin_quantize(w1[i].transpose(1, 0), quant_type,
+                                        group_size, act_order, test_perm)
+            w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = quant_res
+
+            w_ref1_l.append(w_ref1.T)
+            qweight1_l.append(qweight1)
+            scales1_l.append(scales1)
+            g_idx1_l.append(g_idx1)
+            sort_indices1_l.append(sort_indices1)
 
     w_ref1 = stack_and_dev(w_ref1_l)
     qweight1 = stack_and_dev(qweight1_l).contiguous()
     scales1 = stack_and_dev(scales1_l)
-    g_idx1 = stack_and_dev(g_idx1_l)
-    sort_indices1 = stack_and_dev(sort_indices1_l)
+    g_idx1 = stack_and_dev(g_idx1_l) if g_idx1_l else None
+    zeros1 = stack_and_dev(zeros1_l) if zeros1_l else None
+    sort_indices1 = stack_and_dev(sort_indices1_l) if sort_indices1_l else None
 
     w_ref2_l = []
     qweight2_l = []
     scales2_l = []
+    zeros2_l = []
     g_idx2_l = []
     sort_indices2_l = []
 
     for i in range(w2.shape[0]):
-        test_perm = torch.randperm(n)
-        w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = marlin_quantize(
-            w2[i].transpose(1, 0), quant_type, group_size, act_order,
-            test_perm)
-        w_ref2_l.append(w_ref2)
-        qweight2_l.append(qweight2)
-        scales2_l.append(scales2)
-        g_idx2_l.append(g_idx2)
-        sort_indices2_l.append(sort_indices2)
+        if has_zp:
+            w_ref2, qweight2, scales2, zeros2 = awq_marlin_quantize(
+                w2[i].transpose(1, 0), quant_type, group_size)
+
+            w_ref2_l.append(w_ref2.T)
+            qweight2_l.append(qweight2)
+            scales2_l.append(scales2)
+            zeros2_l.append(zeros2)
+        else:
+            test_perm = torch.randperm(n)
+            quant_res = marlin_quantize(w2[i].transpose(1, 0), quant_type,
+                                        group_size, act_order, test_perm)
+            w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = quant_res
+
+            w_ref2_l.append(w_ref2.T)
+            qweight2_l.append(qweight2)
+            scales2_l.append(scales2)
+            g_idx2_l.append(g_idx2)
+            sort_indices2_l.append(sort_indices2)
 
     w_ref2 = stack_and_dev(w_ref2_l)
     qweight2 = stack_and_dev(qweight2_l).contiguous()
     scales2 = stack_and_dev(scales2_l)
-    g_idx2 = stack_and_dev(g_idx2_l)
-    sort_indices2 = stack_and_dev(sort_indices2_l)
+    g_idx2 = stack_and_dev(g_idx2_l) if g_idx2_l else None
+    zeros2 = stack_and_dev(zeros2_l) if zeros2_l else None
+    sort_indices2 = stack_and_dev(sort_indices2_l) if sort_indices2_l else None
 
     score = torch.randn((m, e), device="cuda", dtype=dtype)
 
     topk_weights, topk_ids = fused_topk(a, score, topk, False)
 
-    triton_output = fused_moe(
-        a,
-        w_ref1.transpose(1, 2).contiguous(),
-        w_ref2.transpose(1, 2).contiguous(),
-        score,
-        topk,
-        renormalize=False,
-    )
+    torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, e_map)
+
     marlin_output = torch.ops.vllm.fused_marlin_moe(
         a,
         qweight1,
@@ -394,111 +433,91 @@ def test_fused_marlin_moe(
         score,
         topk_weights,
         topk_ids,
+        global_num_experts=e,
+        expert_map=e_map,
         g_idx1=g_idx1,
         g_idx2=g_idx2,
         sort_indices1=sort_indices1,
         sort_indices2=sort_indices2,
+        w1_zeros=zeros1,
+        w2_zeros=zeros2,
         num_bits=num_bits,
-        is_k_full=is_k_full,
-    )
+        is_k_full=is_k_full)
 
-    assert compute_max_diff(marlin_output, triton_output) < 4e-2
-
-    if ops.supports_moe_ops:
-        token_expert_indicies = torch.empty(m,
-                                            topk,
-                                            dtype=torch.int32,
-                                            device=a.device)
-
-        opcheck(torch.ops._moe_C.topk_softmax, (
-            topk_weights,
-            topk_ids,
-            token_expert_indicies,
-            score.float(),
-        ))
-
-        block_size_m = 4
-
-        sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m,
-                                                      e)
-
-        max_workspace_size = ((m + 255) // 256) * (max(2 * n, k) // 64) * 16
-        workspace = torch.zeros(max_workspace_size,
-                                dtype=torch.int,
-                                device="cuda",
-                                requires_grad=False)
-
-        zp = torch.empty((0, 0),
-                         dtype=dtype,
-                         device="cuda",
-                         requires_grad=False)
-        opcheck(torch.ops._moe_C.marlin_gemm_moe,
-                (a, qweight1, sorted_token_ids, topk_weights, topk_ids,
-                 scales1, zp, g_idx1, sort_indices1, workspace, quant_type.id,
-                 m, 2 * n, k, True, e, topk, block_size_m, True, False))
+    torch.testing.assert_close(marlin_output, torch_output, atol=2e-2, rtol=0)
 
 
 @pytest.mark.skip("This test is here for the sake of debugging, "
                   "don't run it in automated tests.")
-@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
-@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
-@pytest.mark.parametrize("k", [128, 1024, 512])
-@pytest.mark.parametrize("e", [8, 64])
-@pytest.mark.parametrize("topk", [2, 6])
-@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
+@pytest.mark.parametrize("m", [1, 33, 123])
+@pytest.mark.parametrize("n", [128, 1024])
+@pytest.mark.parametrize("k", [256, 2048])
+@pytest.mark.parametrize("e", [4, 12])
+@pytest.mark.parametrize("topk", [2, 3])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("group_size", [-1, 32, 128])
 @pytest.mark.parametrize("act_order", [True, False])
 @pytest.mark.parametrize("num_bits", [4, 8])
+@pytest.mark.parametrize("has_zp", [True, False])
 @pytest.mark.parametrize("is_k_full", [True, False])
-@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
-def test_single_marlin_moe_multiply(
-    m: int,
-    n: int,
-    k: int,
-    e: int,
-    topk: int,
-    group_size: int,
-    act_order: bool,
-    num_bits: int,
-    is_k_full: bool,
-):
-
+def test_single_marlin_moe_multiply(m: int, n: int, k: int, e: int, topk: int,
+                                    dtype: torch.dtype, group_size: int,
+                                    act_order: bool, num_bits: int,
+                                    has_zp: bool, is_k_full: bool):
     # Filter act_order
     if act_order:
         if group_size == -1:
             return
-        if group_size == k:
+        if group_size in (k, n):
+            return
+        if has_zp:
             return
     else:
         if not is_k_full:
             return
 
-    quant_type = (scalar_types.uint4b8
-                  if num_bits == 4 else scalar_types.uint8b128)
-    dtype = torch.float16
+    if has_zp:
+        quant_type = scalar_types.uint4 if num_bits == 4 else scalar_types.uint8
+    else:
+        quant_type = scalar_types.uint4b8 \
+                if num_bits == 4 else scalar_types.uint8b128
     a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
     w = torch.randn((e, n, k), device="cuda", dtype=dtype) / 10
 
     w_ref_l = []
-    qweights_l = []
+    qweight_l = []
     scales_l = []
+    zeros_l = []
     g_idx_l = []
     sort_indices_l = []
 
     for i in range(w.shape[0]):
-        test_perm = torch.randperm(k)
-        w_ref, qweight, scales, g_idx, sort_indices, _ = marlin_quantize(
-            w[i].transpose(1, 0), quant_type, group_size, act_order, test_perm)
-        w_ref_l.append(w_ref)
-        qweights_l.append(qweight)
-        scales_l.append(scales)
-        g_idx_l.append(g_idx)
-        sort_indices_l.append(sort_indices)
+        if has_zp:
+            w_ref, qweight, scales, zeros = awq_marlin_quantize(
+                w[i].transpose(1, 0), quant_type, group_size)
+
+            w_ref_l.append(w_ref.T)
+            qweight_l.append(qweight)
+            scales_l.append(scales)
+            zeros_l.append(zeros)
+        else:
+            test_perm = torch.randperm(k)
+            w_ref, qweight, scales, g_idx, sort_indices, _ = marlin_quantize(
+                w[i].transpose(1, 0), quant_type, group_size, act_order,
+                test_perm)
+
+            w_ref_l.append(w_ref.T)
+            qweight_l.append(qweight)
+            scales_l.append(scales)
+            g_idx_l.append(g_idx)
+            sort_indices_l.append(sort_indices)
 
     w_ref = stack_and_dev(w_ref_l)
-    qweight = stack_and_dev(qweights_l).contiguous()
+    qweight = stack_and_dev(qweight_l).contiguous()
     scales = stack_and_dev(scales_l)
-    g_idx = stack_and_dev(g_idx_l)
-    sort_indices = stack_and_dev(sort_indices_l)
+    g_idx = stack_and_dev(g_idx_l) if g_idx_l else None
+    zeros = stack_and_dev(zeros_l) if zeros_l else None
+    sort_indices = stack_and_dev(sort_indices_l) if sort_indices_l else None
 
     score = torch.randn((m, e), device="cuda", dtype=dtype)
     marlin_output = torch.ops.vllm.single_marlin_moe(
@@ -510,13 +529,14 @@ def test_single_marlin_moe_multiply(
         renormalize=False,
         g_idx=g_idx,
         sort_indices=sort_indices,
+        w_zeros=zeros,
         num_bits=num_bits,
         is_k_full=is_k_full,
     )
 
-    torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk)
+    torch_output = torch_moe_single(a, w_ref, score, topk)
 
-    assert compute_max_diff(marlin_output, torch_output) < 1e-2
+    torch.testing.assert_close(marlin_output, torch_output, atol=2e-2, rtol=0)
 
 
 def test_moe_align_block_size_opcheck():
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 7a4c93ad6f7f8..bd930bb906531 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1245,6 +1245,29 @@ def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
                                   token_expert_indicies, gating_output)
 
 
+def moe_wna16_marlin_gemm(input: torch.Tensor, output: Optional[torch.Tensor],
+                          b_qweight: torch.Tensor, b_scales: torch.Tensor,
+                          b_qzeros: Optional[torch.Tensor],
+                          g_idx: Optional[torch.Tensor],
+                          perm: Optional[torch.Tensor],
+                          workspace: torch.Tensor,
+                          sorted_token_ids: torch.Tensor,
+                          expert_ids: torch.Tensor,
+                          num_tokens_past_padded: torch.Tensor,
+                          topk_weights: torch.Tensor, moe_block_size: int,
+                          top_k: int, mul_topk_weights: bool, is_ep: bool,
+                          b_q_type: ScalarType, size_m: int, size_n: int,
+                          size_k: int, is_k_full: bool, use_atomic_add: bool,
+                          use_fp32_reduce: bool,
+                          is_zp_float: bool) -> torch.Tensor:
+    return torch.ops._moe_C.moe_wna16_marlin_gemm(
+        input, output, b_qweight, b_scales, b_qzeros, g_idx, perm, workspace,
+        sorted_token_ids, expert_ids, num_tokens_past_padded, topk_weights,
+        moe_block_size, top_k, mul_topk_weights, is_ep, b_q_type.id, size_m,
+        size_n, size_k, is_k_full, use_atomic_add, use_fp32_reduce,
+        is_zp_float)
+
+
 if supports_moe_ops and hasattr(torch.ops._moe_C, "marlin_gemm_moe"):
 
     @register_fake("_moe_C::marlin_gemm_moe")
@@ -1263,6 +1286,29 @@ if supports_moe_ops and hasattr(torch.ops._moe_C, "marlin_gemm_moe"):
                            dtype=a.dtype,
                            device=a.device)
 
+    @register_fake("_moe_C::moe_wna16_marlin_gemm")
+    def moe_wna16_marlin_gemm_fake(input: torch.Tensor,
+                                   output: Optional[torch.Tensor],
+                                   b_qweight: torch.Tensor,
+                                   b_scales: torch.Tensor,
+                                   b_qzeros: Optional[torch.Tensor],
+                                   g_idx: Optional[torch.Tensor],
+                                   perm: Optional[torch.Tensor],
+                                   workspace: torch.Tensor,
+                                   sorted_token_ids: torch.Tensor,
+                                   expert_ids: torch.Tensor,
+                                   num_tokens_past_padded: torch.Tensor,
+                                   topk_weights: torch.Tensor,
+                                   moe_block_size: int, top_k: int,
+                                   mul_topk_weights: bool, is_ep: bool,
+                                   b_q_type: ScalarType, size_m: int,
+                                   size_n: int, size_k: int, is_k_full: bool,
+                                   use_atomic_add: bool, use_fp32_reduce: bool,
+                                   is_zp_float: bool) -> torch.Tensor:
+        return torch.empty((size_m * top_k, size_n),
+                           dtype=input.dtype,
+                           device=input.device)
+
 
 def reshape_and_cache(
     key: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index ee158d7ee474e..62614a59cbe9a 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -5,17 +5,16 @@ from typing import Optional
 
 import torch
 
+import vllm._custom_ops as ops
 from vllm.model_executor.layers.fused_moe.fused_moe import (
     fused_topk, moe_align_block_size, try_get_optimal_moe_config)
-from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 from vllm.utils import direct_register_custom_op
 
 
 def get_scalar_type(num_bits: int, has_zp: bool):
     if has_zp:
-        assert num_bits == 4
-        return scalar_types.uint4
+        return scalar_types.uint4 if num_bits == 4 else scalar_types.uint8
     else:
         return scalar_types.uint4b8 if num_bits == 4 else scalar_types.uint8b128
 
@@ -27,9 +26,12 @@ def single_marlin_moe(
     gating_output: torch.Tensor,
     topk: int,
     renormalize: bool,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
     g_idx: Optional[torch.Tensor] = None,
     sort_indices: Optional[torch.Tensor] = None,
     w_zeros: Optional[torch.Tensor] = None,
+    workspace: Optional[torch.Tensor] = None,
     num_bits: int = 8,
     is_k_full: bool = True,
 ) -> torch.Tensor:
@@ -62,7 +64,7 @@ def single_marlin_moe(
     assert gating_output.shape[1] == w.shape[0], "Number of experts mismatch"
     assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
     assert w.is_contiguous(), "Expert weights must be contiguous"
-    assert hidden_states.dtype == torch.float16
+    assert hidden_states.dtype in [torch.float16, torch.bfloat16]
     assert num_bits in [4, 8]
 
     M, K = hidden_states.shape
@@ -83,39 +85,54 @@ def single_marlin_moe(
 
     block_size_m = config['BLOCK_SIZE_M']
 
-    sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
+    if global_num_experts == -1:
+        global_num_experts = E
+    sorted_token_ids, expert_ids, num_tokens_post_padded = \
+        moe_align_block_size(topk_ids, block_size_m, E, expert_map)
 
-    max_workspace_size = (N // 64) * 16
-    workspace = torch.zeros(max_workspace_size,
-                            dtype=torch.int,
-                            device=hidden_states.device,
-                            requires_grad=False)
+    if workspace is None:
+        max_workspace_size = (max(2 * N, K) // 64) * \
+            (sorted_token_ids.size(0) // block_size_m)
+        device = hidden_states.device
+        sms = torch.cuda.get_device_properties(device).multi_processor_count
+        max_workspace_size = min(max_workspace_size, sms)
+        workspace = torch.zeros(max_workspace_size,
+                                dtype=torch.int,
+                                device=device,
+                                requires_grad=False)
 
-    has_zero_point = w_zeros is not None
-    if w_zeros is None:
-        w_zeros = torch.empty((0, 0),
-                              dtype=hidden_states.dtype,
-                              device=hidden_states.device,
-                              requires_grad=False)
+    scalar_type = get_scalar_type(num_bits, w_zeros is not None)
+    intermediate_cache = torch.empty(
+        (M * topk_ids.shape[1], N),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
 
-    if g_idx is None:
-        g_idx = torch.empty((0, 0),
-                            dtype=torch.int32,
-                            device=hidden_states.device,
-                            requires_grad=False)
-
-    if sort_indices is None:
-        sort_indices = torch.empty((0),
-                                   dtype=torch.int32,
-                                   device=hidden_states.device,
-                                   requires_grad=False)
-
-    scalar_type = get_scalar_type(num_bits, has_zero_point)
-
-    intermediate_cache = torch.ops._moe_C.marlin_gemm_moe(
-        hidden_states, w, sorted_token_ids, topk_weights, topk_ids, scales,
-        w_zeros, g_idx, sort_indices, workspace, scalar_type.id, M, N, K,
-        is_k_full, E, topk, block_size_m, True, False)
+    ops.moe_wna16_marlin_gemm(hidden_states,
+                              intermediate_cache,
+                              w,
+                              scales,
+                              w_zeros,
+                              g_idx,
+                              sort_indices,
+                              workspace,
+                              sorted_token_ids,
+                              expert_ids,
+                              num_tokens_post_padded,
+                              topk_weights,
+                              moe_block_size=block_size_m,
+                              top_k=topk,
+                              mul_topk_weights=False,
+                              is_ep=expert_map is not None,
+                              b_q_type=scalar_type,
+                              size_m=M,
+                              size_n=N,
+                              size_k=K,
+                              is_k_full=is_k_full,
+                              use_atomic_add=False,
+                              use_fp32_reduce=True,
+                              is_zp_float=False)
+    intermediate_cache = intermediate_cache.view(-1, topk, N)
 
     return torch.sum(intermediate_cache.view(*intermediate_cache.shape), dim=1)
 
@@ -127,9 +144,12 @@ def single_marlin_moe_fake(
     gating_output: torch.Tensor,
     topk: int,
     renormalize: bool,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
     g_idx: Optional[torch.Tensor] = None,
     sort_indices: Optional[torch.Tensor] = None,
     w_zeros: Optional[torch.Tensor] = None,
+    workspace: Optional[torch.Tensor] = None,
     num_bits: int = 8,
     is_k_full: bool = True,
 ) -> torch.Tensor:
@@ -144,24 +164,26 @@ direct_register_custom_op(
 )
 
 
-def fused_marlin_moe(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    w1_scale: torch.Tensor,
-    w2_scale: torch.Tensor,
-    gating_output: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    g_idx1: Optional[torch.Tensor] = None,
-    g_idx2: Optional[torch.Tensor] = None,
-    sort_indices1: Optional[torch.Tensor] = None,
-    sort_indices2: Optional[torch.Tensor] = None,
-    w1_zeros: Optional[torch.Tensor] = None,
-    w2_zeros: Optional[torch.Tensor] = None,
-    num_bits: int = 8,
-    is_k_full: bool = True,
-) -> torch.Tensor:
+def fused_marlin_moe(hidden_states: torch.Tensor,
+                     w1: torch.Tensor,
+                     w2: torch.Tensor,
+                     w1_scale: torch.Tensor,
+                     w2_scale: torch.Tensor,
+                     gating_output: torch.Tensor,
+                     topk_weights: torch.Tensor,
+                     topk_ids: torch.Tensor,
+                     global_num_experts: int = -1,
+                     expert_map: Optional[torch.Tensor] = None,
+                     g_idx1: Optional[torch.Tensor] = None,
+                     g_idx2: Optional[torch.Tensor] = None,
+                     sort_indices1: Optional[torch.Tensor] = None,
+                     sort_indices2: Optional[torch.Tensor] = None,
+                     w1_zeros: Optional[torch.Tensor] = None,
+                     w2_zeros: Optional[torch.Tensor] = None,
+                     workspace: Optional[torch.Tensor] = None,
+                     num_bits: int = 8,
+                     is_k_full: bool = True,
+                     inplace: bool = False) -> torch.Tensor:
     """
     This function computes a Mixture of Experts (MoE) layer using two sets of
     weights, w1 and w2, and top-k gating mechanism.
@@ -196,27 +218,12 @@ def fused_marlin_moe(
         1] == w1.shape[1] * 16, "Hidden size mismatch w1"
     assert hidden_states.shape[1] == w2.shape[2] // (
         num_bits // 2), "Hidden size mismatch w2"
-    assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
     assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
     assert w1.is_contiguous(), "Expert weights1 must be contiguous"
     assert w2.is_contiguous(), "Expert weights2 must be contiguous"
-    assert hidden_states.dtype == torch.float16
+    assert hidden_states.dtype in [torch.float16, torch.bfloat16]
     assert num_bits in [4, 8]
 
-    has_no_act_order = (g_idx1 is None and g_idx2 is None
-                        and sort_indices1 is None and sort_indices2 is None)
-    has_all_act_order = (g_idx1 is not None and g_idx2 is not None
-                         and sort_indices1 is not None
-                         and sort_indices2 is not None)
-    assert has_no_act_order or has_all_act_order, (
-        "g_idx and sorted_indices "
-        "must be all not None or must be all None")
-
-    has_no_zp = w1_zeros is None and w2_zeros is None
-    has_all_zp = w1_zeros is not None and w2_zeros is not None
-    assert has_no_zp or has_all_zp, ("zero points must be both not None or "
-                                     "must be both None")
-
     M, K = hidden_states.shape
     E = w1.shape[0]
     N = w2.shape[1] * 16
@@ -234,122 +241,128 @@ def fused_marlin_moe(
 
     block_size_m = config["BLOCK_SIZE_M"]
 
-    sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
+    if global_num_experts == -1:
+        global_num_experts = E
+    sorted_token_ids, expert_ids, num_tokens_post_padded = \
+        moe_align_block_size(topk_ids, block_size_m, global_num_experts,
+                             expert_map)
 
-    max_workspace_size = (max(2 * N, K) // 64) * 16
-    workspace = torch.zeros(max_workspace_size,
-                            dtype=torch.int,
-                            device=current_platform.device_type,
-                            requires_grad=False)
+    if workspace is None:
+        max_workspace_size = (max(2 * N, K) // 64) * \
+            (sorted_token_ids.size(0) // block_size_m)
+        device = hidden_states.device
+        sms = torch.cuda.get_device_properties(device).multi_processor_count
+        max_workspace_size = min(max_workspace_size, sms * 4)
+        workspace = torch.zeros(max_workspace_size,
+                                dtype=torch.int,
+                                device=device,
+                                requires_grad=False)
 
-    if has_no_zp:
-        w1_zeros = torch.empty((0, 0),
-                               dtype=hidden_states.dtype,
-                               device=hidden_states.device,
-                               requires_grad=False)
-        w2_zeros = torch.empty((0, 0),
-                               dtype=hidden_states.dtype,
-                               device=hidden_states.device,
-                               requires_grad=False)
-
-    if has_no_act_order:
-        g_idx1 = torch.empty((0, 0),
-                             dtype=torch.int32,
-                             device=hidden_states.device,
-                             requires_grad=False)
-        g_idx2 = torch.empty((0, 0),
-                             dtype=torch.int32,
-                             device=hidden_states.device,
-                             requires_grad=False)
-        sort_indices1 = torch.empty((0),
-                                    dtype=torch.int32,
-                                    device=hidden_states.device,
-                                    requires_grad=False)
-        sort_indices2 = torch.empty((0, 0),
-                                    dtype=torch.int32,
-                                    device=hidden_states.device,
-                                    requires_grad=False)
-
-    scalar_type1 = get_scalar_type(num_bits, has_all_zp)
-    scalar_type2 = get_scalar_type(num_bits, has_all_zp)
+    scalar_type1 = get_scalar_type(num_bits, w1_zeros is not None)
+    scalar_type2 = get_scalar_type(num_bits, w2_zeros is not None)
 
     intermediate_cache2 = torch.empty(
         (M * topk_ids.shape[1], N),
         device=hidden_states.device,
         dtype=hidden_states.dtype,
     )
+    intermediate_cache13 = torch.empty(
+        (M * topk_ids.shape[1] * max(2 * N, K), ),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+    intermediate_cache1 = intermediate_cache13[:M * topk_ids.shape[1] * 2 * N]
+    intermediate_cache1 = intermediate_cache1.view(-1, 2 * N)
+    intermediate_cache3 = intermediate_cache13[:M * topk_ids.shape[1] * K]
+    intermediate_cache3 = intermediate_cache3.view(-1, K)
 
-    intermediate_cache1 = torch.ops._moe_C.marlin_gemm_moe(
+    use_atomic_add = hidden_states.dtype == torch.half or \
+        torch.cuda.get_device_capability(hidden_states.device)[0] >= 9
+
+    intermediate_cache1 = ops.moe_wna16_marlin_gemm(
         hidden_states,
+        intermediate_cache1,
         w1,
-        sorted_token_ids,
-        topk_weights,
-        topk_ids,
         w1_scale,
         w1_zeros,
         g_idx1,
         sort_indices1,
         workspace,
-        scalar_type1.id,
-        M,
-        2 * N,
-        K,
-        is_k_full,
-        E,
-        topk,
-        block_size_m,
-        True,
-        False,
-    )
+        sorted_token_ids,
+        expert_ids,
+        num_tokens_post_padded,
+        topk_weights,
+        moe_block_size=block_size_m,
+        top_k=topk,
+        mul_topk_weights=False,
+        is_ep=expert_map is not None,
+        b_q_type=scalar_type1,
+        size_m=M,
+        size_n=2 * N,
+        size_k=K,
+        is_k_full=is_k_full,
+        use_atomic_add=use_atomic_add,
+        use_fp32_reduce=True,
+        is_zp_float=False)
 
     torch.ops._C.silu_and_mul(intermediate_cache2,
                               intermediate_cache1.view(-1, 2 * N))
 
-    intermediate_cache3 = torch.ops._moe_C.marlin_gemm_moe(
+    if expert_map is not None:
+        intermediate_cache3.zero_()
+
+    intermediate_cache3 = ops.moe_wna16_marlin_gemm(
         intermediate_cache2,
+        intermediate_cache3,
         w2,
-        sorted_token_ids,
-        topk_weights,
-        topk_ids,
         w2_scale,
         w2_zeros,
         g_idx2,
         sort_indices2,
         workspace,
-        scalar_type2.id,
-        M,
-        K,
-        N,
-        is_k_full,
-        E,
-        topk,
-        block_size_m,
-        False,
-        True,
-    )
+        sorted_token_ids,
+        expert_ids,
+        num_tokens_post_padded,
+        topk_weights,
+        moe_block_size=block_size_m,
+        top_k=1,
+        mul_topk_weights=True,
+        is_ep=expert_map is not None,
+        b_q_type=scalar_type2,
+        size_m=M * topk,
+        size_n=K,
+        size_k=N,
+        is_k_full=is_k_full,
+        use_atomic_add=use_atomic_add,
+        use_fp32_reduce=True,
+        is_zp_float=False).view(-1, topk, K)
 
+    output = hidden_states if inplace else torch.empty_like(hidden_states)
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
-                     dim=1)
+                     dim=1,
+                     out=output)
 
 
-def fused_marlin_moe_fake(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    w1_scale: torch.Tensor,
-    w2_scale: torch.Tensor,
-    gating_output: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    g_idx1: Optional[torch.Tensor] = None,
-    g_idx2: Optional[torch.Tensor] = None,
-    sort_indices1: Optional[torch.Tensor] = None,
-    sort_indices2: Optional[torch.Tensor] = None,
-    w1_zeros: Optional[torch.Tensor] = None,
-    w2_zeros: Optional[torch.Tensor] = None,
-    num_bits: int = 8,
-    is_k_full: bool = True,
-) -> torch.Tensor:
+def fused_marlin_moe_fake(hidden_states: torch.Tensor,
+                          w1: torch.Tensor,
+                          w2: torch.Tensor,
+                          w1_scale: torch.Tensor,
+                          w2_scale: torch.Tensor,
+                          gating_output: torch.Tensor,
+                          topk_weights: torch.Tensor,
+                          topk_ids: torch.Tensor,
+                          global_num_experts: int = -1,
+                          expert_map: Optional[torch.Tensor] = None,
+                          g_idx1: Optional[torch.Tensor] = None,
+                          g_idx2: Optional[torch.Tensor] = None,
+                          sort_indices1: Optional[torch.Tensor] = None,
+                          sort_indices2: Optional[torch.Tensor] = None,
+                          w1_zeros: Optional[torch.Tensor] = None,
+                          w2_zeros: Optional[torch.Tensor] = None,
+                          workspace: Optional[torch.Tensor] = None,
+                          num_bits: int = 8,
+                          is_k_full: bool = True,
+                          inplace: bool = False) -> torch.Tensor:
     return torch.empty_like(hidden_states)
 
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 38d739d55e55c..2a988b8644b53 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -773,6 +773,18 @@ def get_default_config(
             config = {"BLOCK_SIZE_M": 32, "GROUP_SIZE_M": 1}
         else:
             config = {"BLOCK_SIZE_M": 64, "GROUP_SIZE_M": 1}
+    elif is_marlin:
+        for block_size_m in [8, 16, 32, 48, 64]:
+            if M * topk / E / block_size_m < 0.9:
+                break
+        return {"BLOCK_SIZE_M": block_size_m}
+    elif M <= E:
+        config = {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 1,
+        }
     else:
         config = {
             "BLOCK_SIZE_M": 64,
@@ -780,14 +792,6 @@ def get_default_config(
             "BLOCK_SIZE_K": 32,
             "GROUP_SIZE_M": 8,
         }
-        # A heuristic: fused marlin works faster with this config for small M
-        if M <= E or (is_marlin and M <= 32):
-            config = {
-                "BLOCK_SIZE_M": 16,
-                "BLOCK_SIZE_N": 32,
-                "BLOCK_SIZE_K": 64,
-                "GROUP_SIZE_M": 1,
-            }
     return config
 
 
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 89a7548da2e87..6e32e3e2f50dc 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -472,6 +472,7 @@ class FusedMoE(torch.nn.Module):
         self.global_num_experts = num_experts
 
         assert intermediate_size % self.tp_size == 0
+        self.hidden_size = hidden_size
         self.intermediate_size_per_partition = intermediate_size // self.tp_size
         self.reduce_results = reduce_results
         self.renormalize = renormalize
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index cb1d5400f3a07..ef4a7765d61ef 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -17,14 +17,13 @@ from vllm.model_executor.layers.quantization.awq import (AWQConfig,
                                                          is_layer_skipped_awq)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
-from vllm.model_executor.layers.quantization.moe_wna16 import MoeWNA16Config
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     apply_awq_marlin_linear, awq_to_marlin_zero_points, check_marlin_supported,
-    check_marlin_supports_layer, marlin_make_empty_g_idx,
-    marlin_make_workspace, marlin_moe_permute_scales, marlin_permute_scales,
-    moe_awq_to_marlin_zero_points, verify_marlin_supported,
-    verify_marlin_supports_shape)
+    check_marlin_supports_layer, check_moe_marlin_supports_layer,
+    marlin_make_empty_g_idx, marlin_make_workspace, marlin_moe_permute_scales,
+    marlin_permute_scales, moe_awq_to_marlin_zero_points,
+    verify_marlin_supported, verify_marlin_supports_shape)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.parameter import (GroupQuantScaleParameter,
                                            PackedvLLMParameter)
@@ -136,12 +135,15 @@ class AWQMarlinConfig(QuantizationConfig):
                     self.full_config).get_quant_method(layer, prefix)
             return AWQMarlinLinearMethod(self)
         elif isinstance(layer, FusedMoE):
-            if layer.local_num_experts > 32:
-                # For MoEs with many experts the moe_wna16 kernel is faster
+            from vllm.model_executor.layers.quantization.moe_wna16 import (
+                MoeWNA16Config)
+            if not check_moe_marlin_supports_layer(layer, self.group_size):
+                logger.warning_one(
+                    f"Layer '{prefix}' is not supported by AWQMoeMarlin. "
+                    "Falling back to Moe WNA16 kernels.")
                 return MoeWNA16Config.from_config(
                     self.full_config).get_quant_method(layer, prefix)
-            else:
-                return AWQMoEMethod(self)
+            return AWQMoEMethod(self)
         return None
 
     @classmethod
@@ -391,6 +393,13 @@ class AWQMoEMethod(FusedMoEMethodBase):
         layer.register_parameter("w2_qzeros", w2_qzeros)
         set_weight_attrs(w2_qzeros, extra_weight_attrs)
 
+        device = layer.w13_qweight.device
+        sms = torch.cuda.get_device_properties(device).multi_processor_count
+        layer.workspace = torch.zeros((sms * 4, ),
+                                      dtype=torch.int,
+                                      device=device,
+                                      requires_grad=False)
+
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         num_experts = layer.w13_qweight.shape[0]
         device = layer.w13_qweight.device
@@ -473,10 +482,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
         activation: str = "silu",
     ) -> torch.Tensor:
         assert activation == "silu", "Only SiLU activation is supported."
-        if expert_map is not None:
-            raise NotImplementedError(
-                "Expert Parallelism is not supported for "
-                "fused Marlin MoE method.")
+
         if apply_router_weight_on_input:
             raise NotImplementedError(
                 "Apply router weight on input is not supported for"
@@ -503,7 +509,10 @@ class AWQMoEMethod(FusedMoEMethodBase):
             router_logits,
             topk_weights,
             topk_ids,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
             w1_zeros=layer.w13_qzeros,
             w2_zeros=layer.w2_qzeros,
+            workspace=layer.workspace,
             num_bits=self.quant_config.weight_bits,
         )
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 0615bb4ab4df7..52cd0a5b69757 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -15,13 +15,13 @@ from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
     MPLinearLayerConfig, choose_mp_linear_kernel)
-from vllm.model_executor.layers.quantization.moe_wna16 import MoeWNA16Config
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.gptq_utils import (
     get_linear_quant_method)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    check_marlin_supported, marlin_moe_permute_scales,
-    marlin_repeat_scales_on_all_ranks, verify_marlin_supported)
+    check_marlin_supported, check_moe_marlin_supports_layer,
+    marlin_moe_permute_scales, marlin_repeat_scales_on_all_ranks,
+    verify_marlin_supported)
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            GroupQuantScaleParameter,
                                            PackedColumnParameter,
@@ -153,12 +153,15 @@ class GPTQMarlinConfig(QuantizationConfig):
     def get_quant_method(self, layer: torch.nn.Module,
                          prefix: str) -> Optional["QuantizeMethodBase"]:
         if isinstance(layer, FusedMoE):
-            if layer.local_num_experts > 32:
-                # For MoEs with many experts the moe_wna16 kernel is faster
+            from vllm.model_executor.layers.quantization.moe_wna16 import (
+                MoeWNA16Config)
+            if not check_moe_marlin_supports_layer(layer, self.group_size):
+                logger.warning_one(
+                    f"Layer '{prefix}' is not supported by GPTQMoeMarlin. "
+                    "Falling back to Moe WNA16 kernels.")
                 return MoeWNA16Config.from_config(
                     self.full_config).get_quant_method(layer, prefix)
-            else:
-                return GPTQMarlinMoEMethod(self)
+            return GPTQMarlinMoEMethod(self)
         return get_linear_quant_method(self, layer, prefix,
                                        GPTQMarlinLinearMethod)
 
@@ -408,7 +411,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             torch.empty(num_experts,
                         scales_size13,
                         2 * intermediate_size_per_partition,
-                        dtype=torch.half),
+                        dtype=params_dtype),
             requires_grad=False,
         )
         layer.register_parameter("w13_scales", w13_scales)
@@ -418,7 +421,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             torch.empty(num_experts,
                         scales_size2,
                         hidden_size,
-                        dtype=torch.half),
+                        dtype=params_dtype),
             requires_grad=False,
         )
         layer.register_parameter("w2_scales", w2_scales)
@@ -493,6 +496,13 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
                                  w2_g_idx_sort_indices)
         set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
 
+        device = layer.w13_qweight.device
+        sms = torch.cuda.get_device_properties(device).multi_processor_count
+        layer.workspace = torch.zeros((sms * 4, ),
+                                      dtype=torch.int,
+                                      device=device,
+                                      requires_grad=False)
+
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
 
         # Process act_order
@@ -601,10 +611,6 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
                 "Apply router weight on input is not supported for"
                 "fused Marlin MoE method.")
 
-        # The input must currently be float16
-        orig_dtype = x.dtype
-        x = x.half()
-
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
@@ -626,9 +632,12 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             router_logits,
             topk_weights,
             topk_ids,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
             g_idx1=layer.w13_g_idx,
             g_idx2=layer.w2_g_idx,
             sort_indices1=layer.w13_g_idx_sort_indices,
             sort_indices2=layer.w2_g_idx_sort_indices,
             num_bits=self.quant_config.quant_type.size_bits,
-            is_k_full=self.is_k_full).to(orig_dtype)
+            workspace=layer.workspace,
+            is_k_full=self.is_k_full)
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 5b2e3ca2c799d..1ccfae9117a83 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -151,6 +151,19 @@ def check_marlin_supports_layer(layer: LinearBase, group_size: int) \
         group_size=group_size)[0]
 
 
+def check_moe_marlin_supports_layer(layer: LinearBase, group_size: int) \
+                                    -> bool:
+    hidden_size = layer.hidden_size
+    intermediate_size_per_partition = layer.intermediate_size_per_partition
+
+    # gate-up: (n, k) = (intermediate_size_per_partition * 2, hidden_size)
+    # down: (n, k) = (hidden_size, intermediate_size_per_partition)
+    # moe marlin requires n % 128 == 0 and k % 64 == 0
+    return hidden_size % 128 == 0 and \
+        intermediate_size_per_partition % max(64, group_size) == 0 and \
+        group_size in [-1, 32, 64, 128]
+
+
 def marlin_make_workspace(output_size_per_partition: int,
                           device: torch.device) -> torch.Tensor:
     max_workspace_size = (output_size_per_partition //

From 70e7ed841d1ab498b51e842a43157432cb752a97 Mon Sep 17 00:00:00 2001
From: Taneem Ibrahim <taneem.ibrahim@gmail.com>
Date: Mon, 14 Apr 2025 22:06:03 -0500
Subject: [PATCH 434/593] [BugFix]: Update minimum `pyzmq` version (#16549)

Signed-off-by: Taneem Ibrahim <taneem.ibrahim@gmail.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
---
 requirements/common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 32745a101f657..4df32460c2db7 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -26,7 +26,7 @@ xgrammar == 0.1.18; platform_machine == "x86_64" or platform_machine == "aarch64
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
-pyzmq
+pyzmq >= 25.0.0
 msgspec
 gguf >= 0.13.0
 importlib_metadata

From dbb036cf612a3c9943254182af40597ec107be08 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Tue, 15 Apr 2025 01:35:38 -0400
Subject: [PATCH 435/593] [Bugfix] Fix tests/kernels/test_mamba_ssm_ssd.py
 (#16623)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 tests/kernels/test_mamba_ssm_ssd.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/tests/kernels/test_mamba_ssm_ssd.py b/tests/kernels/test_mamba_ssm_ssd.py
index 8f23a9b216e98..ee908105f557f 100644
--- a/tests/kernels/test_mamba_ssm_ssd.py
+++ b/tests/kernels/test_mamba_ssm_ssd.py
@@ -5,6 +5,8 @@ import torch
 import torch.nn.functional as F
 from einops import rearrange, repeat
 
+from vllm.model_executor.layers.mamba.mamba2_metadata import (
+    _seq_idx_to_chunk_indices_offsets)
 from vllm.model_executor.layers.mamba.ops.ssd_combined import (
     mamba_chunk_scan_combined)
 from vllm.platforms import current_platform
@@ -160,14 +162,14 @@ def generate_continous_batched_examples(example_lens_by_batch,
 
         # get the metadata
         cu_seqlens = torch.tensor((0, ) + spec, device=device).cumsum(dim=0)
-        sed_idx = torch.zeros(cu_seqlens[-1],
+        seq_idx = torch.zeros(cu_seqlens[-1],
                               dtype=torch.int32,
                               device=cu_seqlens.device)
         for i, (srt, end) in enumerate(zip(
                 cu_seqlens,
                 cu_seqlens[1:],
         )):
-            sed_idx[srt:end] = i
+            seq_idx[srt:end] = i
 
         # for cont batch
         if IND_E is None:
@@ -177,7 +179,7 @@ def generate_continous_batched_examples(example_lens_by_batch,
         IND_E = [end_boundary(x + y) for x, y in zip(IND_S, spec)]
 
         yield ([Y_min[s, IND_S[s]:IND_E[s]] for s in range(num_examples)],
-               cu_seqlens, sed_idx.unsqueeze(0), (A, dt2, X2, B2, C2))
+               cu_seqlens, seq_idx.unsqueeze(0), (A, dt2, X2, B2, C2))
 
 
 @pytest.mark.parametrize("itype",
@@ -266,12 +268,15 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
     exhausted: dict = {}  # map: eg -> boolean indicating example is exhausted
 
     states = None
-    for Y_min, cu_seqlens, sed_idx, (A, dt, X, B,
+    for Y_min, cu_seqlens, seq_idx, (A, dt, X, B,
                                      C) in generate_continous_batched_examples(
                                          cases, num_examples, seqlen,
                                          last_taken, exhausted, n_heads,
                                          d_head, itype):
 
+        chunk_indices, chunk_offsets = _seq_idx_to_chunk_indices_offsets(
+            seq_idx, chunk_size)
+
         Y, new_states = mamba_chunk_scan_combined(
             X,
             dt,
@@ -281,7 +286,9 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
             chunk_size,
             D=None,
             cu_seqlens=cu_seqlens,
-            seq_idx=sed_idx,
+            seq_idx=seq_idx,
+            chunk_indices=chunk_indices,
+            chunk_offsets=chunk_offsets,
             return_varlen_states=True,
             initial_states=states,
         )

From bc5dd4f669e2f83adec58b38ea11d75c74bc1706 Mon Sep 17 00:00:00 2001
From: Pooya Davoodi <pooya.davoodi@parasail.io>
Date: Mon, 14 Apr 2025 23:09:58 -0700
Subject: [PATCH 436/593] [Bugfix] Fix broken GritLM model and tests (missing
 pooling_metadata) (#16631)

Signed-off-by: Pooya Davoodi <pooya.davoodi@parasail.io>
---
 .../models/embedding/language/test_gritlm.py  | 21 ++++++++++---------
 vllm/model_executor/models/gritlm.py          |  3 ++-
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/tests/models/embedding/language/test_gritlm.py b/tests/models/embedding/language/test_gritlm.py
index d6bf7d2706397..87a1dde9381fd 100644
--- a/tests/models/embedding/language/test_gritlm.py
+++ b/tests/models/embedding/language/test_gritlm.py
@@ -57,24 +57,25 @@ def test_find_array(monkeypatch: pytest.MonkeyPatch):
 def server_embedding():
     # GritLM embedding implementation is only supported by XFormers backend.
     args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-        yield remote_server
+    with pytest.MonkeyPatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
+        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+            yield remote_server
 
 
 @pytest.fixture(scope="module")
 def server_generate():
     args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-        yield remote_server
+    with pytest.MonkeyPatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
+        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+            yield remote_server
 
 
 @pytest_asyncio.fixture
-async def client_embedding(monkeypatch: pytest.MonkeyPatch,
-                           server_embedding: RemoteOpenAIServer):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
-        async with server_embedding.get_async_client() as async_client:
-            yield async_client
+async def client_embedding(server_embedding: RemoteOpenAIServer):
+    async with server_embedding.get_async_client() as async_client:
+        yield async_client
 
 
 @pytest_asyncio.fixture
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index 2984f22412864..e4692c4580887 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -170,7 +170,8 @@ class GritLMPooler(nn.Module):
         mean_embeddings = sum_embeddings / num_non_instruction_tokens.unsqueeze(
             1)
 
-        pooled_data = self.head(mean_embeddings)
+        pooled_data = self.head(mean_embeddings,
+                                pooling_metadata=pooling_metadata)
 
         pooled_outputs = [
             PoolingSequenceGroupOutput(data) for data in pooled_data

From b4fe16c75b437794900afcc3e1aa53df34e5ea38 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 15 Apr 2025 00:10:35 -0600
Subject: [PATCH 437/593] Add `vllm bench [latency, throughput]` CLI commands
 (#16508)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .buildkite/test-pipeline.yaml                |   7 +
 tests/benchmarks/__init__.py                 |   0
 tests/benchmarks/test_latency_cli.py         |  19 +
 tests/benchmarks/test_serve_cli.py           |  44 +
 tests/benchmarks/test_throughput_cli.py      |  19 +
 vllm/benchmarks/datasets.py                  | 831 +++++++++++++++++++
 vllm/benchmarks/latency.py                   | 181 ++++
 vllm/benchmarks/throughput.py                | 608 ++++++++++++++
 vllm/entrypoints/cli/benchmark/latency.py    |  29 +
 vllm/entrypoints/cli/benchmark/main.py       |   6 +-
 vllm/entrypoints/cli/benchmark/throughput.py |  29 +
 11 files changed, 1771 insertions(+), 2 deletions(-)
 create mode 100644 tests/benchmarks/__init__.py
 create mode 100644 tests/benchmarks/test_latency_cli.py
 create mode 100644 tests/benchmarks/test_serve_cli.py
 create mode 100644 tests/benchmarks/test_throughput_cli.py
 create mode 100644 vllm/benchmarks/datasets.py
 create mode 100644 vllm/benchmarks/latency.py
 create mode 100644 vllm/benchmarks/throughput.py
 create mode 100644 vllm/entrypoints/cli/benchmark/latency.py
 create mode 100644 vllm/entrypoints/cli/benchmark/throughput.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 16acc2fd1127a..38961138c97c8 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -341,6 +341,13 @@ steps:
   commands:
   - bash scripts/run-benchmarks.sh
 
+- label: Benchmarks CLI Test # 10min
+  source_file_dependencies:
+  - vllm/
+  - tests/benchmarks/
+  commands:
+  - pytest -v -s benchmarks/
+
 - label: Quantization Test # 33min
   source_file_dependencies:
   - csrc/
diff --git a/tests/benchmarks/__init__.py b/tests/benchmarks/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/benchmarks/test_latency_cli.py b/tests/benchmarks/test_latency_cli.py
new file mode 100644
index 0000000000000..8537459b9f94d
--- /dev/null
+++ b/tests/benchmarks/test_latency_cli.py
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: Apache-2.0
+import subprocess
+
+import pytest
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+
+
+@pytest.mark.benchmark
+def test_bench_latency():
+    command = [
+        "vllm", "bench", "latency", "--model", MODEL_NAME, "--input-len", "32",
+        "--output-len", "1", "--enforce-eager", "--load-format", "dummy"
+    ]
+    result = subprocess.run(command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+
+    assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py
new file mode 100644
index 0000000000000..b746d6b7853c9
--- /dev/null
+++ b/tests/benchmarks/test_serve_cli.py
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: Apache-2.0
+import subprocess
+
+import pytest
+
+from ..utils import RemoteOpenAIServer
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--max-model-len", "1024", "--enforce-eager", "--load-format", "dummy"
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.benchmark
+def test_bench_serve(server):
+    command = [
+        "vllm",
+        "bench",
+        "serve",
+        "--model",
+        MODEL_NAME,
+        "--host",
+        server.host,
+        "--port",
+        str(server.port),
+        "--random-input-len",
+        "32",
+        "--random-output-len",
+        "4",
+        "--num-prompts",
+        "5",
+    ]
+    result = subprocess.run(command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+
+    assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
diff --git a/tests/benchmarks/test_throughput_cli.py b/tests/benchmarks/test_throughput_cli.py
new file mode 100644
index 0000000000000..2045b36293565
--- /dev/null
+++ b/tests/benchmarks/test_throughput_cli.py
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: Apache-2.0
+import subprocess
+
+import pytest
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+
+
+@pytest.mark.benchmark
+def test_bench_throughput():
+    command = [
+        "vllm", "bench", "throughput", "--model", MODEL_NAME, "--input-len",
+        "32", "--output-len", "1", "--enforce-eager", "--load-format", "dummy"
+    ]
+    result = subprocess.run(command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+
+    assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
new file mode 100644
index 0000000000000..299c888c2e7b9
--- /dev/null
+++ b/vllm/benchmarks/datasets.py
@@ -0,0 +1,831 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This module defines a framework for sampling benchmark requests from various
+datasets. Each dataset subclass of BenchmarkDataset must implement sample
+generation. Supported dataset types include:
+  - ShareGPT
+  - Random (synthetic)
+  - Sonnet
+  - BurstGPT
+  - HuggingFace
+  - VisionArena
+
+TODO: Implement CustomDataset to parse a JSON file and convert its contents into
+SampleRequest instances, similar to the approach used in ShareGPT.
+"""
+
+import base64
+import io
+import json
+import logging
+import random
+from abc import ABC, abstractmethod
+from collections.abc import Mapping
+from dataclasses import dataclass
+from functools import cache
+from io import BytesIO
+from typing import Any, Callable, Optional, Union
+
+import numpy as np
+from PIL import Image
+from transformers import PreTrainedTokenizerBase
+
+from vllm.lora.request import LoRARequest
+from vllm.lora.utils import get_adapter_absolute_path
+from vllm.multimodal import MultiModalDataDict
+from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
+
+logger = logging.getLogger(__name__)
+
+# -----------------------------------------------------------------------------
+# Data Classes
+# -----------------------------------------------------------------------------
+
+
+@dataclass
+class SampleRequest:
+    """
+    Represents a single inference request for benchmarking.
+    """
+
+    prompt: Union[str, Any]
+    prompt_len: int
+    expected_output_len: int
+    multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None
+    lora_request: Optional[LoRARequest] = None
+
+
+# -----------------------------------------------------------------------------
+# Benchmark Dataset Base Class
+# -----------------------------------------------------------------------------
+
+
+class BenchmarkDataset(ABC):
+    DEFAULT_SEED = 0
+
+    def __init__(
+        self,
+        dataset_path: Optional[str] = None,
+        random_seed: int = DEFAULT_SEED,
+    ) -> None:
+        """
+        Initialize the BenchmarkDataset with an optional dataset path and random
+        seed.  
+        
+        Args:
+            dataset_path (Optional[str]): Path to the dataset. If None, it
+            indicates that a default or random dataset might be used.
+            random_seed (int): Seed value for reproducible shuffling or
+            sampling. Defaults to DEFAULT_SEED.
+        """
+        self.dataset_path = dataset_path
+        # Set the random seed, ensuring that a None value is replaced with the
+        # default seed.
+        self.random_seed = (random_seed
+                            if random_seed is not None else self.DEFAULT_SEED)
+        self.data = None
+
+    def apply_multimodal_chat_transformation(
+            self,
+            prompt: str,
+            mm_content: Optional[MultiModalDataDict] = None) -> list[dict]:
+        """
+        Transform a prompt and optional multimodal content into a chat format.
+        This method is used for chat models that expect a specific conversation
+        format.
+        """
+        content = [{"text": prompt, "type": "text"}]
+        if mm_content is not None:
+            content.append(mm_content)
+        return [{"role": "user", "content": content}]
+
+    def load_data(self) -> None:
+        """
+        Load data from the dataset path into self.data.
+
+        This method must be overridden by subclasses since the method to load
+        data will vary depending on the dataset format and source.
+
+        Raises:
+            NotImplementedError: If a subclass does not implement this method.
+        """
+        # TODO (jenniferzhao): add support for downloading data
+        raise NotImplementedError(
+            "load_data must be implemented in subclasses.")
+
+    def get_random_lora_request(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        max_loras: Optional[int] = None,
+        lora_path: Optional[str] = None,
+    ) -> tuple[Optional[LoRARequest], AnyTokenizer]:
+        """
+        Optionally select a random LoRA request and return its associated
+        tokenizer.
+
+        This method is used when LoRA parameters are provided.  It randomly
+        selects a LoRA based on max_loras and retrieves a cached tokenizer for
+        that LoRA if available. Otherwise, it returns the base tokenizer.
+
+        Args:
+            tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
+            LoRA is selected.  max_loras (Optional[int]): The maximum number of
+            LoRAs available. If None, LoRA is not used.  lora_path
+            (Optional[str]): Path to the LoRA parameters on disk. If None, LoRA
+            is not used.
+
+        Returns:
+            tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first
+            element is a LoRARequest (or None if not applicable) and the second
+            element is the tokenizer associated with the LoRA request (or the
+            base tokenizer).
+        """
+        if max_loras is None or lora_path is None:
+            return None, tokenizer
+
+        # Generate a random LoRA ID in the range [1, max_loras].
+        lora_id = random.randint(1, max_loras)
+        lora_request = LoRARequest(
+            lora_name=str(lora_id),
+            lora_int_id=lora_id,
+            lora_path=lora_path_on_disk(lora_path),
+        )
+        if lora_id not in lora_tokenizer_cache:
+            lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
+        # Return lora_request and the cached tokenizer if available; otherwise,
+        # return the base tokenizer
+        return lora_request, lora_tokenizer_cache[lora_id] or tokenizer
+
+    @abstractmethod
+    def sample(self, tokenizer: PreTrainedTokenizerBase,
+               num_requests: int) -> list[SampleRequest]:
+        """
+        Abstract method to generate sample requests from the dataset.
+
+        Subclasses must override this method to implement dataset-specific logic
+        for generating a list of SampleRequest objects.
+
+        Args:
+            tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
+             for processing the dataset's text.
+            num_requests (int): The number of sample requests to generate.
+
+        Returns:
+            list[SampleRequest]: A list of sample requests generated from the
+            dataset.
+        """
+        raise NotImplementedError("sample must be implemented in subclasses.")
+
+    def maybe_oversample_requests(self, requests: list[SampleRequest],
+                                  num_requests: int) -> None:
+        """
+        Oversamples the list of requests if its size is less than the desired
+        number.
+
+        Args:
+            requests (List[SampleRequest]): The current list of sampled
+            requests.  num_requests (int): The target number of requests.
+        """
+        if len(requests) < num_requests:
+            random.seed(self.random_seed)
+            additional = random.choices(requests,
+                                        k=num_requests - len(requests))
+            requests.extend(additional)
+            logger.info("Oversampled requests to reach %d total samples.",
+                        num_requests)
+
+
+# -----------------------------------------------------------------------------
+# Utility Functions and Global Caches
+# -----------------------------------------------------------------------------
+
+
+def is_valid_sequence(
+    prompt_len: int,
+    output_len: int,
+    min_len: int = 4,
+    max_prompt_len: int = 1024,
+    max_total_len: int = 2048,
+    skip_min_output_len_check: bool = False,
+) -> bool:
+    """
+    Validate a sequence based on prompt and output lengths.
+
+    Default pruning criteria are copied from the original `sample_hf_requests`
+    and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
+    from `sample_requests` in benchmark_throughput.py.
+    """
+    # Check for invalid conditions
+    prompt_too_short = prompt_len < min_len
+    output_too_short = (not skip_min_output_len_check) and (output_len
+                                                            < min_len)
+    prompt_too_long = prompt_len > max_prompt_len
+    combined_too_long = (prompt_len + output_len) > max_total_len
+
+    # Return True if none of the invalid conditions are met
+    return not (prompt_too_short or output_too_short or prompt_too_long
+                or combined_too_long)
+
+
+@cache
+def lora_path_on_disk(lora_path: str) -> str:
+    return get_adapter_absolute_path(lora_path)
+
+
+# Global cache for LoRA tokenizers.
+lora_tokenizer_cache: dict[int, AnyTokenizer] = {}
+
+
+def process_image(image: Any) -> Mapping[str, Any]:
+    """
+    Process a single image input and return a multimedia content dictionary.
+
+    Supports three input types:
+
+    1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
+       containing raw image data.  - Loads the bytes as a PIL.Image.Image.
+
+    2. PIL.Image.Image input: - Converts the image to RGB.  - Saves the image as
+       a JPEG in memory.  - Encodes the JPEG data as a base64 string.  - Returns
+       a dictionary with the image as a base64 data URL.
+
+    3. String input: - Treats the string as a URL or local file path.  -
+       Prepends "file://" if the string doesn't start with "http://" or
+       "file://".  - Returns a dictionary with the image URL.
+
+    Raises:
+        ValueError: If the input is not a supported type.
+    """
+    if isinstance(image, dict) and 'bytes' in image:
+        image = Image.open(BytesIO(image['bytes']))
+    if isinstance(image, Image.Image):
+        image = image.convert("RGB")
+        with io.BytesIO() as image_data:
+            image.save(image_data, format="JPEG")
+            image_base64 = base64.b64encode(
+                image_data.getvalue()).decode("utf-8")
+        return {
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:image/jpeg;base64,{image_base64}"
+            },
+        }
+
+    if isinstance(image, str):
+        image_url = (image if image.startswith(
+            ("http://", "file://")) else f"file://{image}")
+        return {"type": "image_url", "image_url": {"url": image_url}}
+
+    raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
+                     " or str or dictionary with raw image bytes.")
+
+
+# -----------------------------------------------------------------------------
+# Random Dataset Implementation (Synthetic Data)
+# -----------------------------------------------------------------------------
+
+
+class RandomDataset(BenchmarkDataset):
+    # Default values copied from benchmark_serving.py for the random dataset.
+    DEFAULT_PREFIX_LEN = 0
+    DEFAULT_RANGE_RATIO = 0.0
+    DEFAULT_INPUT_LEN = 1024
+    DEFAULT_OUTPUT_LEN = 128
+
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        prefix_len: int = DEFAULT_PREFIX_LEN,
+        range_ratio: float = DEFAULT_RANGE_RATIO,
+        input_len: int = DEFAULT_INPUT_LEN,
+        output_len: int = DEFAULT_OUTPUT_LEN,
+        **kwargs,
+    ) -> list[SampleRequest]:
+        # Enforce range_ratio < 1
+        assert range_ratio < 1.0, (
+            "random_range_ratio must be < 1.0 to ensure a valid sampling range"
+        )
+
+        vocab_size = tokenizer.vocab_size
+
+        prefix_token_ids = (np.random.randint(
+            0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
+
+        # New sampling logic: [X * (1 - b), X * (1 + b)]
+        input_low = int(input_len * (1 - range_ratio))
+        input_high = int(input_len * (1 + range_ratio))
+        output_low = int(output_len * (1 - range_ratio))
+        output_high = int(output_len * (1 + range_ratio))
+
+        # Add logging for debugging
+        logger.info("Sampling input_len from [%s, %s]", input_low, input_high)
+        logger.info("Sampling output_len from [%s, %s]", output_low,
+                    output_high)
+
+        input_lens = np.random.randint(input_low,
+                                       input_high + 1,
+                                       size=num_requests)
+        output_lens = np.random.randint(output_low,
+                                        output_high + 1,
+                                        size=num_requests)
+        offsets = np.random.randint(0, vocab_size, size=num_requests)
+
+        requests = []
+        for i in range(num_requests):
+            inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) %
+                         vocab_size).tolist()
+            token_sequence = prefix_token_ids + inner_seq
+            prompt = tokenizer.decode(token_sequence)
+            total_input_len = prefix_len + int(input_lens[i])
+            requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=total_input_len,
+                    expected_output_len=int(output_lens[i]),
+                ))
+        return requests
+
+
+# -----------------------------------------------------------------------------
+# ShareGPT Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class ShareGPTDataset(BenchmarkDataset):
+    """
+    Implements the ShareGPT dataset.  Loads data from a JSON file and generates
+    sample requests based on conversation turns.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        with open(self.dataset_path, encoding="utf-8") as f:
+            self.data = json.load(f)
+        # Filter entries with at least two conversation turns.
+        self.data = [
+            entry for entry in self.data
+            if "conversations" in entry and len(entry["conversations"]) >= 2
+        ]
+        random.seed(self.random_seed)
+        random.shuffle(self.data)
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        lora_path: Optional[str] = None,
+        max_loras: Optional[int] = None,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        samples: list = []
+        for entry in self.data:
+            if len(samples) >= num_requests:
+                break
+            prompt, completion = (
+                entry["conversations"][0]["value"],
+                entry["conversations"][1]["value"],
+            )
+
+            lora_request, tokenizer = self.get_random_lora_request(
+                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
+            prompt_ids = tokenizer(prompt).input_ids
+            completion_ids = tokenizer(completion).input_ids
+            prompt_len = len(prompt_ids)
+            new_output_len = (len(completion_ids)
+                              if output_len is None else output_len)
+            if not is_valid_sequence(prompt_len,
+                                     new_output_len,
+                                     skip_min_output_len_check=output_len
+                                     is not None):
+                continue
+            if enable_multimodal_chat:
+                prompt = self.apply_multimodal_chat_transformation(
+                    prompt, None)
+            samples.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=new_output_len,
+                    lora_request=lora_request,
+                ))
+        self.maybe_oversample_requests(samples, num_requests)
+        return samples
+
+
+# -----------------------------------------------------------------------------
+# Sonnet Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class SonnetDataset(BenchmarkDataset):
+    """
+    Simplified implementation of the Sonnet dataset.  Loads poem lines from a
+    text file and generates sample requests.  Default values here copied from
+    `benchmark_serving.py` for the sonnet dataset.
+    """
+
+    DEFAULT_PREFIX_LEN = 200
+    DEFAULT_INPUT_LEN = 550
+    DEFAULT_OUTPUT_LEN = 150
+
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if not self.dataset_path:
+            raise ValueError("dataset_path must be provided.")
+        with open(self.dataset_path, encoding="utf-8") as f:
+            self.data = f.readlines()
+
+    def sample(
+        self,
+        tokenizer,
+        num_requests: int,
+        prefix_len: int = DEFAULT_PREFIX_LEN,
+        input_len: int = DEFAULT_INPUT_LEN,
+        output_len: int = DEFAULT_OUTPUT_LEN,
+        return_prompt_formatted: bool = False,
+        **kwargs,
+    ) -> list:
+        # Calculate average token length for a poem line.
+        tokenized_lines = [tokenizer(line).input_ids for line in self.data]
+        avg_len = sum(len(tokens)
+                      for tokens in tokenized_lines) / len(tokenized_lines)
+
+        # Build the base prompt.
+        base_prompt = "Pick as many lines as you can from these poem lines:\n"
+        base_msg = [{"role": "user", "content": base_prompt}]
+        base_fmt = tokenizer.apply_chat_template(base_msg,
+                                                 add_generation_prompt=True,
+                                                 tokenize=False)
+        base_offset = len(tokenizer(base_fmt).input_ids)
+        if input_len <= base_offset:
+            raise ValueError(
+                f"'input_len' must be higher than the base prompt length "
+                f"({base_offset}).")
+
+        # Determine how many poem lines to use.
+        num_input_lines = round((input_len - base_offset) / avg_len)
+        num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0)
+        prefix_lines = self.data[:num_prefix_lines]
+
+        samples = []
+        while len(samples) < num_requests:
+            extra_lines = random.choices(self.data,
+                                         k=num_input_lines - num_prefix_lines)
+            prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
+            msg = [{"role": "user", "content": prompt}]
+            prompt_formatted = tokenizer.apply_chat_template(
+                msg, add_generation_prompt=True, tokenize=False)
+            prompt_len = len(tokenizer(prompt_formatted).input_ids)
+            if prompt_len <= input_len:
+                samples.append(
+                    SampleRequest(
+                        prompt=prompt_formatted
+                        if return_prompt_formatted else prompt,
+                        prompt_len=prompt_len,
+                        expected_output_len=output_len,
+                    ))
+        return samples
+
+
+# -----------------------------------------------------------------------------
+# BurstGPT Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class BurstGPTDataset(BenchmarkDataset):
+    """
+    Implements the BurstGPT dataset.  Loads data from a CSV file and generates
+    sample requests based on synthetic prompt generation. Only rows with Model
+    "GPT-4" and positive response tokens are used.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self, ):
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        try:
+            import pandas as pd
+        except ImportError as e:
+            raise ImportError(
+                "Pandas is required for BurstGPTDataset. Please install it "
+                "using `pip install pandas`.") from e
+
+        df = pd.read_csv(self.dataset_path)
+        # Filter to keep only GPT-4 rows.
+        gpt4_df = df[df["Model"] == "GPT-4"]
+        # Remove failed requests (where Response tokens is 0 or less).
+        gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
+        # Sample the desired number of rows.
+        self.data = gpt4_df
+
+    def _sample_loaded_data(self, num_requests: int) -> list:
+        if num_requests <= len(self.data):
+            data = self.data.sample(n=num_requests,
+                                    random_state=self.random_seed)
+        else:
+            data = self.data.sample(
+                n=num_requests,
+                random_state=self.random_seed,
+                replace=True,
+            )
+        # Convert the dataframe to a list of lists.
+        return data.values.tolist()
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        max_loras: Optional[int] = None,
+        lora_path: Optional[str] = None,
+        **kwargs,
+    ) -> list[SampleRequest]:
+        samples = []
+        data = self._sample_loaded_data(num_requests=num_requests)
+        for i in range(num_requests):
+            input_len = int(data[i][2])
+            output_len = int(data[i][3])
+            lora_req, tokenizer = self.get_random_lora_request(
+                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
+            vocab_size = tokenizer.vocab_size
+            # Generate a synthetic prompt: a list of token IDs computed as (i +
+            # j) modulo vocab_size.
+            token_ids = [(i + j) % vocab_size for j in range(input_len)]
+            prompt = tokenizer.decode(token_ids)
+            samples.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=input_len,
+                    expected_output_len=output_len,
+                    lora_request=lora_req,
+                ))
+        return samples
+
+
+# -----------------------------------------------------------------------------
+# HuggingFace Dataset Base Implementation
+# -----------------------------------------------------------------------------
+class HuggingFaceDataset(BenchmarkDataset):
+    """Base class for datasets hosted on HuggingFace."""
+
+    SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set()
+
+    def __init__(
+        self,
+        dataset_path: str,
+        dataset_split: str,
+        dataset_subset: Optional[str] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(dataset_path=dataset_path, **kwargs)
+
+        self.dataset_split = dataset_split
+        self.dataset_subset = dataset_subset
+        self.load_data()
+
+    def load_data(self) -> None:
+        """Load data from HuggingFace datasets."""
+        try:
+            from datasets import load_dataset
+        except ImportError as e:
+            raise ImportError(
+                "Hugging Face datasets library is required for this dataset. "
+                "Please install it using `pip install datasets`.") from e
+
+        self.data = load_dataset(
+            self.dataset_path,
+            name=self.dataset_subset,
+            split=self.dataset_split,
+            streaming=True,
+        )
+        self.data = self.data.shuffle(seed=self.random_seed)
+
+
+# -----------------------------------------------------------------------------
+# Conversation Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class ConversationDataset(HuggingFaceDataset):
+    """Dataset for conversation data with multimodal support."""
+    SUPPORTED_DATASET_PATHS = {
+        'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered'
+    }
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               output_len: Optional[int] = None,
+               enable_multimodal_chat: bool = False,
+               **kwargs) -> list:
+        # Filter examples with at least 2 conversations
+        filtered_data = self.data.filter(
+            lambda x: len(x["conversations"]) >= 2)
+        sampled_requests = []
+        dynamic_output = output_len is None
+
+        for item in filtered_data:
+            if len(sampled_requests) >= num_requests:
+                break
+            conv = item["conversations"]
+            prompt, completion = conv[0]["value"], conv[1]["value"]
+
+            prompt_ids = tokenizer(prompt).input_ids
+            completion_ids = tokenizer(completion).input_ids
+            prompt_len = len(prompt_ids)
+            completion_len = len(completion_ids)
+            output_len = completion_len if dynamic_output else output_len
+            assert isinstance(output_len, int) and output_len > 0
+            if dynamic_output and not is_valid_sequence(
+                    prompt_len, completion_len):
+                continue
+            mm_content = process_image(
+                item["image"]) if "image" in item else None
+            if enable_multimodal_chat:
+                # Note: when chat is enabled the request prompt_len is no longer
+                # accurate and we will be using request output to count the
+                # actual prompt len and output len
+                prompt = self.apply_multimodal_chat_transformation(
+                    prompt, mm_content)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=mm_content,
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# Vision Arena Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class VisionArenaDataset(HuggingFaceDataset):
+    """
+    Vision Arena Dataset.
+    """
+
+    DEFAULT_OUTPUT_LEN = 128
+    SUPPORTED_DATASET_PATHS = {
+        "lmarena-ai/VisionArena-Chat":
+        lambda x: x["conversation"][0][0]["content"],
+        "lmarena-ai/vision-arena-bench-v0.1":
+        lambda x: x["turns"][0][0]["content"]
+    }
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        output_len = (output_len
+                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        sampled_requests = []
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
+            if parser_fn is None:
+                raise ValueError(
+                    f"Unsupported dataset path: {self.dataset_path}")
+            prompt = parser_fn(item)
+            mm_content = process_image(item["images"][0])
+            prompt_len = len(tokenizer(prompt).input_ids)
+            if enable_multimodal_chat:
+                # Note: when chat is enabled the request prompt_len is no longer
+                # accurate and we will be using request output to count the
+                # actual prompt len
+                prompt = self.apply_multimodal_chat_transformation(
+                    prompt, mm_content)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=mm_content,
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# Instruct Coder Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class InstructCoderDataset(HuggingFaceDataset):
+    """
+    InstructCoder Dataset.
+    https://huggingface.co/datasets/likaixin/InstructCoder
+
+    InstructCoder is the dataset designed for general code editing.  It consists
+    of 114,239 instruction-input-output triplets, and covers multiple distinct
+    code editing scenario.
+    """
+
+    DEFAULT_OUTPUT_LEN = 200  # this is the average default output length
+    SUPPORTED_DATASET_PATHS = {
+        "likaixin/InstructCoder",
+    }
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               output_len: Optional[int] = None,
+               enable_multimodal_chat: bool = False,
+               **kwargs) -> list:
+        output_len = (output_len
+                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        sampled_requests = []
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = f"{item['instruction']}:\n{item['input']}"
+            prompt_len = len(tokenizer(prompt).input_ids)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# AIMO Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class AIMODataset(HuggingFaceDataset):
+    """
+    Dataset class for processing a AIMO dataset with reasoning questions.
+    """
+    SUPPORTED_DATASET_PATHS = {
+        "AI-MO/aimo-validation-aime", "AI-MO/NuminaMath-1.5",
+        "AI-MO/NuminaMath-CoT"
+    }
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               output_len: Optional[int] = None,
+               **kwargs) -> list:
+        sampled_requests = []
+        dynamic_output = output_len is None
+
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt, completion = item['problem'], item["solution"]
+
+            prompt_ids = tokenizer(prompt).input_ids
+            completion_ids = tokenizer(completion).input_ids
+            prompt_len = len(prompt_ids)
+            completion_len = len(completion_ids)
+            output_len = completion_len if dynamic_output else output_len
+            assert isinstance(output_len, int) and output_len > 0
+            if dynamic_output and not is_valid_sequence(prompt_len,
+                                                        completion_len,
+                                                        max_prompt_len=2048,
+                                                        max_total_len=32000):
+                continue
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=None,
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py
new file mode 100644
index 0000000000000..06f6848f50cb4
--- /dev/null
+++ b/vllm/benchmarks/latency.py
@@ -0,0 +1,181 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Benchmark the latency of processing a single batch of requests."""
+
+import argparse
+import dataclasses
+import json
+import os
+import time
+from pathlib import Path
+from typing import Any, Optional
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from vllm import LLM, SamplingParams
+from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
+                                   write_to_json)
+from vllm.engine.arg_utils import EngineArgs
+from vllm.inputs import PromptType
+from vllm.sampling_params import BeamSearchParams
+
+
+def save_to_pytorch_benchmark_format(args: argparse.Namespace,
+                                     results: dict[str, Any]) -> None:
+    pt_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={"latency": results["latencies"]},
+        extra_info={k: results[k]
+                    for k in ["avg_latency", "percentiles"]})
+    if pt_records:
+        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
+        write_to_json(pt_file, pt_records)
+
+
+def add_cli_args(parser: argparse.ArgumentParser):
+    parser.add_argument("--input-len", type=int, default=32)
+    parser.add_argument("--output-len", type=int, default=128)
+    parser.add_argument("--batch-size", type=int, default=8)
+    parser.add_argument(
+        "--n",
+        type=int,
+        default=1,
+        help="Number of generated sequences per prompt.",
+    )
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument(
+        "--num-iters-warmup",
+        type=int,
+        default=10,
+        help="Number of iterations to run for warmup.",
+    )
+    parser.add_argument("--num-iters",
+                        type=int,
+                        default=30,
+                        help="Number of iterations to run.")
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="profile the generation process of a single batch",
+    )
+    parser.add_argument(
+        "--profile-result-dir",
+        type=str,
+        default=None,
+        help=("path to save the pytorch profiler output. Can be visualized "
+              "with ui.perfetto.dev or Tensorboard."),
+    )
+    parser.add_argument(
+        "--output-json",
+        type=str,
+        default=None,
+        help="Path to save the latency results in JSON format.",
+    )
+    parser.add_argument(
+        "--disable-detokenize",
+        action="store_true",
+        help=("Do not detokenize responses (i.e. do not include "
+              "detokenization time in the latency measurement)"),
+    )
+
+    parser = EngineArgs.add_cli_args(parser)
+
+
+def main(args: argparse.Namespace):
+    print(args)
+
+    engine_args = EngineArgs.from_cli_args(args)
+
+    # NOTE(woosuk): If the request cannot be processed in a single batch,
+    # the engine will automatically process the request in multiple batches.
+    llm = LLM(**dataclasses.asdict(engine_args))
+    assert llm.llm_engine.model_config.max_model_len >= (
+        args.input_len +
+        args.output_len), ("Please ensure that max_model_len is greater than"
+                           " the sum of input_len and output_len.")
+
+    sampling_params = SamplingParams(
+        n=args.n,
+        temperature=1.0,
+        top_p=1.0,
+        ignore_eos=True,
+        max_tokens=args.output_len,
+        detokenize=not args.disable_detokenize,
+    )
+    print(sampling_params)
+    dummy_prompt_token_ids = np.random.randint(10000,
+                                               size=(args.batch_size,
+                                                     args.input_len))
+    dummy_prompts: list[PromptType] = [{
+        "prompt_token_ids": batch
+    } for batch in dummy_prompt_token_ids.tolist()]
+
+    def llm_generate():
+        if not args.use_beam_search:
+            llm.generate(dummy_prompts,
+                         sampling_params=sampling_params,
+                         use_tqdm=False)
+        else:
+            llm.beam_search(
+                dummy_prompts,
+                BeamSearchParams(
+                    beam_width=args.n,
+                    max_tokens=args.output_len,
+                    ignore_eos=True,
+                ),
+            )
+
+    def run_to_completion(profile_dir: Optional[str] = None):
+        if profile_dir:
+            with torch.profiler.profile(
+                    activities=[
+                        torch.profiler.ProfilerActivity.CPU,
+                        torch.profiler.ProfilerActivity.CUDA,
+                    ],
+                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                        str(profile_dir)),
+            ) as p:
+                llm_generate()
+            print(p.key_averages().table(sort_by="self_cuda_time_total"))
+        else:
+            start_time = time.perf_counter()
+            llm_generate()
+            end_time = time.perf_counter()
+            latency = end_time - start_time
+            return latency
+
+    print("Warming up...")
+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+        run_to_completion(profile_dir=None)
+
+    if args.profile:
+        profile_dir = args.profile_result_dir
+        if not profile_dir:
+            profile_dir = (Path(".") / "vllm_benchmark_result" /
+                           f"latency_result_{time.time()}")
+        print(f"Profiling (results will be saved to '{profile_dir}')...")
+        run_to_completion(profile_dir=profile_dir)
+        return
+
+    # Benchmark.
+    latencies = []
+    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
+        latencies.append(run_to_completion(profile_dir=None))
+    latencies = np.array(latencies)
+    percentages = [10, 25, 50, 75, 90, 99]
+    percentiles = np.percentile(latencies, percentages)
+    print(f"Avg latency: {np.mean(latencies)} seconds")
+    for percentage, percentile in zip(percentages, percentiles):
+        print(f"{percentage}% percentile latency: {percentile} seconds")
+
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "avg_latency": np.mean(latencies),
+            "latencies": latencies.tolist(),
+            "percentiles": dict(zip(percentages, percentiles.tolist())),
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+        save_to_pytorch_benchmark_format(args, results)
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
new file mode 100644
index 0000000000000..b3e24911cc982
--- /dev/null
+++ b/vllm/benchmarks/throughput.py
@@ -0,0 +1,608 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Benchmark offline inference throughput."""
+import argparse
+import dataclasses
+import json
+import os
+import random
+import time
+import warnings
+from typing import Any, Optional, Union
+
+import torch
+import uvloop
+from tqdm import tqdm
+from transformers import (AutoModelForCausalLM, AutoTokenizer,
+                          PreTrainedTokenizerBase)
+
+from vllm.benchmarks.datasets import (AIMODataset, BurstGPTDataset,
+                                      ConversationDataset,
+                                      InstructCoderDataset, RandomDataset,
+                                      SampleRequest, ShareGPTDataset,
+                                      SonnetDataset, VisionArenaDataset)
+from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
+                                   write_to_json)
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.entrypoints.openai.api_server import (
+    build_async_engine_client_from_engine_args)
+from vllm.inputs import TextPrompt, TokensPrompt
+from vllm.lora.request import LoRARequest
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import BeamSearchParams
+from vllm.utils import merge_async_iterators
+
+
+def run_vllm(
+    requests: list[SampleRequest],
+    n: int,
+    engine_args: EngineArgs,
+    disable_detokenize: bool = False,
+) -> tuple[float, Optional[list[RequestOutput]]]:
+    from vllm import LLM, SamplingParams
+    llm = LLM(**dataclasses.asdict(engine_args))
+    assert all(
+        llm.llm_engine.model_config.max_model_len >= (
+            request.prompt_len + request.expected_output_len)
+        for request in requests), (
+            "Please ensure that max_model_len is greater than the sum of"
+            " prompt_len and expected_output_len for all requests.")
+    # Add the requests to the engine.
+    prompts: list[Union[TextPrompt, TokensPrompt]] = []
+    sampling_params: list[SamplingParams] = []
+    for request in requests:
+        prompts.append(
+            TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
+                       multi_modal_data=request.multi_modal_data)
+            if "prompt_token_ids" in request.prompt else \
+            TextPrompt(prompt=request.prompt,
+                       multi_modal_data=request.multi_modal_data))
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=1.0,
+                top_p=1.0,
+                ignore_eos=True,
+                max_tokens=request.expected_output_len,
+                detokenize=not disable_detokenize,
+            ))
+    lora_requests: Optional[list[LoRARequest]] = None
+    if engine_args.enable_lora:
+        lora_requests = [request.lora_request for request in requests]
+
+    use_beam_search = False
+
+    outputs = None
+    if not use_beam_search:
+        start = time.perf_counter()
+        outputs = llm.generate(prompts,
+                               sampling_params,
+                               lora_request=lora_requests,
+                               use_tqdm=True)
+        end = time.perf_counter()
+    else:
+        assert lora_requests is None, "BeamSearch API does not support LoRA"
+        prompts = [request.prompt for request in requests]
+        # output_len should be the same for all requests.
+        output_len = requests[0][2]
+        for request in requests:
+            assert request.expected_output_len == output_len
+        start = time.perf_counter()
+        llm.beam_search(
+            prompts,
+            BeamSearchParams(
+                beam_width=n,
+                max_tokens=output_len,
+                ignore_eos=True,
+            ))
+        end = time.perf_counter()
+    return end - start, outputs
+
+
+def run_vllm_chat(
+        requests: list[SampleRequest],
+        n: int,
+        engine_args: EngineArgs,
+        disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]:
+    """
+    Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
+    multimodal models as it properly handles multimodal inputs and chat
+    formatting. For non-multimodal models, use run_vllm() instead.
+    """
+    from vllm import LLM, SamplingParams
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    assert all(
+        llm.llm_engine.model_config.max_model_len >= (
+            request.prompt_len + request.expected_output_len)
+        for request in requests), (
+            "Please ensure that max_model_len is greater than the sum of "
+            "prompt_len and expected_output_len for all requests.")
+
+    prompts = []
+    sampling_params: list[SamplingParams] = []
+    for request in requests:
+        prompts.append(request.prompt)
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=1.0,
+                top_p=1.0,
+                ignore_eos=True,
+                max_tokens=request.expected_output_len,
+                detokenize=not disable_detokenize,
+            ))
+    start = time.perf_counter()
+    outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
+    end = time.perf_counter()
+    return end - start, outputs
+
+
+async def run_vllm_async(
+    requests: list[SampleRequest],
+    n: int,
+    engine_args: AsyncEngineArgs,
+    disable_frontend_multiprocessing: bool = False,
+    disable_detokenize: bool = False,
+) -> float:
+    from vllm import SamplingParams
+
+    async with build_async_engine_client_from_engine_args(
+            engine_args, disable_frontend_multiprocessing) as llm:
+        assert all(
+            llm.model_config.max_model_len >= (request.prompt_len +
+                                               request.expected_output_len)
+            for request in requests), (
+                "Please ensure that max_model_len is greater than the sum of"
+                " prompt_len and expected_output_len for all requests.")
+
+        # Add the requests to the engine.
+        prompts: list[Union[TextPrompt, TokensPrompt]] = []
+        sampling_params: list[SamplingParams] = []
+        lora_requests: list[Optional[LoRARequest]] = []
+        for request in requests:
+            prompts.append(
+                TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
+                        multi_modal_data=request.multi_modal_data)
+                if "prompt_token_ids" in request.prompt else \
+                TextPrompt(prompt=request.prompt,
+                           multi_modal_data=request.multi_modal_data))
+            sampling_params.append(
+                SamplingParams(
+                    n=n,
+                    temperature=1.0,
+                    top_p=1.0,
+                    ignore_eos=True,
+                    max_tokens=request.expected_output_len,
+                    detokenize=not disable_detokenize,
+                ))
+            lora_requests.append(request.lora_request)
+
+        generators = []
+        start = time.perf_counter()
+        for i, (prompt, sp,
+                lr) in enumerate(zip(prompts, sampling_params, lora_requests)):
+            generator = llm.generate(prompt,
+                                     sp,
+                                     lora_request=lr,
+                                     request_id=f"test{i}")
+            generators.append(generator)
+        all_gens = merge_async_iterators(*generators)
+        async for i, res in all_gens:
+            pass
+        end = time.perf_counter()
+        return end - start
+
+
+def run_hf(
+    requests: list[SampleRequest],
+    model: str,
+    tokenizer: PreTrainedTokenizerBase,
+    n: int,
+    max_batch_size: int,
+    trust_remote_code: bool,
+    disable_detokenize: bool = False,
+) -> float:
+    llm = AutoModelForCausalLM.from_pretrained(
+        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
+    if llm.config.model_type == "llama":
+        # To enable padding in the HF backend.
+        tokenizer.pad_token = tokenizer.eos_token
+    llm = llm.cuda()
+
+    pbar = tqdm(total=len(requests))
+    start = time.perf_counter()
+    batch: list[str] = []
+    max_prompt_len = 0
+    max_output_len = 0
+    for i in range(len(requests)):
+        prompt = requests[i].prompt
+        prompt_len = requests[i].prompt_len
+        output_len = requests[i].expected_output_len
+        # Add the prompt to the batch.
+        batch.append(prompt)
+        max_prompt_len = max(max_prompt_len, prompt_len)
+        max_output_len = max(max_output_len, output_len)
+        if len(batch) < max_batch_size and i != len(requests) - 1:
+            # Check if we can add more requests to the batch.
+            next_prompt_len = requests[i + 1].prompt_len
+            next_output_len = requests[i + 1].expected_output_len
+            if (max(max_prompt_len, next_prompt_len) +
+                    max(max_output_len, next_output_len)) <= 2048:
+                # We can add more requests to the batch.
+                continue
+
+        # Generate the sequences.
+        input_ids = tokenizer(batch, return_tensors="pt",
+                              padding=True).input_ids
+        llm_outputs = llm.generate(
+            input_ids=input_ids.cuda(),
+            do_sample=True,
+            num_return_sequences=n,
+            temperature=1.0,
+            top_p=1.0,
+            use_cache=True,
+            max_new_tokens=max_output_len,
+        )
+        if not disable_detokenize:
+            # Include the decoding time.
+            tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
+        pbar.update(len(batch))
+
+        # Clear the batch.
+        batch = []
+        max_prompt_len = 0
+        max_output_len = 0
+    end = time.perf_counter()
+    return end - start
+
+
+def save_to_pytorch_benchmark_format(args: argparse.Namespace,
+                                     results: dict[str, Any]) -> None:
+    pt_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={
+            "requests_per_second": [results["requests_per_second"]],
+            "tokens_per_second": [results["tokens_per_second"]],
+        },
+        extra_info={
+            k: results[k]
+            for k in ["elapsed_time", "num_requests", "total_num_tokens"]
+        })
+    if pt_records:
+        # Don't use json suffix here as we don't want CI to pick it up
+        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
+        write_to_json(pt_file, pt_records)
+
+
+def get_requests(args, tokenizer):
+    # Common parameters for all dataset types.
+    common_kwargs = {
+        "dataset_path": args.dataset_path,
+        "random_seed": args.seed,
+    }
+    sample_kwargs = {
+        "tokenizer": tokenizer,
+        "lora_path": args.lora_path,
+        "max_loras": args.max_loras,
+        "num_requests": args.num_prompts,
+        "input_len": args.input_len,
+        "output_len": args.output_len,
+    }
+
+    if args.dataset_path is None or args.dataset_name == "random":
+        sample_kwargs["range_ratio"] = args.random_range_ratio
+        sample_kwargs["prefix_len"] = args.prefix_len
+        dataset_cls = RandomDataset
+    elif args.dataset_name == "sharegpt":
+        dataset_cls = ShareGPTDataset
+        if args.backend == "vllm-chat":
+            sample_kwargs["enable_multimodal_chat"] = True
+    elif args.dataset_name == "sonnet":
+        assert tokenizer.chat_template or tokenizer.default_chat_template, (
+            "Tokenizer/model must have chat template for sonnet dataset.")
+        dataset_cls = SonnetDataset
+        sample_kwargs["prefix_len"] = args.prefix_len
+        sample_kwargs["return_prompt_formatted"] = True
+    elif args.dataset_name == "burstgpt":
+        dataset_cls = BurstGPTDataset
+    elif args.dataset_name == "hf":
+        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = VisionArenaDataset
+            common_kwargs['dataset_subset'] = None
+            common_kwargs['dataset_split'] = "train"
+            sample_kwargs["enable_multimodal_chat"] = True
+        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = InstructCoderDataset
+            common_kwargs['dataset_split'] = "train"
+        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = ConversationDataset
+            common_kwargs['dataset_subset'] = args.hf_subset
+            common_kwargs['dataset_split'] = args.hf_split
+            sample_kwargs["enable_multimodal_chat"] = True
+        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = AIMODataset
+            common_kwargs['dataset_subset'] = None
+            common_kwargs['dataset_split'] = "train"
+    else:
+        raise ValueError(f"Unknown dataset name: {args.dataset_name}")
+    # Remove None values
+    sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None}
+    return dataset_cls(**common_kwargs).sample(**sample_kwargs)
+
+
+def validate_args(args):
+    """
+    Validate command-line arguments.
+    """
+
+    # === Deprecation and Defaulting ===
+    if args.dataset is not None:
+        warnings.warn(
+            "The '--dataset' argument will be deprecated in the next release. "
+            "Please use '--dataset-name' and '--dataset-path' instead.",
+            stacklevel=2)
+        args.dataset_path = args.dataset
+
+    if not getattr(args, "tokenizer", None):
+        args.tokenizer = args.model
+
+    # === Backend Validation ===
+    valid_backends = {"vllm", "hf", "mii", "vllm-chat"}
+    if args.backend not in valid_backends:
+        raise ValueError(f"Unsupported backend: {args.backend}")
+
+    # === Dataset Configuration ===
+    if not args.dataset and not args.dataset_path:
+        print(
+            "When dataset path is not set, it will default to random dataset")
+        args.dataset_name = 'random'
+        if args.input_len is None:
+            raise ValueError("input_len must be provided for a random dataset")
+
+    # === Dataset Name Specific Checks ===
+    # --hf-subset and --hf-split: only used
+    # when dataset_name is 'hf'
+    if args.dataset_name != "hf" and (
+            getattr(args, "hf_subset", None) is not None
+            or getattr(args, "hf_split", None) is not None):
+        warnings.warn("--hf-subset and --hf-split will be ignored \
+                since --dataset-name is not 'hf'.",
+                      stacklevel=2)
+    elif args.dataset_name == "hf":
+        if args.dataset_path in (
+                VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
+                | ConversationDataset.SUPPORTED_DATASET_PATHS):
+            assert args.backend == "vllm-chat", f"{args.dataset_path} needs to use vllm-chat as the backend."  #noqa: E501
+        elif args.dataset_path in (InstructCoderDataset.SUPPORTED_DATASET_PATHS
+                                   | AIMODataset.SUPPORTED_DATASET_PATHS):
+            assert args.backend == "vllm", f"{args.dataset_path} needs to use vllm as the backend."  #noqa: E501
+        else:
+            raise ValueError(
+                f"{args.dataset_path} is not supported by hf dataset.")
+
+    # --random-range-ratio: only used when dataset_name is 'random'
+    if args.dataset_name != 'random' and args.random_range_ratio is not None:
+        warnings.warn("--random-range-ratio will be ignored since \
+                --dataset-name is not 'random'.",
+                      stacklevel=2)
+
+    # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
+    # set.
+    if args.dataset_name not in {"random", "sonnet", None
+                                 } and args.prefix_len is not None:
+        warnings.warn("--prefix-len will be ignored since --dataset-name\
+                 is not 'random', 'sonnet', or not set.",
+                      stacklevel=2)
+
+    # === LoRA Settings ===
+    if getattr(args, "enable_lora", False) and args.backend != "vllm":
+        raise ValueError(
+            "LoRA benchmarking is only supported for vLLM backend")
+    if getattr(args, "enable_lora", False) and args.lora_path is None:
+        raise ValueError("LoRA path must be provided when enable_lora is True")
+
+    # === Backend-specific Validations ===
+    if args.backend == "hf" and args.hf_max_batch_size is None:
+        raise ValueError("HF max batch size is required for HF backend")
+    if args.backend != "hf" and args.hf_max_batch_size is not None:
+        raise ValueError("HF max batch size is only for HF backend.")
+
+    if args.backend in {"hf", "mii"} and getattr(args, "quantization",
+                                                 None) is not None:
+        raise ValueError("Quantization is only for vLLM backend.")
+
+    if args.backend == "mii" and args.dtype != "auto":
+        raise ValueError("dtype must be auto for MII backend.")
+    if args.backend == "mii" and args.n != 1:
+        raise ValueError("n must be 1 for MII backend.")
+    if args.backend == "mii" and args.tokenizer != args.model:
+        raise ValueError(
+            "Tokenizer must be the same as the model for MII backend.")
+
+
+def add_cli_args(parser: argparse.ArgumentParser):
+    parser.add_argument("--backend",
+                        type=str,
+                        choices=["vllm", "hf", "mii", "vllm-chat"],
+                        default="vllm")
+    parser.add_argument(
+        "--dataset-name",
+        type=str,
+        choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
+        help="Name of the dataset to benchmark on.",
+        default="sharegpt")
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default=None,
+        help="Path to the ShareGPT dataset, will be deprecated in\
+            the next release. The dataset is expected to "
+        "be a json in form of list[dict[..., conversations: "
+        "list[dict[..., value: <prompt_or_response>]]]]")
+    parser.add_argument("--dataset-path",
+                        type=str,
+                        default=None,
+                        help="Path to the dataset")
+    parser.add_argument("--input-len",
+                        type=int,
+                        default=None,
+                        help="Input prompt length for each request")
+    parser.add_argument("--output-len",
+                        type=int,
+                        default=None,
+                        help="Output length for each request. Overrides the "
+                        "output length from the dataset.")
+    parser.add_argument("--n",
+                        type=int,
+                        default=1,
+                        help="Number of generated sequences per prompt.")
+    parser.add_argument("--num-prompts",
+                        type=int,
+                        default=1000,
+                        help="Number of prompts to process.")
+    parser.add_argument("--hf-max-batch-size",
+                        type=int,
+                        default=None,
+                        help="Maximum batch size for HF backend.")
+    parser.add_argument(
+        '--output-json',
+        type=str,
+        default=None,
+        help='Path to save the throughput results in JSON format.')
+    parser.add_argument("--async-engine",
+                        action='store_true',
+                        default=False,
+                        help="Use vLLM async engine rather than LLM class.")
+    parser.add_argument("--disable-frontend-multiprocessing",
+                        action='store_true',
+                        default=False,
+                        help="Disable decoupled async engine frontend.")
+    parser.add_argument(
+        "--disable-detokenize",
+        action="store_true",
+        help=("Do not detokenize the response (i.e. do not include "
+              "detokenization time in the measurement)"))
+    # LoRA
+    parser.add_argument(
+        "--lora-path",
+        type=str,
+        default=None,
+        help="Path to the lora adapters to use. This can be an absolute path, "
+        "a relative path, or a Hugging Face model identifier.")
+    parser.add_argument(
+        "--prefix-len",
+        type=int,
+        default=0,
+        help="Number of fixed prefix tokens before the random "
+        "context in a request (default: 0).",
+    )
+    # random dataset
+    parser.add_argument(
+        "--random-range-ratio",
+        type=float,
+        default=0.0,
+        help="Range ratio for sampling input/output length, "
+        "used only for RandomDataset. Must be in the range [0, 1) to define "
+        "a symmetric sampling range "
+        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
+    )
+
+    # hf dtaset
+    parser.add_argument("--hf-subset",
+                        type=str,
+                        default=None,
+                        help="Subset of the HF dataset.")
+    parser.add_argument("--hf-split",
+                        type=str,
+                        default=None,
+                        help="Split of the HF dataset.")
+
+    parser = AsyncEngineArgs.add_cli_args(parser)
+
+
+def main(args: argparse.Namespace):
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+    validate_args(args)
+    if args.seed is None:
+        args.seed = 0
+    print(args)
+    random.seed(args.seed)
+    # Sample the requests.
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code)
+    requests = get_requests(args, tokenizer)
+    is_multi_modal = any(request.multi_modal_data is not None
+                         for request in requests)
+    request_outputs: Optional[list[RequestOutput]] = None
+    if args.backend == "vllm":
+        if args.async_engine:
+            elapsed_time = uvloop.run(
+                run_vllm_async(
+                    requests,
+                    args.n,
+                    AsyncEngineArgs.from_cli_args(args),
+                    args.disable_frontend_multiprocessing,
+                    args.disable_detokenize,
+                ))
+        else:
+            elapsed_time, request_outputs = run_vllm(
+                requests, args.n, EngineArgs.from_cli_args(args),
+                args.disable_detokenize)
+    elif args.backend == "hf":
+        assert args.tensor_parallel_size == 1
+        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
+                              args.hf_max_batch_size, args.trust_remote_code,
+                              args.disable_detokenize)
+    elif args.backend == "vllm-chat":
+        elapsed_time, request_outputs = run_vllm_chat(
+            requests, args.n, EngineArgs.from_cli_args(args),
+            args.disable_detokenize)
+    else:
+        raise ValueError(f"Unknown backend: {args.backend}")
+
+    if request_outputs:
+        # Note: with the vllm and vllm-chat backends,
+        # we have request_outputs, which we use to count tokens.
+        total_prompt_tokens = 0
+        total_output_tokens = 0
+        for ro in request_outputs:
+            if not isinstance(ro, RequestOutput):
+                continue
+            total_prompt_tokens += len(
+                ro.prompt_token_ids) if ro.prompt_token_ids else 0
+            total_output_tokens += sum(
+                len(o.token_ids) for o in ro.outputs if o)
+        total_num_tokens = total_prompt_tokens + total_output_tokens
+    else:
+        total_num_tokens = sum(r.prompt_len + r.expected_output_len
+                               for r in requests)
+        total_output_tokens = sum(r.expected_output_len for r in requests)
+        total_prompt_tokens = total_num_tokens - total_output_tokens
+
+    if is_multi_modal and args.backend != "vllm-chat":
+        print("\033[91mWARNING\033[0m: Multi-modal request with "
+              f"{args.backend} backend detected. The "
+              "following metrics are not accurate because image tokens are not"
+              " counted. See vllm-project/vllm/issues/9778 for details.")
+        # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
+        # vllm-chat backend counts the image tokens now
+
+    print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+          f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
+          f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
+    print(f"Total num prompt tokens:  {total_prompt_tokens}")
+    print(f"Total num output tokens:  {total_output_tokens}")
+
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "elapsed_time": elapsed_time,
+            "num_requests": len(requests),
+            "total_num_tokens": total_num_tokens,
+            "requests_per_second": len(requests) / elapsed_time,
+            "tokens_per_second": total_num_tokens / elapsed_time,
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+        save_to_pytorch_benchmark_format(args, results)
diff --git a/vllm/entrypoints/cli/benchmark/latency.py b/vllm/entrypoints/cli/benchmark/latency.py
new file mode 100644
index 0000000000000..5aca16e0b640c
--- /dev/null
+++ b/vllm/entrypoints/cli/benchmark/latency.py
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+
+from vllm.benchmarks.latency import add_cli_args, main
+from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
+from vllm.entrypoints.cli.types import CLISubcommand
+
+
+class BenchmarkLatencySubcommand(BenchmarkSubcommandBase):
+    """ The `latency` subcommand for vllm bench. """
+
+    def __init__(self):
+        self.name = "latency"
+        super().__init__()
+
+    @property
+    def help(self) -> str:
+        return "Benchmark the latency of a single batch of requests."
+
+    def add_cli_args(self, parser: argparse.ArgumentParser) -> None:
+        add_cli_args(parser)
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        main(args)
+
+
+def cmd_init() -> list[CLISubcommand]:
+    return [BenchmarkLatencySubcommand()]
diff --git a/vllm/entrypoints/cli/benchmark/main.py b/vllm/entrypoints/cli/benchmark/main.py
index 1bcb25be2fcae..9e857af7d6dbd 100644
--- a/vllm/entrypoints/cli/benchmark/main.py
+++ b/vllm/entrypoints/cli/benchmark/main.py
@@ -1,14 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 import argparse
 
+import vllm.entrypoints.cli.benchmark.latency
 import vllm.entrypoints.cli.benchmark.serve
+import vllm.entrypoints.cli.benchmark.throughput
 from vllm.entrypoints.cli.types import CLISubcommand
 from vllm.utils import FlexibleArgumentParser
 
-# TODO: Add the rest of the benchmark subcommands here,
-# e.g., throughput, latency, etc.
 BENCHMARK_CMD_MODULES = [
+    vllm.entrypoints.cli.benchmark.latency,
     vllm.entrypoints.cli.benchmark.serve,
+    vllm.entrypoints.cli.benchmark.throughput,
 ]
 
 
diff --git a/vllm/entrypoints/cli/benchmark/throughput.py b/vllm/entrypoints/cli/benchmark/throughput.py
new file mode 100644
index 0000000000000..88ee6aa038578
--- /dev/null
+++ b/vllm/entrypoints/cli/benchmark/throughput.py
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+
+from vllm.benchmarks.throughput import add_cli_args, main
+from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
+from vllm.entrypoints.cli.types import CLISubcommand
+
+
+class BenchmarkThroughputSubcommand(BenchmarkSubcommandBase):
+    """ The `throughput` subcommand for vllm bench. """
+
+    def __init__(self):
+        self.name = "throughput"
+        super().__init__()
+
+    @property
+    def help(self) -> str:
+        return "Benchmark offline inference throughput."
+
+    def add_cli_args(self, parser: argparse.ArgumentParser) -> None:
+        add_cli_args(parser)
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        main(args)
+
+
+def cmd_init() -> list[CLISubcommand]:
+    return [BenchmarkThroughputSubcommand()]

From b590adfdc15fc716f6d120aeefeb587f491f8fce Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@users.noreply.github.com>
Date: Tue, 15 Apr 2025 02:11:11 -0400
Subject: [PATCH 438/593] Fix vLLM x torch.compile config caching (#16491)

Signed-off-by: rzou <zou3519@gmail.com>
---
 vllm/config.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index f86c3272a0ad5..60ea4a517bde9 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -298,12 +298,18 @@ class ModelConfig:
         factors.append(self.quantization)
         factors.append(self.revision)
         factors.append(self.code_revision)
+        factors.append(self.max_model_len)
+        factors.append(self.max_logprobs)
+        factors.append(self.disable_sliding_window)
         factors.append(self.trust_remote_code)
+        factors.append(self.mm_processor_kwargs)
+        factors.append(self.generation_config)
+        factors.append(self.model_impl)
+        factors.append(self.override_generation_config)
         factors.append(self.rope_scaling)
         factors.append(self.rope_theta)
-        # rope cos/sin cache depends on the max_position_embeddings
-        factors.append(
-            getattr(self.hf_config, "max_position_embeddings", "None"))
+        # hf_config can control how the model looks!
+        factors.append(self.hf_config.to_json_string())
         return hashlib.sha256(str(factors).encode()).hexdigest()
 
     def __init__(

From 6ae996a8733269a10cbbc25b8b45b921d81eb362 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Tue, 15 Apr 2025 16:05:30 +0800
Subject: [PATCH 439/593] [Misc] refactor argument parsing in examples (#16635)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 examples/offline_inference/audio_language.py  |  52 +++---
 examples/offline_inference/basic/basic.py     |  33 ++--
 examples/offline_inference/basic/chat.py      |  32 ++--
 examples/offline_inference/basic/classify.py  |  18 +-
 examples/offline_inference/basic/embed.py     |  18 +-
 examples/offline_inference/basic/generate.py  |  32 ++--
 examples/offline_inference/basic/score.py     |  18 +-
 examples/offline_inference/data_parallel.py   |  67 +++----
 examples/offline_inference/eagle.py           |   9 +-
 .../embed_jina_embeddings_v3.py               |  18 +-
 .../offline_inference/embed_matryoshka_fy.py  |  18 +-
 examples/offline_inference/encoder_decoder.py | 174 ++++++++++--------
 .../encoder_decoder_multimodal.py             |  33 ++--
 examples/offline_inference/mistral-small.py   |   7 +-
 examples/offline_inference/mlpspeculator.py   |   7 +-
 .../prithvi_geospatial_mae.py                 |  62 ++++---
 examples/offline_inference/profiling.py       |  12 +-
 .../offline_inference/save_sharded_state.py   |  33 ++--
 .../offline_inference/simple_profiling.py     |   6 +-
 examples/offline_inference/vision_language.py | 105 ++++++-----
 .../vision_language_embedding.py              |  16 +-
 .../vision_language_multi_image.py            |  37 ++--
 examples/online_serving/api_client.py         |  18 +-
 .../gradio_openai_chatbot_webserver.py        | 153 +++++++++------
 examples/online_serving/gradio_webserver.py   |  28 ++-
 25 files changed, 595 insertions(+), 411 deletions(-)

diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 248090474de66..8f6779088e8fc 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -187,6 +187,33 @@ model_example_map = {
 }
 
 
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'audio language models')
+    parser.add_argument('--model-type',
+                        '-m',
+                        type=str,
+                        default="ultravox",
+                        choices=model_example_map.keys(),
+                        help='Huggingface "model_type".')
+    parser.add_argument('--num-prompts',
+                        type=int,
+                        default=1,
+                        help='Number of prompts to run.')
+    parser.add_argument("--num-audios",
+                        type=int,
+                        default=1,
+                        choices=[0, 1, 2],
+                        help="Number of audio items per prompt.")
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")
+
+    return parser.parse_args()
+
+
 def main(args):
     model = args.model_type
     if model not in model_example_map:
@@ -240,28 +267,5 @@ def main(args):
 
 
 if __name__ == "__main__":
-    parser = FlexibleArgumentParser(
-        description='Demo on using vLLM for offline inference with '
-        'audio language models')
-    parser.add_argument('--model-type',
-                        '-m',
-                        type=str,
-                        default="ultravox",
-                        choices=model_example_map.keys(),
-                        help='Huggingface "model_type".')
-    parser.add_argument('--num-prompts',
-                        type=int,
-                        default=1,
-                        help='Number of prompts to run.')
-    parser.add_argument("--num-audios",
-                        type=int,
-                        default=1,
-                        choices=[0, 1, 2],
-                        help="Number of audio items per prompt.")
-    parser.add_argument("--seed",
-                        type=int,
-                        default=None,
-                        help="Set the seed when initializing `vllm.LLM`.")
-
-    args = parser.parse_args()
+    args = parse_args()
     main(args)
diff --git a/examples/offline_inference/basic/basic.py b/examples/offline_inference/basic/basic.py
index 2ba5ec1192b19..ae5ae7cb48346 100644
--- a/examples/offline_inference/basic/basic.py
+++ b/examples/offline_inference/basic/basic.py
@@ -12,16 +12,23 @@ prompts = [
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
-# Create an LLM.
-llm = LLM(model="facebook/opt-125m")
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
-print("\nGenerated Outputs:\n" + "-" * 60)
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt:    {prompt!r}")
-    print(f"Output:    {generated_text!r}")
-    print("-" * 60)
\ No newline at end of file
+
+def main():
+    # Create an LLM.
+    llm = LLM(model="facebook/opt-125m")
+    # Generate texts from the prompts.
+    # The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt:    {prompt!r}")
+        print(f"Output:    {generated_text!r}")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/basic/chat.py b/examples/offline_inference/basic/chat.py
index 2dea45f843cf3..6857c6e9e31df 100644
--- a/examples/offline_inference/basic/chat.py
+++ b/examples/offline_inference/basic/chat.py
@@ -4,6 +4,24 @@ from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
 
 
+def create_parser():
+    parser = FlexibleArgumentParser()
+    # Add engine args
+    engine_group = parser.add_argument_group("Engine arguments")
+    EngineArgs.add_cli_args(engine_group)
+    engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
+    # Add sampling params
+    sampling_group = parser.add_argument_group("Sampling parameters")
+    sampling_group.add_argument("--max-tokens", type=int)
+    sampling_group.add_argument("--temperature", type=float)
+    sampling_group.add_argument("--top-p", type=float)
+    sampling_group.add_argument("--top-k", type=int)
+    # Add example params
+    parser.add_argument("--chat-template-path", type=str)
+
+    return parser
+
+
 def main(args: dict):
     # Pop arguments not used by LLM
     max_tokens = args.pop("max_tokens")
@@ -82,18 +100,6 @@ def main(args: dict):
 
 
 if __name__ == "__main__":
-    parser = FlexibleArgumentParser()
-    # Add engine args
-    engine_group = parser.add_argument_group("Engine arguments")
-    EngineArgs.add_cli_args(engine_group)
-    engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
-    # Add sampling params
-    sampling_group = parser.add_argument_group("Sampling parameters")
-    sampling_group.add_argument("--max-tokens", type=int)
-    sampling_group.add_argument("--temperature", type=float)
-    sampling_group.add_argument("--top-p", type=float)
-    sampling_group.add_argument("--top-k", type=int)
-    # Add example params
-    parser.add_argument("--chat-template-path", type=str)
+    parser = create_parser()
     args: dict = vars(parser.parse_args())
     main(args)
diff --git a/examples/offline_inference/basic/classify.py b/examples/offline_inference/basic/classify.py
index 72c29e4c77c30..5b6dcb41eee1c 100644
--- a/examples/offline_inference/basic/classify.py
+++ b/examples/offline_inference/basic/classify.py
@@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
 
 
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(model="jason9693/Qwen2.5-1.5B-apeach",
+                        task="classify",
+                        enforce_eager=True)
+    return parser.parse_args()
+
+
 def main(args: Namespace):
     # Sample prompts.
     prompts = [
@@ -34,11 +44,5 @@ def main(args: Namespace):
 
 
 if __name__ == "__main__":
-    parser = FlexibleArgumentParser()
-    parser = EngineArgs.add_cli_args(parser)
-    # Set example specific arguments
-    parser.set_defaults(model="jason9693/Qwen2.5-1.5B-apeach",
-                        task="classify",
-                        enforce_eager=True)
-    args = parser.parse_args()
+    args = parse_args()
     main(args)
diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py
index 0283909a2a84a..cb5f923ffb697 100644
--- a/examples/offline_inference/basic/embed.py
+++ b/examples/offline_inference/basic/embed.py
@@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
 
 
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(model="intfloat/e5-mistral-7b-instruct",
+                        task="embed",
+                        enforce_eager=True)
+    return parser.parse_args()
+
+
 def main(args: Namespace):
     # Sample prompts.
     prompts = [
@@ -34,11 +44,5 @@ def main(args: Namespace):
 
 
 if __name__ == "__main__":
-    parser = FlexibleArgumentParser()
-    parser = EngineArgs.add_cli_args(parser)
-    # Set example specific arguments
-    parser.set_defaults(model="intfloat/e5-mistral-7b-instruct",
-                        task="embed",
-                        enforce_eager=True)
-    args = parser.parse_args()
+    args = parse_args()
     main(args)
diff --git a/examples/offline_inference/basic/generate.py b/examples/offline_inference/basic/generate.py
index 93f4f2a36fac6..54b52b22a45a9 100644
--- a/examples/offline_inference/basic/generate.py
+++ b/examples/offline_inference/basic/generate.py
@@ -4,6 +4,22 @@ from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
 
 
+def create_parser():
+    parser = FlexibleArgumentParser()
+    # Add engine args
+    engine_group = parser.add_argument_group("Engine arguments")
+    EngineArgs.add_cli_args(engine_group)
+    engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
+    # Add sampling params
+    sampling_group = parser.add_argument_group("Sampling parameters")
+    sampling_group.add_argument("--max-tokens", type=int)
+    sampling_group.add_argument("--temperature", type=float)
+    sampling_group.add_argument("--top-p", type=float)
+    sampling_group.add_argument("--top-k", type=int)
+
+    return parser
+
+
 def main(args: dict):
     # Pop arguments not used by LLM
     max_tokens = args.pop("max_tokens")
@@ -35,23 +51,15 @@ def main(args: dict):
     ]
     outputs = llm.generate(prompts, sampling_params)
     # Print the outputs.
+    print("-" * 50)
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
 
 
 if __name__ == "__main__":
-    parser = FlexibleArgumentParser()
-    # Add engine args
-    engine_group = parser.add_argument_group("Engine arguments")
-    EngineArgs.add_cli_args(engine_group)
-    engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
-    # Add sampling params
-    sampling_group = parser.add_argument_group("Sampling parameters")
-    sampling_group.add_argument("--max-tokens", type=int)
-    sampling_group.add_argument("--temperature", type=float)
-    sampling_group.add_argument("--top-p", type=float)
-    sampling_group.add_argument("--top-k", type=int)
+    parser = create_parser()
     args: dict = vars(parser.parse_args())
     main(args)
diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py
index 83b8253f4e257..d2bda8b3180c3 100644
--- a/examples/offline_inference/basic/score.py
+++ b/examples/offline_inference/basic/score.py
@@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
 
 
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(model="BAAI/bge-reranker-v2-m3",
+                        task="score",
+                        enforce_eager=True)
+    return parser.parse_args()
+
+
 def main(args: Namespace):
     # Sample prompts.
     text_1 = "What is the capital of France?"
@@ -30,11 +40,5 @@ def main(args: Namespace):
 
 
 if __name__ == "__main__":
-    parser = FlexibleArgumentParser()
-    parser = EngineArgs.add_cli_args(parser)
-    # Set example specific arguments
-    parser.set_defaults(model="BAAI/bge-reranker-v2-m3",
-                        task="score",
-                        enforce_eager=True)
-    args = parser.parse_args()
+    args = parse_args()
     main(args)
diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index 04a79e2f8ae66..965915beaf58f 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -34,6 +34,40 @@ from vllm import LLM, SamplingParams
 from vllm.utils import get_open_port
 
 
+def parse_args():
+    import argparse
+    parser = argparse.ArgumentParser(description="Data Parallel Inference")
+    parser.add_argument("--model",
+                        type=str,
+                        default="ibm-research/PowerMoE-3b",
+                        help="Model name or path")
+    parser.add_argument("--dp-size",
+                        type=int,
+                        default=2,
+                        help="Data parallel size")
+    parser.add_argument("--tp-size",
+                        type=int,
+                        default=2,
+                        help="Tensor parallel size")
+    parser.add_argument("--node-size",
+                        type=int,
+                        default=1,
+                        help="Total number of nodes")
+    parser.add_argument("--node-rank",
+                        type=int,
+                        default=0,
+                        help="Rank of the current node")
+    parser.add_argument("--master-addr",
+                        type=str,
+                        default="",
+                        help="Master node IP address")
+    parser.add_argument("--master-port",
+                        type=int,
+                        default=0,
+                        help="Master node port")
+    return parser.parse_args()
+
+
 def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
          dp_master_port, GPUs_per_dp_rank):
     os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
@@ -95,37 +129,8 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
 
 
 if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description="Data Parallel Inference")
-    parser.add_argument("--model",
-                        type=str,
-                        default="ibm-research/PowerMoE-3b",
-                        help="Model name or path")
-    parser.add_argument("--dp-size",
-                        type=int,
-                        default=2,
-                        help="Data parallel size")
-    parser.add_argument("--tp-size",
-                        type=int,
-                        default=2,
-                        help="Tensor parallel size")
-    parser.add_argument("--node-size",
-                        type=int,
-                        default=1,
-                        help="Total number of nodes")
-    parser.add_argument("--node-rank",
-                        type=int,
-                        default=0,
-                        help="Rank of the current node")
-    parser.add_argument("--master-addr",
-                        type=str,
-                        default="",
-                        help="Master node IP address")
-    parser.add_argument("--master-port",
-                        type=int,
-                        default=0,
-                        help="Master node port")
-    args = parser.parse_args()
+
+    args = parse_args()
 
     dp_size = args.dp_size
     tp_size = args.tp_size
diff --git a/examples/offline_inference/eagle.py b/examples/offline_inference/eagle.py
index 453ae7b6f56fa..c7b4368c9b132 100644
--- a/examples/offline_inference/eagle.py
+++ b/examples/offline_inference/eagle.py
@@ -27,7 +27,7 @@ def load_prompts(dataset_path, num_prompts):
     return prompts[:num_prompts]
 
 
-def main():
+def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--dataset",
@@ -45,7 +45,12 @@ def main():
     parser.add_argument("--enable_chunked_prefill", action='store_true')
     parser.add_argument("--max_num_batched_tokens", type=int, default=2048)
     parser.add_argument("--temp", type=float, default=0)
-    args = parser.parse_args()
+    return parser.parse_args()
+
+
+def main():
+
+    args = parse_args()
 
     model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
     eagle_dir = "abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm"
diff --git a/examples/offline_inference/embed_jina_embeddings_v3.py b/examples/offline_inference/embed_jina_embeddings_v3.py
index f7d9e47e7953e..b347ddbf3197a 100644
--- a/examples/offline_inference/embed_jina_embeddings_v3.py
+++ b/examples/offline_inference/embed_jina_embeddings_v3.py
@@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
 
 
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(model="jinaai/jina-embeddings-v3",
+                        task="embed",
+                        trust_remote_code=True)
+    return parser.parse_args()
+
+
 def main(args: Namespace):
     # Sample prompts.
     prompts = [
@@ -40,11 +50,5 @@ def main(args: Namespace):
 
 
 if __name__ == "__main__":
-    parser = FlexibleArgumentParser()
-    parser = EngineArgs.add_cli_args(parser)
-    # Set example specific arguments
-    parser.set_defaults(model="jinaai/jina-embeddings-v3",
-                        task="embed",
-                        trust_remote_code=True)
-    args = parser.parse_args()
+    args = parse_args()
     main(args)
diff --git a/examples/offline_inference/embed_matryoshka_fy.py b/examples/offline_inference/embed_matryoshka_fy.py
index ab71fbe73e6aa..7a6cb02556d9a 100644
--- a/examples/offline_inference/embed_matryoshka_fy.py
+++ b/examples/offline_inference/embed_matryoshka_fy.py
@@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs, PoolingParams
 from vllm.utils import FlexibleArgumentParser
 
 
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(model="jinaai/jina-embeddings-v3",
+                        task="embed",
+                        trust_remote_code=True)
+    return parser.parse_args()
+
+
 def main(args: Namespace):
     # Sample prompts.
     prompts = [
@@ -38,11 +48,5 @@ def main(args: Namespace):
 
 
 if __name__ == "__main__":
-    parser = FlexibleArgumentParser()
-    parser = EngineArgs.add_cli_args(parser)
-    # Set example specific arguments
-    parser.set_defaults(model="jinaai/jina-embeddings-v3",
-                        task="embed",
-                        trust_remote_code=True)
-    args = parser.parse_args()
+    args = parse_args()
     main(args)
diff --git a/examples/offline_inference/encoder_decoder.py b/examples/offline_inference/encoder_decoder.py
index c6ccfd42ec85b..c4916e00f473c 100644
--- a/examples/offline_inference/encoder_decoder.py
+++ b/examples/offline_inference/encoder_decoder.py
@@ -8,94 +8,112 @@ from vllm import LLM, SamplingParams
 from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
                          TokensPrompt, zip_enc_dec_prompts)
 
-dtype = "float"
 
-# Create a BART encoder/decoder model instance
-llm = LLM(
-    model="facebook/bart-large-cnn",
-    dtype=dtype,
-)
+def create_prompts(tokenizer):
+    # Test prompts
+    #
+    # This section shows all of the valid ways to prompt an
+    # encoder/decoder model.
+    #
+    # - Helpers for building prompts
+    text_prompt_raw = "Hello, my name is"
+    text_prompt = TextPrompt(prompt="The president of the United States is")
+    tokens_prompt = TokensPrompt(prompt_token_ids=tokenizer.encode(
+        prompt="The capital of France is"))
+    # - Pass a single prompt to encoder/decoder model
+    #   (implicitly encoder input prompt);
+    #   decoder input prompt is assumed to be None
 
-# Get BART tokenizer
-tokenizer = llm.llm_engine.get_tokenizer_group()
+    single_text_prompt_raw = text_prompt_raw  # Pass a string directly
+    single_text_prompt = text_prompt  # Pass a TextPrompt
+    single_tokens_prompt = tokens_prompt  # Pass a TokensPrompt
 
-# Test prompts
-#
-# This section shows all of the valid ways to prompt an
-# encoder/decoder model.
-#
-# - Helpers for building prompts
-text_prompt_raw = "Hello, my name is"
-text_prompt = TextPrompt(prompt="The president of the United States is")
-tokens_prompt = TokensPrompt(prompt_token_ids=tokenizer.encode(
-    prompt="The capital of France is"))
-# - Pass a single prompt to encoder/decoder model
-#   (implicitly encoder input prompt);
-#   decoder input prompt is assumed to be None
+    # ruff: noqa: E501
+    # - Pass explicit encoder and decoder input prompts within one data structure.
+    #   Encoder and decoder prompts can both independently be text or tokens, with
+    #   no requirement that they be the same prompt type. Some example prompt-type
+    #   combinations are shown below, note that these are not exhaustive.
 
-single_text_prompt_raw = text_prompt_raw  # Pass a string directly
-single_text_prompt = text_prompt  # Pass a TextPrompt
-single_tokens_prompt = tokens_prompt  # Pass a TokensPrompt
+    enc_dec_prompt1 = ExplicitEncoderDecoderPrompt(
+        # Pass encoder prompt string directly, &
+        # pass decoder prompt tokens
+        encoder_prompt=single_text_prompt_raw,
+        decoder_prompt=single_tokens_prompt,
+    )
+    enc_dec_prompt2 = ExplicitEncoderDecoderPrompt(
+        # Pass TextPrompt to encoder, and
+        # pass decoder prompt string directly
+        encoder_prompt=single_text_prompt,
+        decoder_prompt=single_text_prompt_raw,
+    )
+    enc_dec_prompt3 = ExplicitEncoderDecoderPrompt(
+        # Pass encoder prompt tokens directly, and
+        # pass TextPrompt to decoder
+        encoder_prompt=single_tokens_prompt,
+        decoder_prompt=single_text_prompt,
+    )
 
-# - Pass explicit encoder and decoder input prompts within one data structure.
-#   Encoder and decoder prompts can both independently be text or tokens, with
-#   no requirement that they be the same prompt type. Some example prompt-type
-#   combinations are shown below, note that these are not exhaustive.
+    # - Finally, here's a useful helper function for zipping encoder and
+    #   decoder prompts together into a list of ExplicitEncoderDecoderPrompt
+    #   instances
+    zipped_prompt_list = zip_enc_dec_prompts(
+        ['An encoder prompt', 'Another encoder prompt'],
+        ['A decoder prompt', 'Another decoder prompt'])
 
-enc_dec_prompt1 = ExplicitEncoderDecoderPrompt(
-    # Pass encoder prompt string directly, &
-    # pass decoder prompt tokens
-    encoder_prompt=single_text_prompt_raw,
-    decoder_prompt=single_tokens_prompt,
-)
-enc_dec_prompt2 = ExplicitEncoderDecoderPrompt(
-    # Pass TextPrompt to encoder, and
-    # pass decoder prompt string directly
-    encoder_prompt=single_text_prompt,
-    decoder_prompt=single_text_prompt_raw,
-)
-enc_dec_prompt3 = ExplicitEncoderDecoderPrompt(
-    # Pass encoder prompt tokens directly, and
-    # pass TextPrompt to decoder
-    encoder_prompt=single_tokens_prompt,
-    decoder_prompt=single_text_prompt,
-)
+    # - Let's put all of the above example prompts together into one list
+    #   which we will pass to the encoder/decoder LLM.
+    return [
+        single_text_prompt_raw, single_text_prompt, single_tokens_prompt,
+        enc_dec_prompt1, enc_dec_prompt2, enc_dec_prompt3
+    ] + zipped_prompt_list
 
-# - Finally, here's a useful helper function for zipping encoder and
-#   decoder prompts together into a list of ExplicitEncoderDecoderPrompt
-#   instances
-zipped_prompt_list = zip_enc_dec_prompts(
-    ['An encoder prompt', 'Another encoder prompt'],
-    ['A decoder prompt', 'Another decoder prompt'])
-
-# - Let's put all of the above example prompts together into one list
-#   which we will pass to the encoder/decoder LLM.
-prompts = [
-    single_text_prompt_raw, single_text_prompt, single_tokens_prompt,
-    enc_dec_prompt1, enc_dec_prompt2, enc_dec_prompt3
-] + zipped_prompt_list
 
 # Create a sampling params object.
-sampling_params = SamplingParams(
-    temperature=0,
-    top_p=1.0,
-    min_tokens=0,
-    max_tokens=20,
-)
+def create_sampling_params():
+    return SamplingParams(
+        temperature=0,
+        top_p=1.0,
+        min_tokens=0,
+        max_tokens=20,
+    )
 
-# Generate output tokens from the prompts. The output is a list of
-# RequestOutput objects that contain the prompt, generated
-# text, and other information.
-outputs = llm.generate(prompts, sampling_params)
 
 # Print the outputs.
-print("-" * 50)
-for i, output in enumerate(outputs):
-    prompt = output.prompt
-    encoder_prompt = output.encoder_prompt
-    generated_text = output.outputs[0].text
-    print(f"Output {i+1}:")
-    print(f"Encoder prompt: {encoder_prompt!r}\n"
-          f"Decoder prompt: {prompt!r}\n"
-          f"Generated text: {generated_text!r}")
+def print_outputs(outputs):
     print("-" * 50)
+    for i, output in enumerate(outputs):
+        prompt = output.prompt
+        encoder_prompt = output.encoder_prompt
+        generated_text = output.outputs[0].text
+        print(f"Output {i+1}:")
+        print(f"Encoder prompt: {encoder_prompt!r}\n"
+              f"Decoder prompt: {prompt!r}\n"
+              f"Generated text: {generated_text!r}")
+        print("-" * 50)
+
+
+def main():
+    dtype = "float"
+
+    # Create a BART encoder/decoder model instance
+    llm = LLM(
+        model="facebook/bart-large-cnn",
+        dtype=dtype,
+    )
+
+    # Get BART tokenizer
+    tokenizer = llm.llm_engine.get_tokenizer_group()
+
+    prompts = create_prompts(tokenizer)
+    sampling_params = create_sampling_params()
+
+    # Generate output tokens from the prompts. The output is a list of
+    # RequestOutput objects that contain the prompt, generated
+    # text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+
+    print_outputs(outputs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py
index 456ee60eaabf3..61e5f5eae4efa 100644
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@@ -126,6 +126,23 @@ model_example_map = {
 }
 
 
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'vision language models for text generation')
+    parser.add_argument('--model-type',
+                        '-m',
+                        type=str,
+                        default="mllama",
+                        choices=model_example_map.keys(),
+                        help='Huggingface "model_type".')
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")
+    return parser.parse_args()
+
+
 def main(args):
     model = args.model_type
     if model not in model_example_map:
@@ -171,19 +188,5 @@ def main(args):
 
 
 if __name__ == "__main__":
-    parser = FlexibleArgumentParser(
-        description='Demo on using vLLM for offline inference with '
-        'vision language models for text generation')
-    parser.add_argument('--model-type',
-                        '-m',
-                        type=str,
-                        default="mllama",
-                        choices=model_example_map.keys(),
-                        help='Huggingface "model_type".')
-    parser.add_argument("--seed",
-                        type=int,
-                        default=None,
-                        help="Set the seed when initializing `vllm.LLM`.")
-
-    args = parser.parse_args()
+    args = parse_args()
     main(args)
diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py
index efa1aa5b03692..9bb66fdbc4592 100644
--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@@ -168,7 +168,7 @@ def run_advanced_demo(args: argparse.Namespace):
     print("-" * 50)
 
 
-def main():
+def parse_args():
     parser = argparse.ArgumentParser(
         description="Run a demo in simple or advanced mode.")
 
@@ -187,8 +187,11 @@ def main():
         '--disable-mm-preprocessor-cache',
         action='store_true',
         help='If True, disables caching of multi-modal preprocessor/mapper.')
+    return parser.parse_args()
 
-    args = parser.parse_args()
+
+def main():
+    args = parse_args()
 
     if args.mode == "simple":
         print("Running simple demo...")
diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py
index a2a984b04e005..53c58a76d9dc1 100644
--- a/examples/offline_inference/mlpspeculator.py
+++ b/examples/offline_inference/mlpspeculator.py
@@ -34,8 +34,7 @@ def time_generation(llm: LLM, prompts: list[str],
         print("-" * 50)
 
 
-if __name__ == "__main__":
-
+def main():
     template = (
         "Below is an instruction that describes a task. Write a response "
         "that appropriately completes the request.\n\n### Instruction:\n{}"
@@ -66,3 +65,7 @@ if __name__ == "__main__":
     )
 
     time_generation(llm, prompts, sampling_params, "With speculation")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py
index 3ae507cac5ce1..f97a1f32e6210 100644
--- a/examples/offline_inference/prithvi_geospatial_mae.py
+++ b/examples/offline_inference/prithvi_geospatial_mae.py
@@ -417,6 +417,38 @@ def run_model(input_data,
     return pred_imgs
 
 
+def parse_args():
+    parser = argparse.ArgumentParser("MAE run inference", add_help=False)
+
+    parser.add_argument(
+        "--data_file",
+        type=str,
+        default="./India_900498_S2Hand.tif",
+        help="Path to the file.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="output",
+        help="Path to the directory where to save outputs.",
+    )
+    parser.add_argument(
+        "--input_indices",
+        default=[1, 2, 3, 8, 11, 12],
+        type=int,
+        nargs="+",
+        help=
+        "0-based indices of the six Prithvi channels to be selected from the  "
+        "input. By default selects [1,2,3,8,11,12] for S2L1C data.",
+    )
+    parser.add_argument(
+        "--rgb_outputs",
+        action="store_true",
+        help="If present, output files will only contain RGB channels. "
+        "Otherwise, all bands will be saved.",
+    )
+
+
 def main(
     data_file: str,
     output_dir: str,
@@ -496,35 +528,7 @@ def main(
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser("MAE run inference", add_help=False)
 
-    parser.add_argument(
-        "--data_file",
-        type=str,
-        default="./India_900498_S2Hand.tif",
-        help="Path to the file.",
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="output",
-        help="Path to the directory where to save outputs.",
-    )
-    parser.add_argument(
-        "--input_indices",
-        default=[1, 2, 3, 8, 11, 12],
-        type=int,
-        nargs="+",
-        help=
-        "0-based indices of the six Prithvi channels to be selected from the  "
-        "input. By default selects [1,2,3,8,11,12] for S2L1C data.",
-    )
-    parser.add_argument(
-        "--rgb_outputs",
-        action="store_true",
-        help="If present, output files will only contain RGB channels. "
-        "Otherwise, all bands will be saved.",
-    )
-    args = parser.parse_args()
+    args = parse_args()
 
     main(**vars(args))
diff --git a/examples/offline_inference/profiling.py b/examples/offline_inference/profiling.py
index 6e1d4722440a5..9c818d0757345 100644
--- a/examples/offline_inference/profiling.py
+++ b/examples/offline_inference/profiling.py
@@ -359,7 +359,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
               f" in folder {context.save_chrome_traces_folder}")
 
 
-if __name__ == "__main__":
+def parse_args():
     parser = FlexibleArgumentParser(description="""
 Profile a model
 
@@ -449,7 +449,10 @@ Profile a model
 
     EngineArgs.add_cli_args(parser)
 
-    args = parser.parse_args()
+    return parser.parse_args()
+
+
+def main(args):
     context = ProfileContext(
         engine_args=EngineArgs.from_cli_args(args),
         **{
@@ -458,3 +461,8 @@ Profile a model
             if k in inspect.signature(ProfileContext).parameters
         })
     run_profile(context, csv_output=args.csv, json_output=args.json)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py
index 6aac9b75c59cf..338380cc96841 100644
--- a/examples/offline_inference/save_sharded_state.py
+++ b/examples/offline_inference/save_sharded_state.py
@@ -29,20 +29,23 @@ from pathlib import Path
 from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
 
-parser = FlexibleArgumentParser()
-EngineArgs.add_cli_args(parser)
-parser.add_argument("--output",
-                    "-o",
-                    required=True,
-                    type=str,
-                    help="path to output checkpoint")
-parser.add_argument("--file-pattern",
-                    type=str,
-                    help="string pattern of saved filenames")
-parser.add_argument("--max-file-size",
-                    type=str,
-                    default=5 * 1024**3,
-                    help="max size (in bytes) of each safetensors file")
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    EngineArgs.add_cli_args(parser)
+    parser.add_argument("--output",
+                        "-o",
+                        required=True,
+                        type=str,
+                        help="path to output checkpoint")
+    parser.add_argument("--file-pattern",
+                        type=str,
+                        help="string pattern of saved filenames")
+    parser.add_argument("--max-file-size",
+                        type=str,
+                        default=5 * 1024**3,
+                        help="max size (in bytes) of each safetensors file")
+    return parser.parse_args()
 
 
 def main(args):
@@ -87,5 +90,5 @@ def main(args):
 
 
 if __name__ == "__main__":
-    args = parser.parse_args()
+    args = parse_args()
     main(args)
diff --git a/examples/offline_inference/simple_profiling.py b/examples/offline_inference/simple_profiling.py
index 6a8e3a5a3e757..d583110c8e69b 100644
--- a/examples/offline_inference/simple_profiling.py
+++ b/examples/offline_inference/simple_profiling.py
@@ -18,8 +18,8 @@ prompts = [
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
-if __name__ == "__main__":
 
+def main():
     # Create an LLM.
     llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
 
@@ -42,3 +42,7 @@ if __name__ == "__main__":
     # Add a buffer to wait for profiler in the background process
     # (in case MP is on) to finish writing profiling output.
     time.sleep(10)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 281d4fbdfc2e2..6b533346ac315 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -1097,6 +1097,59 @@ def time_counter(enable: bool):
         yield
 
 
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'vision language models for text generation')
+    parser.add_argument('--model-type',
+                        '-m',
+                        type=str,
+                        default="llava",
+                        choices=model_example_map.keys(),
+                        help='Huggingface "model_type".')
+    parser.add_argument('--num-prompts',
+                        type=int,
+                        default=4,
+                        help='Number of prompts to run.')
+    parser.add_argument('--modality',
+                        type=str,
+                        default="image",
+                        choices=['image', 'video'],
+                        help='Modality of the input.')
+    parser.add_argument('--num-frames',
+                        type=int,
+                        default=16,
+                        help='Number of frames to extract from the video.')
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")
+
+    parser.add_argument(
+        '--image-repeat-prob',
+        type=float,
+        default=None,
+        help='Simulates the hit-ratio for multi-modal preprocessor cache'
+        ' (if enabled)')
+
+    parser.add_argument(
+        '--disable-mm-preprocessor-cache',
+        action='store_true',
+        help='If True, disables caching of multi-modal preprocessor/mapper.')
+
+    parser.add_argument(
+        '--time-generate',
+        action='store_true',
+        help='If True, then print the total generate() call time')
+
+    parser.add_argument(
+        '--use-different-prompt-per-request',
+        action='store_true',
+        help='If True, then use different prompt (with the same multi-modal '
+        'data) for each request.')
+    return parser.parse_args()
+
+
 def main(args):
     model = args.model_type
     if model not in model_example_map:
@@ -1175,55 +1228,5 @@ def main(args):
 
 
 if __name__ == "__main__":
-    parser = FlexibleArgumentParser(
-        description='Demo on using vLLM for offline inference with '
-        'vision language models for text generation')
-    parser.add_argument('--model-type',
-                        '-m',
-                        type=str,
-                        default="llava",
-                        choices=model_example_map.keys(),
-                        help='Huggingface "model_type".')
-    parser.add_argument('--num-prompts',
-                        type=int,
-                        default=4,
-                        help='Number of prompts to run.')
-    parser.add_argument('--modality',
-                        type=str,
-                        default="image",
-                        choices=['image', 'video'],
-                        help='Modality of the input.')
-    parser.add_argument('--num-frames',
-                        type=int,
-                        default=16,
-                        help='Number of frames to extract from the video.')
-    parser.add_argument("--seed",
-                        type=int,
-                        default=None,
-                        help="Set the seed when initializing `vllm.LLM`.")
-
-    parser.add_argument(
-        '--image-repeat-prob',
-        type=float,
-        default=None,
-        help='Simulates the hit-ratio for multi-modal preprocessor cache'
-        ' (if enabled)')
-
-    parser.add_argument(
-        '--disable-mm-preprocessor-cache',
-        action='store_true',
-        help='If True, disables caching of multi-modal preprocessor/mapper.')
-
-    parser.add_argument(
-        '--time-generate',
-        action='store_true',
-        help='If True, then print the total generate() call time')
-
-    parser.add_argument(
-        '--use-different-prompt-per-request',
-        action='store_true',
-        help='If True, then use different prompt (with the same multi-modal '
-        'data) for each request.')
-
-    args = parser.parse_args()
+    args = parse_args()
     main(args)
diff --git a/examples/offline_inference/vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py
index ad3c5ae0627b3..2637949551a1a 100644
--- a/examples/offline_inference/vision_language_embedding.py
+++ b/examples/offline_inference/vision_language_embedding.py
@@ -156,16 +156,13 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
         print("-" * 50)
 
 
-def main(args: Namespace):
-    run_encode(args.model_name, args.modality, args.seed)
-
-
 model_example_map = {
     "e5_v": run_e5_v,
     "vlm2vec": run_vlm2vec,
 }
 
-if __name__ == "__main__":
+
+def parse_args():
     parser = FlexibleArgumentParser(
         description='Demo on using vLLM for offline inference with '
         'vision language models for multimodal embedding')
@@ -184,6 +181,13 @@ if __name__ == "__main__":
                         type=int,
                         default=None,
                         help="Set the seed when initializing `vllm.LLM`.")
+    return parser.parse_args()
 
-    args = parser.parse_args()
+
+def main(args: Namespace):
+    run_encode(args.model_name, args.modality, args.seed)
+
+
+if __name__ == "__main__":
+    args = parse_args()
     main(args)
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 6fa4a754403ad..52e9389670666 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -767,22 +767,7 @@ def run_chat(model: str, question: str, image_urls: list[str],
         print("-" * 50)
 
 
-def main(args: Namespace):
-    model = args.model_type
-    method = args.method
-    seed = args.seed
-
-    image_urls = IMAGE_URLS[:args.num_images]
-
-    if method == "generate":
-        run_generate(model, QUESTION, image_urls, seed)
-    elif method == "chat":
-        run_chat(model, QUESTION, image_urls, seed)
-    else:
-        raise ValueError(f"Invalid method: {method}")
-
-
-if __name__ == "__main__":
+def parse_args():
     parser = FlexibleArgumentParser(
         description='Demo on using vLLM for offline inference with '
         'vision language models that support multi-image input for text '
@@ -808,6 +793,24 @@ if __name__ == "__main__":
         choices=list(range(1, 13)),  # 12 is the max number of images
         default=2,
         help="Number of images to use for the demo.")
+    return parser.parse_args()
 
-    args = parser.parse_args()
+
+def main(args: Namespace):
+    model = args.model_type
+    method = args.method
+    seed = args.seed
+
+    image_urls = IMAGE_URLS[:args.num_images]
+
+    if method == "generate":
+        run_generate(model, QUESTION, image_urls, seed)
+    elif method == "chat":
+        run_chat(model, QUESTION, image_urls, seed)
+    else:
+        raise ValueError(f"Invalid method: {method}")
+
+
+if __name__ == "__main__":
+    args = parse_args()
     main(args)
diff --git a/examples/online_serving/api_client.py b/examples/online_serving/api_client.py
index 60e4bccb7517c..36079ff11d07e 100644
--- a/examples/online_serving/api_client.py
+++ b/examples/online_serving/api_client.py
@@ -58,6 +58,16 @@ def get_response(response: requests.Response) -> list[str]:
     return output
 
 
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--n", type=int, default=1)
+    parser.add_argument("--prompt", type=str, default="San Francisco is a")
+    parser.add_argument("--stream", action="store_true")
+    return parser.parse_args()
+
+
 def main(args: Namespace):
     prompt = args.prompt
     api_url = f"http://{args.host}:{args.port}/generate"
@@ -82,11 +92,5 @@ def main(args: Namespace):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--host", type=str, default="localhost")
-    parser.add_argument("--port", type=int, default=8000)
-    parser.add_argument("--n", type=int, default=1)
-    parser.add_argument("--prompt", type=str, default="San Francisco is a")
-    parser.add_argument("--stream", action="store_true")
-    args = parser.parse_args()
+    args = parse_args()
     main(args)
diff --git a/examples/online_serving/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py
index ee01e1eae6281..13331609eb09d 100644
--- a/examples/online_serving/gradio_openai_chatbot_webserver.py
+++ b/examples/online_serving/gradio_openai_chatbot_webserver.py
@@ -1,52 +1,36 @@
 # SPDX-License-Identifier: Apache-2.0
+"""Example for starting a Gradio OpenAI Chatbot Webserver
+Start vLLM API server:
+    vllm serve meta-llama/Llama-2-7b-chat-hf
 
+Start Gradio OpenAI Chatbot Webserver:
+    python examples/online_serving/gradio_openai_chatbot_webserver.py \
+                    -m meta-llama/Llama-2-7b-chat-hf
+
+Note that `pip install --upgrade gradio` is needed to run this example.
+More details: https://github.com/gradio-app/gradio
+
+If your antivirus software blocks the download of frpc for gradio,
+you can install it manually by following these steps:
+
+1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64
+2. Rename the downloaded file to: frpc_linux_amd64_v0.3
+3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
+"""
 import argparse
 
 import gradio as gr
 from openai import OpenAI
 
-# Argument parser setup
-parser = argparse.ArgumentParser(
-    description='Chatbot Interface with Customizable Parameters')
-parser.add_argument('--model-url',
-                    type=str,
-                    default='http://localhost:8000/v1',
-                    help='Model URL')
-parser.add_argument('-m',
-                    '--model',
-                    type=str,
-                    required=True,
-                    help='Model name for the chatbot')
-parser.add_argument('--temp',
-                    type=float,
-                    default=0.8,
-                    help='Temperature for text generation')
-parser.add_argument('--stop-token-ids',
-                    type=str,
-                    default='',
-                    help='Comma-separated stop token IDs')
-parser.add_argument("--host", type=str, default=None)
-parser.add_argument("--port", type=int, default=8001)
 
-# Parse the arguments
-args = parser.parse_args()
-
-# Set OpenAI's API key and API base to use vLLM's API server.
-openai_api_key = "EMPTY"
-openai_api_base = args.model_url
-
-# Create an OpenAI client to interact with the API server
-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
+def create_openai_client(api_key, base_url):
+    return OpenAI(api_key=api_key, base_url=base_url)
 
 
-def predict(message, history):
-    # Convert chat history to OpenAI format
+def format_history_to_openai(history):
     history_openai_format = [{
         "role": "system",
-        "content": "You are a great ai assistant."
+        "content": "You are a great AI assistant."
     }]
     for human, assistant in history:
         history_openai_format.append({"role": "user", "content": human})
@@ -54,31 +38,92 @@ def predict(message, history):
             "role": "assistant",
             "content": assistant
         })
+    return history_openai_format
+
+
+def predict(message, history, client, model_name, temp, stop_token_ids):
+    # Format history to OpenAI chat format
+    history_openai_format = format_history_to_openai(history)
     history_openai_format.append({"role": "user", "content": message})
 
-    # Create a chat completion request and send it to the API server
+    # Send request to OpenAI API (vLLM server)
     stream = client.chat.completions.create(
-        model=args.model,  # Model name to use
-        messages=history_openai_format,  # Chat history
-        temperature=args.temp,  # Temperature for text generation
-        stream=True,  # Stream response
+        model=model_name,
+        messages=history_openai_format,
+        temperature=temp,
+        stream=True,
         extra_body={
             'repetition_penalty':
             1,
-            'stop_token_ids': [
-                int(id.strip()) for id in args.stop_token_ids.split(',')
-                if id.strip()
-            ] if args.stop_token_ids else []
+            'stop_token_ids':
+            [int(id.strip())
+             for id in stop_token_ids.split(',')] if stop_token_ids else []
         })
 
-    # Read and return generated text from response stream
-    partial_message = ""
+    # Collect all chunks and concatenate them into a full message
+    full_message = ""
     for chunk in stream:
-        partial_message += (chunk.choices[0].delta.content or "")
-        yield partial_message
+        full_message += (chunk.choices[0].delta.content or "")
+
+    # Return the full message as a single response
+    return full_message
 
 
-# Create and launch a chat interface with Gradio
-gr.ChatInterface(predict).queue().launch(server_name=args.host,
-                                         server_port=args.port,
-                                         share=True)
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Chatbot Interface with Customizable Parameters')
+    parser.add_argument('--model-url',
+                        type=str,
+                        default='http://localhost:8000/v1',
+                        help='Model URL')
+    parser.add_argument('-m',
+                        '--model',
+                        type=str,
+                        required=True,
+                        help='Model name for the chatbot')
+    parser.add_argument('--temp',
+                        type=float,
+                        default=0.8,
+                        help='Temperature for text generation')
+    parser.add_argument('--stop-token-ids',
+                        type=str,
+                        default='',
+                        help='Comma-separated stop token IDs')
+    parser.add_argument("--host", type=str, default=None)
+    parser.add_argument("--port", type=int, default=8001)
+    return parser.parse_args()
+
+
+def build_gradio_interface(client, model_name, temp, stop_token_ids):
+
+    def chat_predict(message, history):
+        return predict(message, history, client, model_name, temp,
+                       stop_token_ids)
+
+    return gr.ChatInterface(fn=chat_predict,
+                            title="Chatbot Interface",
+                            description="A simple chatbot powered by vLLM")
+
+
+def main():
+    # Parse the arguments
+    args = parse_args()
+
+    # Set OpenAI's API key and API base to use vLLM's API server
+    openai_api_key = "EMPTY"
+    openai_api_base = args.model_url
+
+    # Create an OpenAI client
+    client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
+
+    # Define the Gradio chatbot interface using the predict function
+    gradio_interface = build_gradio_interface(client, args.model, args.temp,
+                                              args.stop_token_ids)
+
+    gradio_interface.queue().launch(server_name=args.host,
+                                    server_port=args.port,
+                                    share=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/gradio_webserver.py b/examples/online_serving/gradio_webserver.py
index 85a9119c6aa2f..2e7c2a0c5838c 100644
--- a/examples/online_serving/gradio_webserver.py
+++ b/examples/online_serving/gradio_webserver.py
@@ -1,5 +1,22 @@
 # SPDX-License-Identifier: Apache-2.0
+"""Example for starting a Gradio Webserver
+Start vLLM API server:
+    python -m vllm.entrypoints.api_server \
+        --model meta-llama/Llama-2-7b-chat-hf
 
+Start Webserver:
+    python examples/online_serving/gradio_webserver.py
+
+Note that `pip install --upgrade gradio` is needed to run this example.
+More details: https://github.com/gradio-app/gradio
+
+If your antivirus software blocks the download of frpc for gradio,
+you can install it manually by following these steps:
+
+1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64
+2. Rename the downloaded file to: frpc_linux_amd64_v0.3
+3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
+"""
 import argparse
 import json
 
@@ -39,16 +56,23 @@ def build_demo():
     return demo
 
 
-if __name__ == "__main__":
+def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--host", type=str, default=None)
     parser.add_argument("--port", type=int, default=8001)
     parser.add_argument("--model-url",
                         type=str,
                         default="http://localhost:8000/generate")
-    args = parser.parse_args()
+    return parser.parse_args()
 
+
+def main(args):
     demo = build_demo()
     demo.queue().launch(server_name=args.host,
                         server_port=args.port,
                         share=True)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

From 1575c1701a80befec8efe274b338cb26bc199275 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 15 Apr 2025 16:38:19 +0800
Subject: [PATCH 440/593] [CI/Build] Fix LoRA OOM (#16624)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_minicpmv_tp.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
index 0b223e5011ff9..24242b8a17594 100644
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -66,8 +66,12 @@ def test_minicpmv_lora(minicpmv_lora_files):
         max_loras=2,
         max_lora_rank=8,
         enforce_eager=True,
+        max_model_len=2048,
+        limit_mm_per_prompt={
+            "image": 2,
+            "video": 0
+        },
         trust_remote_code=True,
-        enable_chunked_prefill=True,
     )
     output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
     for i in range(len(EXPECTED_OUTPUT)):
@@ -91,9 +95,11 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
         max_loras=4,
         max_lora_rank=64,
         tensor_parallel_size=4,
+        limit_mm_per_prompt={
+            "image": 2,
+            "video": 0
+        },
         trust_remote_code=True,
-        enforce_eager=True,
-        enable_chunked_prefill=True,
     )
     output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
     for i in range(len(EXPECTED_OUTPUT)):
@@ -115,8 +121,11 @@ def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
         max_lora_rank=8,
         tensor_parallel_size=4,
         trust_remote_code=True,
+        limit_mm_per_prompt={
+            "image": 1,
+            "video": 0
+        },
         fully_sharded_loras=True,
-        enable_chunked_prefill=True,
     )
     output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
     for i in range(len(EXPECTED_OUTPUT)):

From 1666e664435a0b918b7186a020bcc59b8b3216c0 Mon Sep 17 00:00:00 2001
From: Xihui Cang <754795299@qq.com>
Date: Tue, 15 Apr 2025 19:50:38 +0800
Subject: [PATCH 441/593] =?UTF-8?q?Add=20"/server=5Finfo"=20endpoint=20in?=
 =?UTF-8?q?=20api=5Fserver=20to=20retrieve=20the=20vllm=5Fconfig.=C2=A0=20?=
 =?UTF-8?q?(#16572)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Xihui Cang <xihuicang@gmail.com>
---
 vllm/engine/async_llm_engine.py       |  4 ++++
 vllm/engine/llm_engine.py             |  4 ++++
 vllm/engine/multiprocessing/client.py |  4 ++++
 vllm/engine/protocol.py               |  7 ++++++-
 vllm/entrypoints/openai/api_server.py | 16 ++++++++++++----
 vllm/v1/engine/async_llm.py           |  5 ++++-
 vllm/v1/engine/llm_engine.py          |  3 +++
 7 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 7f9f85e1f93f2..67c7e109c9f04 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1167,6 +1167,10 @@ class AsyncLLMEngine(EngineClient):
                                             exception=asyncio.CancelledError,
                                             verbose=self.log_requests)
 
+    async def get_vllm_config(self) -> VllmConfig:
+        """Get the vllm configuration of the vLLM engine."""
+        return self.engine.get_vllm_config()
+
     async def get_model_config(self) -> ModelConfig:
         """Get the model configuration of the vLLM engine."""
         return self.engine.get_model_config()
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 54f7b8fb69b58..2347cdee904b3 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -914,6 +914,10 @@ class LLMEngine:
             scheduler.abort_seq_group(
                 request_id, seq_id_to_seq_group=self.seq_id_to_seq_group)
 
+    def get_vllm_config(self) -> VllmConfig:
+        """Gets the vllm configuration."""
+        return self.vllm_config
+
     def get_model_config(self) -> ModelConfig:
         """Gets the model configuration."""
         return self.model_config
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index f058b13297bb0..6e56cbdbbf8c7 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -93,6 +93,7 @@ class MQLLMEngineClient(EngineClient):
         self._errored_with: Optional[BaseException] = None
 
         # Get the configs.
+        self.vllm_config = engine_config
         self.model_config = engine_config.model_config
         self.decoding_config = engine_config.decoding_config
 
@@ -377,6 +378,9 @@ class MQLLMEngineClient(EngineClient):
     async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None):
         return await self.tokenizer.get_lora_tokenizer_async(lora_request)
 
+    async def get_vllm_config(self) -> VllmConfig:
+        return self.vllm_config
+
     async def get_decoding_config(self) -> DecodingConfig:
         return self.decoding_config
 
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index e2974b02c5ba3..7e5ac3a284522 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -5,7 +5,7 @@ from abc import ABC, abstractmethod
 from typing import AsyncGenerator, List, Mapping, Optional
 
 from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
-from vllm.config import DecodingConfig, ModelConfig
+from vllm.config import DecodingConfig, ModelConfig, VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.inputs.data import PromptType, TokensPrompt
 from vllm.inputs.parse import is_explicit_encoder_decoder_prompt
@@ -220,6 +220,11 @@ class EngineClient(ABC):
         """
         ...
 
+    @abstractmethod
+    async def get_vllm_config(self) -> VllmConfig:
+        """Get the vllm configuration of the vLLM engine."""
+        ...
+
     @abstractmethod
     async def get_model_config(self) -> ModelConfig:
         """Get the model configuration of the vLLM engine."""
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 6a8bdd0602285..2c15aa8a9335a 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -30,7 +30,7 @@ from starlette.routing import Mount
 from typing_extensions import assert_never
 
 import vllm.envs as envs
-from vllm.config import ModelConfig
+from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine  # type: ignore
 from vllm.engine.multiprocessing.client import MQLLMEngineClient
@@ -327,6 +327,7 @@ def mount_metrics(app: FastAPI):
                 "/load",
                 "/ping",
                 "/version",
+                "/server_info",
             ],
             registry=registry,
         ).add().instrument(app).expose(app)
@@ -687,6 +688,11 @@ TASK_HANDLERS: dict[str, dict[str, tuple]] = {
 
 if envs.VLLM_SERVER_DEV_MODE:
 
+    @router.get("/server_info")
+    async def show_server_info(raw_request: Request):
+        server_info = {"vllm_config": str(raw_request.app.state.vllm_config)}
+        return JSONResponse(content=server_info)
+
     @router.post("/reset_prefix_cache")
     async def reset_prefix_cache(raw_request: Request):
         """
@@ -894,7 +900,7 @@ def build_app(args: Namespace) -> FastAPI:
 
 async def init_app_state(
     engine_client: EngineClient,
-    model_config: ModelConfig,
+    vllm_config: VllmConfig,
     state: State,
     args: Namespace,
 ) -> None:
@@ -915,6 +921,8 @@ async def init_app_state(
 
     state.engine_client = engine_client
     state.log_stats = not args.disable_log_stats
+    state.vllm_config = vllm_config
+    model_config = vllm_config.model_config
 
     resolved_chat_template = load_chat_template(args.chat_template)
     if resolved_chat_template is not None:
@@ -1069,8 +1077,8 @@ async def run_server(args, **uvicorn_kwargs) -> None:
     async with build_async_engine_client(args) as engine_client:
         app = build_app(args)
 
-        model_config = await engine_client.get_model_config()
-        await init_app_state(engine_client, model_config, app.state, args)
+        vllm_config = await engine_client.get_vllm_config()
+        await init_app_state(engine_client, vllm_config, app.state, args)
 
         def _listen_addr(a: str) -> str:
             if is_valid_ipv6_address(a):
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index b77a6824cddbd..6d24ba2bc9820 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -64,7 +64,7 @@ class AsyncLLM(EngineClient):
         assert start_engine_loop
 
         self.model_config = vllm_config.model_config
-
+        self.vllm_config = vllm_config
         self.log_requests = log_requests
         self.log_stats = log_stats
 
@@ -379,6 +379,9 @@ class AsyncLLM(EngineClient):
     ):
         raise ValueError("Not Supported on V1 yet.")
 
+    async def get_vllm_config(self) -> VllmConfig:
+        return self.vllm_config
+
     async def get_model_config(self) -> ModelConfig:
         return self.model_config
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 4c67186f70401..c05319f3d80c6 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -230,6 +230,9 @@ class LLMEngine:
 
         return processed_outputs.request_outputs
 
+    def get_vllm_config(self):
+        return self.vllm_config
+
     def get_model_config(self):
         return self.model_config
 

From 280d62b8a2cfd456e42aa3e4f9abb59fb87124ca Mon Sep 17 00:00:00 2001
From: DefTruth <31974251+DefTruth@users.noreply.github.com>
Date: Tue, 15 Apr 2025 20:58:37 +0800
Subject: [PATCH 442/593] [Kernel] Remove redundant Exp calculations (#16123)

Signed-off-by: DefTruth <qiustudent_r@163.com>
---
 vllm/attention/ops/triton_merge_attn_states.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/vllm/attention/ops/triton_merge_attn_states.py b/vllm/attention/ops/triton_merge_attn_states.py
index 9671b933f47b9..250426d9faa5b 100644
--- a/vllm/attention/ops/triton_merge_attn_states.py
+++ b/vllm/attention/ops/triton_merge_attn_states.py
@@ -66,7 +66,10 @@ def merge_attn_states_kernel(
     max_lse = tl.maximum(p_lse, s_lse)
     p_lse = p_lse - max_lse
     s_lse = s_lse - max_lse
-    out_se = (tl.exp(p_lse) + tl.exp(s_lse))
+    # Will reuse precomputed Exp values for scale factor computation.
+    p_se = tl.exp(p_lse)
+    s_se = tl.exp(s_lse)
+    out_se = (p_se + s_se)
 
     if OUTPUT_LSE:
         out_lse = tl.log(out_se) + max_lse
@@ -84,8 +87,8 @@ def merge_attn_states_kernel(
     # NOTE(woosuk): Be careful with the numerical stability.
     # We should compute the scale first, and then multiply it with the output.
     # Do not multiply the output with tl.exp(p_lse) or tl.exp(s_lse) directly.
-    p_scale = tl.exp(p_lse) / out_se
-    s_scale = tl.exp(s_lse) / out_se
+    p_scale = p_se / out_se
+    s_scale = s_se / out_se
     out = p_out * p_scale + s_out * s_scale
     tl.store(output + token_idx * num_heads * HEAD_SIZE +
              head_idx * HEAD_SIZE + head_arange,

From 54a66e5fee4a1ea62f1e4c79a078b20668e408c6 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Tue, 15 Apr 2025 09:33:51 -0400
Subject: [PATCH 443/593] [Misc] Update `compressed-tensors` WNA16 to support
 zero-points (#14211)

---
 tests/quantization/test_compressed_tensors.py | 21 +++++---
 .../compressed_tensors/compressed_tensors.py  |  6 +--
 .../schemes/compressed_tensors_wNa16.py       | 42 +++++++++++++--
 .../kernels/mixed_precision/machete.py        |  7 +--
 .../kernels/mixed_precision/marlin.py         | 51 +++++++++----------
 .../layers/quantization/utils/marlin_utils.py |  3 +-
 6 files changed, 85 insertions(+), 45 deletions(-)

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 5c928f27c10dd..70f716f95e896 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -261,16 +261,23 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
 
 @pytest.mark.parametrize(
     "wNa16_args",
-    [
-        ("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8),
-        ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8),
-        ("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4),
-    ],
+    [("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8,
+      True, False),
+     ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8, True,
+      False),
+     ("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4,
+      True, False),
+     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-awq-group128-asym256", "group", 128,
+      8, False, False),
+     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-Channel",
+      "channel", None, 8, False, False),
+     ("nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-ActOrder",
+      "group", 128, 8, False, True)],
 )
 @pytest.mark.skipif(not current_platform.is_cuda(),
                     reason="The tests are skipped on non-CUDA platform.")
 def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
-    model, strategy, group, pack_factor = wNa16_args
+    model, strategy, group, pack_factor, symmetric, has_g_idx = wNa16_args
     with vllm_runner(model) as llm:
 
         def check_model(model):
@@ -286,6 +293,8 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
                                                   if group is None else group)
 
             assert qkv_proj.scheme.pack_factor == pack_factor
+            assert qkv_proj.scheme.symmetric == symmetric
+            assert qkv_proj.scheme.has_g_idx == has_g_idx
 
         llm.apply_model(check_model)
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index b714d95b60258..cb9a48d7746bf 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -302,14 +302,12 @@ class CompressedTensorsConfig(QuantizationConfig):
     def _is_wNa16_group_channel(self, weight_quant: BaseModel,
                                 input_quant: BaseModel) -> bool:
         input_quant_none = input_quant is None
-        is_symmetric = weight_quant.symmetric
         is_channel_group = (
             weight_quant.strategy == QuantizationStrategy.CHANNEL.value
             or weight_quant.strategy == QuantizationStrategy.GROUP.value)
         is_static = not weight_quant.dynamic
 
-        return (is_channel_group and input_quant_none and is_symmetric
-                and is_static)
+        return (is_channel_group and input_quant_none and is_static)
 
     def _get_scheme_from_parts(
             self, weight_quant: BaseModel,
@@ -319,6 +317,7 @@ class CompressedTensorsConfig(QuantizationConfig):
         if self._is_wNa16_group_channel(weight_quant, input_quant):
             if (self.quant_format == CompressionFormat.marlin_24.value
                     and weight_quant.num_bits in W4A16SPARSE24_SUPPORTED_BITS):
+                assert weight_quant.symmetric
                 return CompressedTensorsW4A16Sparse24(
                     strategy=weight_quant.strategy,
                     num_bits=weight_quant.num_bits,
@@ -328,6 +327,7 @@ class CompressedTensorsConfig(QuantizationConfig):
                 return CompressedTensorsWNA16(
                     num_bits=weight_quant.num_bits,
                     strategy=weight_quant.strategy,
+                    symmetric=weight_quant.symmetric,
                     group_size=weight_quant.group_size,
                     actorder=weight_quant.actorder)
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index 38df09ff39373..3535dd3f3f147 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -12,11 +12,15 @@ from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
     MPLinearLayerConfig, choose_mp_linear_kernel)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     marlin_repeat_scales_on_all_ranks)
+# yapf conflicts with isort for this block
+# yapf: disable
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                            ChannelQuantScaleParameter,
                                            GroupQuantScaleParameter,
+                                           PackedColumnParameter,
                                            PackedvLLMParameter,
                                            RowvLLMParameter)
+# yapf: enable
 from vllm.scalar_type import scalar_types
 
 logger = init_logger(__name__)
@@ -26,6 +30,7 @@ WNA16_SUPPORTED_TYPES_MAP = {
     4: scalar_types.uint4b8,
     8: scalar_types.uint8b128
 }
+WNA16_ZP_SUPPORTED_TYPES_MAP = {4: scalar_types.uint4, 8: scalar_types.uint8}
 WNA16_SUPPORTED_BITS = list(WNA16_SUPPORTED_TYPES_MAP.keys())
 
 
@@ -36,10 +41,12 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
                  strategy: str,
                  num_bits: int,
                  group_size: Optional[int] = None,
+                 symmetric: Optional[bool] = True,
                  actorder: Optional[ActivationOrdering] = None):
 
         self.pack_factor = 32 // num_bits
         self.strategy = strategy
+        self.symmetric = symmetric
         self.group_size = -1 if group_size is None else group_size
         self.has_g_idx = actorder == ActivationOrdering.GROUP
 
@@ -53,7 +60,9 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
                 f"Unsupported num_bits = {num_bits}. "
                 f"Supported num_bits = {WNA16_SUPPORTED_TYPES_MAP.keys()}")
 
-        self.quant_type = WNA16_SUPPORTED_TYPES_MAP[num_bits]
+        self.quant_type = (WNA16_ZP_SUPPORTED_TYPES_MAP[num_bits]
+                           if not self.symmetric else
+                           WNA16_SUPPORTED_TYPES_MAP[num_bits])
 
     @classmethod
     def get_min_capability(cls) -> int:
@@ -75,7 +84,7 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
             weight_type=self.quant_type,
             act_type=params_dtype,
             group_size=self.group_size,
-            zero_points=False,
+            zero_points=not self.symmetric,
             has_g_idx=self.has_g_idx
         )
 
@@ -120,13 +129,37 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
                 dtype=params_dtype,
             )
         }
+
+        zeros_args = {
+            "weight_loader":
+            weight_loader,
+            "data":
+            torch.zeros(
+                output_size_per_partition // self.pack_factor,
+                scales_and_zp_size,
+                dtype=torch.int32,
+            )
+        }
+
         if not partition_scales:
             weight_scale = ChannelQuantScaleParameter(output_dim=0,
                                                       **weight_scale_args)
+
+            if not self.symmetric:
+                qzeros = PackedColumnParameter(output_dim=0,
+                                               packed_dim=0,
+                                               packed_factor=self.pack_factor,
+                                               **zeros_args)
         else:
             weight_scale = GroupQuantScaleParameter(output_dim=0,
                                                     input_dim=1,
                                                     **weight_scale_args)
+            if not self.symmetric:
+                qzeros = PackedvLLMParameter(input_dim=1,
+                                             output_dim=0,
+                                             packed_dim=0,
+                                             packed_factor=self.pack_factor,
+                                             **zeros_args)
 
         # A 2D array defining the original shape of the weights
         # before packing
@@ -138,6 +171,9 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
         layer.register_parameter("weight_scale", weight_scale)
         layer.register_parameter("weight_shape", weight_shape)
 
+        if not self.symmetric:
+            layer.register_parameter("weight_zero_point", qzeros)
+
         # group index (for activation reordering)
         if self.has_g_idx:
             weight_g_idx = RowvLLMParameter(data=torch.empty(
@@ -151,7 +187,7 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
         self.kernel = kernel_type(mp_linear_kernel_config,
                                   w_q_param_name="weight_packed",
                                   w_s_param_name="weight_scale",
-                                  w_zp_param_name=None,
+                                  w_zp_param_name="weight_zero_point",
                                   w_gidx_param_name="weight_g_idx")
 
     # Checkpoints are serialized in compressed-tensors format, which is
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
index 3f0586f6e30d6..b3ffeca4f100e 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
@@ -26,17 +26,14 @@ class MacheteLinearKernel(MPLinearKernel):
     @classmethod
     def can_implement(cls,
                       c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+
         if c.has_g_idx and\
             c.partition_weight_shape[0] != c.full_weight_shape[0]:
             return False, "Act reordering currently not supported by Machete, "\
                           "when the input features are partitioned across "\
                           "devices"
-
         if c.zero_points:
-            return False, "Zero points currently not supported by "\
-                          " Compressed Tensors + Machete. (Kernel supports it"\
-                          " but CompressedTensorsWNA16 does not so support has"\
-                          " not been added to MacheteWNA16Kernel yet"
+            return False, "Zero points currently not supported by Machete"
 
         if c.weight_type not in query_machete_supported_quant_types(
                 c.zero_points):
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
index e21801cf6a785..7bd824ff9e551 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
@@ -9,7 +9,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     MARLIN_SUPPORTED_GROUP_SIZES, apply_gptq_marlin_linear,
     check_marlin_supports_shape, marlin_is_k_full, marlin_make_empty_g_idx,
     marlin_make_workspace, marlin_permute_scales, marlin_sort_g_idx,
-    query_marlin_supported_quant_types)
+    marlin_zero_points, query_marlin_supported_quant_types, unpack_cols)
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                            permute_param_layout_)
 
@@ -25,10 +25,6 @@ class MarlinLinearKernel(MPLinearKernel):
     @classmethod
     def can_implement(cls,
                       c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
-        if c.zero_points:
-            return False, "Zero points currently not supported by "\
-                          " MarlinLinearKernel. Will be added when AWQMarlin "\
-                          "is migrated over to using MPLinearKernel backend"
 
         quant_types = query_marlin_supported_quant_types(c.zero_points)
         if c.weight_type not in quant_types:
@@ -67,28 +63,6 @@ class MarlinLinearKernel(MPLinearKernel):
         if self.w_zp_name is None:
             self.w_zp_name = "w_zp"
 
-        if c.has_g_idx:
-            g_idx, g_idx_sort_indices = marlin_sort_g_idx(
-                getattr(layer, self.w_gidx_name))
-            self._transform_param(layer, self.w_gidx_name, lambda _: g_idx)
-            layer.g_idx_sort_indices = g_idx_sort_indices
-        else:
-            setattr(layer, self.w_gidx_name, marlin_make_empty_g_idx(device))
-            layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
-
-        if c.zero_points:
-            pass
-            # TODO (lucas): add the following when AWQMarlin is migrated over to
-            #       using MPLinearKernel backend
-            # self._transform_param(layer, self.w_zp_name, lambda x: \
-            #     marlin_zero_points(
-            #         x,
-            #         size_k=c.partition_weight_shape[0],
-            #         size_n=c.partition_weight_shape[1],
-            #         num_bits=c.weight_type.size_bits))
-        else:
-            setattr(layer, self.w_zp_name, marlin_make_empty_g_idx(device))
-
         def transform_w_q(x):
             assert isinstance(x, BasevLLMParameter)
             permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
@@ -108,6 +82,28 @@ class MarlinLinearKernel(MPLinearKernel):
                                            group_size=c.group_size)
             return x
 
+        if c.has_g_idx:
+            g_idx, g_idx_sort_indices = marlin_sort_g_idx(
+                getattr(layer, self.w_gidx_name))
+            self._transform_param(layer, self.w_gidx_name, lambda _: g_idx)
+            layer.g_idx_sort_indices = g_idx_sort_indices
+        else:
+            setattr(layer, self.w_gidx_name, marlin_make_empty_g_idx(device))
+            layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
+
+        if c.zero_points:
+            grouped_k = (c.partition_weight_shape[0] //
+                         c.group_size if c.group_size != -1 else 1)
+            self._transform_param(layer, self.w_zp_name, lambda x: \
+                marlin_zero_points(
+                    unpack_cols(x.t(), c.weight_type.size_bits,
+                                grouped_k,
+                                c.partition_weight_shape[1]),
+                    size_k=grouped_k,
+                    size_n=c.partition_weight_shape[1],
+                    num_bits=c.weight_type.size_bits))
+        else:
+            setattr(layer, self.w_zp_name, marlin_make_empty_g_idx(device))
         self._transform_param(layer, self.w_q_name, transform_w_q)
         self._transform_param(layer, self.w_s_name, transform_w_s)
 
@@ -131,5 +127,6 @@ class MarlinLinearKernel(MPLinearKernel):
             wtype=c.weight_type,
             input_size_per_partition=c.partition_weight_shape[0],
             output_size_per_partition=c.partition_weight_shape[1],
+            has_zp=self.config.zero_points,
             is_k_full=self.is_k_full,
             bias=bias)
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 1ccfae9117a83..4a190480d35b6 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -332,6 +332,7 @@ def apply_gptq_marlin_linear(
         wtype: ScalarType,
         output_size_per_partition: int,
         input_size_per_partition: int,
+        has_zp: bool,
         is_k_full: bool,
         bias: Optional[torch.Tensor] = None,
         use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
@@ -356,8 +357,8 @@ def apply_gptq_marlin_linear(
                                   size_n=output_size_per_partition,
                                   size_k=input_size_per_partition,
                                   is_k_full=is_k_full,
-                                  has_zp=False,
                                   use_atomic_add=use_atomic_add,
+                                  has_zp=has_zp,
                                   use_fp32_reduce=use_fp32_reduce,
                                   is_zp_float=False)
 

From fdcb850f1424eca5f914578187ef31642c6e422d Mon Sep 17 00:00:00 2001
From: Angky William <angky.william@gmail.com>
Date: Tue, 15 Apr 2025 15:31:38 -0700
Subject: [PATCH 444/593] [Misc] Enable vLLM to Dynamically Load LoRA from a
 Remote Server (#10546)

Signed-off-by: Angky William <angkywilliam@Angkys-MacBook-Pro.local>
Co-authored-by: Angky William <angkywilliam@Angkys-MacBook-Pro.local>
---
 docs/source/features/lora.md                  |  61 ++++-
 .../entrypoints/openai/test_lora_resolvers.py | 209 ++++++++++++++++++
 tests/lora/test_resolver.py                   |  74 +++++++
 vllm/entrypoints/openai/serving_engine.py     |  14 +-
 vllm/entrypoints/openai/serving_models.py     |  70 ++++++
 vllm/lora/resolver.py                         |  83 +++++++
 6 files changed, 505 insertions(+), 6 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_lora_resolvers.py
 create mode 100644 tests/lora/test_resolver.py
 create mode 100644 vllm/lora/resolver.py

diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md
index a71da72e4360a..b5b51095b3a75 100644
--- a/docs/source/features/lora.md
+++ b/docs/source/features/lora.md
@@ -106,19 +106,18 @@ curl http://localhost:8000/v1/completions \
 
 ## Dynamically serving LoRA Adapters
 
-In addition to serving LoRA adapters at server startup, the vLLM server now supports dynamically loading and unloading
-LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility
-to change models on-the-fly is needed.
+In addition to serving LoRA adapters at server startup, the vLLM server supports dynamically configuring LoRA adapters at runtime through dedicated API endpoints and plugins. This feature can be particularly useful when the flexibility to change models on-the-fly is needed.
 
 Note: Enabling this feature in production environments is risky as users may participate in model adapter management.
 
-To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING`
-is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active.
+To enable dynamic LoRA configuration, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING`
+is set to `True`.
 
 ```bash
 export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
 ```
 
+### Using API Endpoints
 Loading a LoRA Adapter:
 
 To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary
@@ -153,6 +152,58 @@ curl -X POST http://localhost:8000/v1/unload_lora_adapter \
 }'
 ```
 
+### Using Plugins
+Alternatively, you can use the LoRAResolver plugin to dynamically load LoRA adapters. LoRAResolver plugins enable you to load LoRA adapters from both local and remote sources such as local file system and S3. On every request, when there's a new model name that hasn't been loaded yet, the LoRAResolver will try to resolve and load the corresponding LoRA adapter.
+
+You can set up multiple LoRAResolver plugins if you want to load LoRA adapters from different sources. For example, you might have one resolver for local files and another for S3 storage. vLLM will load the first LoRA adapter that it finds.
+
+You can either install existing plugins or implement your own.
+
+Steps to implement your own LoRAResolver plugin:
+1. Implement the LoRAResolver interface.
+
+    Example of a simple S3 LoRAResolver implementation:
+
+    ```python
+    import os
+    import s3fs
+    from vllm.lora.request import LoRARequest
+    from vllm.lora.resolver import LoRAResolver
+
+    class S3LoRAResolver(LoRAResolver):
+        def __init__(self):
+            self.s3 = s3fs.S3FileSystem()
+            self.s3_path_format = os.getenv("S3_PATH_TEMPLATE")
+            self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE")
+
+        async def resolve_lora(self, base_model_name, lora_name):
+            s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
+            local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
+
+            # Download the LoRA from S3 to the local path
+            await self.s3._get(
+                s3_path, local_path, recursive=True, maxdepth=1
+            )
+
+            lora_request = LoRARequest(
+                lora_name=lora_name,
+                lora_path=local_path,
+                lora_int_id=abs(hash(lora_name))
+            )
+            return lora_request
+    ```
+
+2. Register LoRAResolver plugin.
+
+     ```python
+    from vllm.lora.resolver import LoRAResolverRegistry
+
+    s3_resolver = S3LoRAResolver()
+    LoRAResolverRegistry.register_resolver("s3_resolver", s3_resolver)
+    ```
+
+    For more details, refer to the [vLLM's Plugins System](../design/plugin_system.md).
+
 ## New format for `--lora-modules`
 
 In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example:
diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py
new file mode 100644
index 0000000000000..c96151349eb3f
--- /dev/null
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@@ -0,0 +1,209 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from contextlib import suppress
+from dataclasses import dataclass, field
+from http import HTTPStatus
+from typing import Optional
+from unittest.mock import MagicMock
+
+import pytest
+
+from vllm.config import MultiModalConfig
+from vllm.engine.multiprocessing.client import MQLLMEngineClient
+from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
+from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
+from vllm.entrypoints.openai.serving_models import (BaseModelPath,
+                                                    OpenAIServingModels)
+from vllm.lora.request import LoRARequest
+from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+MODEL_NAME = "openai-community/gpt2"
+BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
+
+MOCK_RESOLVER_NAME = "mock_test_resolver"
+
+
+@dataclass
+class MockHFConfig:
+    model_type: str = "any"
+
+
+@dataclass
+class MockModelConfig:
+    """Minimal mock ModelConfig for testing."""
+    model: str = MODEL_NAME
+    tokenizer: str = MODEL_NAME
+    trust_remote_code: bool = False
+    tokenizer_mode: str = "auto"
+    max_model_len: int = 100
+    tokenizer_revision: Optional[str] = None
+    multimodal_config: MultiModalConfig = field(
+        default_factory=MultiModalConfig)
+    hf_config: MockHFConfig = field(default_factory=MockHFConfig)
+    logits_processor_pattern: Optional[str] = None
+    diff_sampling_param: Optional[dict] = None
+    allowed_local_media_path: str = ""
+    encoder_config = None
+    generation_config: str = "auto"
+
+    def get_diff_sampling_param(self):
+        return self.diff_sampling_param or {}
+
+
+class MockLoRAResolver(LoRAResolver):
+
+    async def resolve_lora(self, base_model_name: str,
+                           lora_name: str) -> Optional[LoRARequest]:
+        if lora_name == "test-lora":
+            return LoRARequest(lora_name="test-lora",
+                               lora_int_id=1,
+                               lora_local_path="/fake/path/test-lora")
+        elif lora_name == "invalid-lora":
+            return LoRARequest(lora_name="invalid-lora",
+                               lora_int_id=2,
+                               lora_local_path="/fake/path/invalid-lora")
+        return None
+
+
+@pytest.fixture(autouse=True)
+def register_mock_resolver():
+    """Fixture to register and unregister the mock LoRA resolver."""
+    resolver = MockLoRAResolver()
+    LoRAResolverRegistry.register_resolver(MOCK_RESOLVER_NAME, resolver)
+    yield
+    # Cleanup: remove the resolver after the test runs
+    if MOCK_RESOLVER_NAME in LoRAResolverRegistry.resolvers:
+        del LoRAResolverRegistry.resolvers[MOCK_RESOLVER_NAME]
+
+
+@pytest.fixture
+def mock_serving_setup():
+    """Provides a mocked engine and serving completion instance."""
+    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+
+    def mock_add_lora_side_effect(lora_request: LoRARequest):
+        """Simulate engine behavior when adding LoRAs."""
+        if lora_request.lora_name == "test-lora":
+            # Simulate successful addition
+            return
+        elif lora_request.lora_name == "invalid-lora":
+            # Simulate failure during addition (e.g. invalid format)
+            raise ValueError(f"Simulated failure adding LoRA: "
+                             f"{lora_request.lora_name}")
+
+    mock_engine.add_lora.side_effect = mock_add_lora_side_effect
+    mock_engine.generate.reset_mock()
+    mock_engine.add_lora.reset_mock()
+
+    mock_model_config = MockModelConfig()
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
+
+    serving_completion = OpenAIServingCompletion(mock_engine,
+                                                 mock_model_config,
+                                                 models,
+                                                 request_logger=None)
+
+    return mock_engine, serving_completion
+
+
+@pytest.mark.asyncio
+async def test_serving_completion_with_lora_resolver(mock_serving_setup,
+                                                     monkeypatch):
+    monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
+
+    mock_engine, serving_completion = mock_serving_setup
+
+    lora_model_name = "test-lora"
+    req_found = CompletionRequest(
+        model=lora_model_name,
+        prompt="Generate with LoRA",
+    )
+
+    # Suppress potential errors during the mocked generate call,
+    # as we are primarily checking for add_lora and generate calls
+    with suppress(Exception):
+        await serving_completion.create_completion(req_found)
+
+    mock_engine.add_lora.assert_called_once()
+    called_lora_request = mock_engine.add_lora.call_args[0][0]
+    assert isinstance(called_lora_request, LoRARequest)
+    assert called_lora_request.lora_name == lora_model_name
+
+    mock_engine.generate.assert_called_once()
+    called_lora_request = mock_engine.generate.call_args[1]['lora_request']
+    assert isinstance(called_lora_request, LoRARequest)
+    assert called_lora_request.lora_name == lora_model_name
+
+
+@pytest.mark.asyncio
+async def test_serving_completion_resolver_not_found(mock_serving_setup,
+                                                     monkeypatch):
+    monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
+
+    mock_engine, serving_completion = mock_serving_setup
+
+    non_existent_model = "non-existent-lora-adapter"
+    req = CompletionRequest(
+        model=non_existent_model,
+        prompt="what is 1+1?",
+    )
+
+    response = await serving_completion.create_completion(req)
+
+    mock_engine.add_lora.assert_not_called()
+    mock_engine.generate.assert_not_called()
+
+    assert isinstance(response, ErrorResponse)
+    assert response.code == HTTPStatus.NOT_FOUND.value
+    assert non_existent_model in response.message
+
+
+@pytest.mark.asyncio
+async def test_serving_completion_resolver_add_lora_fails(
+        mock_serving_setup, monkeypatch):
+    monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
+
+    mock_engine, serving_completion = mock_serving_setup
+
+    invalid_model = "invalid-lora"
+    req = CompletionRequest(
+        model=invalid_model,
+        prompt="what is 1+1?",
+    )
+
+    response = await serving_completion.create_completion(req)
+
+    # Assert add_lora was called before the failure
+    mock_engine.add_lora.assert_called_once()
+    called_lora_request = mock_engine.add_lora.call_args[0][0]
+    assert isinstance(called_lora_request, LoRARequest)
+    assert called_lora_request.lora_name == invalid_model
+
+    # Assert generate was *not* called due to the failure
+    mock_engine.generate.assert_not_called()
+
+    # Assert the correct error response
+    assert isinstance(response, ErrorResponse)
+    assert response.code == HTTPStatus.BAD_REQUEST.value
+    assert invalid_model in response.message
+
+
+@pytest.mark.asyncio
+async def test_serving_completion_flag_not_set(mock_serving_setup):
+    mock_engine, serving_completion = mock_serving_setup
+
+    lora_model_name = "test-lora"
+    req_found = CompletionRequest(
+        model=lora_model_name,
+        prompt="Generate with LoRA",
+    )
+
+    await serving_completion.create_completion(req_found)
+
+    mock_engine.add_lora.assert_not_called()
+    mock_engine.generate.assert_not_called()
diff --git a/tests/lora/test_resolver.py b/tests/lora/test_resolver.py
new file mode 100644
index 0000000000000..8ebc2ae98fc43
--- /dev/null
+++ b/tests/lora/test_resolver.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import pytest
+
+from vllm.lora.request import LoRARequest
+from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
+
+
+class DummyLoRAResolver(LoRAResolver):
+    """A dummy LoRA resolver for testing."""
+
+    async def resolve_lora(self, base_model_name: str,
+                           lora_name: str) -> Optional[LoRARequest]:
+        if lora_name == "test_lora":
+            return LoRARequest(
+                lora_name=lora_name,
+                lora_path=f"/dummy/path/{base_model_name}/{lora_name}",
+                lora_int_id=abs(hash(lora_name)))
+        return None
+
+
+def test_resolver_registry_registration():
+    """Test basic resolver registration functionality."""
+    registry = LoRAResolverRegistry
+    resolver = DummyLoRAResolver()
+
+    # Register a new resolver
+    registry.register_resolver("dummy", resolver)
+    assert "dummy" in registry.get_supported_resolvers()
+
+    # Get registered resolver
+    retrieved_resolver = registry.get_resolver("dummy")
+    assert retrieved_resolver is resolver
+
+
+def test_resolver_registry_duplicate_registration():
+    """Test registering a resolver with an existing name."""
+    registry = LoRAResolverRegistry
+    resolver1 = DummyLoRAResolver()
+    resolver2 = DummyLoRAResolver()
+
+    registry.register_resolver("dummy", resolver1)
+    registry.register_resolver("dummy", resolver2)
+
+    assert registry.get_resolver("dummy") is resolver2
+
+
+def test_resolver_registry_unknown_resolver():
+    """Test getting a non-existent resolver."""
+    registry = LoRAResolverRegistry
+
+    with pytest.raises(KeyError, match="not found"):
+        registry.get_resolver("unknown_resolver")
+
+
+@pytest.mark.asyncio
+async def test_dummy_resolver_resolve():
+    """Test the dummy resolver's resolve functionality."""
+    dummy_resolver = DummyLoRAResolver()
+    base_model_name = "base_model_test"
+    lora_name = "test_lora"
+
+    # Test successful resolution
+    result = await dummy_resolver.resolve_lora(base_model_name, lora_name)
+    assert isinstance(result, LoRARequest)
+    assert result.lora_name == lora_name
+    assert result.lora_path == f"/dummy/path/{base_model_name}/{lora_name}"
+
+    # Test failed resolution
+    result = await dummy_resolver.resolve_lora(base_model_name,
+                                               "nonexistent_lora")
+    assert result is None
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index bbc8eddd8b1b0..49b346a23baf9 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -10,6 +10,7 @@ from fastapi import Request
 from pydantic import Field
 from starlette.datastructures import Headers
 
+import vllm.envs as envs
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 # yapf conflicts with isort for this block
@@ -125,18 +126,29 @@ class OpenAIServing:
         self,
         request: AnyRequest,
     ) -> Optional[ErrorResponse]:
+
+        error_response = None
+
         if self._is_model_supported(request.model):
             return None
         if request.model in [
                 lora.lora_name for lora in self.models.lora_requests
         ]:
             return None
+        if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING and request.model and (
+                load_result := await self.models.resolve_lora(request.model)):
+            if isinstance(load_result, LoRARequest):
+                return None
+            if isinstance(load_result, ErrorResponse) and \
+                load_result.code == HTTPStatus.BAD_REQUEST.value:
+                error_response = load_result
         if request.model in [
                 prompt_adapter.prompt_adapter_name
                 for prompt_adapter in self.models.prompt_adapter_requests
         ]:
             return None
-        return self.create_error_response(
+
+        return error_response or self.create_error_response(
             message=f"The model `{request.model}` does not exist.",
             err_type="NotFoundError",
             status_code=HTTPStatus.NOT_FOUND)
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index 7a68452efc653..74433a1a3c3f5 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -2,6 +2,8 @@
 
 import json
 import pathlib
+from asyncio import Lock
+from collections import defaultdict
 from dataclasses import dataclass
 from http import HTTPStatus
 from typing import Optional, Union
@@ -15,6 +17,7 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse,
                                               UnloadLoRAAdapterRequest)
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.utils import AtomicCounter
 
@@ -63,11 +66,19 @@ class OpenAIServingModels:
         self.base_model_paths = base_model_paths
         self.max_model_len = model_config.max_model_len
         self.engine_client = engine_client
+        self.model_config = model_config
 
         self.static_lora_modules = lora_modules
         self.lora_requests: list[LoRARequest] = []
         self.lora_id_counter = AtomicCounter(0)
 
+        self.lora_resolvers: list[LoRAResolver] = []
+        for lora_resolver_name in LoRAResolverRegistry.get_supported_resolvers(
+        ):
+            self.lora_resolvers.append(
+                LoRAResolverRegistry.get_resolver(lora_resolver_name))
+        self.lora_resolver_lock: dict[str, Lock] = defaultdict(Lock)
+
         self.prompt_adapter_requests = []
         if prompt_adapters is not None:
             for i, prompt_adapter in enumerate(prompt_adapters, start=1):
@@ -234,6 +245,65 @@ class OpenAIServingModels:
 
         return None
 
+    async def resolve_lora(
+            self, lora_name: str) -> Union[LoRARequest, ErrorResponse]:
+        """Attempt to resolve a LoRA adapter using available resolvers.
+
+        Args:
+            lora_name: Name/identifier of the LoRA adapter
+
+        Returns:
+            LoRARequest if found and loaded successfully.
+            ErrorResponse (404) if no resolver finds the adapter.
+            ErrorResponse (400) if adapter(s) are found but none load.
+        """
+        async with self.lora_resolver_lock[lora_name]:
+            # First check if this LoRA is already loaded
+            for existing in self.lora_requests:
+                if existing.lora_name == lora_name:
+                    return existing
+
+            base_model_name = self.model_config.model
+            unique_id = self.lora_id_counter.inc(1)
+            found_adapter = False
+
+            # Try to resolve using available resolvers
+            for resolver in self.lora_resolvers:
+                lora_request = await resolver.resolve_lora(
+                    base_model_name, lora_name)
+
+                if lora_request is not None:
+                    found_adapter = True
+                    lora_request.lora_int_id = unique_id
+
+                    try:
+                        await self.engine_client.add_lora(lora_request)
+                        self.lora_requests.append(lora_request)
+                        logger.info(
+                            "Resolved and loaded LoRA adapter '%s' using %s",
+                            lora_name, resolver.__class__.__name__)
+                        return lora_request
+                    except BaseException as e:
+                        logger.warning(
+                            "Failed to load LoRA '%s' resolved by %s: %s. "
+                            "Trying next resolver.", lora_name,
+                            resolver.__class__.__name__, e)
+                        continue
+
+            if found_adapter:
+                # An adapter was found, but all attempts to load it failed.
+                return create_error_response(
+                    message=(f"LoRA adapter '{lora_name}' was found "
+                             "but could not be loaded."),
+                    err_type="BadRequestError",
+                    status_code=HTTPStatus.BAD_REQUEST)
+            else:
+                # No adapter was found
+                return create_error_response(
+                    message=f"LoRA adapter {lora_name} does not exist",
+                    err_type="NotFoundError",
+                    status_code=HTTPStatus.NOT_FOUND)
+
 
 def create_error_response(
         message: str,
diff --git a/vllm/lora/resolver.py b/vllm/lora/resolver.py
new file mode 100644
index 0000000000000..6726ca9a903ff
--- /dev/null
+++ b/vllm/lora/resolver.py
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import AbstractSet, Dict, Optional
+
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+
+logger = init_logger(__name__)
+
+
+class LoRAResolver(ABC):
+    """Base class for LoRA adapter resolvers.
+
+    This class defines the interface for resolving and fetching LoRA adapters.
+    Implementations of this class should handle the logic for locating and
+    downloading LoRA adapters from various sources (e.g. S3, cloud storage,
+    etc.).
+    """
+
+    @abstractmethod
+    async def resolve_lora(self, base_model_name: str,
+                           lora_name: str) -> Optional[LoRARequest]:
+        """Abstract method to resolve and fetch a LoRA model adapter.
+
+        Implements logic to locate and download LoRA adapter based on the name.
+        Implementations might fetch from a blob storage or other sources.
+
+        Args:
+            base_model_name: The name/identifier of the base model to resolve.
+            lora_name: The name/identifier of the LoRA model to resolve.
+
+        Returns:
+            Optional[LoRARequest]: The resolved LoRA model information, or None
+            if the LoRA model cannot be found.
+        """
+        pass
+
+
+@dataclass
+class _LoRAResolverRegistry:
+    resolvers: Dict[str, LoRAResolver] = field(default_factory=dict)
+
+    def get_supported_resolvers(self) -> AbstractSet[str]:
+        """Get all registered resolver names."""
+        return self.resolvers.keys()
+
+    def register_resolver(
+        self,
+        resolver_name: str,
+        resolver: LoRAResolver,
+    ) -> None:
+        """Register a LoRA resolver.
+        Args:
+            resolver_name: Name to register the resolver under.
+            resolver: The LoRA resolver instance to register.
+        """
+        if resolver_name in self.resolvers:
+            logger.warning(
+                "LoRA resolver %s is already registered, and will be "
+                "overwritten by the new resolver instance %s.", resolver_name,
+                resolver)
+
+        self.resolvers[resolver_name] = resolver
+
+    def get_resolver(self, resolver_name: str) -> LoRAResolver:
+        """Get a registered resolver instance by name.
+        Args:
+            resolver_name: Name of the resolver to get.
+        Returns:
+            The resolver instance.
+        Raises:
+            KeyError: If the resolver is not found in the registry.
+        """
+        if resolver_name not in self.resolvers:
+            raise KeyError(
+                f"LoRA resolver '{resolver_name}' not found. "
+                f"Available resolvers: {list(self.resolvers.keys())}")
+        return self.resolvers[resolver_name]
+
+
+LoRAResolverRegistry = _LoRAResolverRegistry()

From 3badb0213b11dcd74db85dea04b883f3391c47a7 Mon Sep 17 00:00:00 2001
From: Shinichi Hemmi <50256998+Alnusjaponica@users.noreply.github.com>
Date: Wed, 16 Apr 2025 11:31:30 +0900
Subject: [PATCH 445/593] [Model] Add PLaMo2 (#14323)

Signed-off-by: Shinichi Hemmi <50256998+Alnusjaponica@users.noreply.github.com>
Signed-off-by: shemmi <shemmi@preferred.jp>
Co-authored-by: Kento Nozawa <nzw0301@preferred.jp>
Co-authored-by: Hiroaki Mikami <mhiroaki@preferred.jp>
Co-authored-by: Calvin Metzger <metzger@preferred.jp>
---
 .buildkite/test-pipeline.yaml                 |   7 +-
 docs/source/models/supported_models.md        |   5 +
 requirements/test.in                          |   1 +
 requirements/test.txt                         |   9 +
 .../decoder_only/language/test_hybrid.py      |  41 +-
 tests/models/registry.py                      |   2 +
 vllm/config.py                                |  12 +
 vllm/model_executor/models/plamo2.py          | 746 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |   1 +
 9 files changed, 800 insertions(+), 24 deletions(-)
 create mode 100644 vllm/model_executor/models/plamo2.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 38961138c97c8..c86f6add6cb20 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -400,8 +400,9 @@ steps:
     - pytest -v -s models/test_transformers.py
     - pytest -v -s models/test_registry.py
     # V1 Test: https://github.com/vllm-project/vllm/issues/14531
-    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4'
+    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
     - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
+    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
 
 - label: Language Models Test (Standard) # 32min
   #mirror_hardwares: [amd]
@@ -411,6 +412,8 @@ steps:
   - tests/models/embedding/language
   - tests/models/encoder_decoder/language
   commands:
+    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
+    - pip install causal-conv1d
     - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
     - pytest -v -s models/embedding/language -m core_model
 
@@ -422,6 +425,8 @@ steps:
   - tests/models/embedding/language
   - tests/models/encoder_decoder/language
   commands:
+    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
+    - pip install causal-conv1d
     - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
     - pytest -v -s models/embedding/language -m 'not core_model'
 
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index b6fef2f43b831..21a1d0264014d 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -497,6 +497,11 @@ See [this page](#generative-models) for more information on how to use generativ
   * `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc.
   *
   * ✅︎
+- * `Plamo2ForCausalLM`
+  * PLaMo2
+  * `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc.
+  *
+  *
 - * `QWenLMHeadModel`
   * Qwen
   * `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.
diff --git a/requirements/test.in b/requirements/test.in
index b9b3df0651b01..c3690f4c9ca42 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -27,6 +27,7 @@ torch==2.6.0
 torchaudio==2.6.0
 torchvision==0.21.0
 transformers_stream_generator # required for qwen-vl test
+mamba_ssm # required for plamo2 test
 matplotlib # required for qwen-vl test
 mistral_common[opencv] >= 1.5.4 # required for pixtral test
 num2words # required for smolvlm test
diff --git a/requirements/test.txt b/requirements/test.txt
index a5c062b0b1f62..948c9eda79e1a 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -111,6 +111,7 @@ einops==0.8.0
     # via
     #   -r requirements/test.in
     #   encodec
+    #   mamba-ssm
     #   vector-quantize-pytorch
     #   vocos
 einx==0.3.0
@@ -233,6 +234,8 @@ lxml==5.3.0
     # via
     #   blobfile
     #   sacrebleu
+mamba-ssm==2.2.4
+    # via -r requirements/test.in
 markdown-it-py==3.0.0
     # via rich
 markupsafe==3.0.2
@@ -268,6 +271,8 @@ mypy-extensions==1.0.0
     # via black
 networkx==3.2.1
     # via torch
+ninja==1.11.1.3
+    # via mamba-ssm
 nltk==3.9.1
     # via rouge-score
 num2words==0.5.14
@@ -360,6 +365,7 @@ packaging==24.1
     #   fastparquet
     #   huggingface-hub
     #   lazy-loader
+    #   mamba-ssm
     #   matplotlib
     #   peft
     #   plotly
@@ -571,6 +577,7 @@ sentencepiece==0.2.0
     # via mistral-common
 setuptools==75.8.0
     # via
+    #   mamba-ssm
     #   pytablewriter
     #   torch
 shellingham==1.5.4
@@ -627,6 +634,7 @@ torch==2.6.0
     #   encodec
     #   fastsafetensors
     #   lm-eval
+    #   mamba-ssm
     #   peft
     #   runai-model-streamer
     #   sentence-transformers
@@ -664,6 +672,7 @@ transformers==4.51.1
     #   -r requirements/test.in
     #   genai-perf
     #   lm-eval
+    #   mamba-ssm
     #   peft
     #   sentence-transformers
     #   transformers-stream-generator
diff --git a/tests/models/decoder_only/language/test_hybrid.py b/tests/models/decoder_only/language/test_hybrid.py
index 60eb3830c6d8b..64a02cb8907bc 100644
--- a/tests/models/decoder_only/language/test_hybrid.py
+++ b/tests/models/decoder_only/language/test_hybrid.py
@@ -9,9 +9,15 @@ from vllm.sampling_params import SamplingParams
 from ...utils import check_outputs_equal
 
 # This test is for the hybrid models
-MODELS = ["ai21labs/Jamba-tiny-dev", "Zyphra/Zamba2-1.2B-instruct"]
+MODELS = [
+    "ai21labs/Jamba-tiny-dev", "Zyphra/Zamba2-1.2B-instruct",
+    "pfnet/plamo-2-1b"
+]
 # Bamba at Fp32 is too big for the CI (L4 GPU).
 # MODELS = ["ai21labs/Jamba-tiny-dev", "ibm-ai-platform/Bamba-9B"]
+# Note: Running Plamo2 in transformers implementation requires to install
+# causal-conv1d package, which is not listed as a test dependency as it's
+# not compatible with pip-compile.
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -25,21 +31,11 @@ def test_models(
     dtype: str,
     max_tokens: int,
 ) -> None:
-
     # numeric error produces different generation
     if "Bamba" in model:
         example_prompts.pop(3)
 
-    model_kwargs = {
-        "use_mamba_kernels": False,  # mamba kernels are not installed so HF 
-        # don't use them
-    }
-    if "Zamba2" in model:
-        # Zamba2 HF implementation automatically checks if mamba kernels are
-        # installed
-        model_kwargs = {}
-
-    with hf_runner(model, dtype=dtype, model_kwargs=model_kwargs) as hf_model:
+    with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     with vllm_runner(model, dtype=dtype) as vllm_model:
@@ -94,6 +90,10 @@ def test_mamba_prefill_chunking_with_parallel_sampling(
     # correctly for n > 1 decoding steps inside a
     # chunked prefill forward pass (where we have both prefills
     # and decoding together )
+
+    if 'plamo-2' in model:
+        dtype = "float"  # use a different dtype for plamo
+
     sampling_params = SamplingParams(n=3,
                                      temperature=1,
                                      seed=0,
@@ -125,20 +125,14 @@ def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
         example_prompts.pop(3)
         example_prompts.pop(2)
         dtype = "half"  # use a different dtype for Bamba
+
     elif "Zamba2" in model:
         example_prompts.pop(7)
         dtype = "half"
+    elif "plamo-2-1b" in model:
+        example_prompts.pop(7)
 
-    model_kwargs = {
-        "use_mamba_kernels": False,  # mamba kernels are not installed so HF 
-        # don't use them
-    }
-    if "Zamba2" in model:
-        # Zamba2 HF implementation automatically checks if mamba kernels are
-        # installed
-        model_kwargs = {}
-
-    with hf_runner(model, dtype=dtype, model_kwargs=model_kwargs) as hf_model:
+    with hf_runner(model, dtype=dtype) as hf_model:
         non_chunked = hf_model.generate_greedy(example_prompts, max_tokens)
 
     with vllm_runner(model,
@@ -208,7 +202,8 @@ def test_mamba_cache_cg_padding(
     # This test is for verifying that mamba cache is padded to CG captured
     # batch size. If it's not, a torch RuntimeError will be raised because
     # tensor dimensions aren't compatible
-    vllm_config = EngineArgs(model=model).create_engine_config()
+    vllm_config = EngineArgs(model=model,
+                             trust_remote_code=True).create_engine_config()
     while len(example_prompts) == vllm_config.pad_for_cudagraph(
             len(example_prompts)):
         example_prompts.append(example_prompts[0])
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 530da89cc72e3..51aeeb5e441da 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -204,6 +204,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                             trust_remote_code=True),
     "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
                                          trust_remote_code=True),
+    "Plamo2ForCausalLM": _HfExamplesInfo("pfnet/plamo-2-1b",
+                                        trust_remote_code=True),
     "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat",
                                        trust_remote_code=True),
     "Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-7B-Instruct",
diff --git a/vllm/config.py b/vllm/config.py
index 60ea4a517bde9..6f87c46a9a2bb 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2838,6 +2838,13 @@ def _get_and_verify_dtype(
             else:
                 torch_dtype = config_dtype
 
+            if config.model_type == "plamo2":
+                logger.info(
+                    "For PLaMo2, we cast models to bfloat16 instead of using "
+                    "float16 by default. This is because float16 does not work."
+                )
+                torch_dtype = torch.bfloat16
+
             from vllm.platforms import current_platform
             if (current_platform.is_cpu()
                     and current_platform.get_cpu_architecture()
@@ -2867,6 +2874,11 @@ def _get_and_verify_dtype(
                     "using float16 by default. Please specify `dtype` if you "
                     "want to use float16.")
                 torch_dtype = torch.bfloat16
+        elif dtype == "float16" and config.model_type == "plamo2":
+            logger.warning(
+                "For PLaMo2, using float16 is unstable and might cause "
+                "unexpected behavior. Please use bfloat16 or float32 instead.")
+            torch_dtype = torch.float16
         else:
             if dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
                 raise ValueError(f"Unknown dtype: {dtype}")
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
new file mode 100644
index 0000000000000..fb1442526c6ca
--- /dev/null
+++ b/vllm/model_executor/models/plamo2.py
@@ -0,0 +1,746 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Inference-only PLaMo2 model."""
+import math
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig, PreTrainedModel
+
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.attention.layer import Attention
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
+    causal_conv1d_fn, causal_conv1d_update)
+from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
+    selective_scan_fn, selective_state_update)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    composed_weight_loader, default_weight_loader, sharded_weight_loader)
+from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid,
+                                                   SupportsV0Only)
+from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
+                                                    MambaCacheParams)
+from vllm.model_executor.models.utils import maybe_prefix
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.sequence import IntermediateTensors
+from vllm.utils import LayerBlockType
+
+
+# Only used for type hinting.
+class Plamo2Config(PretrainedConfig):  # type: ignore
+    model_type: str = "plamo2"
+
+    hidden_size: int
+    num_hidden_layers: int
+    rms_norm_eps: float
+    # Attention
+    num_attention_heads: int
+    hidden_size_per_head: int
+    num_key_value_heads: int
+    # Mamba
+    mamba_d_state: int
+    mamba_d_conv: int
+    mamba_num_heads: int
+    mamba_step: int
+    # MLP
+    intermediate_size: int
+    # Tokenizer
+    vocab_size: int
+
+
+class Plamo2PreTrainedModel(PreTrainedModel):  # type: ignore
+
+    def _init_weights(self, module: torch.nn.Module) -> None:
+        std = 0.02
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+def get_initial_dt_bias(num_heads: int) -> torch.Tensor:
+    dt_min = 0.001
+    dt_max = 0.1
+    dt = torch.exp(
+        torch.rand(num_heads) * (math.log(dt_max) - math.log(dt_min)) +
+        math.log(dt_min))
+    dt = torch.clamp(dt, 1e-4)
+    inv_dt = dt + torch.log(-torch.expm1(-dt))
+    return inv_dt
+
+
+def is_mamba(config: Plamo2Config, i: int) -> bool:
+    assert config.mamba_step > 1
+
+    if config.num_hidden_layers <= (config.mamba_step // 2):
+        # use attention in last layer
+        return i != config.num_hidden_layers - 1
+    return (i % config.mamba_step) != (config.mamba_step // 2)
+
+
+# TODO(Shinichi): Replace this with RMSNorm.
+def _rms_norm(hidden_states: torch.Tensor, weight: torch.Tensor,
+              eps: float) -> torch.Tensor:
+    input_shape = hidden_states.shape
+    hidden_states = hidden_states.reshape(input_shape[:-1] + weight.shape)
+    input_dtype = hidden_states.dtype
+    hidden_states = hidden_states.to(torch.float32)
+    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+    hidden_states = hidden_states * torch.rsqrt(variance + eps)
+    hidden_states = hidden_states.to(input_dtype)
+    hidden_states = weight * hidden_states
+    return hidden_states.reshape(input_shape)
+
+
+def _swiglu(h: torch.Tensor) -> torch.Tensor:
+    h0, h1 = h.chunk(2, dim=-1)
+    return torch.nn.functional.silu(h0) * h1
+
+
+# Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
+class Plamo2MambaMixer(nn.Module):
+    # TODO(Shinichi): Rebase on Mamba2 implementation.
+
+    def __init__(self,
+                 config: Plamo2Config,
+                 cache_config: CacheConfig,
+                 quant_config: QuantizationConfig,
+                 max_model_len: int,
+                 prefix: str = "",
+                 **kwargs) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.ssm_state_size = config.mamba_d_state
+        self.conv_kernel_size = config.mamba_d_conv
+        self.intermediate_size = (config.mamba_num_heads *
+                                  config.hidden_size_per_head)
+        self.hidden_size_per_head = config.hidden_size_per_head
+        self.num_heads = config.mamba_num_heads
+        self.time_step_rank = max(64, self.hidden_size // 16)
+        self.use_conv_bias = False
+        self.use_bias = False
+        self.conv1d = ColumnParallelLinear(
+            input_size=self.conv_kernel_size,
+            output_size=self.intermediate_size,
+            bias=self.use_conv_bias,
+        )
+        # unsqueeze to fit conv1d weights shape into the linear weights shape.
+        # Can't do this in `weight_loader` since it already exists in
+        # `ColumnParallelLinear` and `set_weight_attrs`
+        # doesn't allow to override it
+        self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1)
+
+        self.in_proj = MergedColumnParallelLinear(
+            self.hidden_size,
+            [self.intermediate_size] * 2,
+            bias=self.use_bias,
+            prefix=f"{prefix}.in_proj",
+        )
+        # selective projection used to make dt, B and C input dependent
+        self.bcdt_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.time_step_rank + self.ssm_state_size * 2,
+            bias=False,
+            prefix=f"{prefix}.bcdt_proj",
+        )
+        # time step projection (discretization) -
+        # In the forward we need to apply dt_proj without the bias,
+        # as the bias is added in the selective scan kernel.
+        self.dt_proj = ColumnParallelLinear(
+            self.time_step_rank,
+            self.num_heads,
+            bias=False,
+            prefix=f"{prefix}.dt_proj",
+        )
+        self.dt_bias = torch.nn.Parameter(get_initial_dt_bias(self.num_heads))
+
+        tp_size = get_tensor_model_parallel_world_size()
+        self.A = nn.Parameter(
+            torch.empty(
+                self.intermediate_size // tp_size,
+                self.ssm_state_size,
+                dtype=torch.float32,
+            ))
+        self.D = nn.Parameter(torch.ones(self.intermediate_size // tp_size))
+
+        set_weight_attrs(self.D, {"weight_loader": sharded_weight_loader(0)})
+        a_weight_loader = composed_weight_loader(
+            sharded_weight_loader(0), lambda x: -torch.exp(x.float()))
+        set_weight_attrs(self.A, {"weight_loader": a_weight_loader})
+
+        self.out_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=self.use_bias,
+            input_is_parallel=True,
+            prefix=f"{prefix}.out_proj",
+        )
+        # The activation function is fixed to SiLU.
+        self.activation = "silu"
+
+        self.dt_norm = RMSNorm(self.time_step_rank, eps=config.rms_norm_eps)
+        self.B_norm = RMSNorm(self.ssm_state_size, eps=config.rms_norm_eps)
+        self.C_norm = RMSNorm(self.ssm_state_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        **kwargs,
+    ) -> torch.Tensor:
+
+        attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
+
+        # 1. Gated MLP's linear projection
+        projected_states = self.in_proj(hidden_states)[0]
+        # Reshaping the projected states as in modeling_plamo.py.
+        length = len(hidden_states)
+        projected_states = projected_states.reshape(length, self.num_heads, -1)
+        gate, hidden_states = torch.split(
+            projected_states,
+            [self.hidden_size_per_head, self.hidden_size_per_head],
+            dim=-1)
+        hidden_states = hidden_states.reshape(length, -1).transpose(0, 1)
+        gate = gate.reshape(length, -1).transpose(0, 1)
+
+        # 2. Convolution sequence transformation
+        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
+                                               self.conv1d.weight.size(2))
+
+        if attn_metadata.query_start_loc is not None \
+            and attn_metadata.context_lens_tensor is not None:
+            # |---------- N-1 iteration --------|
+            # |---------------- N iteration ---------------------|
+            # |- tokenA -|......................|-- newTokens ---|
+            # |---------- context_len ----------|
+            # |-------------------- seq_len ---------------------|
+            #                                   |-- query_len ---|
+            hidden_states = causal_conv1d_fn(
+                hidden_states,
+                conv_weights,
+                self.conv1d.bias,
+                activation=self.activation,
+                conv_states=mamba_cache_params.conv_state,
+                has_initial_state=attn_metadata.context_lens_tensor > 0,
+                cache_indices=mamba_cache_params.state_indices_tensor,
+                query_start_loc=attn_metadata.query_start_loc)
+        else:
+            hidden_states = causal_conv1d_update(
+                hidden_states.transpose(0, 1),
+                mamba_cache_params.conv_state,
+                conv_weights,
+                self.conv1d.bias,
+                self.activation,
+                conv_state_indices=mamba_cache_params.state_indices_tensor)
+            hidden_states = hidden_states.transpose(0, 1)
+
+        # 3. State Space Model sequence transformation
+        # 3.a. input varying initialization of time_step, B and C
+        ssm_parameters = self.bcdt_proj(hidden_states.transpose(-2, -1))[0]
+
+        # Splitting the ssm_parameters as in modeling_plamo.py.
+        B, C, time_step = torch.split(
+            ssm_parameters,
+            [self.ssm_state_size, self.ssm_state_size, self.time_step_rank],
+            dim=-1,
+        )
+        time_step = self.dt_norm(time_step.contiguous())
+        B = self.B_norm(B.contiguous())
+        C = self.C_norm(C.contiguous())
+
+        discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1)
+        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+        time_proj_bias = (self.dt_bias.float() if hasattr(
+            self.dt_proj, "bias") else None)
+
+        # Broadcasting as in modeling_plamo.py.
+        discrete_time_step = discrete_time_step.transpose(
+            0, 1)[..., None].expand(-1, -1, self.hidden_size_per_head)
+        discrete_time_step = discrete_time_step.reshape(
+            -1, self.intermediate_size).transpose(0, 1)
+        time_proj_bias = time_proj_bias[...,
+                                        None].expand(-1,
+                                                     self.hidden_size_per_head)
+        time_proj_bias = time_proj_bias.reshape(self.intermediate_size)
+
+        if attn_metadata.query_start_loc is not None \
+            and attn_metadata.context_lens_tensor is not None:
+            scan_outputs = selective_scan_fn(
+                hidden_states,
+                mamba_cache_params.ssm_state,
+                discrete_time_step,
+                self.A,
+                B.transpose(-2, -1),
+                C.transpose(-2, -1),
+                self.D.float(),
+                gate,
+                time_proj_bias,
+                delta_softplus=True,
+                cache_indices=mamba_cache_params.state_indices_tensor,
+                has_initial_state=attn_metadata.context_lens_tensor > 0,
+                query_start_loc=attn_metadata.query_start_loc)
+        else:
+            scan_outputs = selective_state_update(
+                mamba_cache_params.ssm_state,
+                hidden_states.transpose(0, 1),
+                discrete_time_step.transpose(0, 1),
+                self.A,
+                B,
+                C,
+                self.D,
+                gate.transpose(0, 1),
+                time_proj_bias,
+                dt_softplus=True,
+                state_batch_indices=mamba_cache_params.state_indices_tensor)
+            scan_outputs = scan_outputs.transpose(0, 1)
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(scan_outputs.transpose(-2,
+                                                                     -1))[0]
+        return contextualized_states
+
+
+class DenseMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: Plamo2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_up_proj = MergedColumnParallelLinear(
+            self.hidden_size, [self.intermediate_size] * 2,
+            bias=False,
+            prefix=f"{prefix}.gate_up_proj",
+            quant_config=quant_config)
+        self.down_proj = RowParallelLinear(self.intermediate_size,
+                                           self.hidden_size,
+                                           bias=False,
+                                           prefix=f"{prefix}.down_proj",
+                                           quant_config=quant_config)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        h = self.gate_up_proj(hidden_states)[0]
+        h = _swiglu(h)
+        output, _ = self.down_proj(h)
+        return output  # type: ignore
+
+
+class Plamo2AttentionMixer(nn.Module):
+
+    def __init__(self,
+                 config: Plamo2Config,
+                 cache_config: CacheConfig,
+                 quant_config: QuantizationConfig,
+                 max_model_len: int | None = None,
+                 prefix: str = "",
+                 **kwargs) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = config.hidden_size_per_head
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+        )
+        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
+                                        config.hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config)
+
+        self.rope_theta = config.rope_theta if hasattr(config,
+                                                       "rope_theta") else 10000
+        self.rope_scaling = config.rope_scaling if hasattr(
+            config, "rope_scaling") else None
+
+        assert max_model_len is not None, "max_model_len must be provided"
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_model_len,
+            base=self.rope_theta,
+            rope_scaling=self.rope_scaling,
+        )
+        self.q_weight = torch.nn.Parameter(
+            torch.ones((self.num_heads, config.hidden_size_per_head)))
+        self.k_weight = torch.nn.Parameter(
+            torch.ones((self.num_kv_heads, config.hidden_size_per_head)))
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        **kwargs,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q = _rms_norm(q, self.q_weight, 1e-6)
+        k = _rms_norm(k, self.k_weight, 1e-6)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Plamo2DecoderLayer(nn.Module):
+
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 layer_idx: int,
+                 max_model_len: int | None = None,
+                 prefix: str = "",
+                 **kwargs) -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        max_model_len = vllm_config.scheduler_config.max_model_len
+
+        self.is_mamba = is_mamba(config, layer_idx)
+        if self.is_mamba:
+            self.mixer = Plamo2MambaMixer(config=config,
+                                          cache_config=cache_config,
+                                          quant_config=quant_config,
+                                          max_model_len=max_model_len,
+                                          prefix=f"{prefix}.mixer")
+        else:
+            self.mixer = Plamo2AttentionMixer(config=config,
+                                              cache_config=cache_config,
+                                              quant_config=quant_config,
+                                              max_model_len=max_model_len,
+                                              prefix=f"{prefix}.mixer")
+
+        self.mlp = DenseMLP(config=config,
+                            quant_config=quant_config,
+                            prefix=f"{prefix}.mlp")
+        self.pre_mixer_norm = RMSNorm(config.hidden_size,
+                                      eps=config.rms_norm_eps)
+        self.post_mixer_norm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.pre_mlp_norm = RMSNorm(config.hidden_size,
+                                    eps=config.rms_norm_eps)
+        self.post_mlp_norm = RMSNorm(config.hidden_size,
+                                     eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        mamba_cache_params: MambaCacheParams,
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.pre_mixer_norm(hidden_states)
+        else:
+            hidden_states, residual = self.pre_mixer_norm(
+                hidden_states, residual)
+
+        hidden_states = self.mixer(positions=positions,
+                                   hidden_states=hidden_states,
+                                   residual=residual,
+                                   mamba_cache_params=mamba_cache_params)
+        hidden_states = self.post_mixer_norm(hidden_states)
+        # Fully Connected
+        hidden_states, residual = self.pre_mlp_norm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_mlp_norm(hidden_states)
+        return hidden_states, residual
+
+
+class Plamo2Decoder(torch.nn.Module):
+
+    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        num_hidden_layers = vllm_config.model_config.hf_config.num_hidden_layers
+
+        self.layers = nn.ModuleList([
+            Plamo2DecoderLayer(vllm_config=vllm_config,
+                               layer_idx=i,
+                               prefix=f"{prefix}.layers.{i}")
+            for i in range(num_hidden_layers)
+        ])
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        mamba_cache_params: MambaCacheParams,
+    ) -> torch.Tensor:
+        mamba_cache_index = 0
+        for layer in self.layers:
+            layer_mamba_cache_params = None
+            if layer.is_mamba:
+                layer_mamba_cache_params = mamba_cache_params.at_layer_idx(
+                    mamba_cache_index)
+                mamba_cache_index += 1
+
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+                mamba_cache_params=layer_mamba_cache_params)
+        return hidden_states, residual
+
+
+class Plamo2Model(Plamo2PreTrainedModel):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config.model_config.hf_config)
+
+        config = vllm_config.model_config.hf_config
+
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.org_vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            prefix=f"{prefix}.embed_tokens",
+        )
+        self.layers = Plamo2Decoder(vllm_config, prefix=f"{prefix}.layers")
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # TODO(Shinichi): Implement pipeline parallelism.
+        hidden_states = self.embed_tokens(input_ids)
+        residual = None
+
+        hidden_states, residual = self.layers(
+            positions=positions,
+            hidden_states=hidden_states,
+            residual=residual,
+            mamba_cache_params=mamba_cache_params)
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, IsHybrid,
+                        SupportsV0Only):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        config = vllm_config.model_config.hf_config
+        scheduler_config = vllm_config.scheduler_config
+        assert not vllm_config.cache_config.enable_prefix_caching, \
+            "PLaMo2 currently does not support prefix caching"
+
+        super().__init__(config)
+        self.config = config
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.scheduler_config = scheduler_config
+
+        # ModelConfig.get_head_size assumes head_dim is set or calculated as
+        # hidden_size // num_attention_heads. However, this is not always
+        # the case for PLaMo2, as indicated by the FIXME comment.
+        self.config.head_dim = self.config.hidden_size_per_head
+
+        self.model = Plamo2Model(vllm_config=vllm_config,
+                                 prefix=maybe_prefix(prefix, "model"))
+        self.vocab_size = self.config.vocab_size
+        self.unpadded_vocab_size = self.config.vocab_size
+        num_embeddings = ((self.vocab_size + 15) // 16) * 16
+        self.lm_head = ParallelLMHead(
+            num_embeddings,
+            self.config.hidden_size,
+            org_num_embeddings=self.config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            prefix=f"{prefix}.lm_head",
+        )
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
+
+        # Used to track and store by the Mamba cache between steps.
+        self.mamba_cache: Optional[MambaCacheManager] = None
+
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                self.config.vocab_size)
+        self.sampler = get_sampler()
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs):
+        if self.mamba_cache is None:
+            num_mamba_layers = self.model_config.get_num_layers_by_block_type(
+                self.vllm_config.parallel_config, LayerBlockType.mamba)
+
+            self.mamba_cache = MambaCacheManager(
+                self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers,
+                *self._get_mamba_cache_shape())
+
+        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+
+        hidden_states = self.model(input_ids, positions, mamba_cache_params,
+                                   intermediate_tensors, inputs_embeds)
+        return hidden_states
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        return self.mamba_cache.copy_inputs_before_cuda_graphs(
+            input_buffers, **kwargs)
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
+
+    def _get_mamba_cache_shape(
+            self) -> Tuple[Tuple[int, int], Tuple[int, int]]:
+        world_size = get_tensor_model_parallel_world_size()
+        hidden_size = (self.config.mamba_num_heads *
+                       self.config.hidden_size_per_head)
+        conv_state_shape = (
+            hidden_size // world_size,
+            self.config.mamba_d_conv - 1,
+        )
+        temporal_state_shape = (
+            hidden_size // world_size,
+            self.config.mamba_d_state,
+        )
+        return conv_state_shape, temporal_state_shape
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+
+            # Both tie_word_embeddings=True and lm_head.weight in the safetensor
+            # at the same time causes dict key access error.
+            if name == "lm_head.weight" and self.config.tie_word_embeddings:
+                assert "lm_head.weight" not in params_dict
+                continue
+
+            # Update the weight names to be compatible with the vllm version
+            # of the model.
+            # Do not change the order of the replacements.
+            replacements = {
+                # Rename incompatible weight names.
+                ".A_log": ".A",
+                ".B_norm_weight": ".B_norm.weight",
+                ".C_norm_weight": ".C_norm.weight",
+                ".dt_norm_weight": ".dt_norm.weight",
+            }
+            # Apply replacements based on the defined mappings
+            for old, new in replacements.items():
+                if old in name:
+                    name = name.replace(old, new)
+
+            # Broadcast the loaded weight to match the model's parameter shape.
+            if ".A" in name:
+                loaded_weight = loaded_weight[:, None, None].expand(
+                    -1, self.config.hidden_size_per_head,
+                    self.config.mamba_d_state)
+                loaded_weight = loaded_weight.reshape(
+                    -1, self.config.mamba_d_state)
+            elif ".D" in name:
+                loaded_weight = loaded_weight[:, None].expand(
+                    -1, self.config.hidden_size_per_head)
+                loaded_weight = loaded_weight.reshape(-1)
+            # Offset parameter with vllm's RMSNorm haven't been supported yet.
+            if ".pre_mixer_norm" in name:
+                loaded_weight += 1.0
+            elif ".post_mixer_norm" in name:
+                loaded_weight += 1.0 / 5
+            elif ".pre_mlp_norm" in name:
+                loaded_weight += 1.0
+            elif ".post_mlp_norm" in name:
+                loaded_weight += 1.0 / (5**1.5)
+            elif "model.norm.weight" in name:
+                loaded_weight += 1.0
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index b345113ef306b..e35d84b3b94a5 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -99,6 +99,7 @@ _TEXT_GENERATION_MODELS = {
     "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
     "Phi3SmallForCausalLM": ("phi3_small", "Phi3SmallForCausalLM"),
     "PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"),
+    "Plamo2ForCausalLM": ("plamo2", "Plamo2ForCausalLM"),
     "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),

From 96bb8aa68b2a155e2c4dc574afcb8d7e3d8bf9a1 Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Wed, 16 Apr 2025 12:21:14 +0800
Subject: [PATCH 446/593] [Bugfix] fix gpu docker image mis benchmarks dir
 (#16628)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 docker/Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index d1ecef586d50b..45adf83e34ae6 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -240,6 +240,7 @@ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
     uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
 fi
 COPY examples examples
+COPY benchmarks benchmarks
 
 # Although we build Flashinfer with AOT mode, there's still
 # some issues w.r.t. JIT compilation. Therefore we need to

From 0d7d05f4b6d79e1e5af15634b742dfd32f915f3f Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 16 Apr 2025 12:51:38 +0800
Subject: [PATCH 447/593] [Misc] Modify LRUCache touch (#16689)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index c4c74a37d2f40..c6e2afff72d77 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -316,7 +316,10 @@ class LRUCache(cachetools.LRUCache[_K, _V], Generic[_K, _V]):
         return info
 
     def touch(self, key: _K) -> None:
-        self._LRUCache__update(key)  # type: ignore
+        try:
+            self._LRUCache__order.move_to_end(key)  # type: ignore
+        except KeyError:
+            self._LRUCache__order[key] = None  # type: ignore
 
     @overload
     def get(self, key: _K, /) -> Optional[_V]:

From 966c742ed2955db9515509a3351248e5c7b5a3a2 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@users.noreply.github.com>
Date: Wed, 16 Apr 2025 01:18:28 -0400
Subject: [PATCH 448/593] Disable remote caching when calling compile_fx
 (#16611)

Signed-off-by: rzou <zou3519@gmail.com>
---
 vllm/compilation/compiler_interface.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index 6c8875916efc3..f6c752073c7d8 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -290,6 +290,19 @@ class InductorAdaptor(CompilerInterface):
             # Dynamo metrics context, see method for more details.
             stack.enter_context(self.metrics_context())
 
+            # Disable remote caching. When these are on, on remote cache-hit,
+            # the monkey-patched functions never actually get called.
+            # vLLM today assumes and requires the monkey-patched functions to
+            # get hit.
+            # TODO(zou3519): we're going to replace this all with
+            # standalone_compile sometime.
+            if is_torch_equal_or_newer("2.6"):
+                stack.enter_context(
+                    torch._inductor.config.patch(fx_graph_remote_cache=False))
+                stack.enter_context(
+                    torch._functorch.config.patch(
+                        enable_remote_autograd_cache=False))
+
             compiled_graph = compile_fx(
                 graph,
                 example_inputs,

From 3ac98edcb1390cfa0a31427941d4de26e36606be Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Wed, 16 Apr 2025 14:00:43 +0800
Subject: [PATCH 449/593] [Feature] add model aware kv ops helper (#16020)

Signed-off-by: billishyahao <bill.he@amd.com>
---
 .../kv_connector/mooncake_store_connector.py  | 39 +++-----
 .../kv_connector/simple_connector.py          | 91 ++++---------------
 .../kv_transfer/kv_connector/utils.py         | 90 ++++++++++++++++++
 3 files changed, 121 insertions(+), 99 deletions(-)
 create mode 100644 vllm/distributed/kv_transfer/kv_connector/utils.py

diff --git a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
index c5135dab23eba..7b26aec23239c 100644
--- a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 """
 MooncakeStore Connector for Distributed Machine Learning Inference
-
 The MooncakeStoreConnector transfers KV caches between prefill vLLM workers
 (KV cache producer) and decode vLLM workers (KV cache consumer) using a
 database-style KVStore.
@@ -11,9 +10,10 @@ from typing import TYPE_CHECKING, List, Tuple, Union
 
 import torch
 
-from vllm import _custom_ops as ops
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
+from vllm.distributed.kv_transfer.kv_connector.utils import (
+    model_aware_kv_ops_helper as kv_helper)
 from vllm.logger import init_logger
 from vllm.sequence import IntermediateTensors
 
@@ -32,8 +32,7 @@ class MooncakeStoreConnector(KVConnectorBase):
         config: VllmConfig,
     ):
         self.config = config.kv_transfer_config
-        self.tp_size = config.parallel_config.tensor_parallel_size
-
+        self.kv_helper = kv_helper(config)
         self.local_tp_rank = local_rank
 
         # Init kv_store
@@ -80,12 +79,7 @@ class MooncakeStoreConnector(KVConnectorBase):
         slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten()
         start_layer = model_executable.model.start_layer
         end_layer = model_executable.model.end_layer
-
-        model_config = model_executable.model.config
-        num_heads = int(model_config.num_key_value_heads / self.tp_size)
-        hidden_size = model_config.hidden_size
-        num_attention_heads = model_config.num_attention_heads
-        head_size = int(hidden_size / num_attention_heads)
+        num_heads, head_size = self.kv_helper.get_model_args(model_executable)
 
         for idx, slen in enumerate(seq_lens):
             start_pos = sum(seq_lens[:idx])
@@ -97,10 +91,8 @@ class MooncakeStoreConnector(KVConnectorBase):
 
             for layer_id in range(start_layer, end_layer):
                 kv_cache = kv_caches[layer_id - start_layer]
-
-                key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
-                value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
-
+                key_cache, value_cache = self.kv_helper.get_kv_from_cache(
+                    kv_cache, num_heads, head_size)
                 current_slot_mapping = slot_mapping_flat[start_pos:end_pos]
 
                 keys.append(key_cache[current_slot_mapping].unsqueeze(0))
@@ -173,22 +165,15 @@ class MooncakeStoreConnector(KVConnectorBase):
                 layer = model_executable.model.layers[layer_id]
                 # get kvcache object
                 kv_cache = kv_caches[layer_id - start_layer]
-                key_cache, value_cache = kv_cache[0], kv_cache[1]
-                # get remote kvcache
 
+                # get remote kvcache
                 remote_k, remote_v = remote_kv[0][layer_id], remote_kv[1][
                     layer_id]
-                # use ops.reshape_and_cache_flash to put kv into kvcache
-                ops.reshape_and_cache_flash(
-                    remote_k.to(key_cache.device),
-                    remote_v.to(value_cache.device),
-                    key_cache,
-                    value_cache,
-                    slot_mapping[start_pos:end_pos],
-                    layer.self_attn.attn.kv_cache_dtype,
-                    layer.self_attn.attn._k_scale,
-                    layer.self_attn.attn._v_scale,
-                )
+
+                self.kv_helper.put_kv_to_cache(model_executable, remote_k,
+                                               remote_v, layer, kv_cache,
+                                               slot_mapping, start_pos,
+                                               end_pos)
 
             hidden_or_intermediate_states_for_one_req.append(hidden)
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
index 49b97d7b58897..0464a7585138f 100644
--- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
@@ -12,10 +12,10 @@ from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
 import torch
 
-import vllm.envs as envs
-from vllm import _custom_ops as ops
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
+from vllm.distributed.kv_transfer.kv_connector.utils import (
+    model_aware_kv_ops_helper as kv_helper)
 from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import (
     SimpleBuffer)
 from vllm.logger import init_logger
@@ -37,9 +37,7 @@ class SimpleConnector(KVConnectorBase):
     ):
 
         self.config = config.kv_transfer_config
-        self.tp_size = config.parallel_config.tensor_parallel_size
-        self.is_deepseek_mla = config.model_config.is_deepseek_mla
-        self.use_mla_opt = not envs.VLLM_MLA_DISABLE
+        self.kv_helper = kv_helper(config)
 
         if self.config.kv_connector == "PyNcclConnector":
             from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import (
@@ -165,31 +163,7 @@ class SimpleConnector(KVConnectorBase):
         num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens
         start_layer = model_executable.model.start_layer
         end_layer = model_executable.model.end_layer
-
-        model_config = model_executable.model.config
-        num_heads = int(model_config.num_key_value_heads / self.tp_size)
-        hidden_size = model_config.hidden_size
-        num_attention_heads = model_config.num_attention_heads
-
-        # Deepseek's MLA (Multi-head Latent Attention) uses two different
-        # kv_cache shapes based on whether VLLM_MLA_DISABLE is set to 0.
-        # When VLLM_MLA_DISABLE=0 (default), forward absorb is applied,
-        # resulting in a kv_cache shape of [num_blks, blk_size, 1,
-        # kv_lora_rank + qk_rope_head_dim].
-        # When VLLM_MLA_DISABLE=1, standard FA is used instead, leading
-        # to a kv_cache shape of [2, num_blks, blk_size,
-        # num_key_value_heads / tp, qk_nope_head_dim + qk_rope_head_dim].
-        # For more details, see vllm/attention/backends/mla/common.py.
-        if self.is_deepseek_mla and self.use_mla_opt:
-            head_size = model_config.kv_lora_rank + \
-                model_config.qk_rope_head_dim
-            num_heads = 1
-        elif self.is_deepseek_mla and not self.use_mla_opt:
-            head_size = model_config.qk_nope_head_dim + \
-                model_config.qk_rope_head_dim
-        else:
-            head_size = getattr(model_config, "head_dim",
-                                int(hidden_size // num_attention_heads))
+        num_heads, head_size = self.kv_helper.get_model_args(model_executable)
 
         # query_lens contains new KV caches that are added to vLLM.
         # so we will send them to decode instance
@@ -212,13 +186,8 @@ class SimpleConnector(KVConnectorBase):
 
             for layer_id in range(start_layer, end_layer):
                 kv_cache = kv_caches[layer_id - start_layer]
-
-                if self.is_deepseek_mla and self.use_mla_opt:
-                    key_cache = kv_cache.reshape(-1, num_heads, head_size)
-                    value_cache = kv_cache.reshape(-1, num_heads, head_size)
-                else:
-                    key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
-                    value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
+                key_cache, value_cache = self.kv_helper.get_kv_from_cache(
+                    kv_cache, num_heads, head_size)
 
                 current_slot_mapping = slot_mapping_flat[start_pos:end_pos]
 
@@ -248,12 +217,12 @@ class SimpleConnector(KVConnectorBase):
         # and hidden states.
         bypass_model_exec = True
 
-        model_config = model_executable.model.config
-
         input_tokens_tensor = model_input.input_tokens
         seq_lens = model_input.attn_metadata.seq_lens
         num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens
         slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
+        start_layer = model_executable.model.start_layer
+        end_layer = model_executable.model.end_layer
 
         hidden_or_intermediate_states_for_one_req = []
 
@@ -312,41 +281,19 @@ class SimpleConnector(KVConnectorBase):
             end_pos = start_pos + num_computed_tokens
 
             # put received KV caches into paged memory
-            for i in range(model_executable.model.start_layer,
-                           model_executable.model.end_layer):
+            for cur_layer in range(start_layer, end_layer):
 
-                kv_cache = kv_caches[i - model_executable.model.start_layer]
-                layer = model_executable.model.layers[i]
+                layer_id = cur_layer - start_layer
+                kv_cache = kv_caches[layer_id]
+                layer = model_executable.model.layers[cur_layer]
 
-                if self.is_deepseek_mla and self.use_mla_opt:
-                    layer.self_attn.attn = layer.self_attn.mla_attn
-                    k_c_normed_k_pe = keys[
-                        i - model_executable.model.start_layer].to(
-                            kv_cache.device).squeeze(1)
-                    k_c_normed = k_c_normed_k_pe[:, :model_config.kv_lora_rank]
-                    k_pe = k_c_normed_k_pe[:, model_config.kv_lora_rank:]
-                    ops.concat_and_cache_mla(
-                        k_c_normed,
-                        k_pe,
-                        kv_cache,
-                        slot_mapping[start_pos:end_pos],
-                        layer.self_attn.attn.kv_cache_dtype,
-                        layer.self_attn.attn._k_scale,
-                    )
-                else:
-                    key_cache, value_cache = kv_cache[0], kv_cache[1]
-                    ops.reshape_and_cache_flash(
-                        keys[i - model_executable.model.start_layer].to(
-                            key_cache.device),
-                        values[i - model_executable.model.start_layer].to(
-                            value_cache.device),
-                        key_cache,
-                        value_cache,
-                        slot_mapping[start_pos:end_pos],
-                        layer.self_attn.attn.kv_cache_dtype,
-                        layer.self_attn.attn._k_scale,
-                        layer.self_attn.attn._v_scale,
-                    )
+                # get remote kvcache
+                remote_k, remote_v = keys[layer_id], values[layer_id]
+
+                self.kv_helper.put_kv_to_cache(model_executable, remote_k,
+                                               remote_v, layer, kv_cache,
+                                               slot_mapping, start_pos,
+                                               end_pos)
 
             hidden_or_intermediate_states_for_one_req.append(hidden)
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py
new file mode 100644
index 0000000000000..0b0ce9828a74d
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+KV cache helper for store.
+"""
+import torch
+
+import vllm.envs as envs
+from vllm import _custom_ops as ops
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class model_aware_kv_ops_helper:
+
+    def __init__(self, config: VllmConfig):
+        self.is_deepseek_mla = config.model_config.is_deepseek_mla
+        self.use_mla_opt = not envs.VLLM_MLA_DISABLE
+        self.tp_size = config.parallel_config.tensor_parallel_size
+
+    def get_model_args(self, model_executable: torch.nn.Module):
+
+        model_config = model_executable.model.config
+        self.model_executable = model_executable
+        num_heads = int(model_config.num_key_value_heads / self.tp_size)
+        hidden_size = model_config.hidden_size
+        num_attention_heads = model_config.num_attention_heads
+
+        # Deepseek's MLA (Multi-head Latent Attention) uses two different
+        # kv_cache shapes based on whether VLLM_MLA_DISABLE is set to 0.
+        # When VLLM_MLA_DISABLE=0 (default), forward absorb is applied,
+        # resulting in a kv_cache shape of [num_blks, blk_size, 1,
+        # kv_lora_rank + qk_rope_head_dim].
+        # When VLLM_MLA_DISABLE=1, standard FA is used instead, leading
+        # to a kv_cache shape of [2, num_blks, blk_size,
+        # num_key_value_heads / tp, qk_nope_head_dim + qk_rope_head_dim].
+        # For more details, see vllm/attention/backends/mla/common.py.
+        if self.is_deepseek_mla and self.use_mla_opt:
+            head_size = model_config.kv_lora_rank + \
+                model_config.qk_rope_head_dim
+            num_heads = 1
+        elif self.is_deepseek_mla and not self.use_mla_opt:
+            head_size = model_config.qk_nope_head_dim + \
+                model_config.qk_rope_head_dim
+        else:
+            head_size = getattr(model_config, "head_dim",
+                                int(hidden_size // num_attention_heads))
+
+        return num_heads, head_size
+
+    def get_kv_from_cache(self, kv_cache, num_heads, head_size):
+        if self.is_deepseek_mla and self.use_mla_opt:
+            key_cache = kv_cache.reshape(-1, num_heads, head_size)
+            value_cache = kv_cache.reshape(-1, num_heads, head_size)
+        else:
+            key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
+            value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
+        return key_cache, value_cache
+
+    def put_kv_to_cache(self, model_executable: torch.nn.Module, keys, values,
+                        layer, kv_cache, slot_mapping, start_pos, end_pos):
+
+        model_config = model_executable.model.config
+
+        if self.is_deepseek_mla and self.use_mla_opt:
+            layer.self_attn.attn = layer.self_attn.mla_attn
+            k_c_normed_k_pe = keys.squeeze(1)
+            k_c_normed = k_c_normed_k_pe[:, :model_config.kv_lora_rank]
+            k_pe = k_c_normed_k_pe[:, model_config.kv_lora_rank:]
+            ops.concat_and_cache_mla(
+                k_c_normed.to(kv_cache.device),
+                k_pe.to(kv_cache.device),
+                kv_cache,
+                slot_mapping[start_pos:end_pos],
+                layer.self_attn.attn.kv_cache_dtype,
+                layer.self_attn.attn._k_scale,
+            )
+        else:
+            key_cache, value_cache = kv_cache[0], kv_cache[1]
+            ops.reshape_and_cache_flash(
+                keys.to(key_cache.device),
+                values.to(value_cache.device),
+                key_cache,
+                value_cache,
+                slot_mapping[start_pos:end_pos],
+                layer.self_attn.attn.kv_cache_dtype,
+                layer.self_attn.attn._k_scale,
+                layer.self_attn.attn._v_scale,
+            )

From 44fa4d556c37cf3538090960bb3e07654193df5e Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Tue, 15 Apr 2025 23:05:28 -0700
Subject: [PATCH 450/593] [ROCM] Bind triton version to 3.2 in
 requirements-built.txt  (#16664)

Signed-off-by: Sage Moore <sage@neuralmagic.com>
---
 requirements/rocm-build.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt
index 29d5647807bb9..05de4ff168453 100644
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -6,6 +6,7 @@ torch==2.6.0
 torchvision==0.21.0
 torchaudio==2.6.0
 
+triton==3.2
 cmake>=3.26,<4
 packaging
 setuptools>=61

From 976711d9db1d1db7c4f2ea9a1c7c981a190be88a Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Wed, 16 Apr 2025 17:01:36 +0800
Subject: [PATCH 451/593] [V1][Structured Output] Move xgrammar related utils
 to `backend_xgrammar.py` (#16578)

Signed-off-by: shen-shanshan <467638484@qq.com>
---
 tests/v1/structured_output/test_utils.py      |   2 +-
 vllm/v1/engine/processor.py                   |   8 +-
 vllm/v1/structured_output/backend_xgrammar.py | 116 ++++++++++++++++-
 vllm/v1/structured_output/utils.py            | 120 ------------------
 4 files changed, 120 insertions(+), 126 deletions(-)

diff --git a/tests/v1/structured_output/test_utils.py b/tests/v1/structured_output/test_utils.py
index 0929f99016289..337df4517ae99 100644
--- a/tests/v1/structured_output/test_utils.py
+++ b/tests/v1/structured_output/test_utils.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from vllm.v1.structured_output.utils import (
+from vllm.v1.structured_output.backend_xgrammar import (
     has_xgrammar_unsupported_json_features)
 
 
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 6d3290f165653..396fe25e2628f 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -22,8 +22,8 @@ from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.mm_input_cache import MirroredProcessingCache
 from vllm.v1.structured_output.backend_guidance import (
     validate_guidance_grammar)
-from vllm.v1.structured_output.utils import (
-    validate_structured_output_request_xgrammar)
+from vllm.v1.structured_output.backend_xgrammar import (
+    validate_xgrammar_grammar)
 
 
 class Processor:
@@ -165,7 +165,7 @@ class Processor:
         # Request content validation
         if engine_level_backend.startswith("xgrammar"):
             # xgrammar with no fallback
-            validate_structured_output_request_xgrammar(params)
+            validate_xgrammar_grammar(params)
             params.guided_decoding.backend = engine_level_backend
         elif engine_level_backend == "auto":
             # "auto" is an opt-in to opinionated behavior where we try to
@@ -173,7 +173,7 @@ class Processor:
             # default as it is less predictable and subject to change
             # between releases as feature support changes.
             try:
-                validate_structured_output_request_xgrammar(params)
+                validate_xgrammar_grammar(params)
                 params.guided_decoding.backend = "xgrammar"
             except ValueError:
                 # The request includes some jsonschema feature(s) that
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index 83f2c6436ed2c..c9839bd7ddee0 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -1,19 +1,24 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import json
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 import torch
 
 import vllm.envs
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 from vllm.utils import LazyLoader
 from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
                                                      StructuredOutputGrammar,
                                                      StructuredOutputOptions)
+from vllm.v1.structured_output.utils import (choice_as_grammar,
+                                             convert_lark_to_ebnf,
+                                             grammar_is_likely_lark)
 
 if TYPE_CHECKING:
     import xgrammar as xgr
@@ -156,3 +161,112 @@ class XgrammarGrammar(StructuredOutputGrammar):
     def reset(self):
         self.num_processed_tokens = 0
         self.matcher.reset()
+
+
+def has_xgrammar_unsupported_json_features(schema: dict[str, Any]) -> bool:
+    """Check if JSON schema contains features unsupported by xgrammar."""
+
+    def check_object(obj: dict[str, Any]) -> bool:
+        if not isinstance(obj, dict):
+            return False
+
+        # Check for pattern restrictions
+        if "pattern" in obj:
+            return True
+
+        # Check for numeric ranges
+        if obj.get("type") in ("integer", "number") and any(
+                key in obj
+                for key in ("minimum", "maximum", "exclusiveMinimum",
+                            "exclusiveMaximum", "multipleOf")):
+            return True
+
+        # Check for array unsupported keywords
+        if obj.get("type") == "array" and any(
+                key in obj
+                for key in ("uniqueItems", "contains", "minContains",
+                            "maxContains", "minItems", "maxItems")):
+            return True
+
+        # Unsupported keywords for strings
+        if obj.get("type") == "string" and "format" in obj:
+            return True
+
+        # Unsupported keywords for objects
+        if obj.get("type") == "object" and any(
+                key in obj for key in ("minProperties", "maxProperties",
+                                       "propertyNames", "patternProperties")):
+            return True
+
+        # Recursively check all nested objects and arrays
+        for value in obj.values():
+            if isinstance(value, dict):
+                if check_object(value):
+                    return True
+            elif isinstance(value, list):
+                for item in value:
+                    if isinstance(item, dict) and check_object(item):
+                        return True
+
+        return False
+
+    return check_object(schema)
+
+
+def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None:
+    """Validate that the request is supported by structured output.
+
+    Raises ValueError if the request is not supported.
+    """
+    if sampling_params.guided_decoding is None:
+        return
+
+    gd_params = sampling_params.guided_decoding
+
+    if gd_params.regex:
+        try:
+            xgr.Grammar.from_regex(gd_params.regex)
+        except Exception as err:
+            raise ValueError("Failed to transform regex into a grammar: "
+                             f"{err}") from err
+
+    if gd_params.choice:
+        choice_grammar = choice_as_grammar(gd_params.choice)
+        try:
+            xgr.Grammar.from_ebnf(choice_grammar)
+        except Exception as err:
+            raise ValueError("Failed to transform choices into a grammar: "
+                             "{err}") from err
+        gd_params.choice = None
+        gd_params.grammar = choice_grammar
+        return
+
+    if gd_params.json:
+        if isinstance(gd_params.json, str):
+            try:
+                schema = json.loads(gd_params.json)
+            except json.JSONDecodeError as e:
+                raise ValueError("Invalid JSON grammar specification.") from e
+        else:
+            schema = gd_params.json
+
+        if has_xgrammar_unsupported_json_features(schema):
+            raise ValueError("The provided JSON schema contains features not "
+                             "supported by xgrammar.")
+        return
+
+    if gd_params.grammar:
+        if grammar_is_likely_lark(gd_params.grammar):
+            # xgrammar supports EBNF grammars only
+            try:
+                gd_params.grammar = convert_lark_to_ebnf(gd_params.grammar)
+            except ValueError as e:
+                raise ValueError(
+                    "Failed to convert the grammar from Lark to EBNF. ") from e
+
+        # Test parsing EBNF grammar, possibly already converted from Lark
+        try:
+            # parse the grammar, but we aren't compiling it.
+            xgr.Grammar.from_ebnf(gd_params.grammar)
+        except Exception as e:
+            raise ValueError("Invalid grammar specification.") from e
diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
index 56eed95944e2f..f33f4972e1032 100644
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -2,67 +2,7 @@
 
 from __future__ import annotations
 
-import json
 import re
-from typing import TYPE_CHECKING, Any
-
-from vllm.sampling_params import SamplingParams
-from vllm.utils import LazyLoader
-
-if TYPE_CHECKING:
-    import xgrammar as xgr
-else:
-    xgr = LazyLoader("xgr", globals(), "xgrammar")
-
-
-def has_xgrammar_unsupported_json_features(schema: dict[str, Any]) -> bool:
-    """Check if JSON schema contains features unsupported by xgrammar."""
-
-    def check_object(obj: dict[str, Any]) -> bool:
-        if not isinstance(obj, dict):
-            return False
-
-        # Check for pattern restrictions
-        if "pattern" in obj:
-            return True
-
-        # Check for numeric ranges
-        if obj.get("type") in ("integer", "number") and any(
-                key in obj
-                for key in ("minimum", "maximum", "exclusiveMinimum",
-                            "exclusiveMaximum", "multipleOf")):
-            return True
-
-        # Check for array unsupported keywords
-        if obj.get("type") == "array" and any(
-                key in obj
-                for key in ("uniqueItems", "contains", "minContains",
-                            "maxContains", "minItems", "maxItems")):
-            return True
-
-        # Unsupported keywords for strings
-        if obj.get("type") == "string" and "format" in obj:
-            return True
-
-        # Unsupported keywords for objects
-        if obj.get("type") == "object" and any(
-                key in obj for key in ("minProperties", "maxProperties",
-                                       "propertyNames", "patternProperties")):
-            return True
-
-        # Recursively check all nested objects and arrays
-        for value in obj.values():
-            if isinstance(value, dict):
-                if check_object(value):
-                    return True
-            elif isinstance(value, list):
-                for item in value:
-                    if isinstance(item, dict) and check_object(item):
-                        return True
-
-        return False
-
-    return check_object(schema)
 
 
 def grammar_is_likely_lark(grammar_str: str) -> bool:
@@ -232,63 +172,3 @@ def choice_as_grammar(choice: list[str]) -> str:
     escaped_choices = (escape_ebnf_string(c) for c in choice)
     grammar = ('root ::= ' + ' | '.join(f'"{c}"' for c in escaped_choices))
     return grammar
-
-
-def validate_structured_output_request_xgrammar(
-        sampling_params: SamplingParams) -> None:
-    """Validate that the request is supported by structured output.
-
-    Raises ValueError if the request is not supported.
-    """
-    if sampling_params.guided_decoding is None:
-        return
-
-    gd_params = sampling_params.guided_decoding
-
-    if gd_params.regex:
-        try:
-            xgr.Grammar.from_regex(gd_params.regex)
-        except Exception as err:
-            raise ValueError("Failed to transform regex into a grammar: "
-                             f"{err}") from err
-
-    if gd_params.choice:
-        choice_grammar = choice_as_grammar(gd_params.choice)
-        try:
-            xgr.Grammar.from_ebnf(choice_grammar)
-        except Exception as err:
-            raise ValueError("Failed to transform choices into a grammar: "
-                             "{err}") from err
-        gd_params.choice = None
-        gd_params.grammar = choice_grammar
-        return
-
-    if gd_params.json:
-        if isinstance(gd_params.json, str):
-            try:
-                schema = json.loads(gd_params.json)
-            except json.JSONDecodeError as e:
-                raise ValueError("Invalid JSON grammar specification.") from e
-        else:
-            schema = gd_params.json
-
-        if has_xgrammar_unsupported_json_features(schema):
-            raise ValueError("The provided JSON schema contains features not "
-                             "supported by xgrammar.")
-        return
-
-    if gd_params.grammar:
-        if grammar_is_likely_lark(gd_params.grammar):
-            # xgrammar supports EBNF grammars only
-            try:
-                gd_params.grammar = convert_lark_to_ebnf(gd_params.grammar)
-            except ValueError as e:
-                raise ValueError(
-                    "Failed to convert the grammar from Lark to EBNF. ") from e
-
-        # Test parsing EBNF grammar, possibly already converted from Lark
-        try:
-            # parse the grammar, but we aren't compiling it.
-            xgr.Grammar.from_ebnf(gd_params.grammar)
-        except Exception as e:
-            raise ValueError("Invalid grammar specification.") from e

From 21378a2323581c07a91e0208f6f4868acb407b42 Mon Sep 17 00:00:00 2001
From: Kay Yan <kay.yan@daocloud.io>
Date: Wed, 16 Apr 2025 18:05:31 +0800
Subject: [PATCH 452/593] [CI] Cleanup `additional_dependencies: [toml]` for
 pre-commit yapf hook (#16405)

Signed-off-by: Kay Yan <kay.yan@daocloud.io>
---
 .pre-commit-config.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e921f69925b66..f76b24c025ffb 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,7 +11,6 @@ repos:
   hooks:
   - id: yapf
     args: [--in-place, --verbose]
-    additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
 - repo: https://github.com/astral-sh/ruff-pre-commit
   rev: v0.9.3
   hooks:

From 716892049190e55b432a0cb84d0250f0e68c310a Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Wed, 16 Apr 2025 18:16:36 +0800
Subject: [PATCH 453/593] [Misc] refactor examples series (#16708)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 .../offline_inference/llm_engine_example.py   | 12 ++-
 .../gradio_openai_chatbot_webserver.py        |  4 -
 ...i_chat_completion_client_for_multimodal.py | 18 +++--
 ...t_completion_client_with_tools_required.py | 58 +++++++-------
 .../openai_chat_completion_with_reasoning.py  | 63 ++++++++-------
 ...hat_completion_with_reasoning_streaming.py | 78 ++++++++++---------
 ...ai_chat_embedding_client_for_multimodal.py | 11 ++-
 .../openai_completion_client.py               | 52 +++++++------
 .../openai_cross_encoder_score.py             | 23 ++++--
 .../online_serving/openai_embedding_client.py | 39 ++++++----
 .../online_serving/openai_pooling_client.py   | 15 +++-
 11 files changed, 220 insertions(+), 153 deletions(-)

diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py
index abff90d1c0cb6..d84cd9ee9f52b 100644
--- a/examples/offline_inference/llm_engine_example.py
+++ b/examples/offline_inference/llm_engine_example.py
@@ -50,6 +50,13 @@ def initialize_engine(args: argparse.Namespace) -> LLMEngine:
     return LLMEngine.from_engine_args(engine_args)
 
 
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description='Demo on using the LLMEngine class directly')
+    parser = EngineArgs.add_cli_args(parser)
+    return parser.parse_args()
+
+
 def main(args: argparse.Namespace):
     """Main function that sets up and runs the prompt processing."""
     engine = initialize_engine(args)
@@ -58,8 +65,5 @@ def main(args: argparse.Namespace):
 
 
 if __name__ == '__main__':
-    parser = FlexibleArgumentParser(
-        description='Demo on using the LLMEngine class directly')
-    parser = EngineArgs.add_cli_args(parser)
-    args = parser.parse_args()
+    args = parse_args()
     main(args)
diff --git a/examples/online_serving/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py
index 13331609eb09d..314f1c5b73951 100644
--- a/examples/online_serving/gradio_openai_chatbot_webserver.py
+++ b/examples/online_serving/gradio_openai_chatbot_webserver.py
@@ -23,10 +23,6 @@ import gradio as gr
 from openai import OpenAI
 
 
-def create_openai_client(api_key, base_url):
-    return OpenAI(api_key=api_key, base_url=base_url)
-
-
 def format_history_to_openai(history):
     history_openai_format = [{
         "role": "system",
diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
index ecfcf05a90d16..18006e2c42322 100644
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -303,12 +303,7 @@ example_function_map = {
 }
 
 
-def main(args) -> None:
-    chat_type = args.chat_type
-    example_function_map[chat_type]()
-
-
-if __name__ == "__main__":
+def parse_args():
     parser = FlexibleArgumentParser(
         description='Demo on using OpenAI client for online serving with '
         'multimodal language models served with vLLM.')
@@ -318,5 +313,14 @@ if __name__ == "__main__":
                         default="single-image",
                         choices=list(example_function_map.keys()),
                         help='Conversation type with multimodal data.')
-    args = parser.parse_args()
+    return parser.parse_args()
+
+
+def main(args) -> None:
+    chat_type = args.chat_type
+    example_function_map[chat_type]()
+
+
+if __name__ == "__main__":
+    args = parse_args()
     main(args)
diff --git a/examples/online_serving/openai_chat_completion_client_with_tools_required.py b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
index 779369d163442..97d900bb75f1a 100644
--- a/examples/online_serving/openai_chat_completion_client_with_tools_required.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 """
-To run this example, you can start the vLLM server 
+To run this example, you can start the vLLM server
 without any specific flags:
 
 ```bash
@@ -8,7 +8,7 @@ VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
     --guided-decoding-backend outlines
 ```
 
-This example demonstrates how to generate chat completions 
+This example demonstrates how to generate chat completions
 using the OpenAI Python client library.
 """
 
@@ -18,15 +18,6 @@ from openai import OpenAI
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    # defaults to os.environ.get("OPENAI_API_KEY")
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-
-models = client.models.list()
-model = models.data[0].id
-
 tools = [
     {
         "type": "function",
@@ -116,21 +107,36 @@ messages = [
     },
 ]
 
-chat_completion = client.chat.completions.create(
-    messages=messages,
-    model=model,
-    tools=tools,
-    tool_choice="required",
-    stream=True  # Enable streaming response
-)
 
-for chunk in chat_completion:
-    if chunk.choices and chunk.choices[0].delta.tool_calls:
-        print(chunk.choices[0].delta.tool_calls)
+def main():
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
 
-chat_completion = client.chat.completions.create(messages=messages,
-                                                 model=model,
-                                                 tools=tools,
-                                                 tool_choice="required")
+    models = client.models.list()
+    model = models.data[0].id
 
-print(chat_completion.choices[0].message.tool_calls)
+    chat_completion = client.chat.completions.create(
+        messages=messages,
+        model=model,
+        tools=tools,
+        tool_choice="required",
+        stream=True  # Enable streaming response
+    )
+
+    for chunk in chat_completion:
+        if chunk.choices and chunk.choices[0].delta.tool_calls:
+            print(chunk.choices[0].delta.tool_calls)
+
+    chat_completion = client.chat.completions.create(messages=messages,
+                                                     model=model,
+                                                     tools=tools,
+                                                     tool_choice="required")
+
+    print(chat_completion.choices[0].message.tool_calls)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py
index e753cedcdc08d..6f5f7b5fa20ba 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning.py
@@ -3,8 +3,8 @@
 An example shows how to generate chat completions from reasoning models
 like DeepSeekR1.
 
-To run this example, you need to start the vLLM server with the reasoning 
-parser:
+To run this example, you need to start the vLLM server
+with the reasoning parser:
 
 ```bash
 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
@@ -21,35 +21,44 @@ from openai import OpenAI
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
 
-models = client.models.list()
-model = models.data[0].id
+def main():
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
 
-# Round 1
-messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
-# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
-response = client.chat.completions.create(model=model, messages=messages)
+    models = client.models.list()
+    model = models.data[0].id
 
-reasoning_content = response.choices[0].message.reasoning_content
-content = response.choices[0].message.content
+    # Round 1
+    messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+    # ruff: noqa: E501
+    # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
+    response = client.chat.completions.create(model=model, messages=messages)
 
-print("reasoning_content for Round 1:", reasoning_content)
-print("content for Round 1:", content)
+    reasoning_content = response.choices[0].message.reasoning_content
+    content = response.choices[0].message.content
 
-# Round 2
-messages.append({"role": "assistant", "content": content})
-messages.append({
-    "role": "user",
-    "content": "How many Rs are there in the word 'strawberry'?",
-})
-response = client.chat.completions.create(model=model, messages=messages)
+    print("reasoning_content for Round 1:", reasoning_content)
+    print("content for Round 1:", content)
 
-reasoning_content = response.choices[0].message.reasoning_content
-content = response.choices[0].message.content
+    # Round 2
+    messages.append({"role": "assistant", "content": content})
+    messages.append({
+        "role":
+        "user",
+        "content":
+        "How many Rs are there in the word 'strawberry'?",
+    })
+    response = client.chat.completions.create(model=model, messages=messages)
 
-print("reasoning_content for Round 2:", reasoning_content)
-print("content for Round 2:", content)
+    reasoning_content = response.choices[0].message.reasoning_content
+    content = response.choices[0].message.content
+
+    print("reasoning_content for Round 2:", reasoning_content)
+    print("content for Round 2:", content)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
index cb13b0c614aa1..90481cdc0fb79 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
@@ -3,7 +3,7 @@
 An example shows how to generate chat completions from reasoning models
 like DeepSeekR1.
 
-To run this example, you need to start the vLLM server with the reasoning 
+To run this example, you need to start the vLLM server with the reasoning
 parser:
 
 ```bash
@@ -29,41 +29,49 @@ from openai import OpenAI
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-
-models = client.models.list()
-model = models.data[0].id
-
 messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
-# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
-stream = client.chat.completions.create(model=model,
-                                        messages=messages,
-                                        stream=True)
 
-print("client: Start streaming chat completions...")
-printed_reasoning_content = False
-printed_content = False
 
-for chunk in stream:
-    reasoning_content = None
-    content = None
-    # Check the content is reasoning_content or content
-    if hasattr(chunk.choices[0].delta, "reasoning_content"):
-        reasoning_content = chunk.choices[0].delta.reasoning_content
-    elif hasattr(chunk.choices[0].delta, "content"):
-        content = chunk.choices[0].delta.content
+def main():
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
 
-    if reasoning_content is not None:
-        if not printed_reasoning_content:
-            printed_reasoning_content = True
-            print("reasoning_content:", end="", flush=True)
-        print(reasoning_content, end="", flush=True)
-    elif content is not None:
-        if not printed_content:
-            printed_content = True
-            print("\ncontent:", end="", flush=True)
-        # Extract and print the content
-        print(content, end="", flush=True)
+    models = client.models.list()
+    model = models.data[0].id
+
+    # ruff: noqa: E501
+    # For granite: add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
+    stream = client.chat.completions.create(model=model,
+                                            messages=messages,
+                                            stream=True)
+
+    print("client: Start streaming chat completions...")
+    printed_reasoning_content = False
+    printed_content = False
+
+    for chunk in stream:
+        reasoning_content = None
+        content = None
+        # Check the content is reasoning_content or content
+        if hasattr(chunk.choices[0].delta, "reasoning_content"):
+            reasoning_content = chunk.choices[0].delta.reasoning_content
+        elif hasattr(chunk.choices[0].delta, "content"):
+            content = chunk.choices[0].delta.content
+
+        if reasoning_content is not None:
+            if not printed_reasoning_content:
+                printed_reasoning_content = True
+                print("reasoning_content:", end="", flush=True)
+            print(reasoning_content, end="", flush=True)
+        elif content is not None:
+            if not printed_content:
+                printed_content = True
+                print("\ncontent:", end="", flush=True)
+            # Extract and print the content
+            print(content, end="", flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
index 2c63c5ec370e3..c850b5aa2f800 100644
--- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
@@ -98,7 +98,7 @@ def dse_qwen2_vl(inp: dict):
     print("Embedding output:", response_json["data"][0]["embedding"])
 
 
-if __name__ == '__main__':
+def parse_args():
     parser = argparse.ArgumentParser(
         "Script to call a specified VLM through the API. Make sure to serve "
         "the model with --task embed before running this.")
@@ -107,8 +107,10 @@ if __name__ == '__main__':
                         choices=["vlm2vec", "dse_qwen2_vl"],
                         required=True,
                         help="Which model to call.")
-    args = parser.parse_args()
+    return parser.parse_args()
 
+
+def main(args):
     if args.model == "vlm2vec":
         vlm2vec()
     elif args.model == "dse_qwen2_vl":
@@ -120,3 +122,8 @@ if __name__ == '__main__':
             "type": "text",
             "content": "What is the weather like today?",
         })
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)
diff --git a/examples/online_serving/openai_completion_client.py b/examples/online_serving/openai_completion_client.py
index 06b93d7d19315..6ab7619bff192 100644
--- a/examples/online_serving/openai_completion_client.py
+++ b/examples/online_serving/openai_completion_client.py
@@ -6,28 +6,36 @@ from openai import OpenAI
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    # defaults to os.environ.get("OPENAI_API_KEY")
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
 
-models = client.models.list()
-model = models.data[0].id
+def main():
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
 
-# Completion API
-stream = False
-completion = client.completions.create(
-    model=model,
-    prompt="A robot may not injure a human being",
-    echo=False,
-    n=2,
-    stream=stream,
-    logprobs=3)
+    models = client.models.list()
+    model = models.data[0].id
 
-print("Completion results:")
-if stream:
-    for c in completion:
-        print(c)
-else:
-    print(completion)
+    # Completion API
+    stream = False
+    completion = client.completions.create(
+        model=model,
+        prompt="A robot may not injure a human being",
+        echo=False,
+        n=2,
+        stream=stream,
+        logprobs=3)
+
+    print("-" * 50)
+    print("Completion results:")
+    if stream:
+        for c in completion:
+            print(c)
+    else:
+        print(completion)
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py
index 67c5fc91bc65b..20a64ddb21413 100644
--- a/examples/online_serving/openai_cross_encoder_score.py
+++ b/examples/online_serving/openai_cross_encoder_score.py
@@ -16,13 +16,15 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
     return response
 
 
-if __name__ == "__main__":
+def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--host", type=str, default="localhost")
     parser.add_argument("--port", type=int, default=8000)
     parser.add_argument("--model", type=str, default="BAAI/bge-reranker-v2-m3")
+    return parser.parse_args()
 
-    args = parser.parse_args()
+
+def main(args):
     api_url = f"http://{args.host}:{args.port}/score"
     model_name = args.model
 
@@ -30,9 +32,9 @@ if __name__ == "__main__":
     text_2 = "The capital of Brazil is Brasilia."
     prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
     score_response = post_http_request(prompt=prompt, api_url=api_url)
-    print("Prompt when text_1 and text_2 are both strings:")
+    print("\nPrompt when text_1 and text_2 are both strings:")
     pprint.pprint(prompt)
-    print("Score Response:")
+    print("\nScore Response:")
     pprint.pprint(score_response.json())
 
     text_1 = "What is the capital of France?"
@@ -41,9 +43,9 @@ if __name__ == "__main__":
     ]
     prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
     score_response = post_http_request(prompt=prompt, api_url=api_url)
-    print("Prompt when text_1 is string and text_2 is a list:")
+    print("\nPrompt when text_1 is string and text_2 is a list:")
     pprint.pprint(prompt)
-    print("Score Response:")
+    print("\nScore Response:")
     pprint.pprint(score_response.json())
 
     text_1 = [
@@ -54,7 +56,12 @@ if __name__ == "__main__":
     ]
     prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
     score_response = post_http_request(prompt=prompt, api_url=api_url)
-    print("Prompt when text_1 and text_2 are both lists:")
+    print("\nPrompt when text_1 and text_2 are both lists:")
     pprint.pprint(prompt)
-    print("Score Response:")
+    print("\nScore Response:")
     pprint.pprint(score_response.json())
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/online_serving/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py
index b7c5651e3bab2..bc217f7ca7a0b 100644
--- a/examples/online_serving/openai_embedding_client.py
+++ b/examples/online_serving/openai_embedding_client.py
@@ -6,22 +6,29 @@ from openai import OpenAI
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    # defaults to os.environ.get("OPENAI_API_KEY")
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
 
-models = client.models.list()
-model = models.data[0].id
+def main():
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
 
-responses = client.embeddings.create(
-    input=[
-        "Hello my name is",
-        "The best thing about vLLM is that it supports many different models"
-    ],
-    model=model,
-)
+    models = client.models.list()
+    model = models.data[0].id
 
-for data in responses.data:
-    print(data.embedding)  # List of float of len 4096
+    responses = client.embeddings.create(
+        # ruff: noqa: E501
+        input=[
+            "Hello my name is",
+            "The best thing about vLLM is that it supports many different models"
+        ],
+        model=model,
+    )
+
+    for data in responses.data:
+        print(data.embedding)  # List of float of len 4096
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py
index e17f9c5efd659..abcfe27c27699 100644
--- a/examples/online_serving/openai_pooling_client.py
+++ b/examples/online_serving/openai_pooling_client.py
@@ -17,7 +17,7 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
     return response
 
 
-if __name__ == "__main__":
+def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--host", type=str, default="localhost")
     parser.add_argument("--port", type=int, default=8000)
@@ -25,15 +25,20 @@ if __name__ == "__main__":
                         type=str,
                         default="jason9693/Qwen2.5-1.5B-apeach")
 
-    args = parser.parse_args()
+    return parser.parse_args()
+
+
+def main(args):
     api_url = f"http://{args.host}:{args.port}/pooling"
     model_name = args.model
 
     # Input like Completions API
     prompt = {"model": model_name, "input": "vLLM is great!"}
     pooling_response = post_http_request(prompt=prompt, api_url=api_url)
+    print("-" * 50)
     print("Pooling Response:")
     pprint.pprint(pooling_response.json())
+    print("-" * 50)
 
     # Input like Chat API
     prompt = {
@@ -50,3 +55,9 @@ if __name__ == "__main__":
     pooling_response = post_http_request(prompt=prompt, api_url=api_url)
     print("Pooling Response:")
     pprint.pprint(pooling_response.json())
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

From facbe2a114d4725420ac6a3da802202fb0801cf0 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 16 Apr 2025 18:29:48 +0800
Subject: [PATCH 454/593] [Doc] Improve OOM troubleshooting (#16704)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../source/getting_started/troubleshooting.md |  2 +-
 docs/source/serving/offline_inference.md      | 56 ++++++++++++++++++-
 2 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md
index 87fa442e9a489..a4744827f2268 100644
--- a/docs/source/getting_started/troubleshooting.md
+++ b/docs/source/getting_started/troubleshooting.md
@@ -24,7 +24,7 @@ To isolate the model downloading and loading issue, you can use the `--load-form
 
 ## Out of memory
 
-If the model is too large to fit in a single GPU, you will get an out-of-memory (OOM) error. Consider [using tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
+If the model is too large to fit in a single GPU, you will get an out-of-memory (OOM) error. Consider adopting [these options](#reducing-memory-usage) to reduce the memory consumption.
 
 ## Generation quality changed
 
diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md
index 85f2cafacdd38..387b1dbc39a90 100644
--- a/docs/source/serving/offline_inference.md
+++ b/docs/source/serving/offline_inference.md
@@ -59,6 +59,8 @@ model = LLM(
 
 Our [list of supported models](#supported-models) shows the model architectures that are recognized by vLLM.
 
+(reducing-memory-usage)=
+
 ### Reducing memory usage
 
 Large models might cause your machine to run out of memory (OOM). Here are some options that help alleviate this problem.
@@ -81,6 +83,12 @@ before initializing vLLM. Otherwise, you may run into an error like `RuntimeErro
 To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable.
 :::
 
+:::{note}
+With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism).
+
+You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
+:::
+
 #### Quantization
 
 Quantized models take less memory at the cost of lower precision.
@@ -103,6 +111,39 @@ llm = LLM(model="adept/fuyu-8b",
           max_num_seqs=2)
 ```
 
+#### Reduce CUDA Graphs
+
+By default, we optimize model inference using CUDA graphs which take up extra memory in the GPU.
+
+:::{important}
+CUDA graph capture takes up more memory in V1 than in V0.
+:::
+
+You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage:
+
+```python
+from vllm import LLM
+from vllm.config import CompilationConfig, CompilationLevel
+
+llm = LLM(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        # By default, it goes up to max_num_seqs
+        cudagraph_capture_sizes=[1, 2, 4, 8, 16],
+    ),
+)
+```
+
+You can disable graph capturing completely via the `enforce_eager` flag:
+
+```python
+from vllm import LLM
+
+llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
+          enforce_eager=True)
+```
+
 #### Adjust cache size
 
 If you run out of CPU RAM, try the following options:
@@ -110,16 +151,25 @@ If you run out of CPU RAM, try the following options:
 - (Multi-modal models only) you can set the size of multi-modal input cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB).
 - (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB).
 
-#### Disable unused modalities
+#### Multi-modal input limits
 
-You can disable unused modalities (except for text) by setting its limit to zero.
+You can allow a smaller number of multi-modal items per prompt to reduce the memory footprint of the model:
 
+```python
+from vllm import LLM
+
+# Accept up to 3 images and 1 video per prompt
+llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+          limit_mm_per_prompt={"image": 3, "video": 1})
+```
+
+You can go a step further and disable unused modalities completely by setting its limit to zero.
 For example, if your application only accepts image input, there is no need to allocate any memory for videos.
 
 ```python
 from vllm import LLM
 
-# Accept images but not videos
+# Accept any number of images but no videos
 llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
           limit_mm_per_prompt={"video": 0})
 ```

From e82ee40de3362afda8671e6f5daece0eaa7f0d51 Mon Sep 17 00:00:00 2001
From: DefTruth <31974251+DefTruth@users.noreply.github.com>
Date: Wed, 16 Apr 2025 18:31:39 +0800
Subject: [PATCH 455/593] [Bugfix][Kernel] fix potential cuda graph broken for
 merge_attn_states kernel (#16693)

Signed-off-by: DefTruth <qiustudent_r@163.com>
---
 csrc/attention/merge_attn_states.cu | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/csrc/attention/merge_attn_states.cu b/csrc/attention/merge_attn_states.cu
index 7af0caceda2f0..14e5edd7e283d 100644
--- a/csrc/attention/merge_attn_states.cu
+++ b/csrc/attention/merge_attn_states.cu
@@ -107,13 +107,14 @@ __global__ void merge_attn_states_kernel(
 
 #define LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS)                     \
   {                                                                         \
-    vllm::merge_attn_states_kernel<scalar_t, NUM_THREADS><<<grid, block>>>( \
-        reinterpret_cast<scalar_t*>(output.data_ptr()), output_lse_ptr,     \
-        reinterpret_cast<scalar_t*>(prefix_output.data_ptr()),              \
-        reinterpret_cast<float*>(prefix_lse.data_ptr()),                    \
-        reinterpret_cast<scalar_t*>(suffix_output.data_ptr()),              \
-        reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens,        \
-        num_heads, head_size);                                              \
+    vllm::merge_attn_states_kernel<scalar_t, NUM_THREADS>                   \
+        <<<grid, block, 0, stream>>>(                                       \
+            reinterpret_cast<scalar_t*>(output.data_ptr()), output_lse_ptr, \
+            reinterpret_cast<scalar_t*>(prefix_output.data_ptr()),          \
+            reinterpret_cast<float*>(prefix_lse.data_ptr()),                \
+            reinterpret_cast<scalar_t*>(suffix_output.data_ptr()),          \
+            reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens,    \
+            num_heads, head_size);                                          \
   }
 
 /*@brief Merges the attention states from prefix and suffix
@@ -122,10 +123,10 @@ __global__ void merge_attn_states_kernel(
  * @param output [n,h,d] The output tensor to store the merged attention states.
  * @param output_lse [h,d] Optional tensor to store the log-sum-exp values.
  * @param prefix_output [n,h,d] The prefix attention states.
- * @param prefix_lse [h,d] The log-sum-exp values for the prefix attention
+ * @param prefix_lse [h,n] The log-sum-exp values for the prefix attention
  * states.
  * @param suffix_output [n,h,d] The suffix attention states.
- * @param suffix_lse [h,d] The log-sum-exp values for the suffix attention
+ * @param suffix_lse [h,n] The log-sum-exp values for the suffix attention
  * states.
  */
 template <typename scalar_t>
@@ -146,13 +147,17 @@ void merge_attn_states_launcher(torch::Tensor& output,
   if (output_lse.has_value()) {
     output_lse_ptr = output_lse.value().data_ptr<float>();
   }
-  // process one pack elements per thread. float -> 4, half/bf16 -> 8
+  // Process one pack elements per thread. for float, the
+  // pack_size is 4 for half/bf16, the pack_size is 8.
   const uint threads_per_head = head_size / pack_size;
   const uint total_threads = num_tokens * num_heads * threads_per_head;
 
   dim3 block(NUM_THREADS);
   dim3 grid((total_threads + NUM_THREADS - 1) / NUM_THREADS);
 
+  const c10::cuda::OptionalCUDAGuard device_guard(prefix_output.device());
+  auto stream = at::cuda::getCurrentCUDAStream();
+
   LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS);
 }
 

From ee378f3d49f1404a72ec0948f0a2553f7c3a3726 Mon Sep 17 00:00:00 2001
From: xsank <xsank@foxmail.com>
Date: Wed, 16 Apr 2025 20:30:15 +0800
Subject: [PATCH 456/593] [Model] support modernbert  (#16648)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 唯勤 <xsank.mz@alibaba-inc.com>
Co-authored-by: 唯勤 <xsank.mz@alibaba-inc.com>
---
 docs/source/models/supported_models.md   |   5 +
 tests/models/registry.py                 |   3 +
 vllm/model_executor/models/modernbert.py | 325 +++++++++++++++++++++++
 vllm/model_executor/models/registry.py   |   2 +
 4 files changed, 335 insertions(+)
 create mode 100644 vllm/model_executor/models/modernbert.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 21a1d0264014d..ddb77f37a1e55 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -740,6 +740,11 @@ If your model is not in the above list, we will try to automatically convert the
   * `BAAI/bge-reranker-v2-m3`, etc.
   *
   *
+- * `ModernBertForSequenceClassification`
+  * ModernBert-based
+  * `Alibaba-NLP/gte-reranker-modernbert-base`, etc.
+  *
+  *
 :::
 
 (supported-mm-models)=
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 51aeeb5e441da..1599b1da07ca0 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -275,6 +275,9 @@ _CROSS_ENCODER_EXAMPLE_MODELS = {
     "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2"),  # noqa: E501
     "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base"),  # noqa: E501
     "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3"),  # noqa: E501
+    "ModernBertForSequenceClassification":
+        _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base",
+                        min_transformers_version="4.49"),
 }
 
 _MULTIMODAL_EXAMPLE_MODELS = {
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
new file mode 100644
index 0000000000000..2190241f0ba3c
--- /dev/null
+++ b/vllm/model_executor/models/modernbert.py
@@ -0,0 +1,325 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Iterable, Optional, Set, Tuple
+
+import torch
+from torch import nn
+from transformers import ModernBertConfig
+
+from vllm.attention import Attention, AttentionType
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.pooler import CrossEncodingPooler
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.pooling_metadata import PoolingMetadata
+from vllm.sequence import IntermediateTensors, PoolerOutput
+
+from .interfaces import SupportsCrossEncoding
+from .utils import WeightsMapper, maybe_prefix
+
+
+class ModernBertEmbeddings(nn.Module):
+
+    def __init__(self, config: ModernBertConfig):
+
+        super().__init__()
+        self.config = config
+        self.tok_embeddings = VocabParallelEmbedding(config.vocab_size,
+                                                     config.hidden_size)
+        self.norm = nn.LayerNorm(config.hidden_size,
+                                 eps=config.layer_norm_eps,
+                                 bias=config.norm_bias)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if inputs_embeds:
+            return self.norm(inputs_embeds)
+        else:
+            inputs_embeds = self.tok_embeddings(input_ids)
+            embeddings = self.norm(inputs_embeds)
+            return embeddings
+
+
+class ModernBertRotaryEmbedding(RotaryEmbedding):
+
+    def __init__(self, config: ModernBertConfig, head_size: int, dim: int,
+                 base: float):
+        super().__init__(
+            head_size=head_size,
+            rotary_dim=dim,
+            max_position_embeddings=config.max_position_embeddings,
+            base=base,
+            is_neox_style=True,
+            dtype=torch.float16)
+        self.config = config
+
+
+class ModernBertAttention(nn.Module):
+
+    def __init__(self,
+                 config: ModernBertConfig,
+                 layer_id: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.layer_id = layer_id
+        self.deterministic_flash_attn = config.deterministic_flash_attn
+        self.num_heads = config.num_attention_heads
+        assert self.num_heads % tp_size == 0
+        self.head_dim = config.hidden_size // config.num_attention_heads
+        self.all_head_size = self.head_dim * self.num_heads
+        self.scaling = self.head_dim**-0.5
+        self.Wqkv = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.num_heads,
+            bias=config.attention_bias,
+        )
+
+        if layer_id % config.global_attn_every_n_layers != 0:
+            self.local_attention = (config.local_attention // 2,
+                                    config.local_attention // 2)
+        else:
+            self.local_attention = (-1, -1)
+
+        rope_theta = config.global_rope_theta
+        if self.local_attention != (
+                -1, -1) and config.local_rope_theta is not None:
+            rope_theta = config.local_rope_theta
+        self.rotary_emb = ModernBertRotaryEmbedding(config=config,
+                                                    head_size=self.head_dim,
+                                                    dim=self.head_dim,
+                                                    base=rope_theta)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              prefix=f"{layer_id}.attn",
+                              attn_type=AttentionType.ENCODER_ONLY)
+        self.Wo = RowParallelLinear(config.hidden_size,
+                                    config.hidden_size,
+                                    bias=config.attention_bias)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        qkv, _ = self.Wqkv(hidden_states)
+        q, k, v = qkv.split([self.all_head_size] * 3, dim=-1)
+        q, k = self.rotary_emb(position_ids, q, k)
+        attn_outputs = self.attn(q, k, v)
+        hidden_states = attn_outputs
+        hidden_states, _ = self.Wo(hidden_states)
+        return hidden_states
+
+
+class ModernBertMLP(nn.Module):
+
+    def __init__(self, config: ModernBertConfig):
+        super().__init__()
+        self.config = config
+        self.Wi = nn.Linear(config.hidden_size,
+                            int(config.intermediate_size) * 2,
+                            bias=config.mlp_bias)
+        self.act = nn.GELU()
+        self.Wo = RowParallelLinear(config.intermediate_size,
+                                    config.hidden_size,
+                                    bias=config.mlp_bias)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        input, gate = self.Wi(hidden_states).chunk(2, dim=-1)
+        return self.Wo(self.act(input) * gate)[0]
+
+
+class ModernBertLayer(nn.Module):
+
+    def __init__(self,
+                 config: ModernBertConfig,
+                 prefix: str = "",
+                 layer_id: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        if layer_id == 0:
+            self.attn_norm = nn.Identity()
+        else:
+            self.attn_norm = nn.LayerNorm(config.hidden_size,
+                                          eps=config.norm_eps,
+                                          bias=config.norm_bias)
+        self.attn = ModernBertAttention(config=config, layer_id=layer_id)
+        self.mlp_norm = nn.LayerNorm(config.hidden_size,
+                                     eps=config.norm_eps,
+                                     bias=config.norm_bias)
+        self.mlp = ModernBertMLP(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+    ):
+        attn_outputs = self.attn(self.attn_norm(hidden_states),
+                                 position_ids=position_ids)
+        hidden_states = hidden_states + attn_outputs
+        mlp_output = self.mlp(self.mlp_norm(hidden_states))
+        hidden_states = hidden_states + mlp_output
+        return hidden_states
+
+
+class ModernBertEncoderLayer(nn.Module):
+
+    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        self.layers = nn.ModuleList([
+            ModernBertLayer(config=config, layer_id=layer_id)
+            for layer_id in range(config.num_hidden_layers)
+        ])
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        for i, layer in enumerate(self.layers):
+            hidden_states = layer(hidden_states, position_ids)
+        return hidden_states
+
+
+@support_torch_compile
+class ModernBertModel(nn.Module):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={"layers.": "encoder_layer.layers."})
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        self.config = config
+        self.embeddings = ModernBertEmbeddings(config)
+        self.encoder_layer = ModernBertEncoderLayer(vllm_config)
+        self.final_norm = nn.LayerNorm(config.hidden_size,
+                                       eps=config.norm_eps,
+                                       bias=config.norm_bias)
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        weights = self.hf_to_vllm_mapper.apply(weights)
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.embeddings(input_ids=input_ids,
+                                            inputs_embeds=inputs_embeds)
+
+        outputs = self.encoder_layer(
+            hidden_states=hidden_states,
+            position_ids=position_ids,
+        )
+        norm_outputs = self.final_norm(outputs)
+        return norm_outputs
+
+
+class ModernBertPooler(nn.Module):
+
+    def __init__(self, config: ModernBertConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size,
+                               config.classifier_bias)
+        self.act = nn.GELU()
+        self.norm = nn.LayerNorm(config.hidden_size,
+                                 eps=config.norm_eps,
+                                 bias=config.norm_bias)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        pooled_output = hidden_states
+        pooled_output = pooled_output.mean(dim=0, keepdim=False)
+        pooled_output = self.norm(self.act(self.dense(pooled_output)))
+        return pooled_output
+
+
+class ModernBertForSequenceClassification(nn.Module, SupportsCrossEncoding):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        self.config = config
+        self.model = ModernBertModel(vllm_config=vllm_config,
+                                     prefix=maybe_prefix(prefix, "modernbert"))
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self._pooler = CrossEncodingPooler(config, self.classifier,
+                                           ModernBertPooler(config))
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+
+        self_weights = []
+
+        def weight_filter():
+            for name, weight in weights:
+                if name.startswith("model."):
+                    yield name[len("model."):], weight
+                else:
+                    self_weights.append((name, weight))
+
+        self.model.load_weights(weight_filter())
+
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in self_weights:
+            if name.startswith("classifier"):
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            if name.startswith("head"):
+                param = params_dict["_pooler.pooler." + name[len("head") + 1:]]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+    def pooler(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Optional[PoolerOutput]:
+        return self._pooler(hidden_states, pooling_metadata)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return self.model(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            position_ids=positions,
+        )
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index e35d84b3b94a5..670a44392847a 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -162,6 +162,8 @@ _CROSS_ENCODER_MODELS = {
                                          "RobertaForSequenceClassification"),
     "XLMRobertaForSequenceClassification": ("roberta",
                                             "RobertaForSequenceClassification"),
+    "ModernBertForSequenceClassification": ("modernbert",
+                                            "ModernBertForSequenceClassification"),
 }
 
 _MULTIMODAL_MODELS = {

From e1b004839a2c6d1f1771b7ab9c97acd0ed0c7aa2 Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Wed, 16 Apr 2025 18:28:42 +0200
Subject: [PATCH 457/593] [Hardware] Add processor inputs to platform
 validation (#16680)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
---
 vllm/platforms/interface.py |  3 ++-
 vllm/platforms/tpu.py       |  3 ++-
 vllm/v1/engine/processor.py | 12 ++++++------
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 2695da5778aad..8c099b9531c5f 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -8,7 +8,7 @@ from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Union
 import numpy as np
 import torch
 
-from vllm.inputs import PromptType
+from vllm.inputs import ProcessorInputs, PromptType
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
@@ -400,6 +400,7 @@ class Platform:
         cls,
         prompt: PromptType,
         params: Union[SamplingParams, PoolingParams],
+        processed_inputs: ProcessorInputs,
     ) -> None:
         """Raises if this request is unsupported on this platform"""
 
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index d8807a72ba2f3..83dd3e9c817af 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, Optional, Union
 import torch
 
 import vllm.envs as envs
-from vllm.inputs import PromptType
+from vllm.inputs import ProcessorInputs, PromptType
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams, SamplingType
 
@@ -150,6 +150,7 @@ class TpuPlatform(Platform):
         cls,
         prompt: PromptType,
         params: Union[SamplingParams, PoolingParams],
+        processed_inputs: ProcessorInputs,
     ) -> None:
         """Raises if this request is unsupported on this platform"""
         if isinstance(params, SamplingParams):
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 396fe25e2628f..225e78f53eab3 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -202,12 +202,6 @@ class Processor:
 
         # TODO(woosuk): Support pooling models.
         # TODO(woosuk): Support encoder-decoder models.
-
-        from vllm.platforms import current_platform
-        current_platform.validate_request(
-            prompt=prompt,
-            params=params,
-        )
         self._validate_lora(lora_request)
         self._validate_params(params)
         if priority != 0:
@@ -231,6 +225,12 @@ class Processor:
             prompt_adapter_request=prompt_adapter_request,
             return_mm_hashes=self.use_hash,
         )
+        from vllm.platforms import current_platform
+        current_platform.validate_request(
+            prompt=prompt,
+            params=params,
+            processed_inputs=processed_inputs,
+        )
         eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
 
         self._validate_model_inputs(processed_inputs, lora_request)

From 93e561ec4dfadba837e47f63fb9d9ec971b6ba65 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 17 Apr 2025 01:35:35 +0100
Subject: [PATCH 458/593] Improve error for structured output backend selection
 (#16717)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/v1/engine/processor.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 225e78f53eab3..be7e3709cccc3 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -155,10 +155,13 @@ class Processor:
                              "supported in V1.")
         if params.guided_decoding.backend:
             if params.guided_decoding.backend != engine_level_backend:
-                raise ValueError("Request-level structured output backend "
-                                 "must match engine-level backend. "
-                                 f"{params.guided_decoding.backend}"
-                                 f" != {engine_level_backend}")
+                raise ValueError(
+                    "Request-level structured output backend selection is no "
+                    "longer supported. The request specified "
+                    f"'{params.guided_decoding.backend}', but vLLM was "
+                    f"initialised with '{engine_level_backend}'. This error "
+                    "can be resolved by removing backend selection from the "
+                    "request.")
         else:
             params.guided_decoding.backend = engine_level_backend
 

From 8a7368e0692fd1e355e85e196b5d3509343ebabf Mon Sep 17 00:00:00 2001
From: Jade Zheng <zheng.shoujian@outlook.com>
Date: Thu, 17 Apr 2025 08:44:52 +0800
Subject: [PATCH 459/593] [Misc] Remove redundant comment (#16703)

Signed-off-by: Jade Zheng <zheng.shoujian@outlook.com>
---
 vllm/v1/worker/gpu_model_runner.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index c3d84ab377388..bfdb0f72251f6 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -540,9 +540,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         # because M (max_model_len) is not necessarily divisible by block_size.
         block_table_indices = (req_indices * self.max_num_blocks_per_req +
                                positions_np // self.block_size)
-        # NOTE(woosuk): We use torch.index_select instead of np.take here
-        # because torch.index_select is much faster than np.take for large
-        # tensors.
         block_table_cpu = self.input_batch.block_table.get_cpu_tensor()
         block_numbers = block_table_cpu.flatten()[block_table_indices].numpy()
         block_offsets = positions_np % self.block_size

From 3cd91dc9555e6f10e55f23d37782c65b0366f7cf Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 17 Apr 2025 02:05:59 +0100
Subject: [PATCH 460/593] Help user create custom model for Transformers
 backend remote code models (#16719)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/source/models/supported_models.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index ddb77f37a1e55..0b193ca0f502e 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -55,6 +55,10 @@ If your model is neither supported natively by vLLM or Transformers, you can sti
 Simply set `trust_remote_code=True` and vLLM will run any model on the Model Hub that is compatible with Transformers.
 Provided that the model writer implements their model in a compatible way, this means that you can run new models before they are officially supported in Transformers or vLLM!
 
+:::{tip}
+If you have not yet created your custom model, you can follow this guide on [customising models in Transformers](https://huggingface.co/docs/transformers/en/custom_models).
+:::
+
 ```python
 from vllm import LLM
 llm = LLM(model=..., task="generate", trust_remote_code=True)  # Name or path of your model

From 3092375e274e9e003961e600e10a6192d33ceaa0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Staszek=20Pa=C5=9Bko?= <staszek@gmail.com>
Date: Thu, 17 Apr 2025 04:28:32 +0200
Subject: [PATCH 461/593] [V1][Performance] Implement custom serializaton for
 MultiModalKwargs [Rebased] (#16432)

Signed-off-by: Staszek Pasko <staszek@gmail.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/test_serial_utils.py |  99 ++++++++++++++++++++++++++++++-
 vllm/envs.py                  |  11 ++++
 vllm/v1/serial_utils.py       | 108 ++++++++++++++++++++++++++++++++--
 3 files changed, 212 insertions(+), 6 deletions(-)

diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py
index bc0e0cbd85e1a..e58d3c403c197 100644
--- a/tests/v1/test_serial_utils.py
+++ b/tests/v1/test_serial_utils.py
@@ -1,10 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 from collections import UserDict
 from dataclasses import dataclass
+from typing import Optional
 
+import msgspec
 import numpy as np
 import torch
 
+from vllm.multimodal.inputs import (MultiModalBatchedField,
+                                    MultiModalFieldElem, MultiModalKwargs,
+                                    MultiModalKwargsItem,
+                                    MultiModalSharedField, NestedTensors)
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
 
 
@@ -50,7 +56,7 @@ def test_encode_decode():
         large_non_contig_tensor=torch.rand(1024, 512)[:, 10:20],
     )
 
-    encoder = MsgpackEncoder()
+    encoder = MsgpackEncoder(size_threshold=256)
     decoder = MsgpackDecoder(MyType)
 
     encoded = encoder.encode(obj)
@@ -78,6 +84,97 @@ def test_encode_decode():
     assert_equal(decoded2, obj)
 
 
+class MyRequest(msgspec.Struct):
+    mm: Optional[list[MultiModalKwargs]]
+
+
+def test_multimodal_kwargs():
+    d = {
+        "foo":
+        torch.zeros(20000, dtype=torch.float16),
+        "bar": [torch.zeros(i * 1000, dtype=torch.int8) for i in range(3)],
+        "baz": [
+            torch.rand((256), dtype=torch.float16),
+            [
+                torch.rand((1, 12), dtype=torch.float32),
+                torch.rand((3, 5, 7), dtype=torch.float64),
+            ], [torch.rand((4, 4), dtype=torch.float16)]
+        ],
+    }
+
+    # pack mm kwargs into a mock request so that it can be decoded properly
+    req = MyRequest(mm=[MultiModalKwargs(d)])
+
+    encoder = MsgpackEncoder()
+    decoder = MsgpackDecoder(MyRequest)
+
+    encoded = encoder.encode(req)
+
+    assert len(encoded) == 6
+
+    total_len = sum(memoryview(x).cast("B").nbytes for x in encoded)
+
+    # expected total encoding length, should be 44536, +-20 for minor changes
+    assert total_len >= 44516 and total_len <= 44556
+    decoded: MultiModalKwargs = decoder.decode(encoded).mm[0]
+    assert all(nested_equal(d[k], decoded[k]) for k in d)
+
+
+def test_multimodal_items_by_modality():
+    e1 = MultiModalFieldElem("audio", "a0", torch.zeros(1000,
+                                                        dtype=torch.int16),
+                             MultiModalBatchedField())
+    e2 = MultiModalFieldElem(
+        "video",
+        "v0",
+        [torch.zeros(1000, dtype=torch.int8) for _ in range(4)],
+        MultiModalBatchedField(),
+    )
+    e3 = MultiModalFieldElem("image", "i0", torch.zeros(1000,
+                                                        dtype=torch.int32),
+                             MultiModalSharedField(4))
+    e4 = MultiModalFieldElem("image", "i1", torch.zeros(1000,
+                                                        dtype=torch.int32),
+                             MultiModalBatchedField())
+    audio = MultiModalKwargsItem.from_elems([e1])
+    video = MultiModalKwargsItem.from_elems([e2])
+    image = MultiModalKwargsItem.from_elems([e3, e4])
+    mm = MultiModalKwargs.from_items([audio, video, image])
+
+    # pack mm kwargs into a mock request so that it can be decoded properly
+    req = MyRequest([mm])
+
+    encoder = MsgpackEncoder()
+    decoder = MsgpackDecoder(MyRequest)
+
+    encoded = encoder.encode(req)
+
+    assert len(encoded) == 8
+
+    total_len = sum(memoryview(x).cast("B").nbytes for x in encoded)
+
+    # expected total encoding length, should be 14255, +-20 for minor changes
+    assert total_len >= 14235 and total_len <= 14275
+    decoded: MultiModalKwargs = decoder.decode(encoded).mm[0]
+
+    # check all modalities were recovered and do some basic sanity checks
+    assert len(decoded.modalities) == 3
+    images = decoded.get_items("image")
+    assert len(images) == 1
+    assert len(images[0].items()) == 2
+    assert list(images[0].keys()) == ["i0", "i1"]
+
+    # check the tensor contents and layout in the main dict
+    assert all(nested_equal(mm[k], decoded[k]) for k in mm)
+
+
+def nested_equal(a: NestedTensors, b: NestedTensors):
+    if isinstance(a, torch.Tensor):
+        return torch.equal(a, b)
+    else:
+        return all(nested_equal(x, y) for x, y in zip(a, b))
+
+
 def assert_equal(obj1: MyType, obj2: MyType):
     assert torch.equal(obj1.tensor1, obj2.tensor1)
     assert obj1.a_string == obj2.a_string
diff --git a/vllm/envs.py b/vllm/envs.py
index f80bf878f79cf..d32968c3d173a 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -107,6 +107,7 @@ if TYPE_CHECKING:
     VLLM_TPU_BUCKET_PADDING_GAP: int = 0
     VLLM_USE_DEEP_GEMM: bool = False
     VLLM_XGRAMMAR_CACHE_MB: int = 0
+    VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
 
 
 def get_default_cache_root():
@@ -704,6 +705,16 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # It can be changed with this variable if needed for some reason.
     "VLLM_XGRAMMAR_CACHE_MB":
     lambda: int(os.getenv("VLLM_XGRAMMAR_CACHE_MB", "512")),
+
+    # Control the threshold for msgspec to use 'zero copy' for
+    # serialization/deserialization of tensors. Tensors below
+    # this limit will be encoded into the msgpack buffer, and
+    # tensors above will instead be sent via a separate message.
+    # While the sending side still actually copies the tensor
+    # in all cases, on the receiving side, tensors above this
+    # limit will actually be zero-copy decoded.
+    "VLLM_MSGPACK_ZERO_COPY_THRESHOLD":
+    lambda: int(os.getenv("VLLM_MSGPACK_ZERO_COPY_THRESHOLD", "256")),
 }
 
 # end-env-vars-definition
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 3af6793fde74c..4f7987ee46a6e 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import dataclasses
 import pickle
 from collections.abc import Sequence
 from inspect import isclass
@@ -12,12 +13,26 @@ import torch
 import zmq
 from msgspec import msgpack
 
+from vllm import envs
+from vllm.multimodal.inputs import (BaseMultiModalField,
+                                    MultiModalBatchedField,
+                                    MultiModalFieldConfig, MultiModalFieldElem,
+                                    MultiModalFlatField, MultiModalKwargs,
+                                    MultiModalKwargsItem,
+                                    MultiModalSharedField, NestedTensors)
+
 CUSTOM_TYPE_PICKLE = 1
 CUSTOM_TYPE_CLOUDPICKLE = 2
 CUSTOM_TYPE_RAW_VIEW = 3
 
-# TODO calibrate this size
-MIN_NOCOPY_BUF_SIZE = 512
+# MultiModalField class serialization type map.
+# These need to list all possible field types and match them
+# to factory methods in `MultiModalFieldConfig`.
+MMF_CLASS_TO_FACTORY: dict[type[BaseMultiModalField], str] = {
+    MultiModalFlatField: "flat",
+    MultiModalSharedField: "shared",
+    MultiModalBatchedField: "batched",
+}
 
 bytestr = Union[bytes, bytearray, memoryview, zmq.Frame]
 
@@ -27,14 +42,20 @@ class MsgpackEncoder:
 
     Note that unlike vanilla `msgspec` Encoders, this interface is generally
     not thread-safe when encoding tensors / numpy arrays.
+
+    By default, arrays below 256B are serialized inline Larger will get sent 
+    via dedicated messages. Note that this is a per-tensor limit.
     """
 
-    def __init__(self):
+    def __init__(self, size_threshold: Optional[int] = None):
+        if size_threshold is None:
+            size_threshold = envs.VLLM_MSGPACK_ZERO_COPY_THRESHOLD
         self.encoder = msgpack.Encoder(enc_hook=self.enc_hook)
         # This is used as a local stash of buffers that we can then access from
         # our custom `msgspec` hook, `enc_hook`. We don't have a way to
         # pass custom data to the hook otherwise.
         self.aux_buffers: Optional[list[bytestr]] = None
+        self.size_threshold = size_threshold
 
     def encode(self, obj: Any) -> Sequence[bytestr]:
         try:
@@ -65,6 +86,25 @@ class MsgpackEncoder:
         if isinstance(obj, np.ndarray) and obj.dtype.kind not in ('O', 'V'):
             return self._encode_ndarray(obj)
 
+        if isinstance(obj, MultiModalKwargs):
+            mm: MultiModalKwargs = obj
+            if not mm.modalities:
+                # just return the main dict if there are no modalities.
+                return dict(mm)
+
+            # ignore the main dict, it will be re-indexed.
+            # Encode a list of MultiModalKwargsItems as plain dicts
+            # + special handling for .field.
+            # Any tensors *not* indexed by modality will be ignored.
+            return [[{
+                "modality": elem.modality,
+                "key": elem.key,
+                "data": self._encode_nested_tensors(elem.data),
+                "field": self._encode_mm_field(elem.field),
+            } for elem in item.values()]
+                    for itemlist in mm._items_by_modality.values()
+                    for item in itemlist]
+
         if isinstance(obj, FunctionType):
             # `pickle` is generally faster than cloudpickle, but can have
             # problems serializing methods.
@@ -77,8 +117,9 @@ class MsgpackEncoder:
         self, obj: np.ndarray
     ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]:
         assert self.aux_buffers is not None
+        # If the array is non-contiguous, we need to copy it first
         arr_data = obj.data if obj.data.c_contiguous else obj.tobytes()
-        if not obj.shape or obj.nbytes < MIN_NOCOPY_BUF_SIZE:
+        if not obj.shape or obj.nbytes < self.size_threshold:
             # Encode small arrays and scalars inline. Using this extension type
             # ensures we can avoid copying when decoding.
             data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr_data)
@@ -92,6 +133,26 @@ class MsgpackEncoder:
         # backing buffers that we've stashed in `aux_buffers`.
         return obj.dtype.str, obj.shape, data
 
+    def _encode_nested_tensors(self, nt: NestedTensors) -> Any:
+        if isinstance(nt, torch.Tensor):
+            return self._encode_ndarray(nt.numpy())
+        if isinstance(nt, (int, float)):
+            # Although it violates NestedTensors type, MultiModalKwargs
+            # values are sometimes floats.
+            return nt
+        return [self._encode_nested_tensors(x) for x in nt]
+
+    def _encode_mm_field(self, field: BaseMultiModalField):
+        # Figure out the factory name for the field type.
+        name = MMF_CLASS_TO_FACTORY.get(field.__class__)
+        if not name:
+            raise TypeError(f"Unsupported field type: {field.__class__}")
+        # We just need to copy all of the field values in order
+        # which will be then used to reconstruct the field.
+        field_values = (getattr(field, f.name)
+                        for f in dataclasses.fields(field))
+        return name, *field_values
+
 
 class MsgpackDecoder:
     """Decoder with custom torch tensor and numpy array serialization.
@@ -126,13 +187,50 @@ class MsgpackDecoder:
                 return self._decode_ndarray(obj)
             if issubclass(t, torch.Tensor):
                 return torch.from_numpy(self._decode_ndarray(obj))
+            if issubclass(t, MultiModalKwargs):
+                if isinstance(obj, list):
+                    return MultiModalKwargs.from_items(
+                        self._decode_mm_items(obj))
+                return MultiModalKwargs({
+                    k: self._decode_nested_tensors(v)
+                    for k, v in obj.items()
+                })
         return obj
 
     def _decode_ndarray(self, arr: Any) -> np.ndarray:
         dtype, shape, data = arr
-        buffer = self.aux_buffers[data] if isinstance(data, int) else data
+        # Copy from inline representation, otherwise Torch is unhappy since
+        # the returned memory is non-writeable.
+        buffer = self.aux_buffers[data] if isinstance(data, int) \
+            else bytearray(data)
         return np.ndarray(buffer=buffer, dtype=np.dtype(dtype), shape=shape)
 
+    def _decode_mm_items(self, obj: list) -> list[MultiModalKwargsItem]:
+        decoded_items = []
+        for item in obj:
+            elems = []
+            for v in item:
+                v["data"] = self._decode_nested_tensors(v["data"])
+                # Reconstruct the field processor using MultiModalFieldConfig
+                factory_meth_name, *field_args = v["field"]
+                factory_meth = getattr(MultiModalFieldConfig,
+                                       factory_meth_name)
+                v["field"] = factory_meth(None, *field_args).field
+                elems.append(MultiModalFieldElem(**v))
+            decoded_items.append(MultiModalKwargsItem.from_elems(elems))
+        return decoded_items
+
+    def _decode_nested_tensors(self, obj: Any) -> NestedTensors:
+        if isinstance(obj, (int, float)):
+            # Although it violates NestedTensors type, MultiModalKwargs
+            # values are sometimes floats.
+            return obj
+        if not isinstance(obj, list):
+            raise TypeError(f"Unexpected NestedTensors contents: {type(obj)}")
+        if obj and isinstance(obj[0], str):
+            return torch.from_numpy(self._decode_ndarray(obj))
+        return [self._decode_nested_tensors(x) for x in obj]
+
     def ext_hook(self, code: int, data: memoryview) -> Any:
         if code == CUSTOM_TYPE_RAW_VIEW:
             return data

From 2cbd4d2999e6f2ef016aa1a2e2d280b81819ab5f Mon Sep 17 00:00:00 2001
From: Bryan Lu <55512809+luyuzhe111@users.noreply.github.com>
Date: Wed, 16 Apr 2025 19:47:26 -0700
Subject: [PATCH 462/593] [V1][Spec Dec Bug Fix] Respect Spec Dec Method
 Specification (#16636)

Signed-off-by: Bryan Lu <yuzhelu@amazon.com>
---
 vllm/config.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index 6f87c46a9a2bb..cca725c7792f8 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2341,7 +2341,9 @@ class SpeculativeConfig:
                 )
 
                 # Automatically detect the method
-                if "eagle-" in self.draft_model_config.model.lower():
+                if self.method == 'eagle':
+                    pass
+                elif "eagle-" in self.draft_model_config.model.lower():
                     self.method = "eagle"
                 elif self.draft_model_config.hf_config.model_type == "medusa":
                     self.method = "medusa"

From 3c776dcefb3f327d252a6950fd6d5e990856a11e Mon Sep 17 00:00:00 2001
From: Aaruni Aggarwal <47731267+AaruniAggarwal@users.noreply.github.com>
Date: Thu, 17 Apr 2025 08:17:47 +0530
Subject: [PATCH 463/593] Adding vllm buildkite job for IBM Power (#16679)

Signed-off-by: Aaruni Aggarwal <aaruniagg@gmail.com>
---
 .../hardware_ci/run-cpu-test-ppc64le.sh       | 28 +++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
index 9c5cf7cad9489..036cfea9431cb 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -5,10 +5,34 @@
 set -ex
 
 # Setup cleanup
-remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
+remove_docker_container() { podman rm -f cpu-test-ubi9-ppc || true; podman system prune -f; }
 trap remove_docker_container EXIT
 remove_docker_container
 
 # Try building the docker image
-docker build -t cpu-test -f docker/Dockerfile.ppc64le .
+podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
+
+# Run the image
+podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --name cpu-test-ubi9-ppc cpu-test-ubi9-ppc
+
+function cpu_tests() {
+
+  # offline inference
+  podman exec cpu-test-ubi9-ppc bash -c "
+    set -e
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+
+  # Run basic model test
+  podman exec cpu-test-ubi9-ppc bash -c "
+    set -e
+    pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
+    pip install sentence-transformers datamodel_code_generator
+    pytest -v -s tests/models/embedding/language/test_cls_models.py::test_classification_models[float-jason9693/Qwen2.5-1.5B-apeach]
+    pytest -v -s tests/models/embedding/language/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]
+    pytest -v -s tests/models/encoder_decoder/language -m cpu_model"
+}
+
+# All of CPU tests are expected to be finished less than 40 mins.
+export -f cpu_tests
+timeout 40m bash -c cpu_tests
 

From 2b05b8ce69fff90b7f1e3285dcd05ced2b79fd97 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Wed, 16 Apr 2025 22:48:34 -0400
Subject: [PATCH 464/593] [V1][Frontend] Improve Shutdown And Logs (#11737)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Andrew Feldman <afeldman@neuralmagic.com>
Co-authored-by: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
---
 .buildkite/test-pipeline.yaml                 |   1 +
 tests/v1/shutdown/test_delete.py              |  97 ++++++
 tests/v1/shutdown/test_forward_error.py       | 129 +++++++
 tests/v1/shutdown/test_processor_error.py     |  69 ++++
 tests/v1/shutdown/test_startup_error.py       |  97 ++++++
 tests/v1/shutdown/utils.py                    |   5 +
 .../device_communicators/shm_broadcast.py     |  31 +-
 vllm/entrypoints/launcher.py                  |  98 ++++--
 vllm/v1/engine/__init__.py                    |   2 +
 vllm/v1/engine/async_llm.py                   | 176 ++++++----
 vllm/v1/engine/core.py                        |  87 +++--
 vllm/v1/engine/core_client.py                 | 202 ++++++-----
 vllm/v1/engine/exceptions.py                  |  16 +
 vllm/v1/engine/output_processor.py            |  33 +-
 vllm/v1/executor/abstract.py                  |  11 +-
 vllm/v1/executor/multiproc_executor.py        | 324 +++++++++++-------
 16 files changed, 1031 insertions(+), 347 deletions(-)
 create mode 100644 tests/v1/shutdown/test_delete.py
 create mode 100644 tests/v1/shutdown/test_forward_error.py
 create mode 100644 tests/v1/shutdown/test_processor_error.py
 create mode 100644 tests/v1/shutdown/test_startup_error.py
 create mode 100644 tests/v1/shutdown/utils.py
 create mode 100644 vllm/v1/engine/exceptions.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index c86f6add6cb20..5fc7b48bfcf2d 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -552,6 +552,7 @@ steps:
   # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
 
 - label: Plugin Tests (2 GPUs) # 40min
   working_dir: "/vllm-workspace/tests"
diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py
new file mode 100644
index 0000000000000..ed368fe828d07
--- /dev/null
+++ b/tests/v1/shutdown/test_delete.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Test that we handle a startup Error and shutdown."""
+
+import pytest
+
+from tests.utils import wait_for_gpu_memory_to_clear
+from tests.v1.shutdown.utils import (SHUTDOWN_TEST_THRESHOLD_BYTES,
+                                     SHUTDOWN_TEST_TIMEOUT_SEC)
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.sampling_params import RequestOutputKind
+from vllm.utils import cuda_device_count_stateless
+from vllm.v1.engine.async_llm import AsyncLLM
+
+MODELS = ["meta-llama/Llama-3.2-1B"]
+
+
+@pytest.mark.asyncio
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+@pytest.mark.parametrize("send_one_request", [False, True])
+async def test_async_llm_delete(model: str, tensor_parallel_size: int,
+                                send_one_request: bool) -> None:
+    """Test that AsyncLLM frees GPU memory upon deletion.
+    AsyncLLM always uses an MP client.
+
+    Args:
+      model: model under test
+      tensor_parallel_size: degree of tensor parallelism
+      send_one_request: send one request to engine before deleting
+    """
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    engine_args = AsyncEngineArgs(model=model,
+                                  enforce_eager=True,
+                                  tensor_parallel_size=tensor_parallel_size)
+
+    # Instantiate AsyncLLM; make request to complete any deferred
+    # initialization; then delete instance
+    async_llm = AsyncLLM.from_engine_args(engine_args)
+    if send_one_request:
+        async for _ in async_llm.generate(
+                "Hello my name is",
+                request_id="abc",
+                sampling_params=SamplingParams(
+                    max_tokens=1, output_kind=RequestOutputKind.DELTA)):
+            pass
+    del async_llm
+
+    # Confirm all the processes are cleaned up.
+    wait_for_gpu_memory_to_clear(
+        devices=list(range(tensor_parallel_size)),
+        threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
+    )
+
+
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+@pytest.mark.parametrize("enable_multiprocessing", [True])
+@pytest.mark.parametrize("send_one_request", [False, True])
+def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int,
+                    enable_multiprocessing: bool,
+                    send_one_request: bool) -> None:
+    """Test that LLM frees GPU memory upon deletion.
+    TODO(andy) - LLM without multiprocessing.
+
+    Args:
+      model: model under test
+      tensor_parallel_size: degree of tensor parallelism
+      enable_multiprocessing: enable workers in separate process(es)
+      send_one_request: send one request to engine before deleting
+    """
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    with monkeypatch.context() as m:
+        MP_VALUE = "1" if enable_multiprocessing else "0"
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
+
+        # Instantiate LLM; make request to complete any deferred
+        # initialization; then delete instance
+        llm = LLM(model=model,
+                  enforce_eager=True,
+                  tensor_parallel_size=tensor_parallel_size)
+        if send_one_request:
+            llm.generate("Hello my name is",
+                         sampling_params=SamplingParams(max_tokens=1))
+        del llm
+
+        # Confirm all the processes are cleaned up.
+        wait_for_gpu_memory_to_clear(
+            devices=list(range(tensor_parallel_size)),
+            threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
+        )
diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py
new file mode 100644
index 0000000000000..9fedbe4f9a01a
--- /dev/null
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -0,0 +1,129 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Test that we handle an Error in model forward and shutdown."""
+
+import asyncio
+
+import pytest
+
+from tests.utils import wait_for_gpu_memory_to_clear
+from tests.v1.shutdown.utils import (SHUTDOWN_TEST_THRESHOLD_BYTES,
+                                     SHUTDOWN_TEST_TIMEOUT_SEC)
+from vllm import LLM, AsyncEngineArgs, SamplingParams
+from vllm.distributed import get_tensor_model_parallel_rank
+from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.utils import cuda_device_count_stateless
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.engine.exceptions import EngineDeadError
+
+MODELS = ["meta-llama/Llama-3.2-1B"]
+
+
+def evil_forward(self, *args, **kwargs):
+    """Evil forward method that raise an exception after 10 calls."""
+    NUMBER_OF_GOOD_PASSES = 10
+
+    if not hasattr(self, "num_calls"):
+        self.num_calls = 0
+
+    if (self.num_calls == NUMBER_OF_GOOD_PASSES
+            and get_tensor_model_parallel_rank() == 0):
+        raise Exception("Simulated illegal memory access on Rank 0!")
+    self.num_calls += 1
+
+    return self.model(*args, **kwargs)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+@pytest.mark.parametrize("model", MODELS)
+async def test_async_llm_model_error(monkeypatch, tensor_parallel_size: int,
+                                     model: str) -> None:
+    """Test that AsyncLLM propagates a forward pass error and frees memory.
+    
+    AsyncLLM always uses an MP client.
+    """
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    # Monkeypatch an error in the model.
+    monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward)
+
+    engine_args = AsyncEngineArgs(model=model,
+                                  enforce_eager=True,
+                                  tensor_parallel_size=tensor_parallel_size)
+    async_llm = AsyncLLM.from_engine_args(engine_args)
+
+    async def generate(request_id: str):
+        generator = async_llm.generate("Hello my name is",
+                                       request_id=request_id,
+                                       sampling_params=SamplingParams())
+        try:
+            async for _ in generator:
+                pass
+        except Exception as e:
+            return e
+
+    NUM_REQS = 3
+    tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)]
+    outputs = await asyncio.gather(*tasks)
+
+    # Every request should get an EngineDeadError.
+    for output in outputs:
+        assert isinstance(output, EngineDeadError)
+
+    # AsyncLLM should be errored.
+    assert async_llm.errored
+
+    # We should not be able to make another request.
+    with pytest.raises(EngineDeadError):
+        async for _ in async_llm.generate("Hello my name is",
+                                          request_id="abc",
+                                          sampling_params=SamplingParams()):
+            raise Exception("We should not get here.")
+
+    # Confirm all the processes are cleaned up.
+    wait_for_gpu_memory_to_clear(
+        devices=list(range(tensor_parallel_size)),
+        threshold_bytes=2 * 2**30,
+        timeout_s=60,
+    )
+
+    # NOTE: shutdown is handled by the API Server if an exception
+    # occurs, so it is expected that we would need to call this.
+    async_llm.shutdown()
+
+
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
+@pytest.mark.parametrize("enable_multiprocessing", [True])
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+@pytest.mark.parametrize("model", MODELS)
+def test_llm_model_error(monkeypatch, tensor_parallel_size: int,
+                         enable_multiprocessing: bool, model: str) -> None:
+    """Test that LLM propagates a forward pass error and frees memory.
+    TODO(andy) - LLM without multiprocessing; LLM with multiprocessing
+    and >1 rank
+    """
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    with monkeypatch.context() as m:
+
+        MP_VALUE = "1" if enable_multiprocessing else "0"
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
+
+        # Monkeypatch an error in the model.
+        m.setattr(LlamaForCausalLM, "forward", evil_forward)
+
+        llm = LLM(model=model,
+                  enforce_eager=True,
+                  tensor_parallel_size=tensor_parallel_size)
+
+        with pytest.raises(
+                EngineDeadError if enable_multiprocessing else Exception):
+            llm.generate("Hello my name is Robert and I")
+
+        # Confirm all the processes are cleaned up.
+        wait_for_gpu_memory_to_clear(
+            devices=list(range(tensor_parallel_size)),
+            threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
+        )
diff --git a/tests/v1/shutdown/test_processor_error.py b/tests/v1/shutdown/test_processor_error.py
new file mode 100644
index 0000000000000..0fe48da475c6a
--- /dev/null
+++ b/tests/v1/shutdown/test_processor_error.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Test error handling in Processor. Should not impact other reqs."""
+
+import asyncio
+
+import pytest
+
+from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT_SEC
+from vllm import SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.inputs.data import TokensPrompt
+from vllm.sampling_params import RequestOutputKind
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.engine.exceptions import EngineGenerateError
+
+MODELS = ["meta-llama/Llama-3.2-1B"]
+
+
+@pytest.mark.asyncio
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
+@pytest.mark.parametrize("model", MODELS)
+async def test_async_llm_processor_error(model: str) -> None:
+    """Test that AsyncLLM propagates a processor error.
+    Test empty tokens prompt (failure) and non-empty prompt (no failure.)
+    AsyncLLM always uses an MP client.
+    """
+    engine_args = AsyncEngineArgs(model=model, enforce_eager=True)
+    async_llm = AsyncLLM.from_engine_args(engine_args)
+
+    async def generate(request_id: str):
+        # [] is not allowed and will raise a ValueError in Processor.
+        generator = async_llm.generate(TokensPrompt([]),
+                                       request_id=request_id,
+                                       sampling_params=SamplingParams())
+        try:
+            async for _ in generator:
+                pass
+        except Exception as e:
+            return e
+
+    NUM_REQS = 3
+    tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)]
+    outputs = await asyncio.gather(*tasks)
+
+    # Every request should have get an EngineGenerateError.
+    for output in outputs:
+        with pytest.raises(EngineGenerateError):
+            raise output
+
+    # AsyncLLM should be errored.
+    assert not async_llm.errored
+
+    # This should be no problem.
+    EXPECTED_TOKENS = 5
+    outputs = []
+    async for out in async_llm.generate(
+            "Hello my name is",
+            request_id="abc",
+            sampling_params=SamplingParams(
+                max_tokens=EXPECTED_TOKENS,
+                output_kind=RequestOutputKind.DELTA)):
+        outputs.append(out)
+
+    generated_tokens = []
+    for out in outputs:
+        generated_tokens.extend(out.outputs[0].token_ids)
+    assert len(generated_tokens) == EXPECTED_TOKENS
+
+    async_llm.shutdown()
diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py
new file mode 100644
index 0000000000000..1bba19102ec61
--- /dev/null
+++ b/tests/v1/shutdown/test_startup_error.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Test that we handle a startup Error and shutdown."""
+
+import pytest
+
+from tests.utils import wait_for_gpu_memory_to_clear
+from tests.v1.shutdown.utils import (SHUTDOWN_TEST_THRESHOLD_BYTES,
+                                     SHUTDOWN_TEST_TIMEOUT_SEC)
+from vllm import LLM
+from vllm.distributed import get_tensor_model_parallel_rank
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.utils import cuda_device_count_stateless
+from vllm.v1.engine.async_llm import AsyncLLM
+
+MODELS = ["meta-llama/Llama-3.2-1B"]
+
+
+def evil_method(self, *args, **kwargs):
+    """Evil method that raises an exception."""
+
+    if get_tensor_model_parallel_rank() == 0:
+        raise Exception("Simulated Error in startup!")
+
+    return self.model(*args, **kwargs, intermediate_tensors=None)
+
+
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+@pytest.mark.parametrize("failing_method", ["forward", "load_weights"])
+def test_async_llm_startup_error(monkeypatch, model: str,
+                                 tensor_parallel_size: int,
+                                 failing_method: str) -> None:
+    """Test that AsyncLLM propagates an __init__ error & frees memory.
+    Test profiling (forward()) and load weights failures.
+    AsyncLLM always uses an MP client.
+    """
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    # Monkeypatch an error in the model.
+    monkeypatch.setattr(LlamaForCausalLM, failing_method, evil_method)
+
+    engine_args = AsyncEngineArgs(model=model,
+                                  enforce_eager=True,
+                                  tensor_parallel_size=tensor_parallel_size)
+
+    # Confirm we get an exception.
+    with pytest.raises(Exception, match="initialization failed"):
+        _ = AsyncLLM.from_engine_args(engine_args)
+
+    # Confirm all the processes are cleaned up.
+    wait_for_gpu_memory_to_clear(
+        devices=list(range(tensor_parallel_size)),
+        threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
+    )
+
+
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+@pytest.mark.parametrize("enable_multiprocessing", [True])
+@pytest.mark.parametrize("failing_method", ["forward", "load_weights"])
+def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int,
+                           enable_multiprocessing: bool,
+                           failing_method: str) -> None:
+    """Test that LLM propagates an __init__ error and frees memory.
+    Test profiling (forward()) and load weights failures.
+    TODO(andy) - LLM without multiprocessing.
+    """
+    if model != "meta-llama/Llama-3.2-1B":
+        pytest.skip(reason="Only test meta-llama/Llama-3.2-1B")
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    with monkeypatch.context() as m:
+
+        MP_VALUE = "1" if enable_multiprocessing else "0"
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
+
+        # Monkeypatch an error in the model.
+        monkeypatch.setattr(LlamaForCausalLM, failing_method, evil_method)
+
+        with pytest.raises(
+                Exception,
+                match="initialization failed"
+                if enable_multiprocessing else "Simulated Error in startup!"):
+            _ = LLM(model=model,
+                    enforce_eager=True,
+                    tensor_parallel_size=tensor_parallel_size)
+
+        # Confirm all the processes are cleaned up.
+        wait_for_gpu_memory_to_clear(
+            devices=list(range(tensor_parallel_size)),
+            threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
+        )
diff --git a/tests/v1/shutdown/utils.py b/tests/v1/shutdown/utils.py
new file mode 100644
index 0000000000000..8f7c0380d407f
--- /dev/null
+++ b/tests/v1/shutdown/utils.py
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Shutdown test utils"""
+
+SHUTDOWN_TEST_TIMEOUT_SEC = 120
+SHUTDOWN_TEST_THRESHOLD_BYTES = 2 * 2**30
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 11ed7c0843779..49a65bd0d1182 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -7,11 +7,13 @@ import time
 from contextlib import contextmanager
 from dataclasses import dataclass, field
 from multiprocessing import shared_memory
-from typing import List, Optional, Tuple, Union
+from threading import Event
+from typing import Any, List, Optional, Tuple, Union
 from unittest.mock import patch
 
 import torch
 import torch.distributed as dist
+import zmq
 from torch.distributed import ProcessGroup
 from zmq import IPV6  # type: ignore
 from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context  # type: ignore
@@ -400,7 +402,9 @@ class MessageQueue:
                 break
 
     @contextmanager
-    def acquire_read(self, timeout: Optional[float] = None):
+    def acquire_read(self,
+                     timeout: Optional[float] = None,
+                     cancel: Optional[Event] = None):
         assert self._is_local_reader, "Only readers can acquire read"
         start_time = time.monotonic()
         n_warning = 1
@@ -430,6 +434,9 @@ class MessageQueue:
                         )
                         n_warning += 1
 
+                    if cancel is not None and cancel.is_set():
+                        raise RuntimeError("cancelled")
+
                     # if we time out, raise an exception
                     if (timeout is not None
                             and time.monotonic() - start_time > timeout):
@@ -464,10 +471,12 @@ class MessageQueue:
         if self.n_remote_reader > 0:
             self.remote_socket.send(serialized_obj)
 
-    def dequeue(self, timeout: Optional[float] = None):
+    def dequeue(self,
+                timeout: Optional[float] = None,
+                cancel: Optional[Event] = None):
         """ Read from message queue with optional timeout (in seconds) """
         if self._is_local_reader:
-            with self.acquire_read(timeout) as buf:
+            with self.acquire_read(timeout, cancel) as buf:
                 overflow = buf[0] == 1
                 if not overflow:
                     # no need to know the size of serialized object
@@ -475,15 +484,21 @@ class MessageQueue:
                     # see https://docs.python.org/3/library/pickle.html
                     obj = pickle.loads(buf[1:])
             if overflow:
-                recv = self.local_socket.recv()
-                obj = pickle.loads(recv)
+                obj = MessageQueue.recv(self.local_socket, timeout)
         elif self._is_remote_reader:
-            recv = self.remote_socket.recv()
-            obj = pickle.loads(recv)
+            obj = MessageQueue.recv(self.remote_socket, timeout)
         else:
             raise RuntimeError("Only readers can dequeue")
         return obj
 
+    @staticmethod
+    def recv(socket: zmq.Socket, timeout: Optional[float]) -> Any:
+        timeout_ms = None if timeout is None else int(timeout * 1000)
+        if not socket.poll(timeout=timeout_ms):
+            raise TimeoutError
+        recv = socket.recv(copy=False)
+        return pickle.loads(recv.buffer)
+
     def broadcast_object(self, obj=None):
         if self._is_writer:
             self.enqueue(obj)
diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
index b09ee526f14ae..a4f70a51ebaf3 100644
--- a/vllm/entrypoints/launcher.py
+++ b/vllm/entrypoints/launcher.py
@@ -12,9 +12,11 @@ from fastapi import FastAPI, Request, Response
 from vllm import envs
 from vllm.engine.async_llm_engine import AsyncEngineDeadError
 from vllm.engine.multiprocessing import MQEngineDeadError
+from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.ssl import SSLCertRefresher
 from vllm.logger import init_logger
 from vllm.utils import find_process_using_port
+from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
 
 logger = init_logger(__name__)
 
@@ -40,6 +42,8 @@ async def serve_http(app: FastAPI,
 
     loop = asyncio.get_running_loop()
 
+    watchdog_task = loop.create_task(
+        watchdog_loop(server, app.state.engine_client))
     server_task = loop.create_task(
         server.serve(sockets=[sock] if sock else None))
 
@@ -52,6 +56,7 @@ async def serve_http(app: FastAPI,
     def signal_handler() -> None:
         # prevents the uvicorn signal handler to exit early
         server_task.cancel()
+        watchdog_task.cancel()
         if ssl_cert_refresher:
             ssl_cert_refresher.stop()
 
@@ -73,48 +78,69 @@ async def serve_http(app: FastAPI,
                 port, process, " ".join(process.cmdline()))
         logger.info("Shutting down FastAPI HTTP server.")
         return server.shutdown()
+    finally:
+        watchdog_task.cancel()
+
+
+async def watchdog_loop(server: uvicorn.Server, engine: EngineClient):
+    """
+    # Watchdog task that runs in the background, checking
+    # for error state in the engine. Needed to trigger shutdown
+    # if an exception arises is StreamingResponse() generator.
+    """
+    VLLM_WATCHDOG_TIME_S = 5.0
+    while True:
+        await asyncio.sleep(VLLM_WATCHDOG_TIME_S)
+        terminate_if_errored(server, engine)
+
+
+def terminate_if_errored(server: uvicorn.Server, engine: EngineClient):
+    """
+    See discussions here on shutting down a uvicorn server
+    https://github.com/encode/uvicorn/discussions/1103
+    In this case we cannot await the server shutdown here
+    because handler must first return to close the connection
+    for this request.
+    """
+    engine_errored = engine.errored and not engine.is_running
+    if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine_errored:
+        server.should_exit = True
 
 
 def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None:
-    """Adds handlers for fatal errors that should crash the server"""
+    """
+    VLLM V1 AsyncLLM catches exceptions and returns
+    only two types: EngineGenerateError and EngineDeadError.
+    
+    EngineGenerateError is raised by the per request generate()
+    method. This error could be request specific (and therefore
+    recoverable - e.g. if there is an error in input processing).
+    
+    EngineDeadError is raised by the background output_handler
+    method. This error is global and therefore not recoverable.
+    
+    We register these @app.exception_handlers to return nice
+    responses to the end user if they occur and shut down if needed.
+    See https://fastapi.tiangolo.com/tutorial/handling-errors/
+    for more details on how exception handlers work.
+
+    If an exception is encountered in a StreamingResponse
+    generator, the exception is not raised, since we already sent
+    a 200 status. Rather, we send an error message as the next chunk.
+    Since the exception is not raised, this means that the server
+    will not automatically shut down. Instead, we use the watchdog
+    background task for check for errored state.
+    """
 
     @app.exception_handler(RuntimeError)
-    async def runtime_error_handler(request: Request, __):
-        """On generic runtime error, check to see if the engine has died.
-        It probably has, in which case the server will no longer be able to
-        handle requests. Trigger a graceful shutdown with a SIGTERM."""
-        engine = request.app.state.engine_client
-        if (not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine.errored
-                and not engine.is_running):
-            logger.fatal("AsyncLLMEngine has failed, terminating server "
-                         "process")
-            # See discussions here on shutting down a uvicorn server
-            # https://github.com/encode/uvicorn/discussions/1103
-            # In this case we cannot await the server shutdown here because
-            # this handler must first return to close the connection for
-            # this request.
-            server.should_exit = True
-
-        return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
-
     @app.exception_handler(AsyncEngineDeadError)
-    async def async_engine_dead_handler(_, __):
-        """Kill the server if the async engine is already dead. It will
-        not handle any further requests."""
-        if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH:
-            logger.fatal("AsyncLLMEngine is already dead, terminating server "
-                         "process")
-            server.should_exit = True
-
-        return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
-
     @app.exception_handler(MQEngineDeadError)
-    async def mq_engine_dead_handler(_, __):
-        """Kill the server if the mq engine is already dead. It will
-        not handle any further requests."""
-        if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH:
-            logger.fatal("MQLLMEngine is already dead, terminating server "
-                         "process")
-            server.should_exit = True
+    @app.exception_handler(EngineDeadError)
+    @app.exception_handler(EngineGenerateError)
+    async def runtime_exception_handler(request: Request, __):
+        terminate_if_errored(
+            server=server,
+            engine=request.app.state.engine_client,
+        )
 
         return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 1264e43c79d9e..af4122a510779 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -156,3 +156,5 @@ class EngineCoreRequestType(enum.Enum):
     ABORT = b'\x01'
     START_DP = b'\x02'
     UTILITY = b'\x03'
+    # Sentinel used within EngineCoreProc.
+    EXECUTOR_FAILED = b'\x04'
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 6d24ba2bc9820..bc49a0d3bb5bf 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
-
 import asyncio
 import logging
-import os
 from collections.abc import AsyncGenerator, Mapping
 from copy import copy
 from typing import Optional, Union
@@ -26,9 +24,10 @@ from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Device, cdiv, kill_process_tree
+from vllm.utils import Device, cdiv
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.core_client import EngineCoreClient
+from vllm.v1.engine.core_client import AsyncMPClient, DPAsyncMPClient
+from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
 from vllm.v1.engine.output_processor import (OutputProcessor,
                                              RequestOutputCollector)
 from vllm.v1.engine.parallel_sampling import ParentRequest
@@ -61,8 +60,6 @@ class AsyncLLM(EngineClient):
                 "AsyncLLMEngine.from_vllm_config(...) or explicitly set "
                 "VLLM_USE_V1=0 or 1 and report this issue on Github.")
 
-        assert start_engine_loop
-
         self.model_config = vllm_config.model_config
         self.vllm_config = vllm_config
         self.log_requests = log_requests
@@ -99,15 +96,23 @@ class AsyncLLM(EngineClient):
                                                 log_stats=self.log_stats)
 
         # EngineCore (starts the engine in background process).
-        self.engine_core = EngineCoreClient.make_client(
-            multiprocess_mode=True,
-            asyncio_mode=True,
+        core_client_class = AsyncMPClient if (
+            vllm_config.parallel_config.data_parallel_size
+            == 1) else DPAsyncMPClient
+
+        self.engine_core = core_client_class(
             vllm_config=vllm_config,
             executor_class=executor_class,
             log_stats=self.log_stats,
         )
 
         self.output_handler: Optional[asyncio.Task] = None
+        try:
+            # Start output handler eagerly if we are in the asyncio eventloop.
+            asyncio.get_running_loop()
+            self._run_output_handler()
+        except RuntimeError:
+            pass
 
     @classmethod
     def from_vllm_config(
@@ -165,6 +170,9 @@ class AsyncLLM(EngineClient):
             usage_context=usage_context,
         )
 
+    def __del__(self):
+        self.shutdown()
+
     def shutdown(self):
         """Shutdown, cleaning up the background proc and IPC."""
 
@@ -187,6 +195,9 @@ class AsyncLLM(EngineClient):
     ) -> RequestOutputCollector:
         """Add new request to the AsyncLLM."""
 
+        if self.errored:
+            raise EngineDeadError()
+
         assert isinstance(params, SamplingParams), \
             "Pooling is not supported in V1"
 
@@ -261,9 +272,7 @@ class AsyncLLM(EngineClient):
             # We start the output_handler on the first call to generate() so
             # we can call __init__ before the event loop, which enables us
             # to handle startup failure gracefully in the OpenAI server.
-            if self.output_handler is None:
-                self.output_handler = asyncio.create_task(
-                    self._run_output_handler())
+            self._run_output_handler()
 
             q = await self.add_request(
                 request_id,
@@ -288,62 +297,96 @@ class AsyncLLM(EngineClient):
                 finished = out.finished
                 yield out
 
-        # If the request is disconnected by the client, the
-        # generate() task will be canceled. So, we abort the
-        # request if we end up here.
+        # If the request is disconnected by the client, generate()
+        # is cancelled. So, we abort the request if we end up here.
         except asyncio.CancelledError:
             await self.abort(request_id)
+            if self.log_requests:
+                logger.info("Request %s aborted.", request_id)
             raise
 
-    async def _run_output_handler(self):
+        # Engine is dead. Do not abort since we shut down.
+        except EngineDeadError:
+            if self.log_requests:
+                logger.info("Request %s failed (engine dead).", request_id)
+            raise
+
+        # Request validation error.
+        except ValueError:
+            if self.log_requests:
+                logger.info("Request %s failed (bad request).", request_id)
+            raise
+
+        # Unexpected error in the generate() task (possibly recoverable).
+        except Exception as e:
+            await self.abort(request_id)
+            if self.log_requests:
+                logger.info("Request %s failed.", request_id)
+            raise EngineGenerateError() from e
+
+    def _run_output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
 
-        try:
-            while True:
-                # 1) Pull EngineCoreOutputs from the EngineCore.
-                outputs = await self.engine_core.get_output_async()
-                num_outputs = len(outputs.outputs)
+        if self.output_handler is not None:
+            return
 
-                iteration_stats = IterationStats() if (
-                    self.log_stats and num_outputs) else None
+        # Ensure that the task doesn't have a circular ref back to the AsyncLLM
+        # object, or else it won't be garbage collected and cleaned up properly.
+        engine_core = self.engine_core
+        output_processor = self.output_processor
+        log_stats = self.log_stats
+        stat_loggers = self.stat_loggers if log_stats else None
 
-                # Split outputs into chunks of at most
-                # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the
-                # event loop for too long.
-                if num_outputs <= VLLM_V1_OUTPUT_PROC_CHUNK_SIZE:
-                    slices = (outputs.outputs, )
-                else:
-                    slices = np.array_split(
-                        outputs.outputs,
-                        cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE))
+        async def output_handler():
+            try:
+                while True:
+                    # 1) Pull EngineCoreOutputs from the EngineCore.
+                    outputs = await engine_core.get_output_async()
+                    num_outputs = len(outputs.outputs)
 
-                for i, outputs_slice in enumerate(slices):
-                    # 2) Process EngineCoreOutputs.
-                    processed_outputs = self.output_processor.process_outputs(
-                        outputs_slice, outputs.timestamp, iteration_stats)
-                    # NOTE: RequestOutputs are pushed to their queues.
-                    assert not processed_outputs.request_outputs
+                    iteration_stats = IterationStats() if (
+                        log_stats and num_outputs) else None
 
-                    # Allow other asyncio tasks to run between chunks
-                    if i + 1 < len(slices):
-                        await asyncio.sleep(0)
+                    # Split outputs into chunks of at most
+                    # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the
+                    # event loop for too long.
+                    if num_outputs <= VLLM_V1_OUTPUT_PROC_CHUNK_SIZE:
+                        slices = (outputs.outputs, )
+                    else:
+                        slices = np.array_split(
+                            outputs.outputs,
+                            cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE))
 
-                    # 3) Abort any reqs that finished due to stop strings.
-                    await self.engine_core.abort_requests_async(
-                        processed_outputs.reqs_to_abort)
+                    for i, outputs_slice in enumerate(slices):
+                        # 2) Process EngineCoreOutputs.
+                        processed_outputs = output_processor.process_outputs(
+                            outputs_slice, outputs.timestamp, iteration_stats)
+                        # NOTE: RequestOutputs are pushed to their queues.
+                        assert not processed_outputs.request_outputs
 
-                # 4) Logging.
-                # TODO(rob): make into a coroutine and launch it in
-                # background thread once Prometheus overhead is non-trivial.
-                self._record_stats(
-                    engine_index=outputs.engine_index,
-                    scheduler_stats=outputs.scheduler_stats,
-                    iteration_stats=iteration_stats,
-                )
+                        # Allow other asyncio tasks to run between chunks
+                        if i + 1 < len(slices):
+                            await asyncio.sleep(0)
 
-        except Exception as e:
-            logger.exception("EngineCore output handler hit an error: %s", e)
-            kill_process_tree(os.getpid())
+                        # 3) Abort any reqs that finished due to stop strings.
+                        await engine_core.abort_requests_async(
+                            processed_outputs.reqs_to_abort)
+
+                    # 4) Logging.
+                    # TODO(rob): make into a coroutine and launch it in
+                    # background thread once Prometheus overhead is non-trivial.
+                    if stat_loggers:
+                        assert outputs.scheduler_stats is not None
+                        AsyncLLM._record_stats(
+                            stat_loggers[outputs.engine_index],
+                            scheduler_stats=outputs.scheduler_stats,
+                            iteration_stats=iteration_stats,
+                        )
+            except Exception as e:
+                logger.exception("AsyncLLM output_handler failed.")
+                output_processor.propagate_error(e)
+
+        self.output_handler = asyncio.create_task(output_handler())
 
     async def abort(self, request_id: str) -> None:
         """Abort RequestId in OutputProcessor and EngineCore."""
@@ -354,17 +397,15 @@ class AsyncLLM(EngineClient):
         if self.log_requests:
             logger.info("Aborted request %s.", request_id)
 
+    @staticmethod
     def _record_stats(
-        self,
-        scheduler_stats: Optional[SchedulerStats],
+        stat_loggers: list[StatLoggerBase],
+        scheduler_stats: SchedulerStats,
         iteration_stats: Optional[IterationStats],
-        engine_index: int = 0,
     ):
-        if not self.log_stats:
-            return
-
-        assert scheduler_stats is not None
-        for stat_logger in self.stat_loggers[engine_index]:
+        """static so that it can be used from the output_handler task
+        without a circular ref to AsyncLLM."""
+        for stat_logger in stat_loggers:
             stat_logger.record(scheduler_stats=scheduler_stats,
                                iteration_stats=iteration_stats)
 
@@ -451,16 +492,17 @@ class AsyncLLM(EngineClient):
 
     @property
     def is_running(self) -> bool:
-        return True
+        # Is None before the loop is started.
+        return self.output_handler is None or not self.output_handler.done()
 
     @property
     def is_stopped(self) -> bool:
-        return False
+        return self.errored
 
     @property
     def errored(self) -> bool:
-        return False
+        return self.engine_core.resources.engine_dead or not self.is_running
 
     @property
     def dead_error(self) -> BaseException:
-        return Exception()  # TODO: implement
+        return EngineDeadError()
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index f642e51001a8e..ba5e5050abbb7 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -11,9 +11,7 @@ from logging import DEBUG
 from typing import Any, Callable, Optional, TypeVar, Union
 
 import msgspec
-import psutil
 import zmq
-import zmq.asyncio
 
 from vllm.config import ParallelConfig, VllmConfig
 from vllm.distributed import stateless_destroy_torch_distributed_process_group
@@ -22,8 +20,7 @@ from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
-from vllm.utils import (get_exception_traceback, resolve_obj_by_qualname,
-                        zmq_socket_ctx)
+from vllm.utils import resolve_obj_by_qualname, zmq_socket_ctx
 from vllm.v1.core.kv_cache_utils import (get_kv_cache_config,
                                          unify_kv_cache_configs)
 from vllm.v1.core.sched.interface import SchedulerInterface
@@ -50,12 +47,11 @@ _R = TypeVar('_R')  # Return type for collective_rpc
 class EngineCore:
     """Inner loop of vLLM's Engine."""
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        executor_class: type[Executor],
-        log_stats: bool,
-    ):
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 executor_class: type[Executor],
+                 log_stats: bool,
+                 executor_fail_callback: Optional[Callable] = None):
         assert vllm_config.model_config.runner_type != "pooling"
 
         logger.info("Initializing a V1 LLM engine (v%s) with config: %s",
@@ -65,6 +61,9 @@ class EngineCore:
 
         # Setup Model.
         self.model_executor = executor_class(vllm_config)
+        if executor_fail_callback is not None:
+            self.model_executor.register_failure_callback(
+                executor_fail_callback)
 
         # Setup KV Caches and update CacheConfig after profiling.
         num_gpu_blocks, num_cpu_blocks, kv_cache_config = \
@@ -254,7 +253,8 @@ class EngineCore:
         return engine_core_outputs
 
     def shutdown(self):
-        self.model_executor.shutdown()
+        if self.model_executor:
+            self.model_executor.shutdown()
 
     def profile(self, is_start: bool = True):
         self.model_executor.profile(is_start)
@@ -308,6 +308,8 @@ class EngineCore:
 class EngineCoreProc(EngineCore):
     """ZMQ-wrapper for running EngineCore in background process."""
 
+    ENGINE_CORE_DEAD = b'ENGINE_CORE_DEAD'
+
     def __init__(
         self,
         input_path: str,
@@ -317,11 +319,16 @@ class EngineCoreProc(EngineCore):
         log_stats: bool,
         engine_index: int = 0,
     ):
-        super().__init__(vllm_config, executor_class, log_stats)
+        input_queue = queue.Queue[tuple[EngineCoreRequestType, Any]]()
+
+        executor_fail_callback = lambda: input_queue.put_nowait(
+            (EngineCoreRequestType.EXECUTOR_FAILED, b''))
+
+        super().__init__(vllm_config, executor_class, log_stats,
+                         executor_fail_callback)
 
         self.step_fn = (self.step if self.batch_queue is None else
                         self.step_with_batch_queue)
-
         self.global_unfinished_reqs = False
 
         # Background Threads and Queues for IO. These enable us to
@@ -329,15 +336,16 @@ class EngineCoreProc(EngineCore):
         # and to overlap some serialization/deserialization with the
         # model forward pass.
         # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
-        self.input_queue: queue.Queue[tuple[EngineCoreRequestType,
-                                            Any]] = queue.Queue()
-        self.output_queue: queue.Queue[EngineCoreOutputs] = queue.Queue()
+        self.input_queue = input_queue
+        self.output_queue = queue.Queue[Union[EngineCoreOutputs, bytes]]()
         threading.Thread(target=self.process_input_socket,
                          args=(input_path, engine_index),
                          daemon=True).start()
-        threading.Thread(target=self.process_output_socket,
-                         args=(output_path, engine_index),
-                         daemon=True).start()
+        self.output_thread = threading.Thread(
+            target=self.process_output_socket,
+            args=(output_path, engine_index),
+            daemon=True)
+        self.output_thread.start()
 
     @staticmethod
     def run_engine_core(*args,
@@ -364,7 +372,6 @@ class EngineCoreProc(EngineCore):
         signal.signal(signal.SIGTERM, signal_handler)
         signal.signal(signal.SIGINT, signal_handler)
 
-        parent_process = psutil.Process().parent()
         engine_core: Optional[EngineCoreProc] = None
         try:
             parallel_config: ParallelConfig = kwargs[
@@ -380,13 +387,15 @@ class EngineCoreProc(EngineCore):
             engine_core.run_busy_loop()
 
         except SystemExit:
-            logger.debug("EngineCore interrupted.")
-
-        except Exception:
-            traceback = get_exception_traceback()
-            logger.error("EngineCore hit an exception: %s", traceback)
-            parent_process.send_signal(signal.SIGUSR1)
+            logger.debug("EngineCore exiting.")
 
+        except Exception as e:
+            if engine_core is None:
+                logger.exception("EngineCore failed to start.")
+            else:
+                logger.exception("EngineCore encountered a fatal error.")
+                engine_core._send_engine_dead()
+            raise e
         finally:
             if engine_core is not None:
                 engine_core.shutdown()
@@ -458,6 +467,11 @@ class EngineCoreProc(EngineCore):
                                           f" failed: {str(e)}")
             self.output_queue.put_nowait(
                 EngineCoreOutputs(utility_output=output))
+        elif request_type == EngineCoreRequestType.EXECUTOR_FAILED:
+            raise RuntimeError("Executor failed.")
+        else:
+            logger.error("Unrecognized input request type encountered: %s",
+                         request_type)
 
     @staticmethod
     def _convert_msgspec_args(method, args):
@@ -473,6 +487,18 @@ class EngineCoreProc(EngineCore):
             and not isinstance(v, p.annotation) else v
             for v, p in zip(args, arg_types))
 
+    def _send_engine_dead(self):
+        """Send EngineDead status to the EngineCoreClient."""
+
+        # Put ENGINE_CORE_DEAD in the queue.
+        self.output_queue.put_nowait(EngineCoreProc.ENGINE_CORE_DEAD)
+
+        # Wait until msg sent by the daemon before shutdown.
+        self.output_thread.join(timeout=5.0)
+        if self.output_thread.is_alive():
+            logger.fatal("vLLM shutdown signal from EngineCore failed "
+                         "to send. Please report this issue.")
+
     def process_input_socket(self, input_path: str, engine_index: int):
         """Input socket IO thread."""
 
@@ -511,9 +537,16 @@ class EngineCoreProc(EngineCore):
         # Reuse send buffer.
         buffer = bytearray()
 
-        with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket:
+        # We must set linger to ensure the ENGINE_CORE_DEAD
+        # message is sent prior to closing the socket.
+        with zmq_socket_ctx(output_path, zmq.constants.PUSH,
+                            linger=4000) as socket:
             while True:
                 outputs = self.output_queue.get()
+                if outputs == EngineCoreProc.ENGINE_CORE_DEAD:
+                    socket.send(outputs, copy=False)
+                    break
+                assert not isinstance(outputs, bytes)
                 outputs.engine_index = engine_index
                 buffers = encoder.encode_into(outputs, buffer)
                 socket.send_multipart(buffers, copy=False)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index a96ebc7edb538..f54b3546f06dd 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,14 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
-
 import asyncio
-import os
 import queue
-import signal
-import threading
 import uuid
 import weakref
 from abc import ABC, abstractmethod
-from collections.abc import Awaitable
+from collections.abc import Awaitable, Sequence
 from concurrent.futures import Future
 from dataclasses import dataclass, field
 from threading import Thread
@@ -21,10 +17,11 @@ from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.utils import (get_open_zmq_inproc_path, get_open_zmq_ipc_path,
-                        kill_process_tree, make_zmq_socket)
+                        make_zmq_socket)
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType, UtilityOutput)
 from vllm.v1.engine.core import EngineCore, EngineCoreProc
+from vllm.v1.engine.exceptions import EngineDeadError
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder, bytestr
 from vllm.v1.utils import BackgroundProcHandle
@@ -305,14 +302,22 @@ class BackgroundResources:
     core_engines: list[CoreEngine] = field(default_factory=list)
     output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
     input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
+    output_queue_task: Optional[asyncio.Task] = None
     shutdown_path: Optional[str] = None
 
+    # Set if any of the engines are dead. Here so that the output
+    # processing threads can access it without holding a ref to the client.
+    engine_dead: bool = False
+
     def __call__(self):
         """Clean up background resources."""
 
         for core_engine in self.core_engines:
             core_engine.close()
 
+        if self.output_queue_task is not None:
+            self.output_queue_task.cancel()
+
         # ZMQ context termination can hang if the sockets
         # aren't explicitly closed first.
         if self.output_socket is not None:
@@ -327,6 +332,12 @@ class BackgroundResources:
                 # Send shutdown signal.
                 shutdown_sender.send(b'')
 
+    def validate_alive(self, frames: Sequence[zmq.Frame]):
+        if len(frames) == 1 and (frames[0].buffer
+                                 == EngineCoreProc.ENGINE_CORE_DEAD):
+            self.engine_dead = True
+            raise EngineDeadError()
+
 
 class MPClient(EngineCoreClient):
     """
@@ -348,27 +359,6 @@ class MPClient(EngineCoreClient):
         executor_class: type[Executor],
         log_stats: bool,
     ):
-        # The child processes will send SIGUSR1 when unrecoverable
-        # errors happen. We kill the process tree here so that the
-        # stack trace is very evident.
-        # TODO(rob): rather than killing the main process, we should
-        # figure out how to raise an AsyncEngineDeadError and
-        # handle at the API server level so we can return a better
-        # error code to the clients calling vLLM.
-        def sigusr1_handler(signum, frame):
-            logger.fatal("Got fatal signal from worker processes, shutting "
-                         "down. See stack trace above for root cause issue.")
-            kill_process_tree(os.getpid())
-
-        if threading.current_thread() == threading.main_thread():
-            signal.signal(signal.SIGUSR1, sigusr1_handler)
-        else:
-            logger.warning("SIGUSR1 handler not installed because we are not "
-                           "running in the main thread. In this case the "
-                           "forked engine process may not be killed when "
-                           "an exception is raised, and you need to handle "
-                           "the engine process shutdown manually.")
-
         # Serialization setup.
         self.encoder = MsgpackEncoder()
         self.decoder = MsgpackDecoder(EngineCoreOutputs)
@@ -378,32 +368,37 @@ class MPClient(EngineCoreClient):
         self.ctx = zmq.asyncio.Context(sync_ctx) if asyncio_mode else sync_ctx
 
         # This will ensure resources created so far are closed
-        # when the client is garbage collected,  even if an
+        # when the client is garbage collected, even if an
         # exception is raised mid-construction.
         self.resources = BackgroundResources(ctx=sync_ctx)
         self._finalizer = weakref.finalize(self, self.resources)
+        success = False
+        try:
+            # Paths and sockets for IPC.
+            self.output_path = get_open_zmq_ipc_path()
+            input_path = get_open_zmq_ipc_path()
+            self.input_socket = make_zmq_socket(self.ctx,
+                                                input_path,
+                                                zmq.ROUTER,
+                                                bind=True)
+            self.resources.input_socket = self.input_socket
 
-        # Paths and sockets for IPC.
-        self.output_path = get_open_zmq_ipc_path()
-        input_path = get_open_zmq_ipc_path()
-        self.input_socket = make_zmq_socket(self.ctx,
-                                            input_path,
-                                            zmq.ROUTER,
-                                            bind=True)
-        self.resources.input_socket = self.input_socket
+            new_core_engine = lambda index, local_dp_rank=None: CoreEngine(
+                vllm_config, executor_class, log_stats, input_path, self.
+                output_path, index, local_dp_rank)
 
-        new_core_engine = lambda index, local_dp_rank=None: CoreEngine(
-            vllm_config, executor_class, log_stats, input_path, self.
-            output_path, index, local_dp_rank)
+            # Start engine core process(es).
+            self._init_core_engines(vllm_config, new_core_engine,
+                                    self.resources.core_engines)
 
-        # Start engine core process(es).
-        self._init_core_engines(vllm_config, new_core_engine,
-                                self.resources.core_engines)
+            # Wait for engine core process(es) to start.
+            self._wait_for_engine_startup()
 
-        # Wait for engine core process(es) to start.
-        self._wait_for_engine_startup()
-
-        self.utility_results: dict[int, AnyFuture] = {}
+            self.utility_results: dict[int, AnyFuture] = {}
+            success = True
+        finally:
+            if not success:
+                self._finalizer()
 
     def _wait_for_engine_startup(self):
         # Get a sync handle to the socket which can be sync or async.
@@ -451,8 +446,18 @@ class MPClient(EngineCoreClient):
         self.core_engine = core_engine
 
     def shutdown(self):
+        # Terminate background resources.
         self._finalizer()
 
+    def _format_exception(self, e: Exception) -> Exception:
+        """If errored, use EngineDeadError so root cause is clear."""
+        return EngineDeadError(
+            suppress_context=True) if self.resources.engine_dead else e
+
+    def ensure_alive(self):
+        if self.resources.engine_dead:
+            raise EngineDeadError()
+
 
 def _process_utility_output(output: UtilityOutput,
                             utility_results: dict[int, AnyFuture]):
@@ -476,7 +481,7 @@ class SyncMPClient(MPClient):
             log_stats=log_stats,
         )
 
-        self.outputs_queue: queue.Queue[EngineCoreOutputs] = queue.Queue()
+        self.outputs_queue = queue.Queue[Union[EngineCoreOutputs, Exception]]()
 
         # Ensure that the outputs socket processing thread does not have
         # a ref to the client which prevents gc.
@@ -487,7 +492,8 @@ class SyncMPClient(MPClient):
         outputs_queue = self.outputs_queue
 
         shutdown_path = get_open_zmq_inproc_path()
-        self.resources.shutdown_path = shutdown_path
+        resources = self.resources
+        resources.shutdown_path = shutdown_path
 
         def process_outputs_socket():
             shutdown_socket = ctx.socket(zmq.PAIR)
@@ -506,12 +512,15 @@ class SyncMPClient(MPClient):
                         break
 
                     frames = out_socket.recv_multipart(copy=False)
+                    resources.validate_alive(frames)
                     outputs = decoder.decode(frames)
                     if outputs.utility_output:
                         _process_utility_output(outputs.utility_output,
                                                 utility_results)
                     else:
                         outputs_queue.put_nowait(outputs)
+            except Exception as e:
+                outputs_queue.put_nowait(e)
             finally:
                 # Close sockets.
                 shutdown_socket.close(linger=0)
@@ -524,9 +533,16 @@ class SyncMPClient(MPClient):
         self.output_queue_thread.start()
 
     def get_output(self) -> EngineCoreOutputs:
-        return self.outputs_queue.get()
+        # If an exception arises in process_outputs_socket task,
+        # it is forwarded to the outputs_queue so we can raise it
+        # from this (run_output_handler) task to shut down the server.
+        outputs = self.outputs_queue.get()
+        if isinstance(outputs, Exception):
+            raise self._format_exception(outputs) from None
+        return outputs
 
     def _send_input(self, request_type: EngineCoreRequestType, request: Any):
+        self.ensure_alive()
         # (Identity, RequestType, SerializedRequest)
         msg = (self.core_engine.identity, request_type.value,
                *self.encoder.encode(request))
@@ -608,61 +624,81 @@ class AsyncMPClient(MPClient):
             log_stats=log_stats,
         )
 
-        self.outputs_queue: Optional[asyncio.Queue[EngineCoreOutputs]] = None
-        self.queue_task: Optional[asyncio.Task] = None
-
-        self.outputs_handler: Optional[Callable[
-            [AsyncMPClient, EngineCoreOutputs], Awaitable[None]]] = None
+        self.outputs_queue = asyncio.Queue[Union[EngineCoreOutputs,
+                                                 Exception]]()
+        try:
+            # If we are running in an asyncio event loop, start the queue task.
+            # Otherwise, it will be started lazily. If it is not started here,
+            # we could miss EXECUTOR_FAILED messages from engine core if they
+            # occur prior to any requests being sent.
+            asyncio.get_running_loop()
+            self._ensure_output_queue_task()
+        except RuntimeError:
+            pass
 
     def _ensure_output_queue_task(self):
-        if self.outputs_queue is not None:
+        resources = self.resources
+        if resources.output_queue_task is not None:
             return
 
         # Perform IO in separate task to parallelize as much as possible.
         # Avoid task having direct reference back to the client.
-        self.outputs_queue = asyncio.Queue()
         decoder = self.decoder
         utility_results = self.utility_results
         outputs_queue = self.outputs_queue
-        output_handler = self.outputs_handler
+        output_handler: Optional[Callable[[AsyncMPClient, EngineCoreOutputs],
+                                          Awaitable[None]]] = getattr(
+                                              self.__class__,
+                                              "process_engine_outputs", None)
         _self_ref = weakref.ref(self) if output_handler else None
         output_path = self.output_path
         output_socket = make_zmq_socket(self.ctx, output_path,
                                         zmq.constants.PULL)
-        self.resources.output_socket = output_socket
+        resources.output_socket = output_socket
 
         async def process_outputs_socket():
-            while True:
-                frames = await output_socket.recv_multipart(copy=False)
-                outputs: EngineCoreOutputs = decoder.decode(frames)
-                if outputs.utility_output:
-                    _process_utility_output(outputs.utility_output,
-                                            utility_results)
-                    continue
+            try:
+                while True:
+                    frames = await output_socket.recv_multipart(copy=False)
+                    resources.validate_alive(frames)
+                    outputs: EngineCoreOutputs = decoder.decode(frames)
+                    if outputs.utility_output:
+                        _process_utility_output(outputs.utility_output,
+                                                utility_results)
+                        continue
 
-                if output_handler is not None:
-                    assert _self_ref is not None
-                    _self = _self_ref()
-                    if not _self:
-                        # Client has been garbage collected, abort.
-                        return
-                    await output_handler(_self, outputs)
+                    if output_handler is not None:
+                        assert _self_ref is not None
+                        _self = _self_ref()
+                        if not _self:
+                            # Client has been garbage collected, abort.
+                            return
+                        await output_handler(_self, outputs)
 
-                if outputs.outputs or outputs.scheduler_stats:
-                    outputs_queue.put_nowait(outputs)
+                    if outputs.outputs or outputs.scheduler_stats:
+                        outputs_queue.put_nowait(outputs)
+            except Exception as e:
+                outputs_queue.put_nowait(e)
 
-        self.queue_task = asyncio.create_task(process_outputs_socket(),
-                                              name="EngineCoreOutputQueueTask")
+        resources.output_queue_task = asyncio.create_task(
+            process_outputs_socket(), name="EngineCoreOutputQueueTask")
 
     async def get_output_async(self) -> EngineCoreOutputs:
         self._ensure_output_queue_task()
+        # If an exception arises in process_outputs_socket task,
+        # it is forwarded to the outputs_queue so we can raise it
+        # from this (run_output_handler) task to shut down the server.
         assert self.outputs_queue is not None
-        return await self.outputs_queue.get()
+        outputs = await self.outputs_queue.get()
+        if isinstance(outputs, Exception):
+            raise self._format_exception(outputs) from None
+        return outputs
 
     def _send_input(self,
                     request_type: EngineCoreRequestType,
                     request: Any,
                     engine: Optional[CoreEngine] = None) -> Awaitable[None]:
+        self.ensure_alive()
         if engine is None:
             engine = self.core_engine
 
@@ -671,6 +707,7 @@ class AsyncMPClient(MPClient):
 
     def _send_input_message(self, message: tuple[bytestr, ...],
                             engine: CoreEngine) -> Awaitable[None]:
+        self.ensure_alive()
         message = (engine.identity, ) + message
         return self.input_socket.send_multipart(message, copy=False)
 
@@ -754,18 +791,17 @@ class DPAsyncMPClient(AsyncMPClient):
 
     def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
                  log_stats: bool):
-        super().__init__(vllm_config, executor_class, log_stats)
 
-        assert len(self.core_engines) > 1
+        self.num_engines_running = 0
+        self.reqs_in_flight: dict[str, CoreEngine] = {}
+
+        super().__init__(vllm_config, executor_class, log_stats)
 
         # Control message used for triggering dp idle mode loop.
         self.start_dp_msg = (EngineCoreRequestType.START_DP.value,
                              *self.encoder.encode(None))
 
-        self.num_engines_running = 0
-        self.reqs_in_flight: dict[str, CoreEngine] = {}
-
-        self.outputs_handler = DPAsyncMPClient.process_engine_outputs  # type: ignore[assignment]
+        assert len(self.core_engines) > 1
 
     def _init_core_engines(
         self,
diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py
new file mode 100644
index 0000000000000..97dd31d5e5218
--- /dev/null
+++ b/vllm/v1/engine/exceptions.py
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: Apache-2.0
+class EngineGenerateError(Exception):
+    """Raised when a AsyncLLM.generate() fails. Recoverable."""
+    pass
+
+
+class EngineDeadError(Exception):
+    """Raised when the EngineCore dies. Unrecoverable."""
+
+    def __init__(self, *args, suppress_context: bool = False, **kwargs):
+        ENGINE_DEAD_MESSAGE = "EngineCore encountered an issue. See stack trace (above) for the root cause."  # noqa: E501
+
+        super().__init__(ENGINE_DEAD_MESSAGE, *args, **kwargs)
+        # Make stack trace clearer when using with LLMEngine by
+        # silencing irrelevant ZMQError.
+        self.__suppress_context__ = suppress_context
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 70f072d3c9399..21e2a1aee4e2c 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -28,32 +28,40 @@ class RequestOutputCollector:
 
     def __init__(self, output_kind: RequestOutputKind):
         self.aggregate = output_kind == RequestOutputKind.DELTA
-        self.output: Optional[RequestOutput] = None
+        self.output: Optional[Union[RequestOutput, Exception]] = None
         self.ready = asyncio.Event()
 
-    def put(self, output: RequestOutput) -> None:
-        if self.output is None:
+    def put(self, output: Union[RequestOutput, Exception]) -> None:
+        """Non-blocking put operation."""
+        if self.output is None or isinstance(output, Exception):
             self.output = output
             self.ready.set()
-        elif self.aggregate:
-            # Coalesce the outputs in delta case.
-            self.output.add(output)
-        else:
-            # Just replace latest in non-delta case.
-            self.output = output
+        elif isinstance(self.output, RequestOutput):
+            if self.aggregate:
+                # Coalesce the outputs in delta case.
+                self.output.add(output)
+            else:
+                # Just replace latest in non-delta case.
+                self.output = output
 
     async def get(self) -> RequestOutput:
+        """Get operation blocks on put event."""
         while (output := self.output) is None:
             await self.ready.wait()
         self.output = None
         self.ready.clear()
+        if isinstance(output, Exception):
+            raise output
         return output
 
     def get_nowait(self) -> Optional[RequestOutput]:
+        """Non-blocking get operation."""
         output = self.output
         if output is not None:
             self.output = None
             self.ready.clear()
+        if isinstance(output, Exception):
+            raise output
         return output
 
 
@@ -235,6 +243,13 @@ class OutputProcessor:
     def has_unfinished_requests(self) -> bool:
         return len(self.request_states) > 0
 
+    def propagate_error(self, e: Exception):
+        """Propagate error to all generate() tasks."""
+
+        for _, state in self.request_states.items():
+            assert state.queue is not None
+            state.queue.put(e)
+
     def abort_requests(
         self,
         request_ids: Iterable[str],
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index e3a4cd98c1f81..3b9feb0d32980 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from concurrent.futures import Future
-from typing import Union
+from typing import Callable, Union
 
 import torch
 import torch.distributed as dist
@@ -15,6 +15,8 @@ from vllm.executor.uniproc_executor import (  # noqa
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
 
+FailureCallback = Callable[[], None]
+
 
 class Executor(ExecutorBase):
     """
@@ -62,6 +64,13 @@ class Executor(ExecutorBase):
                             args=(kv_cache_configs, ))
         self.collective_rpc("compile_or_warm_up_model")
 
+    def register_failure_callback(self, callback: FailureCallback):
+        """
+        Register a function to be called if the executor enters a permanent
+        failed state.
+        """
+        pass
+
     def determine_available_memory(self) -> list[int]:  # in bytes
         output = self.collective_rpc("determine_available_memory")
         return output
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index e854c2a44ff94..cff6181fa3adf 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -1,21 +1,23 @@
 # SPDX-License-Identifier: Apache-2.0
-
+import multiprocessing
 import os
 import pickle
 import signal
 import sys
+import threading
 import time
 import traceback
 import weakref
+from concurrent.futures import Future
 from dataclasses import dataclass
 from enum import Enum, auto
 from functools import partial
+from multiprocessing.connection import Connection
 from multiprocessing.process import BaseProcess
-from typing import Any, Callable, Optional, Union
+from threading import Thread
+from typing import Any, Callable, Optional, Union, cast
 
 import cloudpickle
-import psutil
-import zmq
 
 from vllm.config import VllmConfig
 from vllm.distributed import (destroy_distributed_environment,
@@ -26,8 +28,9 @@ from vllm.executor.multiproc_worker_utils import (
     _add_prefix, set_multiprocessing_worker_envs)
 from vllm.logger import init_logger
 from vllm.utils import (get_distributed_init_method, get_mp_context,
-                        get_open_port, get_open_zmq_ipc_path, zmq_socket_ctx)
-from vllm.v1.executor.abstract import Executor
+                        get_open_port)
+from vllm.v1.executor.abstract import Executor, FailureCallback
+from vllm.v1.outputs import ModelRunnerOutput
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -35,6 +38,8 @@ logger = init_logger(__name__)
 POLLING_TIMEOUT_MS = 5000
 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
 
+EXECUTE_MODEL_TIMEOUT_S = 30
+
 
 class MultiprocExecutor(Executor):
 
@@ -42,19 +47,9 @@ class MultiprocExecutor(Executor):
         # Call self.shutdown at exit to clean up
         # and ensure workers will be terminated.
         self._finalizer = weakref.finalize(self, self.shutdown)
-
-        # The child processes will send SIGUSR1 when unrecoverable
-        # errors happen.
-        def sigusr1_handler(signum, frame):
-            logger.fatal(
-                "MulitprocExecutor got fatal signal from worker processes, "
-                "shutting down. See stack trace above for root cause issue.")
-            # Propagate error up to parent process.
-            parent_process = psutil.Process().parent()
-            parent_process.send_signal(signal.SIGUSR1)
-            self.shutdown()
-
-        signal.signal(signal.SIGUSR1, sigusr1_handler)
+        self.is_failed = False
+        self.shutdown_event = threading.Event()
+        self.failure_callback: Optional[FailureCallback] = None
 
         self.world_size = self.parallel_config.world_size
         tensor_parallel_size = self.parallel_config.tensor_parallel_size
@@ -78,28 +73,94 @@ class MultiprocExecutor(Executor):
         scheduler_output_handle = self.rpc_broadcast_mq.export_handle()
 
         # Create workers
-        self.workers: list[WorkerProcHandle] = []
-        for rank in range(self.world_size):
-            worker = WorkerProc.make_worker_process(self.vllm_config, rank,
-                                                    rank,
-                                                    distributed_init_method,
-                                                    scheduler_output_handle)
-            self.workers.append(worker)
+        unready_workers: list[UnreadyWorkerProcHandle] = []
+        success = False
+        try:
+            for rank in range(self.world_size):
+                unready_workers.append(
+                    WorkerProc.make_worker_process(
+                        vllm_config=self.vllm_config,
+                        local_rank=rank,
+                        rank=rank,
+                        distributed_init_method=distributed_init_method,
+                        input_shm_handle=scheduler_output_handle,
+                    ))
 
-        # Ensure message queues are ready. Will deadlock if re-ordered
-        # Must be kept consistent with the WorkerProc
-        self.rpc_broadcast_mq.wait_until_ready()
-        for w in self.workers:
-            w.worker_response_mq.wait_until_ready()
+            # Workers must be created before wait_for_ready to avoid
+            # deadlock, since worker.init_device() does a device sync.
+            self.workers = WorkerProc.wait_for_ready(unready_workers)
+
+            # Ensure message queues are ready. Will deadlock if re-ordered
+            # Must be kept consistent with the WorkerProc.
+            self.rpc_broadcast_mq.wait_until_ready()
+            for w in self.workers:
+                w.worker_response_mq.wait_until_ready()
+
+            self.start_worker_monitor()
+            success = True
+        finally:
+            if not success:
+                # Clean up the worker procs if there was a failure.
+                self._ensure_worker_termination(
+                    [w.proc for w in unready_workers])
+
+    def start_worker_monitor(self):
+        workers = self.workers
+        self_ref = weakref.ref(self)
+
+        # Monitors worker process liveness. If any die unexpectedly,
+        # logs an error, shuts down the executor and invokes the failure
+        # callback to inform the engine.
+        def monitor_workers():
+            sentinels = [h.proc.sentinel for h in workers]
+            died = multiprocessing.connection.wait(sentinels)
+            _self = self_ref()
+            if not _self or getattr(_self, 'shutting_down', False):
+                return
+            _self.is_failed = True
+            proc_name = next(h.proc.name for h in workers
+                             if h.proc.sentinel == died[0])
+            logger.error(
+                "Worker proc %s died unexpectedly, "
+                "shutting down executor.", proc_name)
+            _self.shutdown()
+            callback = _self.failure_callback
+            if callback is not None:
+                _self.failure_callback = None
+                callback()
+
+        Thread(target=monitor_workers,
+               daemon=True,
+               name="MultiprocWorkerMonitor").start()
+
+    def register_failure_callback(self, callback: FailureCallback):
+        if self.is_failed:
+            callback()
+        else:
+            self.failure_callback = callback
+
+    def execute_model(
+        self,
+        scheduler_output,
+    ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
+        (output, ) = self.collective_rpc("execute_model",
+                                         args=(scheduler_output, ),
+                                         rank0_reply_only=True,
+                                         timeout=EXECUTE_MODEL_TIMEOUT_S)
+        return output
 
     def collective_rpc(self,
                        method: Union[str, Callable],
-                       timeout: Optional[float] = None,
+                       timeout: Optional[float] = 180.0,
                        args: tuple = (),
-                       kwargs: Optional[dict] = None) -> list[Any]:
+                       kwargs: Optional[dict] = None,
+                       rank0_reply_only: bool = False) -> list[Any]:
         start_time = time.monotonic()
         kwargs = kwargs or {}
 
+        if self.is_failed:
+            raise RuntimeError("Executor failed.")
+
         # NOTE: If the args are heterogeneous, then we pack them into a list,
         # and unpack them in the method of every worker, because every worker
         # knows their own rank.
@@ -109,30 +170,30 @@ class MultiprocExecutor(Executor):
             else:
                 send_method = cloudpickle.dumps(
                     method, protocol=pickle.HIGHEST_PROTOCOL)
-            self.rpc_broadcast_mq.enqueue((send_method, args, kwargs))
+            self.rpc_broadcast_mq.enqueue(
+                (send_method, args, kwargs, rank0_reply_only))
 
-            responses = [None] * self.world_size
-            for w in self.workers:
+            workers = (self.workers[0], ) if rank0_reply_only else self.workers
+            responses = [None] * len(workers)
+            for w in workers:
                 dequeue_timeout = timeout - (time.monotonic() - start_time
                                              ) if timeout is not None else None
                 status, result = w.worker_response_mq.dequeue(
-                    timeout=dequeue_timeout)
+                    timeout=dequeue_timeout, cancel=self.shutdown_event)
 
                 if status != WorkerProc.ResponseStatus.SUCCESS:
                     raise RuntimeError(
-                        "Worker failed with error %s, please check the"
-                        " stack trace above for the root cause", result)
+                        f"Worker failed with error '{result}', please check the"
+                        " stack trace above for the root cause")
 
                 responses[w.rank] = result
 
             return responses
         except TimeoutError as e:
             raise TimeoutError(f"RPC call to {method} timed out.") from e
-        except Exception as e:
-            # Re-raise any other exceptions
-            raise e
 
-    def _ensure_worker_termination(self):
+    @staticmethod
+    def _ensure_worker_termination(worker_procs: list[BaseProcess]):
         """Ensure that all worker processes are terminated. Assumes workers have
         received termination requests. Waits for processing, then sends
         termination and kill signals if needed."""
@@ -150,7 +211,7 @@ class MultiprocExecutor(Executor):
             return False
 
         # Send SIGTERM if still running
-        active_procs = [w.proc for w in self.workers if w.proc.is_alive()]
+        active_procs = [proc for proc in worker_procs if proc.is_alive()]
         for p in active_procs:
             p.terminate()
         if not wait_for_termination(active_procs, 4):
@@ -159,22 +220,14 @@ class MultiprocExecutor(Executor):
             for p in active_procs:
                 p.kill()
 
-        self._cleanup_sockets()
-
-    def _cleanup_sockets(self):
-        for w in self.workers:
-            # Remove the zmq ipc socket file
-            socket_path = w.ready_path.replace("ipc://", "")
-            if os and os.path.exists(socket_path):
-                os.remove(socket_path)
-
     def shutdown(self):
         """Properly shut down the executor and its workers"""
         if not getattr(self, 'shutting_down', False):
             self.shutting_down = True
+            self.shutdown_event.set()
             for w in self.workers:
                 w.worker_response_mq = None
-            self._ensure_worker_termination()
+            self._ensure_worker_termination([w.proc for w in self.workers])
 
         self.rpc_broadcast_mq = None
 
@@ -183,13 +236,30 @@ class MultiprocExecutor(Executor):
         return
 
 
+@dataclass
+class UnreadyWorkerProcHandle:
+    """WorkerProcess handle before READY."""
+    proc: BaseProcess
+    rank: int
+    ready_pipe: Connection
+
+
 @dataclass
 class WorkerProcHandle:
     proc: BaseProcess
     rank: int
-    ready_path: str
     worker_response_mq: MessageQueue  # The worker process writes to this MQ
 
+    @classmethod
+    def from_unready_handle(
+            cls, unready_handle: UnreadyWorkerProcHandle,
+            worker_response_mq: MessageQueue) -> "WorkerProcHandle":
+        return cls(
+            proc=unready_handle.proc,
+            rank=unready_handle.rank,
+            worker_response_mq=worker_response_mq,
+        )
+
 
 class WorkerProc:
     """Wrapper that runs one Worker in a separate process."""
@@ -203,7 +273,6 @@ class WorkerProc:
         rank: int,
         distributed_init_method: str,
         input_shm_handle: Handle,
-        ready_path: str,
     ):
         self.rank = rank
         wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank)
@@ -231,18 +300,8 @@ class WorkerProc:
 
         # Initializes a message queue for sending the model output
         self.worker_response_mq = MessageQueue(1, 1)
-        worker_response_mq_handle = self.worker_response_mq.export_handle()
-
-        # Send Readiness signal to EngineCore process.
-        # Set linger here because we want to ensure the message has
-        # been sent before the context is closed.
-        with zmq_socket_ctx(ready_path, zmq.constants.PUSH,
-                            linger=10000) as ready_socket:
-            payload = pickle.dumps(worker_response_mq_handle,
-                                   protocol=pickle.HIGHEST_PROTOCOL)
-            ready_socket.send_string(WorkerProc.READY_STR)
-            ready_socket.send(payload)
 
+        # Initialize device and loads weights
         self.worker.init_device()
         self.worker.load_model()
 
@@ -253,12 +312,10 @@ class WorkerProc:
             rank: int,
             distributed_init_method: str,
             input_shm_handle,  # Receive SchedulerOutput
-    ) -> WorkerProcHandle:
+    ) -> UnreadyWorkerProcHandle:
         context = get_mp_context()
-
-        # ZMQ path for worker to send ready message and shm_broadcast handle
-        # back to core process.
-        ready_path = get_open_zmq_ipc_path()
+        # (reader, writer)
+        reader, writer = context.Pipe(duplex=False)
 
         process_kwargs = {
             "vllm_config": vllm_config,
@@ -266,24 +323,57 @@ class WorkerProc:
             "rank": rank,
             "distributed_init_method": distributed_init_method,
             "input_shm_handle": input_shm_handle,
-            "ready_path": ready_path,
+            "ready_pipe": (reader, writer),
         }
         # Run EngineCore busy loop in background process.
         proc = context.Process(target=WorkerProc.worker_main,
                                kwargs=process_kwargs,
+                               name=f"VllmWorker-{rank}",
                                daemon=True)
 
-        with zmq_socket_ctx(ready_path, zmq.constants.PULL) as ready_socket:
-            proc.start()
+        proc.start()
+        writer.close()
+        return UnreadyWorkerProcHandle(proc, rank, reader)
 
-            # Wait for startup
-            worker_response_mq_handle = WorkerProc.wait_for_startup(
-                proc, ready_socket)
+    @staticmethod
+    def wait_for_ready(
+        unready_proc_handles: list[UnreadyWorkerProcHandle]
+    ) -> list[WorkerProcHandle]:
 
-        worker_response_mq = MessageQueue.create_from_handle(
-            worker_response_mq_handle, 0)
+        e = Exception("WorkerProc initialization failed due to "
+                      "an exception in a background process. "
+                      "See stack trace for root cause.")
 
-        return WorkerProcHandle(proc, rank, ready_path, worker_response_mq)
+        pipes = {handle.ready_pipe: handle for handle in unready_proc_handles}
+        ready_proc_handles: list[Optional[WorkerProcHandle]] = (
+            [None] * len(unready_proc_handles))
+        while pipes:
+            ready = multiprocessing.connection.wait(pipes.keys())
+            for pipe in ready:
+                assert isinstance(pipe, Connection)
+                try:
+                    # Wait until the WorkerProc is ready.
+                    unready_proc_handle = pipes.pop(pipe)
+                    response: dict[str, Any] = pipe.recv()
+                    if response["status"] != "READY":
+                        raise e
+
+                    # Extract the message queue handle.
+                    worker_response_mq = MessageQueue.create_from_handle(
+                        response["handle"], 0)
+                    ready_proc_handles[unready_proc_handle.rank] = (
+                        WorkerProcHandle.from_unready_handle(
+                            unready_proc_handle, worker_response_mq))
+
+                except EOFError:
+                    e.__suppress_context__ = True
+                    raise e from None
+
+                finally:
+                    # Close connection.
+                    pipe.close()
+
+        return cast(list[WorkerProcHandle], ready_proc_handles)
 
     def shutdown(self):
         self.rpc_broadcast_mq = None
@@ -312,51 +402,51 @@ class WorkerProc:
         signal.signal(signal.SIGINT, signal_handler)
 
         worker = None
+        # tuple[Connection, Connection]
+        reader, ready_writer = kwargs.pop("ready_pipe")
         try:
+            reader.close()
             worker = WorkerProc(*args, **kwargs)
 
+            # Send READY once we know everything is loaded
+            ready_writer.send({
+                "status":
+                WorkerProc.READY_STR,
+                "handle":
+                worker.worker_response_mq.export_handle(),
+            })
+
             # Ensure message queues are ready. Will deadlock if re-ordered.
             # Must be kept consistent with the Executor
             worker.rpc_broadcast_mq.wait_until_ready()
             worker.worker_response_mq.wait_until_ready()
+            ready_writer.close()
+            ready_writer = None
 
             worker.worker_busy_loop()
 
-        except SystemExit:
-            logger.debug("Worker interrupted.")
-
         except Exception:
-            # worker_busy_loop sends exceptions to Executor
-            # for shutdown, but if there is an error in startup or an
-            # error with IPC itself, we need to alert the parent.
-            psutil.Process().parent().send_signal(signal.SIGUSR1)
-            raise
+            # NOTE: if an Exception arises in busy_loop, we send
+            # a FAILURE message over the MQ RPC to notify the Executor,
+            # which triggers system shutdown.
+            # TODO(rob): handle case where the MQ itself breaks.
+
+            if ready_writer is not None:
+                logger.exception("WorkerProc failed to start.")
+            else:
+                logger.exception("WorkerProc failed.")
+
+            # The parent sends a SIGTERM to all worker processes if
+            # any worker dies. Set this value so we don't re-throw
+            # SystemExit() to avoid zmq exceptions in __del__.
+            shutdown_requested = True
 
         finally:
+            if ready_writer is not None:
+                ready_writer.close()
             # Clean up once worker exits busy loop
             if worker is not None:
                 worker.shutdown()
-                worker = None
-
-    @staticmethod
-    def wait_for_startup(
-        proc: BaseProcess,
-        ready_socket: zmq.Socket,
-    ) -> Optional[Handle]:
-        """Wait until the Worker is ready."""
-
-        # Wait for Worker to send READY.
-        while ready_socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
-            logger.debug("Waiting for WorkerProc to startup.")
-
-            if not proc.is_alive():
-                raise RuntimeError("WorkerProc failed to start.")
-
-        message = ready_socket.recv_string()
-        assert message == WorkerProc.READY_STR
-        handle_frame = ready_socket.recv(copy=False)
-        handle = pickle.loads(handle_frame.buffer)
-        return handle
 
     class ResponseStatus(Enum):
         SUCCESS = auto()
@@ -365,7 +455,7 @@ class WorkerProc:
     def worker_busy_loop(self):
         """Main busy loop for Multiprocessing Workers"""
         while True:
-            method, args, kwargs = self.rpc_broadcast_mq.dequeue()
+            method, args, kwargs, rank0_only = self.rpc_broadcast_mq.dequeue()
 
             try:
                 if isinstance(method, str):
@@ -377,12 +467,14 @@ class WorkerProc:
                 # Notes have been introduced in python 3.11
                 if hasattr(e, "add_note"):
                     e.add_note(traceback.format_exc())
-                logger.exception("WorkerProc hit an exception: %s", exc_info=e)
+                logger.exception("WorkerProc hit an exception.")
                 # exception might not be serializable, so we convert it to
                 # string, only for logging purpose.
-                self.worker_response_mq.enqueue(
-                    (WorkerProc.ResponseStatus.FAILURE, str(e)))
+                if not rank0_only or self.rank == 0:
+                    self.worker_response_mq.enqueue(
+                        (WorkerProc.ResponseStatus.FAILURE, str(e)))
                 continue
 
-            self.worker_response_mq.enqueue(
-                (WorkerProc.ResponseStatus.SUCCESS, output))
+            if not rank0_only or self.rank == 0:
+                self.worker_response_mq.enqueue(
+                    (WorkerProc.ResponseStatus.SUCCESS, output))

From 95aca283b456c3aae09a37a7769cac07681fc585 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Wed, 16 Apr 2025 21:52:11 -0500
Subject: [PATCH 465/593] [rocm][V0] fix selection logic for custom PA in V0
 (#16426)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 vllm/platforms/rocm.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index d18b7c26f7ec5..ca6528313a194 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -109,8 +109,11 @@ def use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
     ON_MI250_MI300 = any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942"])
 
     # rocm custom page attention not support on navi (gfx1*)
+    # custom paged attn always supported on V0. On V1, requires sliding window
+    # disabled due to observed numerical discrepancy.
     return (ON_MI250_MI300 and not ON_NAVI
-            and (sliding_window == 0 or sliding_window == (-1, -1))
+            and (not envs.VLLM_USE_V1 or sliding_window == 0
+                 or sliding_window == (-1, -1))
             and (qtype == torch.half or qtype == torch.bfloat16)
             and (head_size == 64 or head_size == 128)
             and (block_size == 16 or block_size == 32)

From cb072ce93b1d5267967e6f97daf9d8b545fd7bc0 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 17 Apr 2025 12:17:39 +0800
Subject: [PATCH 466/593] [Bugfix] Update Florence-2 tokenizer to make
 grounding tasks work (#16734)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 .../encoder_decoder_multimodal.py               |  3 ++-
 examples/offline_inference/vision_language.py   |  2 +-
 tests/conftest.py                               |  2 ++
 .../vision_language/test_florence2.py           | 17 ++++++++++-------
 tests/models/registry.py                        |  2 +-
 5 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py
index 61e5f5eae4efa..2883c37ca2360 100644
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@@ -22,7 +22,7 @@ class ModelRequestData(NamedTuple):
 def run_florence2():
     engine_args = EngineArgs(
         model="microsoft/Florence-2-large",
-        tokenizer="facebook/bart-large",
+        tokenizer="Isotr0py/Florence-2-tokenizer",
         max_num_seqs=8,
         trust_remote_code=True,
         limit_mm_per_prompt={"image": 1},
@@ -165,6 +165,7 @@ def main(args):
         temperature=0,
         top_p=1.0,
         max_tokens=64,
+        skip_special_tokens=False,
     )
 
     start = time.time()
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 6b533346ac315..4476009fd2714 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -150,7 +150,7 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
 
     engine_args = EngineArgs(
         model="microsoft/Florence-2-large",
-        tokenizer="facebook/bart-large",
+        tokenizer="Isotr0py/Florence-2-tokenizer",
         max_model_len=4096,
         max_num_seqs=2,
         trust_remote_code=True,
diff --git a/tests/conftest.py b/tests/conftest.py
index d272f448f61f8..25e70319e2cc8 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -925,6 +925,7 @@ class VllmRunner:
         max_tokens: int,
         num_logprobs: int,
         num_prompt_logprobs: Optional[int] = None,
+        skip_special_tokens: bool = True,
     ) -> Union[list[TokensTextLogprobs],
                list[TokensTextLogprobsPromptLogprobs]]:
         greedy_logprobs_params = SamplingParams(
@@ -932,6 +933,7 @@ class VllmRunner:
             max_tokens=max_tokens,
             logprobs=num_logprobs,
             prompt_logprobs=(num_prompt_logprobs),
+            skip_special_tokens=skip_special_tokens,
         )
         '''
         Greedy logprobs generation for vLLM encoder/decoder models
diff --git a/tests/models/encoder_decoder/vision_language/test_florence2.py b/tests/models/encoder_decoder/vision_language/test_florence2.py
index a6ec333e2e9b4..14b64393bf52a 100644
--- a/tests/models/encoder_decoder/vision_language/test_florence2.py
+++ b/tests/models/encoder_decoder/vision_language/test_florence2.py
@@ -13,12 +13,12 @@ from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
 from ...utils import check_logprobs_close
 
 MODELS = ["microsoft/Florence-2-base"]
-# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
-# Therefore, we borrow the BartTokenizer from the original Bart model
-TOKENIZER = "facebook/bart-base"
+# Florence-2 model repo's tokenizer config is missing some special tokens.
+# Therefore, we use a converted tokenizer from a forked repo
+TOKENIZER = "Isotr0py/Florence-2-tokenizer"
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "stop_sign":
-    "<CAPTION>",  # special task token
+    "<OD>",  # special task token which will output special tokens
     "cherry_blossom":
     "Describe in detail what is shown in the image.",
 })
@@ -45,7 +45,6 @@ def hf_to_vllm_output(hf_output: tuple[list[int], str,
     output_ids, output_str, out_logprobs = hf_output
 
     output_str = output_str.replace("</s>", "").replace("<s>", "")
-    output_ids = [ids for ids in output_ids if ids not in [0, 2]]
 
     return output_ids, output_str, out_logprobs
 
@@ -71,8 +70,11 @@ def run_test(
                      enforce_eager=True) as vllm_model:
         vllm_outputs_per_case = [
             vllm_model.generate_encoder_decoder_greedy_logprobs(
-                prompts, max_tokens, num_logprobs=num_logprobs)
-            for prompts in inputs
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                skip_special_tokens=False,
+            ) for prompts in inputs
         ]
 
     hf_inputs = [get_hf_images_prompts(prompts) for prompts in inputs]
@@ -93,6 +95,7 @@ def run_test(
             outputs_1_lst=vllm_outputs,
             name_0="hf",
             name_1="vllm",
+            num_outputs_0_skip_tokens=1,
         )
 
 
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 1599b1da07ca0..8d50644a86529 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -366,7 +366,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
     # Therefore, we borrow the BartTokenizer from the original Bart model
     "Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base",  # noqa: E501
-                                                         tokenizer="facebook/bart-base",
+                                                         tokenizer="Isotr0py/Florence-2-tokenizer",
                                                          trust_remote_code=True),  # noqa: E501
     "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"),  # noqa: E501
     "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),  # noqa: E501

From 607029e51513933fff7edabe4691c96a82cfd289 Mon Sep 17 00:00:00 2001
From: David Heineman <Heineman.dj@gmail.com>
Date: Wed, 16 Apr 2025 21:33:15 -0700
Subject: [PATCH 467/593] [Bugfix] Revert max_prompt_len validation for
 decoder-only models. (#16741)

Signed-off-by: David Heineman <david@davidheineman.com>
---
 vllm/engine/llm_engine.py   | 2 +-
 vllm/v1/engine/processor.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 2347cdee904b3..4644053785f12 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -2062,7 +2062,7 @@ class LLMEngine:
                 raise ValueError(f"The {prompt_type} prompt cannot be empty")
 
         max_prompt_len = self.model_config.max_model_len
-        if len(prompt_ids) >= max_prompt_len:
+        if len(prompt_ids) > max_prompt_len:
             if prompt_type == "encoder" and model_config.is_multimodal_model:
                 mm_registry = self.input_preprocessor.mm_registry
                 mm_processor = mm_registry.create_processor(
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index be7e3709cccc3..afbbddb86d511 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -354,7 +354,7 @@ class Processor:
             raise ValueError(f"Token id {max_input_id} is out of vocabulary")
 
         max_prompt_len = self.model_config.max_model_len
-        if len(prompt_ids) >= max_prompt_len:
+        if len(prompt_ids) > max_prompt_len:
             if prompt_type == "encoder" and model_config.is_multimodal_model:
                 mm_registry = self.input_preprocessor.mm_registry
                 mm_processor = mm_registry.create_processor(

From 9dbf7a2dc1448d6657adfb2daba36be270dcebcd Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Thu, 17 Apr 2025 00:34:08 -0400
Subject: [PATCH 468/593] [V1] Remove log noise when idle (#16735)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 vllm/v1/metrics/loggers.py     | 15 +++++++++++++--
 vllm/v1/spec_decode/metrics.py |  4 ++--
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 3959be40b7253..4d70f27f8080c 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -40,6 +40,8 @@ class LoggingStatLogger(StatLoggerBase):
         # TODO: Make the interval configurable.
         self.prefix_caching_metrics = PrefixCachingMetrics()
         self.spec_decoding_metrics = SpecDecodingMetrics()
+        self.last_prompt_throughput: float = 0.0
+        self.last_generation_throughput: float = 0.0
 
     def _reset(self, now):
         self.last_log_time = now
@@ -83,8 +85,17 @@ class LoggingStatLogger(StatLoggerBase):
 
         scheduler_stats = self.last_scheduler_stats
 
+        log_fn = logger.info
+        if not any(
+            (prompt_throughput, generation_throughput,
+             self.last_prompt_throughput, self.last_generation_throughput)):
+            # Avoid log noise on an idle production system
+            log_fn = logger.debug
+        self.last_generation_throughput = generation_throughput
+        self.last_prompt_throughput = prompt_throughput
+
         # Format and print output.
-        logger.info(
+        log_fn(
             "Engine %03d: "
             "Avg prompt throughput: %.1f tokens/s, "
             "Avg generation throughput: %.1f tokens/s, "
@@ -101,7 +112,7 @@ class LoggingStatLogger(StatLoggerBase):
         )
 
         if scheduler_stats.spec_decoding_stats is not None:
-            self.spec_decoding_metrics.log()
+            self.spec_decoding_metrics.log(log_fn=log_fn)
 
 
 class PrometheusStatLogger(StatLoggerBase):
diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py
index 7bb3c209d1dcb..cc453b74f7eb1 100644
--- a/vllm/v1/spec_decode/metrics.py
+++ b/vllm/v1/spec_decode/metrics.py
@@ -43,14 +43,14 @@ class SpecDecodingMetrics:
         self.num_accepted_tokens.append(
             spec_decoding_stats.num_accepted_tokens)
 
-    def log(self):
+    def log(self, log_fn=logger.info):
         num_draft_tokens = np.sum(self.num_draft_tokens)
         num_accepted_tokens = np.sum(self.num_accepted_tokens)
 
         draft_acceptance_rate = (num_accepted_tokens / num_draft_tokens *
                                  100 if num_draft_tokens > 0 else float("nan"))
 
-        logger.info(
+        log_fn(
             "SpecDecoding metrics: "
             "Draft acceptance rate: %.1f%%, "
             "Accepted: %d tokens, "

From 8cac35ba435906fb7eb07e44fe1a8c26e8744f4e Mon Sep 17 00:00:00 2001
From: Richard Liaw <rliaw@berkeley.edu>
Date: Wed, 16 Apr 2025 22:19:26 -0700
Subject: [PATCH 469/593] [Ray] Improve documentation on batch inference
 (#16609)

Signed-off-by: Richard Liaw <rliaw@berkeley.edu>
---
 .../offline_inference/batch_llm_inference.py  |  90 +++++++++++++++
 examples/offline_inference/distributed.py     | 109 ------------------
 2 files changed, 90 insertions(+), 109 deletions(-)
 create mode 100644 examples/offline_inference/batch_llm_inference.py
 delete mode 100644 examples/offline_inference/distributed.py

diff --git a/examples/offline_inference/batch_llm_inference.py b/examples/offline_inference/batch_llm_inference.py
new file mode 100644
index 0000000000000..6548857b6d111
--- /dev/null
+++ b/examples/offline_inference/batch_llm_inference.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This example shows how to use Ray Data for data parallel batch inference.
+
+Ray Data is a data processing framework that can handle large datasets
+and integrates tightly with vLLM for data-parallel inference.
+
+As of Ray 2.44, Ray Data has a native integration with
+vLLM (under ray.data.llm).
+
+Ray Data provides functionality for:
+* Reading and writing to cloud storage (S3, GCS, etc.)
+* Automatic sharding and load-balancing across a cluster
+* Optimized configuration of vLLM using continuous batching
+* Compatible with tensor/pipeline parallel inference as well.
+
+Learn more about Ray Data's LLM integration:
+https://docs.ray.io/en/latest/data/working-with-llms.html
+"""
+import ray
+from packaging.version import Version
+from ray.data.llm import build_llm_processor, vLLMEngineProcessorConfig
+
+assert Version(ray.__version__) >= Version(
+    "2.44.1"), "Ray version must be at least 2.44.1"
+
+# Uncomment to reduce clutter in stdout
+# ray.init(log_to_driver=False)
+# ray.data.DataContext.get_current().enable_progress_bars = False
+
+# Read one text file from S3. Ray Data supports reading multiple files
+# from cloud storage (such as JSONL, Parquet, CSV, binary format).
+ds = ray.data.read_text("s3://anonymous@air-example-data/prompts.txt")
+print(ds.schema())
+
+size = ds.count()
+print(f"Size of dataset: {size} prompts")
+
+# Configure vLLM engine.
+config = vLLMEngineProcessorConfig(
+    model_source="unsloth/Llama-3.1-8B-Instruct",
+    engine_kwargs={
+        "enable_chunked_prefill": True,
+        "max_num_batched_tokens": 4096,
+        "max_model_len": 16384,
+    },
+    concurrency=1,  # set the number of parallel vLLM replicas
+    batch_size=64,
+)
+
+# Create a Processor object, which will be used to
+# do batch inference on the dataset
+vllm_processor = build_llm_processor(
+    config,
+    preprocess=lambda row: dict(
+        messages=[{
+            "role": "system",
+            "content": "You are a bot that responds with haikus."
+        }, {
+            "role": "user",
+            "content": row["text"]
+        }],
+        sampling_params=dict(
+            temperature=0.3,
+            max_tokens=250,
+        )),
+    postprocess=lambda row: dict(
+        answer=row["generated_text"],
+        **row  # This will return all the original columns in the dataset.
+    ),
+)
+
+ds = vllm_processor(ds)
+
+# Peek first 10 results.
+# NOTE: This is for local testing and debugging. For production use case,
+# one should write full result out as shown below.
+outputs = ds.take(limit=10)
+
+for output in outputs:
+    prompt = output["prompt"]
+    generated_text = output["generated_text"]
+    print(f"Prompt: {prompt!r}")
+    print(f"Generated text: {generated_text!r}")
+
+# Write inference output data out as Parquet files to S3.
+# Multiple files would be written to the output destination,
+# and each task would write one or more files separately.
+#
+# ds.write_parquet("s3://<your-output-bucket>")
diff --git a/examples/offline_inference/distributed.py b/examples/offline_inference/distributed.py
deleted file mode 100644
index e890c6dad8bd1..0000000000000
--- a/examples/offline_inference/distributed.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-"""
-This example shows how to use Ray Data for running offline batch inference
-distributively on a multi-nodes cluster.
-
-Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
-"""
-
-from typing import Any
-
-import numpy as np
-import ray
-from packaging.version import Version
-from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
-
-from vllm import LLM, SamplingParams
-
-assert Version(ray.__version__) >= Version(
-    "2.22.0"), "Ray version must be at least 2.22.0"
-
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-# Set tensor parallelism per instance.
-tensor_parallel_size = 1
-
-# Set number of instances. Each instance will use tensor_parallel_size GPUs.
-num_instances = 1
-
-
-# Create a class to do batch inference.
-class LLMPredictor:
-
-    def __init__(self):
-        # Create an LLM.
-        self.llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
-                       tensor_parallel_size=tensor_parallel_size)
-
-    def __call__(self, batch: dict[str, np.ndarray]) -> dict[str, list]:
-        # Generate texts from the prompts.
-        # The output is a list of RequestOutput objects that contain the prompt,
-        # generated text, and other information.
-        outputs = self.llm.generate(batch["text"], sampling_params)
-        prompt: list[str] = []
-        generated_text: list[str] = []
-        for output in outputs:
-            prompt.append(output.prompt)
-            generated_text.append(' '.join([o.text for o in output.outputs]))
-        return {
-            "prompt": prompt,
-            "generated_text": generated_text,
-        }
-
-
-# Read one text file from S3. Ray Data supports reading multiple files
-# from cloud storage (such as JSONL, Parquet, CSV, binary format).
-ds = ray.data.read_text("s3://anonymous@air-example-data/prompts.txt")
-
-
-# For tensor_parallel_size > 1, we need to create placement groups for vLLM
-# to use. Every actor has to have its own placement group.
-def scheduling_strategy_fn():
-    # One bundle per tensor parallel worker
-    pg = ray.util.placement_group(
-        [{
-            "GPU": 1,
-            "CPU": 1
-        }] * tensor_parallel_size,
-        strategy="STRICT_PACK",
-    )
-    return dict(scheduling_strategy=PlacementGroupSchedulingStrategy(
-        pg, placement_group_capture_child_tasks=True))
-
-
-resources_kwarg: dict[str, Any] = {}
-if tensor_parallel_size == 1:
-    # For tensor_parallel_size == 1, we simply set num_gpus=1.
-    resources_kwarg["num_gpus"] = 1
-else:
-    # Otherwise, we have to set num_gpus=0 and provide
-    # a function that will create a placement group for
-    # each instance.
-    resources_kwarg["num_gpus"] = 0
-    resources_kwarg["ray_remote_args_fn"] = scheduling_strategy_fn
-
-# Apply batch inference for all input data.
-ds = ds.map_batches(
-    LLMPredictor,
-    # Set the concurrency to the number of LLM instances.
-    concurrency=num_instances,
-    # Specify the batch size for inference.
-    batch_size=32,
-    **resources_kwarg,
-)
-
-# Peek first 10 results.
-# NOTE: This is for local testing and debugging. For production use case,
-# one should write full result out as shown below.
-outputs = ds.take(limit=10)
-for output in outputs:
-    prompt = output["prompt"]
-    generated_text = output["generated_text"]
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-# Write inference output data out as Parquet files to S3.
-# Multiple files would be written to the output destination,
-# and each task would write one or more files separately.
-#
-# ds.write_parquet("s3://<your-output-bucket>")

From a6481525b854bc0baa9105e196accbbc6110af28 Mon Sep 17 00:00:00 2001
From: DefTruth <31974251+DefTruth@users.noreply.github.com>
Date: Thu, 17 Apr 2025 17:15:14 +0800
Subject: [PATCH 470/593] [misc] ignore marlin_moe_wna16 local gen codes
 (#16760)

Signed-off-by: DefTruth <qiustudent_r@163.com>
---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 6f5cbd0733da0..06d2b1e83b7b5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -203,3 +203,6 @@ benchmarks/**/*.json
 # Linting
 actionlint
 shellcheck*/
+
+# Ingore moe/marlin_moe gen code
+csrc/moe/marlin_moe_wna16/kernel_*

From 61a44a0b22a9264ea4d123275981b79726f913e7 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 17 Apr 2025 17:54:34 +0800
Subject: [PATCH 471/593] [Doc] Add more tips to avoid OOM (#16765)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/serving/offline_inference.md      | 25 +++++++++++++++++++
 .../serving/openai_compatible_server.md       |  8 ++++++
 2 files changed, 33 insertions(+)

diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md
index 387b1dbc39a90..894878ed14e76 100644
--- a/docs/source/serving/offline_inference.md
+++ b/docs/source/serving/offline_inference.md
@@ -28,6 +28,8 @@ Please refer to the above pages for more details about each API.
 [API Reference](/api/offline_inference/index)
 :::
 
+(configuration-options)=
+
 ## Configuration Options
 
 This section lists the most common options for running the vLLM engine.
@@ -184,6 +186,29 @@ llm = LLM(model="google/gemma-3-27b-it",
           limit_mm_per_prompt={"image": 0})
 ```
 
+#### Multi-modal processor arguments
+
+For certain models, you can adjust the multi-modal processor arguments to
+reduce the size of the processed multi-modal inputs, which in turn saves memory.
+
+Here are some examples:
+
+```python
+from vllm import LLM
+
+# Available for Qwen2-VL series models
+llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+          mm_processor_kwargs={
+              "max_pixels": 768 * 768,  # Default is 1280 * 28 * 28
+          })
+
+# Available for InternVL series models
+llm = LLM(model="OpenGVLab/InternVL2-2B",
+          mm_processor_kwargs={
+              "max_dynamic_patch": 4,  # Default is 12
+          })
+```
+
 ### Performance optimization and tuning
 
 You can potentially improve the performance of vLLM by finetuning various options.
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 11ca571c684a1..a62d4a79e2aa1 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -33,11 +33,13 @@ print(completion.choices[0].message)
 vLLM supports some parameters that are not supported by OpenAI, `top_k` for example.
 You can pass these parameters to vLLM using the OpenAI client in the `extra_body` parameter of your requests, i.e. `extra_body={"top_k": 50}` for `top_k`.
 :::
+
 :::{important}
 By default, the server applies `generation_config.json` from the Hugging Face model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.
 
 To disable this behavior, please pass `--generation-config vllm` when launching the server.
 :::
+
 ## Supported APIs
 
 We currently support the following OpenAI APIs:
@@ -172,6 +174,12 @@ print(completion._request_id)
 
 The `vllm serve` command is used to launch the OpenAI-compatible server.
 
+:::{tip}
+The vast majority of command-line arguments are based on those for offline inference.
+
+See [here](configuration-options) for some common options.
+:::
+
 :::{argparse}
 :module: vllm.entrypoints.openai.cli_args
 :func: create_parser_for_docs

From d8e557b5e5a1873f7a48b17154003b1d3679fb6b Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Thu, 17 Apr 2025 18:27:32 +0800
Subject: [PATCH 472/593] [doc] add open-webui example (#16747)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 docs/source/assets/deployment/open_webui.png  | Bin 0 -> 69283 bytes
 docs/source/deployment/frameworks/index.md    |   1 +
 .../deployment/frameworks/open-webui.md       |  29 ++++++++++++++++++
 3 files changed, 30 insertions(+)
 create mode 100644 docs/source/assets/deployment/open_webui.png
 create mode 100644 docs/source/deployment/frameworks/open-webui.md

diff --git a/docs/source/assets/deployment/open_webui.png b/docs/source/assets/deployment/open_webui.png
new file mode 100644
index 0000000000000000000000000000000000000000..fe9a7e15ea71d908c76eedc52d92e901bad9dae1
GIT binary patch
literal 69283
zcmeFZWmuH$+BQte&?!08ASo%~fW*)s9U>@*fYJzvAku?$BPk%Dl8PcB5|TqH9nz8_
zARr*!?|He_{k&`4w>;n9@5lRtZJTZ4;=0Z_j{Vs8{W!yQwN*)p7>Tg3ut?R_l=QK%
z&cU&;aCKnkz&q?$-Knv#ur2Hq6?N4W71?y}J6qd1-o?V=iFc2y*TCpfwAJ?|2GXvu
z1?uJ`APjjouB$q<yn4WMt*}}xt<_L%jw3oEy=MBt><`2VwgTPxmd5vN1<0^_)s--_
za_Kd3`<pF~=MHB&zSejXi0vD;YBHcE@2?p8@FVNr(QnnELReWISSo!X;!o4$D!`i>
zqn~F|Qz$-czL}Db4(;D)T=depbLhUucle=>{YI&mJjZdip_uiR02(gsdq~Z7TGtva
z4azkl2~%0B4A=df8=~nElqq4yH?(uB`Z-*A3>KK&GcFI87c)2S$bL;rXJ5@GVM8MW
zQ=5_cId+<c45wQpM&6dj_!ut7q6QQhnY+p-3*qO~I*oxLenDa9gLp%m-}q0x`z|uZ
zS*r6g(|)p_S$73z%)|cHuBfjKV<pn+)teJjBfgH{dF+<?A;k3P+>rpOr#Jeuz`~kx
z_r*}uri{z26Fh6=T|!-{_mN=Ck#~*Nt+lkU_`quz7Ivr|79Mzo4gMIx9~Kr)HqKw~
zoP%fM{`DGH2m0`x;F|(0EEJZylDwfO_R2GYR72&R50N}7QAIGK2osTocY%@?rtV*^
zbrv`bi7&uC%-p&UF1LvbSt|=$b*Jjmhz8!-`!4X|phoPgsEmwEujppx!qU?C<-s>z
zYpcQw*B3I3n6G$bz{LpFvRkMYd!H*tGS}dy1mIm@!zNTi;ZVi>*GoRtZ1?%JQqDho
z^dCRC!H10>n(Q^t{Naqt{QhosARY~z4>pkZAHV$POM#?-Kt+w(UokiTelP#|U0rz`
zBC_*Ab_RdD-~ap$8#?(as_}X5tw(JFe|omRzFWWs?w+WrocDiwNR$R=_H?g)@yEI3
z{~DmbF7oeTGmJxx6II;(UvCSW&`<=uFBS4`(sH#ajktn5@}WJQR98ROj>lem_{`&(
zIZQxLYcRJQoV9NM{mEn7FC|uOpSI<S?BIRx9fo`Jb+X>gC*KSKe}W!f>(h-hXaW`(
zx$NF;dZ`Dt1N33Fi71ZWQ_E3tZ{Vx}LWxtLY)72mFkp5%x(xLn{)8WSR~`lTZBwzN
z<xcljxh=T#Dbod$N{nP1xf~zvk`pjL(507fG=6hfrHAuH2@5?f<iD?BY?qtVH11r_
z{PW{x_d0ly^i>%@w6yP0_8Chd4G-Owa*wHdbwH%N&?Y`OJV62ZtI-^zxTeWtm(iu|
zV(fpsjrsa;ikU#pb+&nLqIx@#hFj~^b3-equ_H>pz|Ea-?#!^|BM!7ZS8tu~q08d)
z&=6%>lM6n(Umwos_#YT}ZhyIsIe*<$&*Q^+_-B;`^ciDN;Du9D)Gr9HZ*(O6v9NTk
z8kBC%Vs67^;5EBEQCs~`%@R}TI)`J48>mn>(f~7VvpU+_K>JeiAdM1G;?%aoZ_V}&
zr%qW~Lcnv&z#mSy&dkNikr4}&>xw_Dd39Lv#pou!@z0Vr7Ue_9g$y^^uiryv@vPIG
zDjHrEnwTB`{POxb@_VXPQ!A76#9)ZEKj%O9U}_BB`n4XBHu&>vuZ;%E>69dBsF=9u
zc*}JiWt@7A%#3zf9tj9zsx?vsA311vt9RjL97S7fc3*L(A#06cW?G#4#&{`%m|H-_
z5O-N0{Ef6q#mGM}b}VU`Epn|8zq>qH>%Dih*^|$R!<%PhayQB4WPc&F@1?5F`>*E*
zLQz`GNdeSlH$U7nfZeG-IdG`l|DKKsBBY*`(K;w@p);5Exc}9CIJ(;u!5rq&`<de%
z_&k&|#q`pI`><KnIU`fomUl~e=^>di-YTx4JyG!-s;yhsa^6*gH=DvJIqn}`AIG7*
z=>4<GX2c?txhVh}_l4WXx3Y-Jt<~`li}%KzKA6$V`gr!Oj*_afJi0UDwfqLtDcS?S
zehTh-Ce7h(`0jm=`lI>z6mwZqmqz^k*KWl<(wm*@dvmdE!yhKS9QxLtv-<6^e*||{
z@<zXydO>!*0e5cu`*GOuuMVM^$K<kMfrAD*GUtakxL4bDJFQ}6l5bAM1%pu(IvGKp
z?9E|PMXVV66Wgx-BS2ozRmNLq&Ec0GWM~c5^76MQ;gG)L`(^Z=sCpfhO4qr5p7S3V
zH#l@oZO?NJ?{zvDExgGm9rHSV&Fb)GN!Pg%y`_k}(jj8cHR0OEJP*IIAadQz{Tj{r
zv5I3WHQoqp5j3&ba#2~Ug#I^N$u-xNPVr$=v=zmLw{-_U-nG!)9DMZlh}k;JcV}@r
ze^pCj^uouZUmaCuE*Dm7cPA~%x+EgI>nB2pgeab)70C$1E?=D6|CXTB1h!DBTf_{^
z^wE40Zqa8-xZ3<<4%e5_5`4>hMC4T_Vxew(LDbr?vd&8yA8&;bf1<z$??~=VKc0I(
z&vJacJN0saIv?5nW&B=^0Q0Py3}KGtuO7KmAL;vG6OxC*@2mNc-%X!AC<3>+f!<^e
zm0c^gFa7E<F0ezcC*o_;g`A7xbdMLhLbDSxCU?5u^KK@DNxP3^^<<9q6eijwW>ER-
z^Tqs;WZy;`S0~?8ix7s{pC4J1Z+R`PJU8GUIG6R<;cb=mbmT`tf)i{4IECEHO&hxP
zs=6`z*DVBDPg6|qf1+6^93)2Y?P<2`krcnKPm(HL{bz!^&|8U@Z00c<ux2+{pw}0g
z;`2k(@A%|!?d4uT5AW1EVcp5Wvg<F0lZu17Jk*1dG*TE*0wG!IR=sZM={`d7u;TeO
zEst!47r0J+-wDa-vN~^pNqw4$l<X*Jd`ytyX8f#;!gXUo?o{%XGBVmTn}*r7nR2F2
zl{K0!Y4kJF!xiP)=Mo_DdLKhBN8;8a^DrtTn{@2pV7Yi%+U7a4%P%t!4@**=>lEt^
z^0kA>3frq?eGk4_ll$+_r_6vj8C^DC^~hCat!Az6Jf)XQr68EI^GjvXNAlSY!*8+M
z02-K}%6#<wY2mwRQ{}}au(fHL!8qo67IPHnVP-Pix(lmEcaXXo&rD>$1ZWszC+I&b
zeZN}M%IMJC9xqh4hv=T<_QJE^mdrE<%f&eEFh9#L;%Ilu|J3u1i!E4tqa`j63BL;1
zrHmasokErP?!z1UZpFBgTjnexyx69$R%Pa7UbV!;u`41tltL18WF_w(j!&H!y`GLc
z2v8%L@|la2n92kpV0~*qXTeD5<9d??h?0abfoE49l3gr%r{KBKLgzNVNjKtOT)Xqx
zY;%xBkHO*1d!wI>Yh^vM=(W1tNpGS0@V)9p@46A$&92MljS{cz?(Y8l`ZBC*H<Ih0
z`5$G6x3079FXwmu($hPnr)2L<Ht1+B`3#YYzhJ5`znAx-OEXhgq}<%1qxW82l7l6q
zePYm%cnPVTKVvIdov0?G_@#2b7lysx=Y^sK8Rx-Po&i~aRQlDYc1%oILy=#`iQLJa
zm8RY%m}}*HN>Q6EpI@IE{E!|G$L#V`1#wj^ufA{IkRDm+*ANo$_Jq+6n>{oe0a1Xs
z0&NgoK!6}!q$`vY1Y?2PB3M#bB-f)F0ehM$>FvuHDL!G`bP4qZ`LXybYad02PpTF&
znNrlZ6~><3;ePf&|AEmONXG^+GMi`aM#kQ&@2v(27Py}g;gEJak0EMd+Sf8PQp`L|
zD#5P9DU?N?!-!~)Ctc-!axhMnH9+#<=co_H#CvZhRCQx<%%PYxjI`Ekn+SPNPyV%*
zBU_+#?e<V)%p?t?$_RZ~^V6#6>^7Mo`Zh<ODDR)<rw7+fzsr2qyzdA?Hz^nsUz#3m
zneD;KpYJeak)nHaW)J#Ot%<NV!GsgWO8-)u==LUx-*{bIM}&-_9J9?Dw9k2_;L@3g
z-}F4HcD;#s<LTB_nB)iuI;<}r&qk)ng^?pO)Lnx)(q*TMxMKa^M-jS;d`S;rJ?8)N
zq%2EFG&R{u$>>522!xny`Ny*zoXN(Bt>s~%xL09b9AT8f{M%@uY5l|X?kj7W?}c#s
zL4sz|L5CTpPd6;wt^N)onE8>ZhVPsg#}B>B_;a$|L}W_A6n%}}PYyG@Gkc~_59?1g
zw3=yb;XP63T2*P_zTZWU-Vfi33_PbN{;7bw$+ye9FpY$gfP*>fjFlHHjHdF(^0PTI
z+*4mCoql+m>h5R#_$f<HeZDszjBAdpMT;GpA#8NmQbC&F&&ty3Z<sJ!sCC#|XEP-{
z_wjX5{KHjhSTs-EHQoyO@fISr;hQe)Td(YCDG5`PI)gEff;ebPuopO&qJ%RRI(ysD
zk+_*#a9+m1)umYpLl?C`Y_bZ4CB^oh;LWAYG)7`%#*m54GalJ>%^ZBuQCI6r7>VFf
zVF&s~^diX%+DZRY-)cB%K7zbE{XEl1`<|=x_u&aP0a|UjjIY9ytK|c;{N_Ov&14xd
zFiK>sYt+VfHU$pJ?*+bSH!TNgoNx8qM4{w}<`6V4fl3>frJ}fsM26qNl0go~TxRo3
z7Kx2Oyq-^iB{J|+Y|lYC+1<;8WI2(RGViH{hsvBU{glTGNiw>@JDEZ*v!!boBpITb
zX4iAF(&5{E7@v6ilUmhdy#*`|A|k~wCALqOn~CtM{7(<3T&dUQ>=~)uZJ$uq>ajMm
ze-U77CU1<lVi>>aTzl=l7$Mo-ox&lUV#+6b(#<3JsZcbU4oJn=u=kN|p&j&J-Cf4J
zi=U^^mOrQ&b)Hgu>fXnVR!`Q=<G6n3T;KlVgj*OA*VNqwY8`3H7FMO~t*=df{BBda
z1A%MLLA>Selsi2t3~$dLM6CO$^!f+2E3_Wsd*s|6_*>BcWu;VEywWD@gzBf?w7#nO
zRDBA&f19~(_39X?4~lcEKKX|cQnv=XvQ99!k`<_5zV%X2^T6FUwXla&d{!|%%|}E0
zxNxJ2?$X%X)aih-oQG?Z!QBQ6Qi)7(wj_!EBNus>@9UjWHW|n;Qe>7V|GNg9zQS%t
zyNr3{_<*uz6K)TwIzvzx>Kk^Mf$Qu-Pm!`(z?4mvFYPlgm+V#?(GR?fN9!1J)jlj$
ze~E~or!(|D$TA+&^e96*`i1VxttPh8VXz#@#>R-ac=#I_QNca+MYU~i7Zk#}sQSmN
zl6Mj@6-SAB`wKhn`*?eCjjrm(*Vs$2ZICfD_97XW!X3i0j&Ng?G~0Q@p{1@vxi0Ai
z76q5~?g(=L=w2q!F5JKWbGA}hOv4g)jncX&VJc>H9(L*D6a~CA<Yn6ENS5D$osU2J
z;7o(IofAbbk~NEzw~77yx4d&2C5c+do_&Ss+R{ONjo^!KD__{RCNlf`rWicH4$gaY
zVNUc(7?1Vj@LgD*aoyu97ip(&3t9WjSmp#HsxLa2?pno2a@Ft&Qs+MDa&PzM^7Y$b
z_<U@To-1pkJh5^zK``jaHWoHYupeTqJP1c%_lf(2y6Cvws^d-=K|LVNdOtBdky4bO
zrF6fS;78!?i4Mh3vMxET#$otFkkVAqGIEHZU6L&Ju#P-=zObI8zn3b#QG_{S%A+)W
zxo=;K(Uv0aM1OieNX_*F`=XJW)J4NPvBf>VW}n%|x}Y>urCl{`>pOow?=p=HHd?hI
zT&1176y-hli=tHr_QpDUEm%2jZ0sH_CE0$q>%j#P_O8!1S>=NMr&sG*8KPc>j}GSy
zvz^zt1W%-o=}x?dM~P!)B(ZeEWo7r3dS85ocFveQ1uCv#Dd|~Drz_H^WCb}FapS~=
zg`beFkbe1|xZORMw=*l4r%Be6L;0&Ww3qX!U@p;M80-ZROlsqg=i<C^D8mj@kAE@O
zaEeXw)UY8hT=`i!cmD<Avtxwu?JH$n;h83&FtL<3Z>E{#Hc=e347s95NKfW>f5j@y
zG(Y41Rnt5H{ow{8n@3kUX)X^Pu~VTL1?YME_0z=gi27%9MwbfTk~FDZb>8l}Jk>lG
zacm(W9f%*ZVb5q#a*|vsOGvh&Q7y<~$_<|jQ-Y=WST{;$i3j4uUZ8jRT9$l#zei4@
z#yrC1HuG>AkGK*7FStu$oZrRF<(&<w{QJ0q&$3E%Z1ik?uHef;JnWk1%V@9@>Y3#q
z)*tUgz8(!tH1&ze@6r0{PNc<QPdOUGuD}r#S5LVSEMc62N6`GX2t`fK_lv;Q8Kinr
zB@10g*=LVg`CxR87q2@`U)dV9$%<Zkv$C`w)3C@fWsvrd3OaiXHmwSfv#8~V)Q@40
z>j&MFScs{6p8B+(j_#|}uZ!e{CyXle-x2qmi@tncw&KW2!}qIQavD#t)SEo#1c&C>
z@VTaMuHsKsxQ3N(m1jw^1P=<k_ng|ktEgBOK*rgJs){q{Q6mNQLR{izG7xX5%3{@*
z1KnsBNXWGHy{1i@pC<`QkWFkADB+!pVS%f{h^3!!w#jPeiM^DNF<4|+xYR9qzsaDI
zu>`v>@ae3bhmfdmS?fk;<<FrvHy@;0bCA0r5PWP`?>uWVA-G*3kWn)+fh=_mbamtA
zw)BuoVT#3aCr-`P3Ohd>FsSfNCtncT5_dS{5bdD78f!0{(L}%YeC3c^a<k0$8w)00
zYmkHze^8`RtqjBOieT_NT>VfPgVT!K8f8_anD)KaUb=gE{E-l6htOxt!3AW_;yL})
z-1guc&-}T~EC7v$D*3{Fzg)^Y@*7$ZFIx-8ncKXOOFJu?G$`kz{p1NzqeYL*E=kQw
zX$Otg+&b4QPQu$hlhy*SzuCxTl^(w9ti{;ScDqG9+>{JPp<RO9G98kIkZe(Rip7Ux
zyoomgcL@>9qBy@|tulNMccORs=QMUL{Icx5yXgFfeg)Ga5PO|%>N?LtiV6~B-m-Tn
z5sHw#mA{0C7%F*VI#-*4Shju4Bi+eJM)}zPpDBZrvLU4<2cr;D&o^Zg%*uZL$-VZ|
z?cnaqU#inij~zZf65r1n58YV%%GmmGOEozxyxxb_+IS;dxa!Ni_}v_0)ytY+j3y(z
zh{+zcU2oAK`uL=Sxs6g=@&42)s8+(ubX=Q-Z)QCZL^h{xM~LFvSWq$0<5}X1?%Tb7
z9+Hi6waI8TQ8a8wok=4K-dP&pmWq6muw8$8?3~fny<&F|s7TW`N_4S^;!sm$qG+bU
z%0uwb{6U{`RoeL##d4f(z5WYd?uH3>cF(J{QEJxY6smWBtB!f-jUk-o=Ts&s%dPsU
z?ug>Y^rgV7M`<pR$~j<es*&BD^fs}gm0S~<O!?%=`m_G@M2}vx#q(+K@#4$a*4d9r
z$eagyE0MXj1j&)UBi}=xavH%6+Fi1(kYgeTS6_+#e5mxnGQ>buSs|y~2F$#r?^_3%
zwd2Gm5rONU!n!S^9Ci9kmd0{?J++s0=SzGVxt48JQG9ol*x7_PP@ZZpctn>dY1BrO
zg7Q~3!YE-KCL9Mrj8tJg@KD=y=dGs>F>!6;0)rk>A(s6|$m#u+vL5Z%b2fK=Dx>iy
z>y<v{!bD_mGRS3pNfMi+_Q}dAzwUp$Y#v#s!}sXc_>+Gyi-sK72!i$obV)|0h6&xn
zx6`a#&a|5yd0>xUZJ_IDR*q1cOK;4NUt8wOzll-RCV6Iak!oIdu2|8{R$Q^I*DLRl
z!tzsx9S5^HOuDc{`P26fT3)#o2K(EkzS*mmMxa!Mp)Gm&k%V>^T;jAfFd{kyn)g?P
zZzd53tp{)<gpu@9=?S412{N*gjj1vnB1olymsEkcR{6|uGo>pRIL5F<0vRfERA$A;
zmoNarGd|=9L~lvbck5bL<v$$`BMr8OQ6C4yDcm4~4<T17O%mqBBwtv*e1BwNEmD^7
zR5JE)bcOK!>N!I#(Q4NhdI29&H$NmBR2I_W6egb=W@_a7a+v*H<^GSM`+E<@6C<!T
zEy6Y>a$RRImo7^5%J+YpZvO5Tq_9M<>^yLsb<H;%-r%BO08bfRQ3|-gp+&Zfn8oqs
zaZi?VVsECD-}=OzmGgb}kPDzo;pM4%|9dxYOp9`wL{t=nh~&y9ZIX~9XoL~;rFPR#
zJmZ>!Is|L@7e=%fuJQ)-%lc$)j4J-%<Nq>uTXGe1e~97@g{BA|UUvd0CwLO|`NPkz
zbR0K(dd=sast3{_gm?KVFAa99@0v=HFDERf$)5ICRA=r>hv2udzs)@iK9mVJa(}P)
z!^7B3wr;$VRs0|9GfIOiZkzN1S?*hn!|OjJFT$s<V0z*OD148%hjp&0Nqu=}Ys)H2
zFI!hRtxH|o`^qU@*rQj{qCd<+Jf7^0!(gb9#q7IX%p#pO(VMaN<O%BYr>6^)I(^Jz
zg4Wx#aF@@6Mq-Q$etriLmfXdIUj5`VQEl>>FKA(MjWaqYj)MG4=y!=8;ag%f9x4yi
zKrE{G;ebp;>gad%Z+uI!;d1v-xw!v+i4Lw#sSr=`Ya-vIGFE2C_^lj_3`PQjAy-F?
z+Ie&jKGYuz&<ZG%AkLp-M=Hs(m43^&`0V`Hk0VI82HO=~Kn!o8LEL@1qgJdC<1%I=
zA<!*~AL{dj)H|9YI9{I^L#UIe=~uU3SrT!=2D|W*UUC<{R%OU0<)qOQh6eb-%8S8Z
z`{T}-!$vBN0J<{nXYrQijc0zx2g)UpkJW1uGDbV&guXn93rS58Z&tT%-ki}Pg-s@O
zCWgwDeV5#gxcHWyym4QP=342JOx|FuAqTtbH#fF7m`~ltm4ZdwQdrw?0?*Qyv9eMs
zDk8E!KIQauIeHac5YDmccQb;^09<Uvh<xF?GwHonNo=0ylWdXGJ$5HS>qVH-Gu*nZ
zaA9_Q=B&G6c+z21YHOhiqBs;rWx;Lgxy?5G!xS0|7l&-#G99Dnr_&6UA8)Qd4Wqyy
zpX6@cTyi(-$*IDG*=Ms(y!<HlWF#)LY?UJVdQtJ{F_8er&gEqGx{a@|)c?Rp%HizU
zJ_*n*LW%joe40%<h|Z60Z=0~Jk5=}2&md$)UohD#g>10a8*xYsYPOzc3Ek~qdubn`
zBFET7vZ`<q!4g65L|4YEs_$6b>6Kx)A5-1=eX+8;F4L$3`QD{+`e;Gnl14_Px)d$T
z$5x9caJ^2K?~PK;IB=K4l`<WZugWwH3~A0qJwg#3D+U=($aJKQ-=uTQr>w$fa#t>&
zN`Gt>`Haiq#ZhlIB<gTws&q-)r2z5d_LAqpCs`O3y}R&qxT5iEI5r*&;&CP2Q+J)1
zean_e*O>cq*CR!07`X-P>eZZf+Bsqe$HcAUZSX_Ct+(=aY>bFL9h8MxUs)|TC^Bf$
zbp26E5sf({z=)XfU(=ajU(=z2|6*<rs?u|$YZJ|)hB@$Vn^X|K;8nYx{C-J>>)<{h
z)E4Xwa_ZRaxsqY&$ld+>8r}Es2r`bt1dx-IssjsZ_Ky$a44vH>vM7}l-T*dy!;PF>
z%t6V5^Crk_n4z1_%;uAi7M^V!=ykG`5XA@fJT2O&%9N3HQlyyIYu`I6UcRN&tel}@
zJngm`r%1*-Xz01#NHFJX<YE<{<ipC&#ukyY7_#z~=J5gnz7u!N_Rvk-S*EO(FL#vY
z?EYRF0I^^&7RAS?^_{9zmWGeQjK^K-#NNk=^w|IC6M0+cT)qdvsYn>30J<vpjeQV4
zt#Rv%c44dNo>l*4d(&k6=!!@ats<pk68GSz1uZNC4iY;Lr+n!QC})D2?K*-R(aoad
z@M@&}fJ5e}P&jFh$W_VFW!0RIwUo3^4C+79MzDT~^N^JAmOj?B#I?DTZmH<Hv;Fl+
zQ&kv3{!T+lq+<GE>geP?IUYYeH8(G%(Aod<*R0WXnm(-vfmNFOrFq-D$~1b&aCs`0
z5$^P{kky<6_SwfhbdzoC@(&tKG@=uVHW-rhzrU01qF<2dIqvLqPcBS~ckcYAGT2~p
zOS5?d5cQTKvnwT0?q1QnUx|a2dK_6DZmQVkOll>chg)cmeJHQ1kQh<f!qYyvuwEMD
zJzKr}HZl-a9LZq!9hq58L_Zwn5tcoZHXSC^GA;j=pc&CzP#xuJz{Au?h>bNjv6b$Y
z@K$L4QD?=qDD@lufFhaikL+U!L8{UyQ;|?Ec~0fPsG5J##6OWq>G;F^h{@n+?p^pQ
zn!vnaU=ytgE6Vq<K6z}VKNP+qvTb(qeV780-TE*uQrG3O@H-@3*~{McHkPZ8w-jB|
zB8oU&qHFQ6yRrP#EOL%Wqgw}_Xrj@bNwXR-5+}!Wxu(8@%9bBz;;|~+wVB55!w)IC
zT?l}!nSOp=y;JPq?b}b8)NcfFpk*v=7qv})1PYgBGVCM?!%NF0A1wBIT~_@{VBN!!
zO2w;};wl&;36u^iY}TDNyN`g^zb5wOnqsTrF#V63XIpwoaTnSi`NrxT4v_6WG<v5w
z{=@5g=7oHA%Yky*lBLT=nl7(iukDXl`BONTYBDe_%&=(BS;@6NGfSCDF8+hAWqSip
z|0*XJ$Ij~bW&V<8Dv_3Lh^OXm=RSRKQn1cx*z|pwzKQy<=e&lP;fvaEarNs%^($@3
zoJcIPYhgG!!TPYfjhE7&J{_G$HkYne@}om2P!t;blNxrR3=^XuA;%OtWnCpA+h4FF
zX=Fcz56p^3sqL4ISVp!C#4`$<f0#wPunYggoB=jQD;<l$69{uLi^Ce!v-vGy66V!T
zd&9}%pB3>})(QMb$!kTQ4<-LF!#p1vdUb~br^#d7iOz;AVJbaizOa(N@FI_&CSUS>
zYqjg2C{|TO5e)<+qCeAL##@Kk2IbxBRG7M(Vs;-tDt|sj!NA7m^bg|$^yCw@nTg&g
zs(BYBwp6P;O7U&D^~Htt36cC=R=xa@hVCxGT>Y2;VyZZlIv%5F6M9g@X}ls@{Jj2;
z)F)E&+JAmD@a+~D#gfatte9=>VkO304%VV8rR?s8jqLI3CO86#C@Uf|0h&V088SEO
z3uSg5#?PUPzQX)%dDI1(usR?bzaMgJXUz#zl%+J+$PT8|b+NUeo0$27S_WcPxh)Qi
zd|6;fu&C<x;XBWKkLLsYPKekU4(<^0kXUkDSth#7BNw0Apb)(OxhY@?Ag)~}JIfMq
zo~yUIX>~N?9|iM&B}W0E&C?>SL;0}@@4z%Or6o5%EB^y@{SO462T#Eho35&!JxXEw
z<<+$^hDhA_|M~Q9gPX^SZNR4`x07k|znK<#9bxn{y^eypxbdh*c$$TVESF>diTVGn
zQRQ98o)+dLH&@SYp%l*5kF5DdW_-rQqZJwwuSN%cQTg*a`giFl5D7-Hqk1yN=#0z!
zex_a?VA5d3e;xhj$@}-s-Jr#GK$aIRuiA17pT~|=(Ftxo|F<FepMn*|0Iv8s;5$2&
ztu`J%<JXM0c~)l(WWyE*c<AF1Z)-wBd~6Q*tJzB<gl9~V*dyQo(K?}$!Xgxl3ZNt-
zom=QT>lPYV@jg?*A@T6tl;2@+()C-C<<3*GJ^)D10I84sy1B1=s$JgGCLlo5N{l;}
z2?LHL1dxin(i_dxGZSt-q|{uRaW((@`s#{9nFL~hr~vocTBLmY@z-ZzzoWf(otHX-
zpr|}kcleVSa0wbCfSti;UH(d=63uwN`oR*F2N1s30lgY}dCI4Z0r5bM%61GakHPZd
ztlBf2IjHv(JQ^;LvV#G_1Jhf8MUXKj)=xQ$z$=TE^Vdl_gz9}fk2cycKo`$snhRmN
zM{i{2{^b=NeCUS5n|FZIXf26m5ak;7r@PRbL71vvC~0oi_6NBFTJvZ--~*D(PXjV+
z(@2SxTWH=q8M9O~jYa~O)@LUxv>JiFK#~KH=5EkSd%7$Fvgd8MhG5l&w@gTv+xM7>
z$)(p(o&a899DyR;a-e?;)WxzEYqh)gbj>v+V-Nqmto~=s{MSB9V?vkWg#x)zj{p6j
z{&TY$kNCDH8iEX2p(G4cBNdA;BF{AwI0A)}5K;?)(r@8o)|~}}|BEaYKn|WKGq(sn
zKYPg?k!)-kUMUTX7%F;}&0&=2&@-O)-(q1yIlHdW)m%_wa6no`iSUCT-`iOI;Sj>>
z&C^OZ9=GVtmbWtiG8t<VP;OJa&8C8}anCgyYUSLWorna=DUBYCrdVas*mwJj)c}w+
z6?|l;18|!d9f~MvfmdN2VkyJIAfG{;U^tEL@xeARPzlx9lnqZ$j`M#E73OC$wnQyt
z`O8WJ?R^HU5%RE$Ves;YfZq!_Jz9{fs^E#K?Neb=M$ZE&n3T@k$5k+TFy_in`ifEO
z^j=^W623Q4Ig(<aEy+py<$E(E$9)Soer((%2}F1Q*1DaQQBext#wvCH&7E=QHpB$Y
z^3b2B;ot9mL1ptdqH%hKYPa-9Za*fqOvKgp#yc0$^W*?#BRgKO8enw-5t%Ky9n8Xk
zxU-E{FTSBk&o;u_*<Z*C{rR<~ZH$tpP@9rsB^HUoUIuD(Xsql3&(9C572_%*w@;6E
zW4jL_JI{Ld)PAsAEOIss@;TsT99*7oQ7pPH|J*SsD9isuvZ(5N8n%4kFbum;pg9S$
zmDJq=YJo+Ezy}gY07m{Oc-hn(PL1o!AwDRzTtuynoC$)=Lam7LHWqKkiI~&?)UTBK
zBhH;;fZU_Eg>QeVqJSflH=TetCH_y8+rQ!qdPyhSeR-(QKa^bPOP6L(?v~&_PeT2Z
zg`=(Q-10eDs?=L}pRX1wrCrGQsst}8{meJh<A37DUy*Tc1B^fZ3zF2XQ{XodS98qP
zdO_z0qL6V4dNJez5^`f~B+|TUrGR_lB0{zRs}=zpyt6p|!Fl9f&FTvZRI_459j+xF
znNRcMJFxTqKt-RG4CiN?&GwFdht}g~+Q2OdxIqcS{-k!C*qjVJVQihiJA_^g0s`@s
zk3b|Bv7E_%OctR4YZIn*Owf`D@3F~zWIf{a;j86gQGk%6WE-e-89l%ne>b3grKNyS
zDJt<QAIl2s8DUuMb{aKG*LJ+ZsmQ5Al;L)1lcKLwVh~YG{xip#ZA!r(ZrZy^C{Dse
zPUH%3D9lVhChyJ_{{fs&l|QSOn}CHUYne%$MU7ev)Q$^7;&-qqu@w!o{c!tmkTc=;
zQK*K#Sc*VoBUAh)MDL1pQMZw@%1W!4ztbFhaR)rmn&CE!$CU4G<x?>YUZ+0M9%<^~
zXZ|PqB9^!_i1bV0(K&`!ll9do&!hL6T$L0k!sVOsROCcnoPU=%6wVV0$vY8jcH>Zd
z{0RnW&)8%1?%avpO`n<n<&i1X6mZ?|QNkDpY_m3}bqqaiHTx^R>f+qZHAI+E_2EMv
z9%(r;fB1@qQ0)V@YIJ)D4P^tr&kJfEMj4P4N)ICkSn?0auB=eB1YMr=<fz#Ge6hIM
zEuM?es{UJhkDR&Jvx+h~-;37i&UKIA9G3XJX=-le8?8)jUQ|k+?=qSR$LOzkIKC}>
zASer?P%L0>3~dCi492>>*~oXj3dHj}PE&Qhrg;^Lmk>f+nyC~?dPDIQ5p%+04T~vf
zhCR(IID32gG??-$%d9pEp40NlbX<4xX*36_Su=HTT40HswR&PtrSLx$Bm~SL<$&zl
z1c7!AHsgm&xhbu~N>{|1^(lXJKfBUqeGA0AJ<S^&{F3YDJ{v8IVPI!ZDW(XTS$3VP
zc(~Zho_E=j!N$+Y1LUaC!=E)mEQ$gIV_;7^G&K<m!2zFBY+R4t-2Bd!YgoE+|JCIU
z5LH?d)qN<;?tkJ$OHqMj=_YkSmK&R$d8*JLpZ=4}Ofa3wbM!O%g?LpK23EP_Edv@6
zaJ{#@V;jL|rS4u&JpCN(o`Hc^xvT>T=!Li1ICqtikd?4RR-RG+Rror%?ND5DY2mk!
zT`$}A7bqc_T;eVdrgq8QX^+cz7l#)*e9K#4c&k9~rKK;fQBD}lUQqWY2u(MLSpAP&
zpB>AWPs=;uzu;4=QF;cmS7kF5)+V}Qm1C+9)&K%et??Y7!OC@%q=9AQN(pHNCNQv1
z+RCk4(uOS}&`qxxyCkIiD(_t?JvfZCAZgG!6==(IzW_|3j?H({It$F%`rzErVlRE3
z7PZ2+NA@Z4g?ct4n7|gjD`?Bk#{}dtvD{fcmnh~@UN7$ydU<(iFEjm;zTFVuiFAGe
zhE`ON!#c-Mk(VftR0&ra>`gQemOiVV2KLNz^LoFH%IcF|!EM@iW)HuLXFTXv7dh@X
zy6JSjO*=Y_fYnD>`d2$gRUQR`f{TJl?TeCyT@V#Q)(-#x9xvTa#`wUxb!_G4=IG0l
z^gjVAqGu1pl6N{66atC%=-Gib=7**<k`#>k8_Q7My8P3W!1TXu$^Vg|__VVbTSCOz
z4NawLcT8{w5>T&Wn6GHRbtb3`=M9)Bv*|GwLWhL{<AJuG&!fDj))9rF`t08hGG9J9
zq*m7r=OI2&s^*}<IC?_{*WZ-F^98V#q$1#jBM&^_N`dP$i7}iqgTUSfZ;fI|7?1WX
zn2$3{k@(J<HFOBooox213IByy{<=LL&Fr+65Xac)2kreXkGv>(F6L5QUIRIRnYBeq
zn>5&-f@#tG{yN>VgYVGrwE~9c*>NKck}6U3A}xg>q)ZYm%L<2g9=6NL)^y!HEB<De
z;-%?tVKu>&b2h%(|IR&B@Vx6bueU$y$ZQ(|O<n8BnHyfr7iZ8eS4F|#(t$>VcMyRV
z&cI`o32#!Ks;W#r-i2Ny1VfRgPS^gq%FuaQSPL?GxLRFm1acsx<qbI<j%&iiY9DHf
zVy_ARH3Wa3H}sDSz18l^D5=~$dR|n$`fH_}$6RPTt?cb`+2;#?9!BUIx?Ol_H~4aW
zlC^42_vbd;2iR{1DGpQ_?HK;^*8YXxZ13PG!6;^W6Qd8PdHbL9#$xI&%iw4oJ{#Eb
z9Je}iI3ozfTB=EdT_jRmX&BR&U!{PmY5NwF_nBj;X@#S7j)Gmcur2xKJq9CVi{KZl
zT>x)bKu$=9q7s!p-_J|LbG;XY-vE?tcY}J@=d3yfb&aZ^JfY5Qp&Qy{GXS}hIe*+x
zX%qn*&K1WGUp;2P!X&~aAmgvpsk*53+E7q3_9AqHRo|qw&m0FMJU+7;bbAn>T0+PA
znQDQt(jw`a5utPKwo!}8rOa?@ZljN8Oo=EFP*JtCSKd1FN_u!My&P1ApKXf)2-kDr
z8TttkhT5P=^89Rhm^hwwm6&IK2Qznr3@Q&Z99GrNsz}i{U}7PUSsE>NMs3J#GF(*_
zfg0oa>k~I7nC=$+_op_QBh>wZz*uZ%0rEz1uFbrYebh$cFm<dwFDNH(d`|MnICB)=
z;PEkApu7k5j6ujzEWTCSR9t_;DD_~G!hpvN<f0NG&;mgM9)T1_o>Xx)HoyP4toFMq
zD5AU8(xeb)E)5c#==tm{F*|+aw>U+dZAK~gA2dIKC9n6*%@4xErfx}n9sT5XMH`8e
zxVw&acu_94cGo91&Mc${5*q-rXK6X7E#<Sfz7E0f&C4o=7%}El?gSJIXiAV~t4sW2
zU;#Vg1{eBM0{7+54WS@VDA(*4=Z_dS(nX7B5_$CHr7?hCTYJbe$DEb-bXm~fzh^t&
zo8w)3<uzCikqrWN-hyX7yFH<<!*fX!HzpmoYKbm)ej>82?axwYfBzO<+D#=C_<F5i
z-W44>9H)!!L&3B+c#1$tW-(Fe3OQFvS$)>!*F)aYn2n!72n66UrZ)f#uUO-;%ICr8
zle?$Z7RxH9oW8UT%23>eX5+WzxfK_`zFk$%rbk1aEsnaJiNNPA1ouDMu$WJ}PG;`A
zeQgiyq^8+OA(p!1opG1Xy!k;;r9Dz*CIoshu+gBDHIx@x3H;Mo6_DWoNa^POAP1+G
zUfkXg|3(QRVWM`nJSCa`!9Z3Ml*5#R!8*Ns1SYUb)VXRtc?MuP5~#%R{85;Hf4Rd1
zb#0|TS7X`-z*IA+M<Z3(LP1hXe2+e(3{op9LEgu-$8#`EWRjj)34c)xC<u^n8Z*=h
z{k#(1M|wAK^W8a0+qh|$tv=OIq57jsfCZ*BaGwRuAYm3L2-O>d9j;Jif8)cwuT6TY
z%;(2GIA>jH16KAXVD&o>8Jj1n?cNJ_@RtRN4;gbvA1uBkA{!v8UN~bm8pIH&>X+Zy
zkU8aA1F-b`c+>@5k3QQMq+Yu}nYCs2zrEmL^<F7iU&_x40hR5?MOk+zXsf(1md{bD
z_d9MP<j$aQ%VY+8hdc8RkF(Yq3TGREy3>%Op76t6^ZlykX|zj0#q0h{2L>KNiXakN
z)t>tyF=!3KAPKm2&C0o$EB51WrYjh%Q`tK~;GNlMV<pFglE)sBFiG%dJe+)FQPM(7
z{N~2{2<7YTuPZ<`+5F;*+|jzCCqVp7fQd+VNPtDU51DX7?s|*%IQAJZmp#g=pabD@
z`Ms}Fw2w_dRq#7B$$~+8_nGf?vlaaU08muW{$B@e5Z}g3-MYjv5On<xs#9aOLx2-6
zUWwosE&vSepEq&C18!8wLZ0dFlM@H3R2IC?F)q`jy8}9MUq)?$0v)Atq{!P>P<MBr
z8X5HAuob-@x>0p`AFN&KOh8ERNKi~F_1Ra{1CMM;OtWCk3RUcL-XJvd!wHci(EKn1
zO#U#MDlR^bYw6r~sSmdn`#6TuvPZ;=cL9_Rn3g{#jA1Xe1Y^*>eGd={WMQmpb%*y*
z1#Gp6XJ$Kq7?cP!iy2Evb5-l3@7eMw#|ovOMk&dZB={?*1NHE-^AtpGT848rd9T%8
zcXWHUtOJ0!3+QeLhTQ+MJepds@*w0{w`AuJaW*TvfEQ@nQNxsjZkgbd<HO%fPX;i9
zi`b*1VR%r_$Kqh2f#7Fsi^py`<ngN?reuZdM7xn35sc(=M=r>%UPTNJF+YWaQ7_r@
z&zdz^S`3T#3gxmCo8O*wjM@>$Kh59Je4x3s0vKUV<H@VC&+b`$@W4D!Tnl%AB_78>
zu8kL%%tiEaaeW<Ir=VFi9s;`pNFssb8YcT7a0P>w5hO);F94VHRVZ&Iu?)}IaEsg|
zqlhlYn$?OKz*WU|K$V}5wjO<s%!@3yP7zfA49?G1(9ASMLS;F(1jQFr1eZRK79gU3
zrbifVB9tXRLcIn|{^!V9;P?cx0W|RY#%BRvR01c88IOx^PRNl@#XsHaI6SXu%#W5!
zw_-X#^aMNr8EAn}-`gHGfB9I}I|2&5ra+^F7%L0V^*5@uee1ialF+&*AVe&yUqX~9
z5lsEA5|;XpubQG=GzJ$bOX6OTt6{c4iP9oP5=LjD5l>q5QV`E*m`OC9K;YSBmc=tG
z<ZKg)2Itft^+tyQ>w4D58DA#N4~L}*69td~kJAYApLyK)&CWo8X%dw6!hnu@*N%t`
zx>f5He0s85j}Gjjxee6|%UW?`g!!3X_F!!hHvz9Zb*!pSc9>`&Zif;hluJr<#qeK>
zAZVZ#n7wmWObYnCDZ)Pl;GWZH*Q#FD7Yv?7C9RAzV2#Dru-Q|{H+t$3{?lsl@BNv5
zmqLv?wv55xmD9H{(!#7SLzfQ+N?~GZE?8e=b|<KzE(%~`zY>(2e1nhnQQk=-#P%+8
z=`K%$jZXrqSZ7Z_JuNGzyy!%pQo<mA5{d>lo*Gnn?ojsF1NSkv&%>`p8Uo0WD||6q
z!K*O#oI8kIs^Dx&G9DhY6d6!DLCp{0zJ6d&Dy&V`{thbmq17?&XA;o8A7K}4{KO;3
zB1pTIZ#n#UY4>g#D*#lVBBmg4;hTH}w7?9gZAh=1x+)jFM3u`Is^DQjV*)b|j#iL-
zDZ64w5LA3^a0cS((Slp^hBzl0I27$%pwfOdj2TAm_)_LTaZBOq<K|H18``TJNL?&1
zsP{{pl|l^aM=~x8l%g=MFjADNdn=|{=$myW`pZ+sl0%%-+rL^uNze}Ph`%{3sM@W#
z))vEjS`anS&S3@-rmog6s6Lk@UL;-cbBgh`7iQ!CcC%!A|8TxuaJTZv!$$yZ&xvw%
z|8oVLEr#<+=c#>1vWMZ_U4=UZ9XdBgMaR<cu5t)>;f3~B&)uSmYbfUeu;)#<x9lHM
zQ5r7xr5s2lur_^aG-rX3h^HN0xj67+B2yJ7m!d%JYOcQaH`%j7djMV<F?PmDj-{$+
zb_H%v^xcfeuyes7tq8}OBt-VKF7K0yGr|%O$dEW;Sn4p;gK7CR4}GDH&X<N9*QCi2
z44&N;X7)@!xzS7Ns74E~{@sbQ{ynOUJsb5~Y2r;lEnkY=6#l~#{%4662ZWb6#hE`s
zp#Ph_=Ba=tmXgu-2h#}<_Wpt{|L<CV0m=V2UjP4J6T0Goim#2{G(iYr?7c(EGMpHy
z4AUJ;)olmzx@eYt+5hj-{*P7u*Gtqkh3(CI*D20H83-Z^=^S5xM!gwO+L6t*#R^r*
zym9<m?T&<WAJA&YK*1~=Qv*tH%wpe5#hy$VjQ#63v{3Jxu>D}c17r0bfS^^OEP~<C
zdGzD2vjBj$X4HT3^TPjrD~})u3H3@gL81$wStPzZHTf_baq-+vVaLJtG9AQ*3Io>C
z1yF23kU|Ej0~X-M3zvJPJ`BVv)0vfbA5X}i#Q^*c1?Uh;o-0ij-`eA60PrP&D2`N6
z6z~0bU2LfXf!}xEo@lf>tr(=g2jR05ByEem^_iKx^m`FCP@kR!DDTOL1uLg%0$U&!
z6H-gLbTZ+$K%?V2&<9$NKLD4;yNUZ9Er}X)t<l8bH_o1b2EYJ7OaDQrV#IO=n7>FC
zGQIB^gQE~I5O+UT;lu>-+y}dBtmjA>4R6wYJrl$JUX&<8Xt1(Z&HazXiHISvhO(V*
zyrh6P{^L?J*2=SolTR(LH-|z5WEArii4U0-KssLKxkq7wHkS7#K=4BFWF1s`N`2TC
zIFQciV?7R<muG*0Zf=Muqy`|gWMoH$$%#g7Dd&F=nZ3d|cU6%&0|@@<33DLo&b<Q5
zr|v=t`dRqB2EYW;W;q<X0Y2%sR0ill1XK!GJP)QNg5*vlXb{GUCml>+<AczAOczCM
zgCA6D(!hD7o;Bkm_yA1#NWX-75c2;qCK#BQvCH(95~c3o(+8?BoZPWtbARTCtDt(N
zp|-ywxC$hgsn)MNH)Mc7-g<)^fTZaMB@$fP8T1fng&Jw!Ex)a4RSSni4WLh+@_K|V
zp9KT{wU~M0Q7yj+DVKCvOVB62Kn*BKm+{`ocu@m&ej|5AtgOazcqP8ovME#H=TTni
z2A%ZAo>I;!rl6S}YI=2PWv~efW%1(4Gej^`<G(%=-Tli9!l;sspd_33umHXM5E3q-
z#em(%uq@19+K#X&%uSA%#V>qfFZu#5+>a0d&ZOtwyFWMF4=`T#a{SecFGPr}pl*+r
zH@wJW(Cy#4*$-ILv-0roIwOR#z&J>;>T4y?G(eN@5k)W>$ZZn*pilSrCIb8BZP_R8
zOiu(G{T3klAldH-*b)rjid~_An+O60GrR7G8BnMagGfS50Cz#LMGrtdY-<HkyxP;m
z|C4(E3QGS~Zz6sZBN?Wo{5(G<UaH2@qlLki^K!QX5;5^hm1v?tJDxk`&nongUSM#e
z5US68&A8FYDp-2jXY;YWx;WQ3kYW$mSOABTs0z;6kiN&lc~*GRRKH!FZ<tnkrS43>
z%WpK2%?l8QZN&91S&jd)?)+_hUqu1VGs%pi=1dF)ZE@QOz!av<UMuVQ+t2?K_^_e*
z0QAwlOV|AKuUz%VK}<scMlH|t%Szc_$^B0k&HfDIK^l(Ymt6VNr++;XI!_IZ7w;|c
zLtsTZ`@a@k0(|T>5BZfdNj&II!I0UG(_C@-3v8R<KrV>aZz^sqeyw?E3>0NWa83p0
z+ouE!w3lyzH=r)-Q2EE_f-Nv1cgfxU>{$Y*wcK!D8F9UjKVAk-M#&DRgbs_LUM~5X
z1ne{*Q_@KQ`5)SRczB`uB#kqGharZ}34xN{JjL#xIV7HVFmvJA6IwW3&^a$|wWVF&
zrcft;C6NCW4tG}PP~0GxWZiskf8O&>v>E}6ak_-l>?SyeqX}{~oFk#9Fh5lN<A%$g
z!7G~SbqT5X*8~oPp@1rlfesgN0c|{Z2zZ!2O4AnXAfF2*>=&iqWmJ?2wYl&+_0l@o
z|9QGdxq--lVHgX<OA`Zjq_nMh2fb$tx1b<}v5XX;4bw%5p)}R{h}9i=&T-b5Q?Z1_
z?E#Yp1`7Hh1sJU84*z)}C!!i)p;T;Dk-t}yra6uh`_XRwX;(NYZxU6LyhPgbid%Wd
zPu5XPFgh50xl@j!)7HcvgZB@v-aPBp#ZT^QLzUifKP*Af7*nR98b#k%$zA47sCJtO
zN6I9>VJ4D~^Y_Uf5Y0t|2g6qvE4cXTd9O=po69MO)Po;`yq$npngPAg7aypVnSW@&
zA^u&CLyfFHAk}sGeI#?S$5Gnnuifgm0B0pMPFWgkx<>zfJ0z-P%ZI;)gn7t)5@_QE
z2!^JM#UJ%uZk}9)%R&n?!whioy+>VjzdeR%TFu*U-VYH$jWBL@iBCqbo|H@5_2)F;
zYHTykMs~6Wor}=OKi>N}<EbyoF69za^RxQ3GVK%4lYEQ;t_a!?U;`R5wu)AB`ox(+
z;=kWvj&tOybP7&Tu$XQ@&j3-M7_yFptZbRD3>TXNY$0H&Dp-H?tHC)=p#ng$P;eqa
zRd1fK$I8e%(C|kyVPrcGN%IwxUdt%#O<*kvQDyZNGX?l$2IN1%a&Oaz(AhXFGF~(j
z-6cQ+dH5TL0F{_xmjN7+v-dsSdG;4@m7SjiUN09wL(@vvjeI_DWX%4M8R~yu2bm)T
zQjMXboPcb7tDU)3d|u{0Kt|^dMk&(4j%bP!?WXS$2a!5NY`*00>$A5RVkH3U#C<;y
z15QMtvuPn^6yrg#aaa6%mDs|tmC(@f9JF56457AE3{L_e(^4GEsGA;KiaB9uVNLaB
zImR!H^23y$w=r<eK&QI&vkS~J^s#z_6LjV_A*EY6II%h+3F5Ct9t>#B;V$_3W^9)9
zsq77aLy26#fCoX$<gnCmajN^;q)bueY($Gh%EyxiTyhdf$Gzu2vQj>vZ2{(;2*T7w
z#81XkO;DY@5_)rxO~{+dE};krUT6Z9db_L=1YVLQWO<<Bsa#87Kj#PzC_qm+wZYvq
zE5Ug&{do~Gz~8nE5Uw?h&eTO7354m3LvKm{j;>JK)PNaZ2mb({g}$4xX&s=_{fUBI
z;N;tj74Ob4_uJ>}&N~EXea^%BoFH&b^!05v60m-Yv9K7un?ib&5cLw5fFho%6ArQB
zbCS?kD*dx$eQ`v0Q^R*Rd*pQbLWKU6vzDmYx-=qCYL>V?^i$+WeM-{Q!ehWgs#sXY
zE7OLTbwizoree5<6#ET+KKaum;9Q%3iTE3i9G}r^%-#DSW?c65<ctY;y4dN{b_+mD
zq$snhD-}~J3X8m7aO11y#y$9gk}|~RaAxMZdU%7Ca-VEpzgrkQQ`^@Na#og;wbK*l
z2*ofLo|b8M$mV3WujK}DHwE~3sZXubc%?SNX?gD;4@D64VO*aW6Og}1MWz9L9chWX
zB6*pT8>kS@dw}@fe3&&WewYb(^9`A%5O9b|DBxf(te-#@q}c<Ac-;wJOuBl*E7m$V
z=4yrW&A%^B-C8`x3&=r<A^@T#rMMwa3Uo45FvOazfYmkwoe=?f;&2NY)#MvIf<lW#
z9>+c{RMOZ2ZnV6ch+tehEXWx7OviBnnR%Y6bjuV96}>(%P(exRQM9Y@v9VB~11RK)
za=^Jg;*fE|fP`@YNwxu<jS)c>Mu`0ET(|$7$FEG3%oLA6pMQpa9$wxhd65$t#}fKL
zZcr7KSPu$_CSXXUWp2kkM*;(OJ<Gi|xf9AmZ2^w{o@o-usy8du{0QC=#s;Z5E3Bf6
zqif()oUpP%Uaw=oo&^JDk`JP)U6}|r(_8^F?EBm5#rCWhs4Oo6$Ws!3SVOe1B}v=I
zt$qzxbNd^&b0$Z@Kb^hgZ@Ucm&+r8NMeaZhSc$AJS=9&<x$uP5u@<gqZvpp}0E_>^
zvxgya$4ff#*wpwg8EYUqTdckxjyYB$sRn3LF>Og_{K(@r0{1;<>PUdD5=|?6D4fbc
zOZYHr3Ok430Gv*hxolGsd;uxkhh8>~X?#**5ryk|4kGAhUxHHtHr$`fg<;X##fRY(
zbWEeC6{}wYpM&(nVwbnocm$r5kI;>A%SllZvEVF|Jr;xj%_=tsQILc3An~pvuqa~W
z378Wl+Pq_ik7Va$)NXzxXBfED!B4%0zF;J>5oAmBquBQEtE2Z4Oe}4>L9Pf!3Y~az
zA^fEch=4KL_$TZwg>K+wD?={wj@m#R2{#rQ9$dcjHmBSr$ynD;M}WJr7htCaWzYdc
zI-&djLQh>`SlxHv%|7H{L2P_QqXJ$gm~M`iON>4@*h?)y-k(h1;$@g(f)VG;sykV8
zaNDN4-M{y1;c_$W4cq4~%WpokuJMgEs@DST<9pf>*tfG$Zg{wkKp_wB_JAMRF~^KS
z7EmJEB90pr&jUvHbDGs#C0LM)`<cxp(eGj#*%>BAV&Yst;%)tsLHAO39Wsel0JUQ;
zO-_@heGFt521q8V)QN});9gJ>mUo%{2-$O`9g|J0!vk?Yx(-??;Zv|<r+-VemWi>;
z*oKUK2!f0scBYOxttH3~NU$;1!I7T>*Wy)oQ6uy8vx58W&BPA_sezUGSIZCCoXtF&
zgT$z6;!o4+2W(1|{ao|IIS4*>hWHF6`G!!34NzGR-6&%6L+*)8;^rRhu;YlCM6|0F
zt1Gq~e>%pBEQ)93V;BxS#8<q-_Kf<4*-*h+<Ktb*S;b`XU7sEI9c0Si>$vPkRC}1m
z12C((Ea(IV-r@UvUhPE&)(@Xl2QcItpU8E0F5dah)s5ABj--uZ>5_}|MywP9|8zq3
zcx%A5=3hO{*^emM7zGJQ)M?mC(QA*vM&sv}U_(rk+?9|Ktx~5>_%%6K)W#mY7~RL>
zl+)A~PQSW&TjtQ^XUG_EYbR4PsYRE8KFx2|YxxyVzvv8Nhor{juE``PB*=#2be?cu
z;s_^&{M|Hs-QavGkJv*Se4nA2Y3|RnVH`1G9Yq|R7MJRDN(}4SC0|iqrnvV@U{!Ss
zPmel`f<A%p-HM~f3#$D|onxCT@pPK}4Ns%Xvg)SWTjhJ42@poVR*TBIBl;+m?%yu7
z#_e7oix6A*kbr<stKK0jQhVdxB|e;(!NR9}w@Exbf+siiTSge?zDlCXrB=l$NxAxm
zdje&E<;d--vUz*?qw6=0#oH_9*B{^><}S<A*w*bedi>O}*ZY>AJo{l3Bq#WXTv$I)
zNL<xEUaqoM3S*<_%P%MIh~n^m6Q0;!7Zbhr4_urkj#i0uLD94Mdz2=Apr1ig9Iwbt
zg6!o#CP2G<XNb8lx!tw%Qg_lFNG8AS57$Q8wPT8Yr&H7>N1Xn^^oR;xgT5h-_VuWz
zN1ciJVXl|-w{;Cu5-%z7#{IGemhT9JQeAIc%*<0a7r`hu(S&;T)*|PXHy<X6AnDFj
z`#qQbRTd+vT2*t`qqz&+eh(2y;nr{8T1{dT<QQUX>NwQ(Qb}eI7h6S%v(^lvHmAR7
zQ^JZg6bP0Q-);FuPwTdqE(6bYHxDxI1z`zRNg$27iDCfiK?5XUdukPM1Z>=7kLK<w
zw^8xAD$^`G$ijx8;<y^no=npy@B8?`n2||)8=U)rkeUq_W8-_wNTS3TM;4KIBZ}K$
zFmifAGp%7yAVC~2?2gifZ{t8Y%e=V_@Z0}k?>(cMTDvY#MHCf86cp)AMU;+GLRF+m
z6A-0$i1gk&DkvabY5)}oB29V+mEIDmp$7;MI#NUDKJmQgeDCp`_uu_<@8E}HI5IZL
z-fKT)tvTnKPaus&H*mlCMEQ_!xo2tsxuj7Pp>L9SmP}PU((LjCbKO~f6cG>0+k1C{
zO@=v26)f0aLP9Nz0QwOaW3&J~)<{J(lwT6#EP|=!>K4ed#zWpeq)=w+`uvCN;O}Lh
zVyZ;iInw>;)(&Y0#dRcS(E-`n*H(<1C*5HQ_N^Tai{b<6%wO&He<FDikvTk`?=(Ns
zhLiuHI`f~AJCSJ?Y-RBB_35(&9eGNrM`wv&Ur48VH0N&TGD&h{^s+PU>(cr+3N?O1
zgh3A@9dEL_X+1;m1qey|Mt3QjeUIL~Ma4{jb{0JBaiyPg{{($W9!DV{ca|ymRI^**
z_3Ct#Nai}+dFoJE`sE4G{M}%;%)0u*Xy{tk%MC7JDn^@abZX3_H*tc9OQLoTn8z}U
za*=Gk2JiC-0wjak4RQu4gwLJ@_=ZvGPI)!p@n-r>_mtZ*1Ft!!KTm3T^pZYw_tZl&
zyCX8)?=?{?4bVCY^<NqSKwgSte?VzsKU!$axGL4!BjFSVy=JmaD|CY2Zs>nvjCz5V
z+ozxO5d`Z=+BT3OvNnFAe;Gy58ZjjQod^}V)R<=s+~pF^F|=N@>@OE*i%NbNxXAyH
zg-n8g?D^eq{#5VM%iqs32L6aR{C2qFdqiW&i?50Ni6#jN2%5wXUSh$HDJo`XbRBwj
zc!xE`>DpANS?PyA67J6E6CdvdI>LyIPF^DQQf9nu&?{^ZD?;i{FKBB|Tg0VgSSxP#
z^=78{aEPYE_af2cs)5>!94NmyV#zCr3+<^{e=fWW^#pU$@FXi=7jgQ7@7Dy1hKfih
ztZKbKULbq+ny^F5+E@wQ#hpHOTKYVLSxbHN$2u?9BGN9umGQf3sVnnSFIhK{cuw^+
z{l|-TE^5N3WFSfwwM9J4T!jaoQ;{-8mn=%ZF$4*t_Xb;D4ecWBH+f*Yy^6A36tp2{
z9~Kkf6od5`GmI~P0p?iE-8Bt0XX^k=(74$fD;v9|nD2hG?S*BGsKA0+B}-H4+eHKI
zN@N8PXEK`_#B_6Ay3$oMgH!GQl=@0#A^1U;j?72qTU*H_AA)>}-d563&?rBpv1Jwk
zlE;f5uVJnyQ(mHe#V&RI%FqntcM#-x3gPXWf-r|RHZ`(I1`D&xYZryh&#kFl)B%~X
zJ&n@qyEI()zKZ`4)Z5Oal#%E^i^ei&9DbN#(E9_5)8h0Wf~8r>4;LAIFaFrr;Y3Je
zw$E~XTt^(FbnrwvF#v(jI^A?8MR?VU%hf!jrXdWvn<R3MoPz3R=LOKcQs(y+SpW((
zmHp+Zy?a6h3)Gfv`QZ`~wYEa8kKRb`(l)#X6P;6<IsL9mamcNx&{`(J>pTJHPCjH|
zlub8kEq*cn)t$_k%=xOG+l0p51X<bnGh|oZj7Kyj4sqeApn@vQ56_I#Hm{BWJWgXH
zarT;^N;4x5J-2~`h`NvNi^ZKw`cd1bQ}GEjTzZm9GQuFV_s#p+t*`#4Fe^;V%4EDz
ztJ*;e(yNvvH|4&{?J@c>K45(MxI<V<B3<k1V6Wy}{6>fKl-{JkHD|^6?QbtH4o%`T
zKU)T9Jo#*2XZWy`Q5A6E&YfixaxWPrQx}r@+3G~~f5$2$dYFDx7E}fIFzZ{%VokJb
z2O>%f#Dv7VA*yuTJ><<@zK83n#Y50v@9%jaq1{&_IY9}lne|kUGi(QAk{3lOpyrtI
zhKe>mG{@o|(4=5%WmGS`9C{h#M2|>7bUIWtg*xrNoQTJR>5dQX`y^=+3G?;r9LhJw
z_tcs*)E;%Sdpc}m(&5%-12|9|Jie3lL_R?BQD%evS>Oinr9l(84?-ywJYS}XX+HHb
z(480lF3D@blfJU~WGZn(;q1xK3DC#+h7V_1C7qS4nj$dtJS{#fxcFgf7Ba^x^wWu#
zr!+F|%JlBNzviwR7!jc1cgfsO!x69zGEYZ5-?RfQ5rdk73M}#YmHFq7?WlzD<)ElE
z16jtG-m>jUU*At`_s?@aJo!<Yhjt$*G>4D2ZXG@pXCryL>cbRuvL7H~$P7ad0Lwr4
zJgQQiaa4CBgO|nPm1UD?!p8@PM|w=@3rZHZ5n2BzVcVI0%mzr_P+3zJZ6rxhk2Axm
z7kdBU5#m43<VP+db?e8ShlH?P!7Q~gkMqLo2BrK6d({SR78vb;V4KhxC>I}AjmG{%
ztT>Es!7twwr6n-YA_~+plKdCN7S8uE$&zYOD-`wH&%%upl9ev7kYtJ$joQ<)%=YLv
z$>pA6W>7^(4}2?V?dC<=$<3v#B_kdha|<&BxmmE5M5YLdQC$j>BmAJJ_BlM_;pHuP
zX2SM|WRIUiyyN)JAS9{bqZYw}#^w(m%=0bS&9Y)dH02Kz_PB`|#ByXJzU2X1uv>@e
zr4vmRZ>Z(&Wfodr)t+%ZOK{RWBZ}wh$Gfksz8UyEe_;^8%zH^(kg};J_)?cWkkd7t
zy`l$gZeK9L3%p3=qGL=GN+~_yVy9KEe0k5+3bfOgEO{azcQhvxwhP33>2P4_?r@73
zBW60+zD09C5XZQJTHGy9`-h><drk7^>g4l!A|ITK=qq0U<eM>~+d@{7pikOwvW4PP
zhRsDI0#d}2`XH)J8k=&%MP33)&|+cyh@pF%$8rzYNW9+kAMkhL#g~dBlqXL>Z?QSW
z=PEv;p)C+h;eqv}@1|{?cp#lYg7pd>krd1<=RN3vc(sdCvRT?)&YbmE9sb~UH!F(U
zY@WrAqA;dAXv^@OSqa~QPY|8UEhpKU{w^_;wN~b+rb(pKr1KGQu3hNl6qZoE@vZJL
z^_`}tvz6E3BeoA?6o@gfhiaj;LcH8Ff!C}JUp4hdxQPCv=`&HxKe*5C`R*-R|6dT^
z6Y`S|=x9CG&IeE`o)eh@e2odFN|$tDDQ8JU*4_dSgbN+L(mL|i*ZrB6!sGo5QoIFS
zZ*X+l<(njB<vP%9vL#>6dKvuY0_;@nV6k9ty2O;M3PBdbv{=vKhMf*&@Le`2Ny)i0
zjZN>6#jy2$+%-f3N0TlZQ?y4(^UQ_eAA9n9)636Q+1_c*+;Hf{mylXFPts1{MSDti
zeh)4)##1TTczq;`Pfq1x<c?1IUJf>DM}>v&_SXR86eV#&md%WeW&pvrWT%7})d);W
z+7M>bRW(k|%yXd<KX3y*dkaI(!AoE2TlJD;c^XgF{-4U>uY&b`BN@K)wxXBx3BfCT
zha@*8>M4;w-V^MxS3=Ca^ijFV6E!U2UTrVLpZ|C-=t;x$Q6`+;5TyN~CA4;*p^Yyo
z_1u{H1>e8sG><m}>Y;&~OZLD~1~dyUEWO0K{VQ=)>;zzcjssG=eU20Wk%!cs{!D#>
zE#_8bk}BOQteb)9IwK_Adgs)AiSccU(TKWal-yT|`1c&4vu8gt0$cY#S%&|j7W{Yd
zEyM?I6@U1%D@YOu9v<zY>Y8NF!~%adN)2hNuVlnz*WA8z@Hd<Zop#c3sVfeH4#BxM
z0qFi|fGlh0AEgBz`|3@VkdsQYpN%N}IKbY$o7&F;WOa`&e>F@rd%m6{_g?7IHL#<V
z1JP~T&j`ibhHGjY1ZWku8dYm!<-HpP{us3+*ozqXe;S?Pp8>Q;&nsj#gd$a8g44-k
zY51=um*p#$w?@T0%7}8Po0Ic<{tJP@({e)ojCZJeGi?VkQ3DdIQ$Uc5nho6bbJG8}
zfJDf=?~i#zfqY`s!ZIU&?upRJv7&0%d*|-55~Pa^+0|dD4`T4SwqpB|AoT2M!k5>X
zd--RI^@D=}#An^jkNxpC0_m?fx3NBcV#ip>G~L8m=N=A(+i;?jV^!5bzrNSFDfAq`
z8uZG6YrDfo>U%U%H|{gt5u?zG3%ehm#xa!sKRS8d8xo#}Kl!dXM=}@azxn*!lT#<n
z>`>}G-(nv1*<~zBdY{AhK~4a5V!QXV$VVhB9!yf)>HN{NUl%A!nmu0-boB=PkNdqX
z`5w?44lPe+C`y@6o>Sl=sEx~{bVm*k5R#q8lk$Lj4d=ER_;TyIT^&n2YZsGfQJA&J
zK%6l!|NBsoAGP#*zJSN*y3bOC{?Z`86Pbp(zY6M|yGumSu}BZ3*}0ND-#<JRv9V$%
zgaE&J&@fc;qsn#V3m$?8BFl!Af)A2GfzR~ZZ-{_|9xqi?v3t%eS(x<KjQp>${l4|m
zty_Y#epU{sE>ijTxBM#|diH#{f)N)ke}WKbTNOy<ARPej^D<AjO4|RG>|3C5d!nna
ztr`R{)VN>u1?ZM7xL=?5MEJt|!z*-n<Lsb}I6Vm7xV$qD@o1|?N2(=B=h>r)azJMg
zbuWWvCuc<)uW1*&8}UVLn+ntMs!4#zyy^VuX<L6mfCOJygFao5-_`+e{}ou0_(Xu+
z_H@eUM!IKhqRrP|<4YP6T0!n+afT;b>taV9JoW+3eY=C$m2kFvv%}3X&`~?-io*AF
z<J*9y5sKqiQm;5491Rc#8dClOU<@GSw?H2W1lQX0pZF(=oMh(w#Wa9rjgqz<hzuFZ
z*26usTmTFl$PzBXyIec}IXlpIAH!e>sFvHCKg-cT@$e+u80LEyRQ`Q<JqU$W+z@4`
zC1Bk-%4;feMerW!Zs0MN0wgiJ_H^UjWZ586bCQzbOF)jx{vF|X!wQV(Fe^Nv@#rq)
z@BoNgJy6D8OU|jLw0nTsqOZn#yaVaVRcdC$y5?6F%Ryk(Uen%qE~efdFJ7n)x+pXi
z;ivW`V?b<gk^EQJDF9_>0=)qL5CnwKqIlch^B0qBD$VgNs^0<rnKaz|s`6aS+L$gl
zd0-Oy477Kg=Z`;fPxY-i1l_z17%-yrer`KFyXtZAUC+~sI`KB4XKI2sk33TGB3*n_
z&e@BKtx7`EF+V{Q?yER`A0ZI{qE7(6<@pa+eKqPT?{{DzGy1oYKiA&5X|3`n+1O34
zD)2Er%>huJInJKOm!_(nH34H8vX!*T&Dn&y%>K4)z}e~?$s;-pUIG#ZI+uTf#$Uei
zPx<`DO#%n?;2DejxC7E=fu!~VK6NH`iNnc*)&?X0T?eQaC>356Si_vOb_m-hWH(Uw
zMqD3|h+ciCx&mNHQhXmmIY23ye}dL-4t8KAU<f4LhCt}?miab82axtqvCy^E-KN((
zN3es+zHEKfA`OOn{J}-%nJ`|;EXH0o;(%Ae=6g5a9ttkUA0&ZyD7aSeW}wkhOoq(r
zKhg;a?{l{Vd*kBw{v2O|C%Z$QUw&>*C~F^}3z#CFLPMYwAOYB;Z#>Aqjdw!;S&*5*
zWPS-F3c9Yf8U%}8T(8mQZyW}M3!Q*tIQa*BI~%*SZv*9FLLT8-KFVn+l+~8oA0A)(
zmijqx5eSZ?!uKPEwpD?V`W;9j#ti8B{~&TXq9vqMZ?A^a6g8b61jhd4HUQcGQD%#N
zkfh7{pVjy4(b@gYeBTS;3FQ@VKtdZnWo8_A{<YJ&I6V(@Ms<?#5YEb5cy>ySPR#K8
z%Pi1oEM<+oanJxfgs#Q{xA+k@AfPWSw*de&B9{fllVuZ3Rb($mv<A)^co;;*mb%sh
zSps_AlQ#QB`v4^h1WW`gW!8O*D`t>XyYJ^FfF;2jwo|h2>fP@|T9^W>ptpQ`6H1T(
zz<u%}-Xp_;Hni|^_3GM=&sYyiKA&gb-p1T|NIZW1dgDp9i}ATSEwbouJRsQ{phyBM
z$9j2+*V45)5d?T*+IAY-YWwrC`(|l$^Q<OI`13x}3GSHS-M#U`vdNn%Ma?K@F;ZrI
zA>19QF~gP;NwOAd3>>YaoyUWMBp1aLU<@+62aNGev3MyJW-aw_BXe5^#jyL3kf=lr
zuik-1*p~D?J$=zw!H)4!08u-biOUpw-M*?CKl<^ad-eH@jj@kt9bZA{$}@)z%!rZ8
z?UC+Ktt~@3K%=izsDslZf*rAS8fFQm1ULe=KEc50eNVq$!~Hz0Pdd7qwTP{F_Tmbg
zYH&ndO6ho`=+_#_DiJ&po3)HFS_uzyczK5}tUy(qs2t^%Z=cWebnHd7Z+T$jA@;+;
z&^5SY^!L|$Neicmmpz-}ODgk(_ttj4m5h%~K<b5Rp!|a-(F4!MDAQdkfT_9B`oj(G
zGpgHm;Xe~vpiaDqb}WlG=39x0!mCh6dl8&me|bL)16qQ@vKKs#_{d=gawp%F0EJD}
ztm}6??-||-FJw`tSTd=j?1rjPOX}jd(D!#%x{_Un*h=LU6CT7aU&*^)`3&zV@u2Fe
zZd{GtE0;-H9myt{${RT`p4Us%QJ$YTIkS?7Qrzo))ZuBW<ybLdvV8-|Tr$G3edLEB
zL}z?a9r5eCQ;w;n(cc31egYm#J{#_#l+*2<d^CK^8{PeEXl5^C0O!bt+5-lcVFwB;
z&8oEm7M|j9yU86j%14HsuJ^b2PTp!Pt$*hkx8l|mhNpq0g@2AV#{b`JescZ0Ki4YQ
zT(61<1k^K!uy=p_9#0%#m+=RA|NbfgMMV15uXv3Jp0u{}i?a0r4gef-ZW0<_RUSgp
z@yBNrTlC!?w_<)vOqvO{L#~wM*)wJZ>TUB7uG8dLz}X7Qhl%|Rzhf>ki^jVx5L*<6
zF(-g=toY5QfG*Pl-}!A3vr=~?HvU6sE<O4?zRMulCcVuc1;iaj5I^8g#yH+E;o5?4
z-T*@=)@>i}szlr}=>-=>$Om5<vUr{Q<@@;p-6n~uj`Joi1`g5~E7QOO&<Tdwl=729
z=+J%yF1Qmez*4Pwp>c(MC?oHrls=VOtqN_d@c`=`lE+gqo1ht8%1KraH0!Kxf{yL)
zfRCiHOFt0bS{50I&Ca2*gG$`+yZ@N%@i{UQGJ-Qt2zdf-y&&P0`x5%dUxnh0x<!^k
z@Fc@ibCUD7G_?FK6%eYh@HRJ)#Q@DDoQ&+XG-Dgt<xGhVSKkXOJG;w^MXsriE}WL5
zdpVyykwTucKDc!k5Wbr1XIOnW=_B?z2?U03&3DMgF=?1&_)4}IAp0(Y6lhvwycNkV
zP$4VR4%-w38WskIr1-+l>#iXjjz3sez^=8ktrdlFg588at6h6%sqZ-%U#b(wlB_?7
z<plZgobK>p3ZhRNz1v@pKaE8%M^s3GXc#KCo>1V&=WR3@EXm%p<{3=e*Bu$XUDVld
zd-x&5oMC^Q=pr~6V6fbKYDGam7CW<A!L`ug0%J?*ev{0G=)Np@{??#=w+bw5Oyh<?
zMZY{GiJ{CEaR|BkgmxzZhAlDGoU?1IEL5B-4fRq~D(vj*r52Jl?e!|nRp^UGOp3rK
zTPu03)%83%pTK;DmZwUcoUqp-0%Y^@BT6lfWV?*G!ob-n8LC4g7>TXXDVClP6SRDg
zGo-1f(3aD3fy8pOFEYXSvBNuzQ|wWTjF`iSSQOgX9m!L3{`8=ajYm&XSd$Rx^bm>j
ze#=s8JFECktGzFRpS*p<1@y^<H^)>rr{6$t(nmEjxKBl5R(Xac)~d=!O|%g_EXa1}
zr(!?-FNU_a_FA-hLK3ixM{iidQ?KR~zF$NSqPnRn1(P+$Fl(@ZnvD#uQQMlq{?C%T
zKDa9Mc6s%T^RVsKw;ZnK_vY=M_Wkb!<4P3NVsPK8%kYP7>{;Jwr>|QqHLtz8f?PoE
z>4&PFXy1@`ERCFVcuE)2UtyxHiBfgzC}3=UWhH6u%--T@rL(4&Jggf33A0z}j6L8L
zYCVj^sD{x9<z@6)Yx<1QsLks4_j^mXy67GbBBGmV(d6re{H*9VtoAeapyHai+5#Kj
zl=YN-KOzx>ecI@4$Bt0#tyi>1s?RlCkD^mH(hu}(#5_%o5T0EyYlAx0y%#4=5)htU
zBp2MTeA<%aHnup=9EaN6Wl#O}dGYlK1Ui&jd%w&r(L*`|8)78gW%kema<wNb3iqS2
zE5BV&ACjtSB`aer?F*0QW~u107o=NFAuV>w$+6sts=VfSsD#}w&IM*G<#{S^=e89J
zm~uHAV5BdKogQ;o&c(D)Zif!FGX}-?-esfmPK-sRkOuA3t?T6Qq|A}X2Pu{~8KkQ*
zB|F@^%&+HZJ*Uulc6f?o)pT@DM=nU)NU4B{E_A0mb$`YxR0IyWP^Rq*8KPg0Zw^Ja
zI`+835F>?2VqA1-b1`zw8vO}R8(|bD^*|XME|uVvGQAjP^<r*2>)QP8nI2QbfoNKQ
z{zjup?@mJ4l9L4dY_W~Ey8;Y|mIHX;)d;BQg63W>M5x0O`Du5G-%%drl5A~(uuSFT
zrW+Naxkb0gW-_(2pap5`c@rIWFvTtPJ>C=&<8+g&l)`T^!hL<6s}9<`RXnW_@APAa
z`Z6FJ_L?UjK;HHgzRetnKwz!ZTHU1^Sn+42xL3yGbJJ^ak=ER4hCPGtin!NcA_8=?
zJRL4&Iim}DIBE{WsOy<7woDlBGL6t3<}^KU6*k3X$lSPVoYiNNwP{JDvBL|Ye^nWA
z{ZT<K;&Uq`buA_{bkA(5sL#L7=4c=rQ;4dX7mV<}*BIzjJS~`L&C%%B7&hBDoyP|g
z6B~9dU0-}LxLkCA6^tw8TxpepoKm~h?gCq4ZT>;gsY>`ctKPzU&I>unVlK3BeR!gf
z8A>0Gd446KaH)FHl#8Y;q;V-XRyPfqeM+c|HKjPzEVbw;{NmVzuMk4lzVV8-)O)M6
zvZ~Rgri}C#LLAMtBe4r^>Ge*m!z<-}R7Z`s7TS3}ge?i_ab1-)W4D--RNd&*WCya~
zMgQABa)+g_QkIU2755x?*jEVoJS~`zWZ3-?QFNsct)Ly|J}Bk@%M0)l>*>(MHyes`
zeima<$S;YW&gg@N?B?){3GUdALStGbn;%F9KM|_s(jK!}t6BCo&fHtSJ2tmM^))iK
zh}yx-B<z$>8t;tx!ZHKHM&l9h3C8voXd`|}?Wx*V^21LOGh(Zp@rbZDUI}v0`*5Ka
z%H?7&l}A{aw=)TJ2}G9#TL`-+qLlKye&)oI=Fi0yn}%mK@5=c}q!Lgu#fF79`UWu^
z9-#yHRJrs=*16Ky)=dTUKU&^(m*&f}+#yI+>a4tFLZ8aDJJ+j()huxeMT*uN^33cf
zlV+J~myJ0=pfB~EIcJDHhwvwJg@q#GmgW^SwoZ$Oi%z^GUo(aUn(UjReG?uqCHu`8
zD7HTlP7q^BUYh&)DD!@TS&U(c(~~~gQK#?e(bth#D$$$e$%h>d7B5X9I^8Nsu^5kO
zE;^GrV8`3@Eq+^-tW9=oNei-MF=ZE<g!o!)u9S()4Qc52l!su8Qrt4$O*`$r0V$Pd
zs6d5cbN|g@*k+%loQzzFlM_7cix%Xk>SEg<AzOI^cd5A$hQ)f+l7XRkrOqk@sA}fj
zO64HS77uMk>qe0jB84mI<lItz4G|)&c`(C*q*!E&xw|(TJiTwBE#Z_o+Y?jWow&#$
zbD+G~-{w0qt;=V7vF54^gFwl9*9$|7L&@~|9r_<@Z6OtQqEp#uGYldQuJ5^rY0iC7
zVoDnV4a;JOTJ8u2p`>xZ=u_Jw4`a92b}eWW^wbn`nLG8i!tHPD9coW&geM&8IMaSY
zMX5~RD-_^48caPHbh?Dv^-eYvWomER(#<X18@kv(?VT7B>Xsi%`rXN;ZsLONrzDpR
zzIMfw+nMeT=UFsoHoQ$Me+*%}2UB7(&%>C6l5RH-7FKhH39xNUZzn8emM&E1PYK4>
zWv<*&D{>xQ+1Sk8t8s|+&VUR#RwL{^$vM!u;XsvQk4_*R(=L14p^MYg?M_JLqfrf&
z|4_6Qr{xRDu{>X8rCm}pvwaUby1CtEx|T?~L2LYZI|BE@3z0SXFt^IbOw>_~W_pzk
zp@@8tZQ6?*o$?&rYwG==XWL)ab5Ic`?rtluBKU$sbt;#+uX1nU-b_xFm(m-@;QHZ;
ze$2sj3H+cRK1W+IN+(<+nkKx)#HC-_Be%AZDeetZyQ`nKF2$X>VL?c0>feD%nrrMn
z7pr2m^8+Rn*5femv6bw_)c!ojqvONkbP2hj24T0{yDUk-190anMNuatG^b;{3e<z<
zCzm!|&nL{8$YtV;<XL5xl?xb?PYTsD=_61jPER?Lvb{X!k483Uh%F&ER5e;H5yf0{
zpB^=v)egY<nCwkAwhCQ9pG&9!H1UPGyv*~G&>V#z1qkX{Nu&AYwG5B>y7`Li<!qIg
zoAGn#l6|KJ>o~O24-U(Ok48iD_NKfwV6A>=454!9mxjOj{9O{EB$til2%ANEbl;CW
zc|FO>u|Mw8nt8}hCt658U27{XlwaRUe^Xum-dOQ+32MFjTXbxe#%J>CW$jHqIy0)J
zgvfC!jznK7)0B!lKDr?z=<sLL>Du|%`OWGojK0c5tn9;PaQ8<+{l%u@^6d%dL#(R~
zSdzc2%3>YPoSW815V$qo(c{q*_}E_-pEg9laG1lDR+GumoOC;-f2pS^XJ(NhWFdEY
z3VXYG4g;^thmiK3j-?c)U)Qu&*M@2!b-g$?UAJBcxpOUe=y3=Pqm|0OdDzgkXG+yp
zazV$eP?sfx1qNpd_0{iJKt@0kG??$wvsN4RK&I9e&LTLhDLtHdOFIo+G(u5W#;09+
zuzWq%!@C6Lu{>#CWrJQq&K>eWX%ebZ6f4qCaF1qM9<Z@14R2f%fQ}B<uE7#Z??4N$
z*9l>G(C<`pkR|UOwlIZDpGp^;Q{UchYIIxJ+O0_P%J)PMAPh729E0=NYNywq04LWt
zT%b-q^Q#zV9Lh51KG9&D#TUs@EGs@|Ascw}wFz7@o}$*?LtnP?WHfZEElE%MfjQw>
z5^c5Dy`N?hRj>2+os0+zDfZ~t4mvtZCOCWg^ZK3S-g`10HM1RQH3J1taD7NGCS_=Q
zAu$?0o=RFV|GI4rn=<jZ9leYl*@V}#0%vlumf4@Ma|w}NY=S0)E)Wt`D~*1`2xp*>
zn9;~X@w{b&$~`GyVo~fl2@8ASq%bmmMj&2nLw?P3T0kl`(N@I&dfA-pqo6Sts7Cq=
z(;<u6P4oN49=;@es$x+0bVspM4?e4Lhs(1UbU0+jmW+(()Ab7xrwwi3Dy<Pnp@nu!
z&8t05o=P%iddiTZXbem<0@tC_QO4d6q&Na<15=@X$rLp66?3wkbqc3qj5OgZZN-9v
zhr9!3YSu$Mam-(fO{+Bfb`F+Z>pb^KQVuM&V%pbBhnoxcbs``=3ezNSC~`n|e*`WC
zR}HlkTQ`S7Q`Dfw>{q`yphx(jNAXoTiXxn+K6Mwam{2EK+-s{m)Z21=Hz6ao7P-pS
zvjh?76-!xiQyN>)P}#yfz-{Z(wNtJOLkk8O96COkI~MBqKaqFbI--twDsGu9P}%i%
z9+e@!5L&(%M8%W(T_%iJi1=1>2t~)**s~WCg%ORYXMsISW#{8dy4CG_3o^q}H(_n&
z2Q|Z~Z^Y5(92VQDSh(C<<;|>F98u5q#%SY~ViRyfv~_CNa?O`ZJr_5&Di)Ls*-$xI
zbF_R(57uJA*5G~<ooS&R&naGqa;YgtPS-Xfr7Ns;hplU$)!6nL_zhis*2wM6kaC!U
zojMSKN?nf|WaS-xh&XKd`eNJUXKUqN5sY@dP<#qPM*YX|3;BAKiZtYsAH<D#xo1DE
zLuGY4SEb??KK=bhecDWe7G$xPA7VBVx=h2N`KqF^nYK+&PuO2lj*;$$`CKUS);A{!
ztUjc=TTVZ1{%JzGG0#d>PerLC{{w_qeAwg8gs1lctoznx#sr`>LQNShDvlu_Qw;M9
zJ>O}a4I#PQ3{>QTfqXTe4B?ILSYfn~l46v~Pqq;_AA~Zs^9jvddaDx+N3qk1%nUX5
z<gj;W3d8P`5dp0(9MFU(Satem>u4-{B-qadx*en?bUsV6l)W=`9-AuFFw>Urx?S)T
zy0rmexu<SMU#FY>(cSs=jJ;$3vwTx(HR)TZgtMAT!K&tSgso2m<}`A?%TDH%&X=H!
z9k#5cr_9g;a_!HD;^E|CQn^-^&ReR|BY75sR~=)!l(d|^psBgmH=AG6l6QY7)@}ER
zj3r(3(34((iXBkM#+FcATT8gk(&(5Q4!aRuMAN-CG=J@#wNYYRrL?3qraiW1xzuAB
zc3><Jw$NYa=IfATE!5l{dKtkpZ?5b))K5Db5}sd&qUy&?3efrM4|^{WEv%(@-(H=z
zDi|UvT<oYy5$V%eDygjy=Svi-9jyI)qfXNdF+P}Q3Mr_zoj$lZY!TdiujU8rpe{_H
zCqOB*KY1|D8p4Am38IL#gd{rUnd#?hT*kqc1RBft<%6v0>K!_Avpi#K4%;-t+MQ{U
zs~&@c{_QDs%cY<kB*hf#aKS_Sty!{Tg-3&C%5$`y!cVntMPbahq7x_W`W)wLD;DyM
z9>k^+D3s>US!?PY=(eXE!q%xwcemB+R&Jsf#sp>-ktN|d9?FQ?L_X|#N-7_<1HX5&
zd$hOt=)q=XQJ&z+#=+;5l|NR-TmEm8Cn8j?_dR`^^0|3i4jdNI@;owBA+fYkC^b&h
zY}c)hoYBti{FL|lG&s@s=d6sl2W+KW$yiy&4o>Z{Rg_w@_3G#w!~VAjVjSE~GdXy!
zSWd=wSzeyiR9ZCCbUt(m&P~n0y<R7@(AiSubQ|ut2*_y$U*sg9Uq<Z5u=};2>lqy-
zGEcG6na@2&ob6Wt))Eh-h0Ug*VnOMA{FIW4&}Y)-We%kp5bhoo`)1)Wd)R`wRFIQJ
z200^?A!m0a1{H&NA;QYS$QO0DLcYtRDmFk~I=R1>yC*;r!72BFh4M1Op~&0xQIL~r
zuk#6BQ7Nrp5;mpABKXmeO>%{(lwWbEbf;ze=GB=TuAZ<Pbyw67yDfKxd-zUflZ#TW
zb~>3<@IQi3Qx1Pd>4kU3$Z9^zJE`i3M0scXnCZ7s2~k<Yta(_R9N&f(&sF~nX@4!h
zRa1@Fm;Yv>)ZUoGM`ui{HYNMvdcIJ_XGEWAd#e~3l9tG!)E*K6(z3`Q^$1Nby*y*p
zVX3Y5srPV9n`JwHq7Il5P|}QgzS9VRv#x44zG<9B?07C87HBsM@hC7vGb`q4Ms)SC
z33gZm<U?pM&rtz+mkShFbeOsE_R`2-m6({aSzC?>KPT0<Wbd2p?(N!Rym~EF{Q@{>
z9%aOWz?5^QThvEs-8%+-t3s=I{k-;Gw-ZfL(Gj+WslA1V2Nm7A%G#VH)VxAVl<k*-
z3n;d#6FQU8XYa7Ql~m-e>eX4m6Xt#~H(;gw>#L)}ecJFMhThX2d=iYq-{C{qdYhjM
z$l?PGgJIzwy=nb6EW^2;gJQ*~1IUVD#Z@Kva;wx_(NPXWOx?=SF$yCY+$OZ#cOsY8
zabEP^sSAaOr6Cyd*08ASDnNTc@+nQ*zl6l&U)S0`<E9pmDvrc(707M(+t5*y#8QeE
zV~1O1;i+8DM)mXWHP?DAw|H*fL=I(7@5$j)(V(!eO$a+dr<Q6}d{V&pMeqIj)%KZt
zh$6}r<xC^^aBEz>QwM(C40UqrbGqn`m|VBm$@gBeYz}J;eJ;-`2fCm?yyFV((~Oy(
z+Sv&ZPOdaw?VD=%=5SrD)$%=n6^*f{L44++Hrb>fot|S$Y*3dirB7l>+PwS`PW(HU
zQ|Fu<R>z8Us(F*i+?RH~Cx29gNUfJUrSvakm0RlW2-8_Q`;O{~gCs|@422v?!E$ky
z(Wj=aB3{w<oQqaMD>z16DxMQ*T{0i$z&hu9j=Je{)+Vx1@O~E`ei&Qv*)+yu$f?*K
zASsU#6mmp9kk-j}%#VXF;7!!%d0muDERwB4<{D(N`kW7|7M;VCfk<giuwyGCR;fzF
z6|u`pa4*d@I=>;tTA)lFt7_G)5*xqfDeZ2SZVTZr%H`Cu!VT43o6)FlU7zVFN~!KY
zj5uJmH}g~kKAXO5Ue(Xh6w2zo+0F{SGlvR|k^31xB4ejHZTL1_gC)!phL|arI>nOt
zF0NIoIkgmv-ZNDcnRmxTZyl69Phpml*2g|*bYA-8VRg{qq|fE{jkxLH1`qs8)B)S+
zHtIL9B5(vj_>r3|_St!=xrj$HPq#WDDeNJvtJ7QNNn;@SXmNC1h<ARc(x~G5Oku~<
zgH~4M*6A!A?K_SBwEF#5e6ZKdya#cG-cVuWR?3ptu)2+WQZ$Yt&T(I!(?KL>+T@{^
z$bxe}l7hJ#o?o?`J={H*+O^@1Zt;8^Tv>7<y~kBOJ1es=UxBEH-6o_xOhD5HJ}DP@
zw`auUx@R6!On_c&+2=k_ZO+%)J{7ro!^Id+#8y(+`V<TF<Wxr>#Y=}pxop#>><*)3
zjD1-9V4Wijftmj}w+U-RMGR*)_ju?W<V>T`LeY$Bk{J=Jp->Uye*VI8#D0wxLn5ZI
zJSRBYr%CA5I7N>G{E60bHcADVZ|pq!vlr{|rx$=2bktpG+8`0RyU%@~pt6vF-yJga
zWEus$4>xcXJBWpBOe+lc)_Kn{tY~2@dc8IRNjxa6YKO3&_!z=TO^<8ji|q#!i0=hW
zA-Zqy5<XD@lSRTHlOSlxQ*HNq`f4<^pY20*($mAliyx>G5-v)bQ<+DBV{#%=xV+uh
zh1Wayng!b45j%h_hK5oDH{qVSx7T#4Sb~5s@5bZ3>B<9~mh%4O1yAoomv&v(J%5dr
zN}wNBc*j>G_N}t~;>LmV$rbg5&J7~q1vDi1P9746s#$I{HI4CvWcB!$j+UICVdpD=
z^O=<mPu|TFT1Z-tcjoEzUh;HTbex3MZq*#7Y%ZBqjVZz$Q=*#}8*Se;gCwAD9p|AB
z8CE-?1of1*!aBqGgrHi}>}hxu*iy!hi?72$?Tg$8_E)K<;hLeM;hJ$cp>|E3<;ptO
z$_@&W?t%oqQ$Ewrd`#yivFwhB1>MS@bYt<GIYU0!hRX7`Q4y#;Rn?AXwAq4=3psiO
zOZ(c{mRF`qvmb`d@$9I?)w$&h2j*3Y^W+SP&&>2<@If8_adn_2=22Dv>UoZx*>yv$
zw^pCJUSZxIvivHZ37?){+zc7hOReKAzv4>&dNWU3W?3H>e}!wpZ=@v2sAU8dV{Y%W
z*z98ks~2H;S$2ni>J>tB#>Qxr)E==rBVtw+&wnsf=1J$c^i<S06W3@O);Z~rpN(BK
zs|YEu#^GFcMweHUYDJ+=w9dUM7|l}&O1D{7_(qkr5Kmg42rmrLrdEV&;>w+I`614O
z2aa(Gd{-T=>Pl~6_YB~sLwTK=X7qem`{7*U<)wxGg5^Wgg~3k8SPp1rnQ2Xu77iX;
z)Pss>W&p8pp!+ixFbt-$Ij723m!zsS9}B|Z_w@tom&kzn$?DpKt4x`z>e|kRRUV;#
z3_SeOtxqP~Y&Mf|*uW}Qz!C-Pi>N^IotD(Ka*`pQ-ToG|X1OA0tr&PX&@ATCQX=N?
z^Z^{zdRLh3p~+Ces7q6y&U!-Ol!1A|(>thltxnPjYZ>r<h=yJ&>i#cW4!;)&`@dDl
zDy~xZyn2#==z+l~B?jV*lV~ua`!)FgyzGy+a+bE5fRoz$=S=^d|GX#syZqZVL<$Dx
z-|*+pJJZdbx*Boi65|)4lY473>>O|KNZ@bR`}YyZ6O&f76iNF@91(0yA3Dxn`TZNg
z-^@zkzZB5l%Qt-NFEf65@U^y@pjr3#Z^M7T)e~?jQ`$%8CpNu<-_O$8`7hRky>EQ+
zsqEkG^Vi>wM_kd+vU}Zx6~hT7zYAJ4n|OuejCdfqW%%}AZWhGnk->0)BtbbK<6(1l
z?|BVH3Hh%_@X-{mSp3_)kO@W*$mD5jO3P)QS_geK*GPd<Zvt3lY2zqxAiMNCod;A(
ztD)C>ZqXUIW&c}94de~1zcGb(Y_QJ+pn2Dc{~5wRKKa7<#o<xLtLEM4f4y=1-F<li
z?2*_)XwLmhfblQUKN)oCFFtHH{+}0Q5ESK2eA!5_`2*o1%yaZl8ry)^qkNW7q<FAC
zNavc5+R~lc$vc5bBU3jqlVY1$$`_8-#gB@zqy!3JKy>3E2#(VU3in<#t_P>Rl&;3t
zF1FWNl^-nGijT}n3usj>f~FLs?QtS;!F|xWxiZx3@!J#4Y66=8?Bbhb{O6<={!a1Y
z8y_yxf!5?N6E?0@!*-wv)MjdPY&``>s|1-#2j>IS*?q(u9iWdc;5%l1_LtQ<v$Uga
zn1H-z#IE6z49SZ3if5T;OOp+9p{iMQnTK=bt*sXZ-xzLdHE8eoojiqmWV=ZqrKSVi
z(Gh*hIt*)-{yc`DH`m5Fd*W(Yd1Lt@g*qo_0iVDy9BnVUx$J(eo}qXaRY&pZ?%xOf
z7X|PZ5`u?+O@yy;8sjbi-`B8`WnBvr-=CP!!&Rapis;04T9ub?7lO7;i*j&t!Xwk1
zu2iY1r1KZO#(~tGjc{be2S;;s&?A1(qv0h>v~zR#0K9++fL`pCw<vo}l2`fAIU2PM
z60!Z5$_c<*3CQ|S<Kr1b@sz)-_Hd!jhn;_aC-rD2c4E>20|-uWx9uhL;nGBQ-EMXe
zC~FvK!$h|R@LQvn*X&~5dSz6m?JnGwIyKsw>9ij>Ho$OyFL}iaYlru$1<vk$i>7&=
z;}>k~UKH3xk4)!87Q?dafg&t?VK@J1H-A+js$n(HwZ?LEazy!WGm)smFQEQ@iUC(R
z{)O(63xXqt0%gQ^yD$kC62h>hWi?Ab!J!O3<v<YUdSHb2)tyjst?U_CO_`LEZo#69
zYvvSfsfFYC0uDNDebzVfL=RGw)|@l|89a@5a}(78bQW4`3>+H)$y)%HbR(74`E?_k
zd9HNeh88ZkiwaZ?b~a^&^S*ilX%^_Jow2Mn)9C>GaN*LV!_!A4M<DyU=Br3jWzEJf
za=04KP`h@g&Y^{h1~glL>tCn<P`2PzrrLwyI`8d3+wH)KS6p5b(!r#zwd=`2M543z
zjXy9ozP#u?D=B$i#r*(Ri>nE;-EXp`1y#bcw^BLPdZ&X%%~0$Ay*S$a1hbJ=qyilB
z4(97PBbjW$1Zbb47u)R$4@+xtG?N&*qrcN8s(Xr&uIp&a_K1If6n!unfjDSHtT<iA
zyRg#pdWiEZmW)Sye<0nAo*;F-R=cTP=j=8-JjNfkEIu8h)`VK0)<2xa3=$o!aUJoB
z0G5}w+dVa)K~EcgxKnDoS8DI^p?d5#>7sK*4;HHs9l9nSfvS=1x~WEeFu`y*QGy_q
zW4f&-T>}OwO~_^RrY`zk^~}BegurSHn$NmNl7211Uf1^k<y(VxQXJ%WRE%sE+bXE9
zXE^+6KUkso+md&lwDgbV?8j?~D>U6ZK0q&Y0S#uZxOA*hy&m#J%h?SDu9#8_(nAq=
z-V(Q%8|9HV&ea$9lLNiRp^fZtl4f)A8bx`J_FE^|+)CR#no>SA5jn*q^#)aUxT%d>
z*@_FTM7}-(X_fOH$K<7Ow}$>ra@B$DAm8nv3BJ^w4xjLZJsG%cTk&I}&lz^n<}s7t
zpK6EuBzdafp%30iNKNKO><A^6f)#Hcl*`BHLJoz0Xe*5{Pm8tYVpA~H%2@dLz9`8}
zH0W;&*AUtE_de#~*HRRJZapnjXT#k)jGH14eOD=cd9KnpsEkw4Z)sTxMtn-xWo-=9
zA)ajH@>vzEQHFe@)RvxtQh;UBcW<?z!mQCoTlx@6odFyCNrzsgtA~p34Hoc;w6{}H
zRK(WzJ=3o#+0I$F5BD7Dzjv;5Iew=dKQ%tU<q~&dL`B($rH6#aR>M}IdbYt2<`$Jz
z=7W4@g95e3)?zW<6=9NEqR@Tgpd-^ko;FR+_TUR_deiK|3Z3dph3%Z#`u|}#{<G!z
zOKKRfKm|>n7{RZ~V!s!gRtDN0&e+mpH#*~vPId7X^3aj)@WMzFJM48h0C%?n^_hW^
z&fYy+;v@cRtNF_7??ejqNOTGVCFp`c`oPaP-<>#2@4fbvo_TA5idMy@ANbWgR80^Z
zCkFZCsOUF5GQ)3d&xm4>mGQ{aEQpRJXjVR)QeJj_+60V{ik$sto#N{N!#Zpmxs&tr
zlZ>c^jy^ncNUr_K<DIg+@yF0;lfi@lHtRAq?#5aC0K?r$G;s$wPkVEbwCeayB*RYR
zs?|%Kvm*I(@!mZ~rbB6?#X35|Y68}AA=RZ=(G~Y9&Ej7f++Q#G>wQjqDz~zaSRvW@
ze)N)6#L#j>cSP07ZP_tZcz+*xv|j?}A1JmArokh#fsL1<sODUU%roDn(SZ{}2kh_a
z8I5+v9ag7=zQ!wkgD(1R*C+X$i<QAImqG8p77zRkepYQnTac0!<iXCrc+;CoQ9pTI
zhQ4Aw{6<;rW^S{QB&5ZW<3;M>Tq-@6RrT#y3jS^v$4hexwlSqPjE%tU_?{T&YWe}z
z_Ym9PD}%WpyuAYQl7am9s>+=P;R+Yu`Lb>FLX+HPiV%8QaA@MN>f^yy43-o1D{230
z*?uqXpX^(Np1B9FXKZ}cL8GJZR^~qI?BLj+Y*Y`%$NYry(S~xfQV<0Z{qBz-(z-3n
zI{vG4Ta}X?&%9PV(Vp9B;=3%TgKCUC#PAb7v}de0;R0A(*CNAdWpZ#jKrP_~sRkmA
z3Z%CUKHZ8<xnNh6G-~D5K{NI<(B2|L#3mC;V?zEp?0|x1rc~zzQ}X6cSx@Zd6u~fh
zT@=kX0+4*PhX>pW0_3I5Y_Hc6ZsDeLUV{B`)YzUG>Zp+EJmfJMs_#(qS{S{9kUW2}
zB(x#Inzig;q3&=Yf>;>KJjix8ef#HDS)F%NRGj#A%G;^6r8*0fIG+_<teBvh#&kzF
zdcPAra+{XJ`~Bb3s__W2f!TYsyz?tr%HH#j%5|o=DH%H&seQp><yt;Gy38+jNa-^{
z*)+V+nxMo?+1e2|Q0(Q7K#Tu8CI0y0;BG?JC5Ns1aAl7+wG>{Zr5LU)q+1hT(;RHg
zv>fi%67A3=$yvP(7pO6B8Ke>t{x}5A@D)k#!CKpT4wu)a?q;P@uk7Mi=n1UPGPVir
zixKcK-Ucbqfk*Iv%GzcEXERC2oAugwBT4B65c9JS5Y#QFlh$it<~~ZIo4pglb?d_U
z)6r}(yc}Jg=)JMkU1Fj&=8lT3Z#Jm7@3Lp;y*APqMm-egHjtVzRPhn9$Qu=mTQunH
zrd7yU_qSDSHofO>_meA&*jKalJk(wBK+zE2P2ijd+R>53+S`<IO2h5oV7`c~SYn^l
zs+I4(?OL3A$w+f>VHE{U?8%D1W#M2302HewZr*8Le9`bCO<nvUK4B>lOx;LleM@(d
z?F`=3x?YYo`K}072uZ~imB59hy5JrpJYooQ6l5Xd%V_WhQ*pK3Bab|?j!(<3U|Pt0
z?u(`s8%6qiZqWti#!#m~TGtVKV-hw6ktK6wn!3B02CD_(-c}-N%xQZ*8#2C5+dy5#
ztMtMSINiKG9pzP!qDNz7pL}@pR~3{^2m}Gd55Y1<ijfhSyo7=6Rru7~U`O>!YEOCV
zkFhvj2vR7C#trG{2(qkX<}Z~1knP%2G2)S*fdaIL^ZAGKey<XW?p#LOXp0qrv9;gX
z-{Tl7rI;K;v*M2Q41;Rv0jGZL@WFPjic%Sb(XZriALQ5TZdawwh_jcvVI9?incNXK
zwK(Z(I&Pz0w>TF#T$L9)^;?0$NI-K(&<lvI8S?ugh_4-j?SC!8l9jy_IQyCIe}oa=
zgHMO7Q$b-ozmfuA60py7`i;&uH{E6D8A&+zqjuR=od)i3Z7Z;}$8t@y4;2qY?D=j!
z^K}*pVXDI_gj9`f=IuDsE^XuIlVtD>DysNM_}q!s=ie0Coz|1kUZHnb&H)`>Sr~Ga
z;n%J|kR>6#kM-t*_L|Pu4UcV=f<)Jfft`b8SM70Y9WWTSD#s`1SJme?`!>mW2N<Dr
zdf?C-XO#0U-wQH_F07)S<NLh(ktjUc>i}cEozIr4&EHiBBl+Ay#S%U&plbqG)?zm^
z`Pqn{!(lxFrd6NgU8)XsvTwf%WamT8wyjh!I*X}IJ9dEvNAaubyl+jX9m*EgME;h;
z3%U{VKeapCitmchd%QEJ?7RCj2P$gmHYcxNdBZtdd(@)*a5G=r<#YAVS6sBV`NY&?
z`25|Wpjl^w`&A(9WMmcrZ#Q3>n5}(?cIfG#ju+J$A}s+9Oijg=5t!}f;3K$H2r*Z+
z9c-8JA~J3Pzg5>G4;PW72}<udZwt9r+<7|3`Rksz&ztNfhwdVKMT9~)>Bi?2l%UJb
zgQ~Jf=M!Or0=%LNdi`s)u&yI_+Yu9aCh?Vpd+}h`{Ipe#70ve<?L+R8MBsi^Tslpk
z5nC{hgwn;(*3~fG{Ww&#5{@FQdo@+%NQlNEnxh>F&Ct1X2WnxKTrne+M`@3tLA#BF
zfm?G8eQ~q2ePzTg>WHFY3}m-&U+@&x{;Ck-Os8tgs;4<HRG)$`II4?={k2=-x3s{o
z(!``p?Np>x6$uMB)73JDHX!GEdiMFyt4ZknnN`D`gbF`#kZ^}L2I;)#J{^QFk&KC2
zT`1Xe*r=DCGKDYTQ*&@AvM(HSU5#W)-m%2I%LtO`ApKJmR8aK9*EE$TJ48B?s6S{d
z;C_|Hq`LmqtaL!+Xe_fzB)<Kv;Dwt2rTkcyf%K=$uXT%BvOT82acYnP($l;+gH!kr
z!$#brS<_lX%&Wmj>EHu;Me43Rh3~f3EopHEh#asNj$&LKJcFgtIB&FoUKLaFs^`*%
ze(5s%dSAZJCP=-{IG9LCYX#F#ZA2oKlb-hksGV1@n0X3++t&MR$12W9aqZKkzhNl;
zQvPo#X$u2%E=~Z|%~0jIqfNUe@y<us<X5hPZmwx~fhi;-@hT&wz0cQE^xQ6iQ^d}-
zQ)Ug=a1e~}@XiCUiZ;b}g|c14XX1k^58enRF-rx7p4CBF?85Qo2quP*sx!{08J{Mi
z)z)8W|B;4QU6O?^h3IX3JT+#XggC^flo(GV=|(Qgxra(#*$6U?t?G(0YmZ*v{v_P2
z(`zhAqtFUch?qH5m|qkOD-AcWo`{<yHZQ^5U<<{|_H67{pUc14=sNo%Ozik2&9{HC
z5q}{i!{>B>ascVGRfKWA?6XkjV_COpCWZ+W^!i4u$DI<KN=#!`JdIZ&UA+!hqw70;
z)=?Y5+rrD2(Onq(y}>bcZ)A1xrH($+LhJJ`3SwRYZm8}rc_Hzy8WF$q4rqQ6Jb#zc
zP{9+Qzq?qZfBN_<*Z}KD=Mzsp(hrVR+B!9=;?1ZB4JIA-mKG#i$alUT23aVyS^o`6
zp?yR^1H=svO=j+-ly;zCq&q>Pi-y{nqu7ohSg<d!S$InsgjiQ)Y^f^-KdpxOR1dl+
zToxvOu!)!?zEn{D^4U}mqr)=EbkV&fB1A3IRO>Zm-F749Ro1$zkpi^cDe=X~(*mf8
z(TrX#I2lu1$=M#r&EFREW4M68(tbBnX8<BabUNtMZ)^EqX_>(_QhXzJ{*J1|R3d4Q
zC7Ts*AQ-_RY9Okk#g$_X&R5VjMr*VLsCN?9E2zFTj=2AxkALO!lH>AE?F8?-)jT3t
zBEBV9oXT?P^52H&ujP97zQK+<i|1>d(7)d3Z~u|t13jCa=?@rx`~1%zeXkGPBYhlM
z9sahw{@*YCz#CffWs#lz^?U#K`Jag6>@$D_p-0}p{`|tfBJsz6?g9_X9}2e#AN=<D
zp8(kImB)6o0<O%ge}3U_KhX(F@Vn9H{r|lFzw2{6VCHk0f_Aqvx#fQVX2*|z{PvFr
z|MA@aZB_XdkmIHAc;!Fd_>TAee@iWn)1Kq3?>Ol_&iQ{UH~uKPj!T~7ito7K{rAP;
zxZXd8JC4zwW3cPr%8FyG?-=SlMtYBd?tjZ1j&c5Dn&X(|`8SyQnB+R<_>L*gV}|$N
zU}_1!V}kpb;65g}j|uL7%N&jg?qh=cnBe|5Bnr?v#|-cP`wXv1dBmkZy#W3z?8ms@
zG46Ma`yJzc$GG1=<OIjC=P~Sg414}t@p24%9>bphpMgD(@@w`^^kjb~KjU*VL5@|D
zQR2Y`GLLg)k51g6Q2An@?W+Dc@AUBP9G|a+=O@p{g42Pto$u@H|Inqpb5@q;HQ8Lv
znK`Mma<a&DZ=7#E9cz7ztJzq>NYXxHU17|1%yUKPXir$Aqhkl{NczuR)PMfGtl*Qf
zm4|;geg9*VA$-KbIPo|!?|(M&e9H5CDe1hYH1)r5l)D6%Ni*jD_d<1i1IH0Kj=*sQ
zjw5g!f#V1qN8mUD#}PP=z;OhQBXAsn{~ts^!q{1oQV0~@{ks15U!2KkYHFg$UsaC@
z3=EviefD2W*RRt1tJaM#4%2P@dzuACQo;GA?XlDOxkS~PX`qO<jM2w{BKwmrW7IiC
zMMd|IOk>R#`?l{_dF`z2muw}%{%+5Lzcf!GG^7DF{_VwjLDV&dHx2dO8S-x<b0%!w
z*=f|%y>|yc@5D;RcP)^FD(Z;rox!Y9yZvvsWMEDCdNp!8`0_;~={2UO0k4VfVfJEU
zuNKkw|L`RQcXd{;2>(-3jF&GoUTkmazVlBYAe)$ik9M-h&CYTA=fsXxkgUk9Q+F6A
zxW9~P0qYtGMFO51cC$Y|xR>gE{BV<(_3wT<AarX97Yaz6Lvz8|(aap0h1XLz{mcvq
z4@-MyzZPd+!Qb$#G^5!)5^xyr(MhVF6Td$C8!+gfa;_o%(RF&G&>Kx!G5(jh>kSQH
zCBRt3Kap^f$RJ1P8V%hQ%S-)EAEN&WfjlLD{ZcYCzd=KuLY_UhsfP<KE4V?N{RsGm
z{BZOyZ4FG(UpoI=*uXb$5i}l>Mv*R=?qWUVZ*PWYYvKQZ<z#Chy}eMv@17?=So}2<
znWIQ!pUq#=4p)w~o8?I$2KVw<-+!RtQ&v{%8R<<^b(-H19D{rQ11P2@10#0nIfj$x
zf)4G~?|^G8u3m#UJx{Uv0+sAEQTmVH@Bcm7h6ZDk9`hWTG0&r;v*2O=xVOjA5(=0Q
z2A6_2k^i(@`B6vEDBT_Vbe;kF*RNDwxpwz?L`F=dx&y)QH}<}kpt1WA4}V(kGr#BF
zqJ7f%Sf6H&rWYTl_-CyAue|1=Dx#gY<(zozt9B3A)x6jC2UEoB1+zSxv-~vS?|&jO
zdfG_gCVzMs4|RqQm>io$Pso2g_j3!2Hk{3;9Rk*WdfxP^Gj8>T{^KvKtcqc(om&44
zQ?EW_ta|Wo*I#~p;f+zHo=MlgjIw(yWX60v(#GqvyF2oNy+~C>{M3+R<i&2;e0tL+
z_xsfF_W{pM4W3(ued4jW{5>%8-{16aNlY4jO#belBk|xinekmS!QOC<?;m_(Etr4J
zoy(;6c!d68&i!KrACw!VEYg3*o^T$Vsuyja1m`Flg1jBHdtp@pe>XRszy%LluH)j+
zD*gg@+x6&HSvd^4JMB!A$(4LKSp0k<Kulf0WcJ5h`zpG1NQ(DN)<zp_Fh9#SQ`e=v
zUxPL#meX1;Tx@^bGHmZ3olB2HBx;(UTbMgqi^+U9v|iT5N&9_;F#ViRhQ*YolzRu^
zcGu;(lw3!6YwC?n5A4$33dl9@W8S#CDT{19-#sit=yLRKN(*Efi*Ky%7wtIe%Ry7^
z=PjDscROciHrLtypT5pBps94*`(tGoMS7E_6ai@>y#!DM1f+{12vLyUr4s_fSm+%?
zCyD|pO$fcJG^Hv*LI@B6DFH%&Kp>Fv?woVay`yvQ%Ll%Yy?6H7YprLk|9aTJ{2;f{
zi{A0`fF*v8ryXOxk1s5u<bn74>5ZN^d-iN(w3X!T^=4#sMBV+Zk56#!Id^l5BMv_M
zSV96c16I%Dp6g??X&cb~@-{-VoE*5ZGVAOBb?dQjld0mu3)jz?6MB2okgkb6+X!Sk
zEjM_8-@UvSot%ZBEy61}8r6S(hY)a+mZP=#AxA5d3bF%Mj`p7i{5RY29#XDxNWU<2
z<{NH~iQvz&H-M#FP7&yen2IiU96MO^PgkhrVhY2!Jj`z=&^+hQXqA$KmdhX;ttyUC
zat*Rh_Pk<qts?Zh82m<)d6%N^WxuHWhu=SVu58ZXKKF~5<~UKhl@$ui*h=5Lk?vR&
zipjZvZvcDw2o`R1tTm1oHhi^3htW~xP?GMY6s%8Dy)$`RcH(f%Rk!0+nFKr(?Q7ji
zY;>@%(A|8l?|^NJ3EC7$z23IoX*A`NR5ccaet08r243cfiJW@XK{B^t)e`kBoB`j+
zi(1c!8&jY!(@UKvJ~VF?lr?n{2}l@cqj3rvszYB#;(%K%2|<_j4Xtddi98FP^-Zgv
zn@kz|iww|R%gV9T1Kuiu*tms)*@ri$T$?}$iR!*3<EQt#jeaQJ4I$TTqwaS<EmsrV
zg2_&Rkdf%2IDT}UC42ee3;VVKfnT!2^S_QhienK7bvv=ZbNR^85RUjGH(kFUcTR_V
zN<WX*IVH0?wzA1ChTMQ`6seAKuqtTp`Q-0z4^2c8TB)#LBnzs%C208v98W*cA8W&g
z`J8QV3X0s1g=`<$<=dQ9#IVl63TU}!nOMyUk%U@Llrf|<_s-_FslpOOJQ?dltRIbL
zdUf6Fn#L9qZmNaXl$IMLS%j+=Sk-oqw3V{d#qLU}Jy@1?XjP5@QxIXHcrE%|ZY8ZR
z7(3HcUV7)D%b`%1vCWy>{4z&~ZJ_<y+RD-r7jvRS5|6v0>QabNlvPQQ{%rVzPK#?c
z$n>bCZV`HbS@e3dPhKhYjJ8ziGigvb*3pwvR<OKx`Qi9$ZGMD4EK1|xk))e(W?wK0
zA2U*u(lZ#%B$(~jOxyFE{Bj=`1gAGCyFFqds)G;Ki0JhUp)se!8^(LsAoqN@1aB&}
zKdrQ*?(r}s0(fE@=C$p}R5Ueh#^dFBVcWL4x#r@jJlqDEyoZGvc?lz~oP|mFEsf-5
zMU%k1OUwmX7FtSX(}ByCT}~!xqv7JZx~tIbg#KG=-R)QDziv;aUc96(!0GAf`I(ed
ziG9mOJsXf9qc&zYKf=xcleKcjUm=vkqqKc9eEFbJs#@nc__yk#PQTU-z=A4<&xNmP
zVY4KUb&^Vv4r-w~OUFcvwcL&40t@_Ll6;Az==~xh5-bc=r;}d!1Zb0N<4aQv)0}jD
zI~O1CIIHr-&~A?mqycx96};V5c<1gxK8o=u_hi5~PVo+nVq?C)gqvVok8Qj(tjb{>
zS{7+MTpm?38|3(%X&E6ZTFX&jMF1`SNvRa5K7>Xh9k5>`dr#iUJjWraSTToLj+TJ?
zUXnU=ovT+G{j7$d;T2+v^9%ZaEYn9V<jq-UV=QPi7Nd3*p@Xuf2TIC)uXYhVdKBlt
zLpVK~rd9xDcO9&%HWeA>kaaMhRPETL1l@V)iqZ}Yhz?b-nx`N-X)(y^Ho~)Ln7oO8
z9E+)oQU#f~R7M=pK<pMOzz>U*BdNPY;@+e;mO?c{CqIps86ec`haC98*25PHu*JJr
zeX=|4{I4nXrWue@y+9Y7fMBa<kLTyag6Q6!A5}S#v>JH6+%CesGuc|UlxuOSBD$cu
zV|$zH7)eoGc<{K2`!Vbmx2v0K()od<0)KRVKyJ}+rc@?N;Cii*@ZDztpWL{7GUYV5
z77r?FK7kkDc?wh&5OJI{LbPPJTh(8_Wkq*;I8#8NpsqBz&sB-}X-y-=vcTpxr0o(W
zXMRbb#+0dR?voEs|Bszz-I;VbqOywaGReWuNp8-vr$O7H`c1fqWas|XA!|DYQ&r`1
z3ArSqcN*=x5z-f3XH}a=3hAOdwk{1|m*BigN*4%W_f&)KQ!6_>Jq=;O8q^=Q0yq1O
z-6`;xL|J`R(+9yYS^j4dd{386FQg1{NLnT4$(SO_MH8)uL=#=Uf*o#5YU<YC`|y|c
zSDQlK+J0-xKmJi0y#*F$Tf9af=Q&G_SU#VPn(ZF2+bf{RRyuP=b7#{>^?fYvomz*A
zL*4RMpV0RoC(7u!Ao|onXE?I`1D13z-ZgUy=FA4t5or6`M}DD!aBBcbSlsFvic1~%
zl<{&bPKTC9`Aknu4ckRmsbO;^1<z>y*o;z0m^?K%$FwoHXYK)+>07uZan-rZ$Y_sO
zN!ppXKItv$rm|ij_mT@7=#O$^j$1?YbWh)2dX?0<H@vg4C){K|wiN55QEG}PO?O_v
zS2dzlg`gi3?v&j3z$`gnjY{S|cWJ}mVXOShuTGp=ay5q8*3LXcvXCPR{5lP2XCYU5
zZXa?mCr;Yv|LR>QZ=LJ&ldTw63u2DzB)8-PtGV~hb0@cr+zZactW5`PgP28ga{~j=
zqw=3~6f{@fsg?R(@Pn7RBaW2}eNr{18LKWlml8wdu{!u_DT0Er9lb`5ojj0hJ%;_h
zgSjlX@N-{Ch<U1{g^o(f07)~R|5!|ZC^TP<@NB0#MqLy0K9Vq>Z4P#KPe_^kqM-_o
zCIr%R;kVb(Uq~gQ)sx@JqcvyN`?u`=xR@zEs!hE~JEgy_$PUiP&TA)R?;Pl3b6F4i
zpJHkSZG5IE&lHMbqxU8wP&2oE*^1<txtY@nXyMR<?+NgcEUvB=0LK6o;!$wQmc#6P
zn>c<=R*Ty>K}t@}bQ=ZPXqEu&yr-g<rka+qMs%P2VJULjsm@}j@VOQWV&A>M2b;Vo
z12a{vPYl=prRuq9cF*$LQ62?`zfsKYZIwJB55*p1)!gwH={%4pNyKkYD_JZ<{^&V0
zL!@Lc;K&k5yvtyWUzbbd=%K}@;uWmMnJ@?Egc7O42MEc;%c2Dz-jQ!{Ci>roq#W75
zt|~jMnoIjO8j~-nf9ruMlW!B$7jNK-tZunj6SQE~S8f+7rFzE?TpUBnqr9v_TsfSd
zomX)Sr%)9|hSb7Rse<O@k2yB$c(zBh+m5sfS@OHL8!4)@>K(g4rP8@Wzz(RU%%0n8
zl_+ow{2UXlP!kr_*l$ofZ6(g;H<HP!C?={}>!*dekCAQRTukT}Hp9{<T5!5uhr_|-
z8hO*Sj#d%FXxV}z?5@*p1x<ZZbq|aqN54wGTF}`^pybB<Pyh2@3yv+<rS2Dr9T(gH
zd$nKO?AN_p;C~=!N?dQ*i^eG^36H?vV<S+Y?UV$Ui{D?Og-=^K8^i4kS8U+aoZz+L
zm;o5CmpPL;R6m;bQpNA4V1dDCk&+~KliUqgmj+LHtofuvzdTPRq7>7GOQkXNxfufJ
zLT~5kjLP`ZjFKCm^%NY6eu$tz-RaZN{wf@v5CM)>R}J3qh;$KW6K$LvCqMQft}`K6
z5Ml`&*#6TA*Ut=n5+qGROb&XC%U|JQ3Skmg4a-rkdm@WCN%*Hs(ZR7JHmrlGa2OsR
zV9@x1R2RHCXY`}rMQpj5Af>1zwjs6=(RF(-+isWqBkH7aJ3L)NhqhQ*(}$V39J3*V
zW5Vhvz;T;PFK3HpX<-D@f@A@>Q}{+p<&6HrEKC{w&2E(cKkdlLr|mQOPN{;YKAf5D
zHC(Uug)i0Xrj5~2-#vf6%MY&We8PT>08!VUX&{g<SwZRk+1fe~XPx4YX&qe7K`Z>|
zZ9Uzglg+$xchhelS~l)|_~i3)f!V=7Cv?hpuYU2dFXGA$8uyXlnC><lCCxHYOws+Z
zDTa8}D<%!&yJ+I0Cj=3@3)x*H>zv$sV0z5AiE56nd|E}MGP?~I=J|CkmPIYB+Eu@v
zjB05Hs4&H5S_Dn$H1CS4reV%wD9VPKXSvVklK5SqDj(!HTEkUdUN*G+=c6;md$SRc
z$PIKkk)I!fe;4E>taO1&p;Of@9`-~g#&zK2$d|2qIZSIac9zgoi|PC2?SMlI=Zrqx
zp9>oqbxDLwsSA9^y0r;yFJRniYS+KgX*>O78m~gf{?*3w!2?;+8LuY;b7riEzBH4c
zu8r-Y?&eAG^E@c<PrDw-I>%m%g`j+KFAfRA1ZWe8G?2;0Lf+b6UzLe2A_xQ$9K#+A
zYPc2{_&1p8h|N#Pz6qdrge@)Ovs<od)#7$`1nB9bM?IfxKK;2nV&h|r<>LgZD_7Y>
zsNf$7#a*h(YHBs>WY=ndm*&g<DKpi3uqyjNXX=8Wq3a`>X^uQTC*0*-v}+^#!?1!g
z{dHHZoFiV&!aksv`mCIl7VS+=&-Q0t?<KQ$KbH;s;#k=*K0_;vm)oeSf?$H4y3uEh
z1dAGkQmvdhS4a}kxOhkcTEKvs(d>{hd_iM389}6qbcRnElOG!QHdtYKyCSdXn2M6(
zrA)@Zr4MJXnCfzcL`eKnkwi*<$p$rR#Ly@cNfLtg(vg(DOcT(Bpd&!~8<6?(^|uJ4
zuM;kRwp|wtz8PZ#=N^>d?iYQ^-K!Q7jhAnn#J;E)d*&j}KLrMh%NIv)Z~2gVUhP=G
zL7<ti(V~cL_b=bCpxq%qhG$@M%-<TXIm#TS?|3@o?ERyOSCPB-v^#&}2%gif`>31x
z=^IR379WkgsTzRDv|%Vy#N5NlPWVJB$bB~)N*;PL-#$s7EA}Cm7Wg%H?tfy)HGhvN
zOO7K-f~zR@e`^}CCcyrj(>I!o)lVlQZvFjp|FQ;%J3V0KjotDx>nbH(?5Fem-CD6z
z$HfJ`f*t~lu>butU!DNFczD&%{6l*C&lMY(ud9zQ{gC}Xj@NsB3J{;<0|fukiT|gz
z_u@<scs>024siXIhW7){WFXM0Q$Kgj3@1H#>%E;hp+iD2(tp|H|GL<B+at_Obm{+`
z`aj?Czi$o716W1r#UKCp!+$;j9L@6oIoe}Ah~?r>`r+T~*1^T66BVAX8RC_H+0`Up
z+0#1z(*3JvmE3MPVb;+IB*06^*yudmP=+4tDEm&;*L!>|accZgN8|DE@bhy*V^dzk
ze~aDz15hg~+douEWK`X4IiR~4rJQ{RlXq-|O&4MQap@iNduvrq=KUfuQJQm)Imez~
zX^`tfIQJ$300*j+Bfu2E0dKq~ICkT!y}Z0kh|8m3JQ9%4zIE=I`2w|OgN8=ZdwW1~
z{X%z?0e<z_|9XkH5&^&)uuJz>O*IJZMC+*C20#|H-SkJHV5$cg^6dZN7_m;wJ(W+v
zlzCpLx$3orjCb*^CFZ4qz$F(~n5_O+6W^=ine=U%83XWPunQK&cbg8ATT4b>JKP}x
zM|pR$Ui-V_{<qy@8|=@you};g>zWJrHQ%2hD6J;94hO&1@=cQg6qbK$rT@#3&n1s8
z7eoBI#03)h0KwB&%KO&yz+u&z5QcBr@g8r)Mjvv4AD#RLTXq5@_ejcCKa#qK?ULVY
zHu~B9-WeV@1fCIT%Dbi=ueH%EHVoJ80?)^H@T|5ZTf1K|6!>}zq?)-=$pR0P`|@Ra
zy1j1AMo<FCr#yVBK9f^XkkaO+XEL|xSRcwIaBHxqw6j3`IUULzYxyHq9d!_8v)hxr
zy{Mw15x)7o9>68M!uYifD=@1h{HpHl+_*>C-@+v;ja+x>Oa81-sN#J~bFC6FMjX|}
zA`)9Qv|t+fs=oGW?o|BdP@WR9s%bVKKpz!Tx{b3gltM}9awCLTnR!vZ?x+1G`?hG*
z`1nr#15<r{eH5?8)Qdm)&j{7+rvu^mjaMCCkF*}*+F!lh)!OHGp-ib0pk;l^I*BZ|
zAXo`92?ppL?34p<V!yft(2@66>?JvW@hrzvQR^-u-e#&jc>+G~t|Z2t6vJ7LJ_PKH
zx;bNG0eZ%G+51i#mRj9S#F84DN<{#enP--nDsD3z!}@YqX;d45H=1Ql*YovuIBj_;
zpV2zR9x$&1C?n6Nna~RGlaAM(9lH~QHIZ{qc(+l&)6@N?S+NA%?_iDe>gxtFW8{Lk
zzG~&n{2CzGUB)*rr0%{p{{@ivT=^)*2QlpJjP|Oiaw6UV>oCHc5pAti^VMw|CWrVY
zaIsH(GlQi;fr^XFLx(c=0V-1#X(kN2^GPf%*a2M6?<;dm;nF!JK_}h7>ya}(FFZ$w
zpT6?#S1n~RJOhBeCjMH))#jNZi$)t?3AoD|#}t0Oym{1~A<)MjVUNcFMJ>c9l75JV
z)9W@TUDCca%-A*J0K6R-!1V4CBVEvM<Afun90NX_@SJUr=<xDJ466>tD7FG{HDaq*
zm3XEwKn1lJlc{OX1jzMi<H2Z2ovnnZH&qP2pGuJ~o1{?m{v7$C9h^>CCZaDN`&y>@
zQ<}VY@Ay1nhg9msy2-uUH)c44U*?0T3kUYF<9d0)L+-Z=qYrQuj-34r#@&(<4+G*N
z2R1$rySwUDrVkjPA4?o~|HlsIbR4_>*|GjJXY>4YUGKL3*iRWtU`jPVG~<KyrJgN3
z={gARz1^NRI&W9Vx!k7`SoN=&gLjUe_;2PKi`iU*Vnu+b;sBDL%Z%S84i-i3;GLDP
z(j5G1Gr>0bAV{cAH=tHf+`KtT{8&i(23z9&Fb6+0tIA&IfMeK<idmt0?H+e?=1-;1
zglDX@O<6F6UYk4y2C#BF00K|4_n5zsgp_klzXB9ziUSaX#umEN;DdE{U5sx>L;Z}e
zg!xLXUe{1S_@JLY@{Yf7mLyjshn4s_b5Im@=LHpI3T}1|ZKcdgfLf+o7;B8><yhzL
z_4$wwf1SX6U=q2Zg|D~K5vr>QNb*iZjf|v7<fLk@zNTz4sAj<c#<0FPa`jDl82}bn
zg2N)d+1@VAjbK|--)JX}kf*XOw70IcZuJQu8&FOKU`~V>10uJy1>_%H5WuXh3gPyj
zC69!ab?nV*`L^Q9aA<e9833OW8B;;swGrIdRGCUb3kCX#+pA7_yyP0Z0X@WE%`J4u
z4~paYfiV$d{h|#b^W2N`{ND-L=(%`S#;*b3Ju?P#vB+&GH={2_Acx@3{`uMGEXi_%
zB%V@${}vY^pf(+#PvbJu#<8RB27fgmKLs0{4m6YX<z8kiNpBE`2@Q(f@+RW(CB>8b
zZ3yB*`*o=&7#;_oj6(*eXRG$-G1J;>J)iw)+k~#cn$CmOs8Fc5aI4H#7!3fm<kO*o
zGRK1(e5}KK!%(n=4$jw*k8|~66NB!uJMx9Lw+-g!v|@Bsq5g;#T%io4>9iquDohSo
zKHgqn6S&s@J{V(7x!3U{({w5t(8Bj-k6^C1zPSG_xgk77l8Xj_6g5P8XbjL{@St4N
zt5E<n<OyiS5m8-~MFqx!fIaJt@pjN%|87=^=L_j?@XcZgT&;<rt|>=T_PI>O49FjO
zJJUla@5d>-j}&s*s5Su$i#D09UlbS6$M62**kUc$Sf+%jJ=63;cs2teNdEJ1x+Mq+
zJoBtjEOakQzsjx-HyT4DzRl9yS94>&R=CCO+rL{Y@3!MWxvBy@2Flyqk=B5@k5Qhb
zldrtn@QtRmIR>&U+>huP%JH7BG76<ulTL|keQiv+gK#UozR?qNETcVsF@H)pP24;x
zBt*CNj@{Nx;~wsi(2(e9x)TKdEpjj8xYf_`Kf@ExwAOm@_T4~(Xm#3bWkdVn3}J*G
zop!AkgQbzo71pZp<_f{jEslnmsYmS!4dXhp)(}?i)3&k=^|W_aaM|F_vGRgoMaJ5l
zt<RXCT_J_Pjp=4lt2DJ#M4lS})3$R^<9qLqvHnmxGOstp5&uBHl9>VG(Zpt*J2}U&
zdHH0;cL2~gl*OW$LIl8mH=?yquCi*bJfDbU1xTS$&uOo+%m?5<(q4T)b&?u_zsE9#
z<^@PyY$kLbZc9Jkf@&wQCp^s!*<w6XaxtJCoG*(vtjzF<zxEMeZ54ur5$dTr2P;vQ
zU6XXtle-Jy;>Qy22nUWbY8j%pv+v;Ra(o#$&{y0ECg%W)-h3ny9I4AFCDv|aUq4-C
zS1bB0k`AD<)V3d63qSr@TL9LQtxFCRJU;^2cCj}9eIv%Nz*%XzGkg9>+sJA-(18;5
zxAeeO5~``G95M9JX8|kKczaBc)9CS4SU`Lp%kB+Tt|s=7kvr`lZ^Rw#VYCREZ1F0m
zF*(j|-HQE!NZOY_p<wOvA~lNlpup*x8igPlJsj7Io;9LEL(QXR$Hp>6<VOhW)5B_5
zZCx(JIZ>EYt+v(BgZLxl`F(s7b~ci7uvX)5rruNe*QA)JiTLj&TsCShjaTF(iaVuD
zWZ#TWPZv514J;1k@QN(g7|1FrEdanu6WSfX*Cdq_AM5*Ma({l>C;>CMIIx355+Cc!
z2HwmbROr&SP~F*y_5$g;L3-A?pZt`52PD}bB0Uc~x(>e8w4UDhqR(o^;9^<Fq)E#<
zwX3Iz35zCwJL&wCdOl8YOmm_0_scZDRM*6XD}D2X@AD1n{oWeKS2F|RK?2UISKTEX
z8qT{`cm~_e+hvcW_Se2XEOvKT$FcGKOK`2cV0pT;-{%QQ!cDp7AXV{F3~^X(Ic^~*
zt5G78wKmFTNI8Ct79QwV+haq?<n+G5BSCugI)wb(epCjQ7l|v)vBzATlHE|7O8Y=1
zNpMIRNCWud`!XhhW)_6<31ZSg5~=;C8M9slFk*`SSTQKrJJy5GahHc_rZTO~U-{4p
zLnGW#^&hWbdJKn})1IA}f)Piw6u__+y!%W?TH~`=UcPbE1*6lRpVZ<v-WHobWQl;i
zxNoUn>?6g&TJ(*xz|zR!-zu_)jHS=I;O8e)5~c?KVkb4MT`UFI6#<t&*g;w>_fMH4
z1+_wbbWG2e1@P>yx2{!Jg>V&I=x5p*H(0M>82{ujK!V%bM!m5q?{uq)oA@J=yabR{
z2WpbR$6prhGwQ}p{@|^o`|h?)DG>nJ@RNniMh9L6)B!Ip#x}|ke(Kqt3m)D2a+l($
z{qV=9sJxj*#Bt|8bzrk8E`jE<r>0TX+S;hBNN0s^k_1e5!xLwQ0r_^5Yq(d4r{fmN
zldso)4l0)bM8?b8$7P%^>XV*zbw-l5ZB6Z!PkXLDe5zk4<a^Bl+j0Lx1ux{^lCd;c
zSW5MU4r$VbD0yz4H_@(IY9fC+d?AI1W>V}vuj-x3f2>y7%IA#GGS{)~+PN*D{w4As
z3v?7r1D%<=Ao=kC2*$x2yF@6Db4$tffF5h}TE7QiliD66P(9?$QkK-pf#T_Mno6cr
z?bpyDulUiIRr`zYDmWnKJ;H2OqO{yawhOgaLJ!Ags9dI?4uJGy(FJfl5Aifi|03k4
zo?%5KwxYWAVoWZh<TS+sNJHYCV-S49?O8inML{K^i*C2wiSIxmyB5S2hJ}XZYezhU
zui?y2d~F1(La4cVlm~scHC5B0OteAPEQNFLMmLFCo_#WH$W8SDr%YYVvn02&QdYWl
z!*q=AEhAOrd9PJx4*hyt?~ilGME3QHQ`Tgg!&e(BI7|bAZJpB%0UPT1AD?F?$4V)(
z)bV{1jwfk)xqSG`)_9)p2f@Q;Ikj3^C+P!8%0Q@hS*WA$Ubw6TSHW2$huVzb(<5;o
z)Vj#S+GsXZFCF88Tm|sUig-qelKcc96knhMjlgh4%C)P7oc$l=N&4|eD9&b|_gw*H
z@IJY3xVG!+x;Go->fP|UClyb)QGNHIL-8wbxt4pq$2q%x2*psg$gSD67T0>}xW?b?
z{Px@F1mkO0ZjS04z5=?I#jbVgJa5;hJtICfzq{q%46P;HJbS49+lICNhU_tiNv&=0
zTw8Wi+YY?1{~)K(OcAy-8v9c-qStbKKDKfGMaSe#@|dZ@FnuJMdO{8TAd7OpbH898
zSlmJ#VK)=wc6IhYuvW7__Gp{9Z0VxX&}XwqJX^;i0|9OC9wg|GN%Z@V?^1zXFx*M7
zS1%wU&kO~(Q|kBoM%^E&qYu#N?<NcTpFKaFp{MiQg*w|uj)uIszmi^AUEQs*G@mmj
z;$As*uD;?aqfQSEQ3t!dtl$(EY#j^q@#TpBU{gvzF-7~H_$Fa%htg{4{6*_z`$7&Z
z{BqZ%&GR-$8tAmBG#gUNMZaR+_;8~-Y2m9|REX>JjK-F@k@%7M3WW8bMv)^1z1*wA
z;p;+>pD3vYDAX5)pgUi2gJ9Z29lQ!GGha6*I<Ogo8rhK4Nds$RHdxr7g@i^ov1r8p
zl&&Y5-_s5Wa7kX(>45HU&IuB63au+8H*3&Gy5m?!`;X20V^RmyI=50N1X^Riv*7nN
zFg+5Jww=ev;B0qJxeMWUEeqLVMhW#ua}`1hU(7-y^n2}O|GU?t-uR=?{JG861pPLD
z=|PceOe&Rki*L#1<?vv)bv+ly7RfiB_6{nTuEqGk$Lqhn@WeTyW#e*ZzJzq1Hz;Tj
z7u5Vi_9P|pP{b5mjhEtq1E6}pQ$6&gsSD>PZl&xWi8;WPFC$g74i`%hkTloydFI1n
zae<QR4UQ8Hg~-tNU5I@zjdFyy)?xcgP45;sX}4DeWUSh=E7af^HT$(TRmWQ-?w4d&
zcvKA=6_nW3<{~|t>nS2Z@{f-@0XYZ@za7SBs!Uxm(p?RPbu&vArpp7wukldA2R@2F
zcq)dcz`p+vwW4PpN>kOPJE85(uvS!WkP`+1vYZS;<Ys&^`<!z<QYGn<l()Q_;%O(?
zGSArpwd*Y1{bk$=x`hPdmtHTEs+93@w>5M{7j7^{8`=`OD-~t*=lfQ9wnrb<Og^4J
z(1h(KNTg-cxZ~fH)F#}r$f;h%z8?Si-}+p+<C#7?M_?IkFl()F!L_j98eVWzU^#PK
z?tONApN0i!_huqDs}2C|>nh8)8@|wI?Y#JS5~4_Y9-l`odiJ_zf@8POF2!0TR!2gx
zQ3!L>g$te4j<%m*N$DPYz1P)>zCMwWvgTiKt9|>L{X^8KjgMdvXanO5ZD>OK+L{CE
z!27f8@scN|+?P<-&z*IwIL<&~a)Eytk^=h)VLwu!hl$$=I8_$3{YeZ*gwvz8tQ?(0
zfJ_k#kkv}F%r)P19~h<y7cn^XA>oz#mq=FH053Mlzbz`Iv1HiZR9tXqzQ(rVka&<C
zFnRCC_YDWBk@%DnM2CRAA4s9<ZU+9tJLd>{VTk%V?S*nU<%}m9T<6#=hWj#{Zvyf{
zcwiW{mx(x=ZjY!7cvL!3q80SwS6Pdjtr<`wtWW=Fk{i_~-oW^;k#sVDV-gCsWF+*N
z)%DjU82>Jig<o)RC1TW>*@HQ5Bsh{50SC5jspMGLe<GRl2_gjR9(Pkwt`POgDcOd_
zxdLnCNPlTxVAfcj5#eq_rS)ExpqJMKV+)y7h29OM4C%=oUQ(cxw{CpWPeJ>8<=|08
zbrJ9}TXP-q4=&?Wl7KC$f|>)4)-VtNN%v-=U^iD2>`}|1?VxZJs$RIL;T(q2(w6Bv
z^v<jF)s;JA_dUG$c)<E&&)L!5>*lkU6;za2MweB_>*Dmyd_#G&7`spbr8`4IM7x;@
z(K8Xyp8pR7+Ya#jG!i>_MHMXx93h4?pgShg&hfSU2!U0M03+}xhe|!yU;CXLs^dQu
zQRacptbcgFu`$@^Jv)DV?&tIlFl``N+2f@#@O3udN=CP}ba^0P+g1M1ia{g;;}{kZ
zwK3h|C>+Mnsi5p0o}R{LW(^yUkU6lsroYTZ*`(s$13g~>fHVy&4H^YC`3=hS%BJL^
zzaSsz$M-acB!89K{jV7o#9^k#>@0HD;S8R({o&DABHE%Hb{L&MwryuFT`<Uo<gbGY
z!9bC=_6PewVs}+_5VbH5cAW7ifOGu{zwXh0e|iS5xglW+3l@rhd3=K7NC9t6OKP%g
zq+*Ewu)z!0pDF}KIJ{>DY81)dli#dJ*ciHwiXcQkYDo!hY~^lYx<12Z>Zss&FvT?O
z(r$<=(u2cpwgwuqJlgI(9k)1$9N_DTQF-%ab9HT?QvbHReY!>D>rxSKFGP_6rT-=1
zsLy?lW=z&sTB9Qkv>D38-z3uadcR+*ySF<22KVul!Vu5L&$oKt)b_Lln%xx$*=g*_
zO^zFZRG`QqEW^XY=r1vZL40}gTIiipyDM*eFTdFD3{ROLo|`H{>AcM&=9GU1Ltjw)
zI}V}&k;c)<-MCafLhx?V@wz+a-d|_34{vEsozrx4=w)qQykmXq*YW@Fl}Se#OA6=j
zHf1+})!xj6^TLgPiJQ`Hqs2wV;qwW>doy9>IAnRbC$g**kRWyw-tZ;iq~`90>cc&3
z;GAbfv^*YY?r!cL@))Gtj#=Cb8VmEcvs^$s!W(BJ*A`9lf<s}C_1!P)fHA-@<Ycvx
zmWlecH8G>qAtf^C^AhXbar=J^FDgMtYtNcT$=Q9`Qb#=2#~!qfdb2ocL+>Z%F;N{$
zQamc0Txk??#0A;V^9m=AEHs3JYeL6?v3%^t@x;<+6umX`fnPUN|EWhf;k@2!ZPKw<
zu{fY3Dag}Ud4nM!$@%W$3O&T8-ri2=x$5(K9nk43`_Hy>DV?4F`*}sT&|_oHx^WxN
z1^*d!SI%X!D4yi}*S_8zjXAdaj?sk7oete$Jp6Zr{mTsx#?A}9XD3?oKUh-UdHeWO
zZp$SW{hU7l`N|}gFGyPWU|sfob)TPRTi<fUsd0%pHrwK$#r7Q}X1T@*BGj3r`O=Tk
z-g&pZpYKf~ibQQ4`@-7X(9tyWZyxaf_}V45ql$gj;sVa=1~_o+2lUfg_lb%GZ(;k)
zr>th41UW>^swL3a@^|68@;F0&x8gp2QQ>>S!FKR{%5TXxAnmFuDvPn`+hy8RF^*rt
z=ehmST8-djXz$oTZJb*eYj|i_yM0)fTZu0)Y^lcf{a!QM?GP%EKN3mIRSq5h?lfcT
z!zKh#wKIHkJ4#*@*mW(7n|w(unhuNI$oxG}JDSfB+j9tXy?Y&2DnEq_!$=e7V<&6Q
zbNt?$$T1_q*{aof=Ibu*)*)aT_7uLc^i=p|Q7BNddzourE18Q^8#U#%0NbHhN)E+;
zI3M+H`SWr2`>=p8=D7IR>XHYNZd%vI?)N5g9}}tM`Kha7q%Urc_&Y3sTyck`m&2q%
zoD(QkuAZg}dnMmVIjeG!(euRwsGc3Vy|I$Ue+Exj+U1OUg)$S&Cr$fYp9=M{nY#Dm
zZ5}IvN7f>3_-_72RHMX<x{L9_LNbh$b5IPxFBvm3_wVL!6*&GV5**Q$A4w3&){i?g
zOkWm)8*IN&pZK<$`j4s4#EIDJJb*_6bt%l^qVRO3M26*{#@b&lReS28>yz1w+?v`Q
zSU2_H$$VC3HmM@uj`MMu#p0(EgT%HjJ@}Nay~U>c`vQ}@piJk)P?5*8UtRCgyQ7No
z)=|CT#>pwyN?w$fmKr5I74>3{yIOLHGE<vTW)ab8M{){if#&~p=uUwi<(v!$^Nfmh
zp#byK+&}nG1`|4@h<Mcg#YB<xCy#&q7mItooE};*4jH{`nddv_p%}%CMnap?%IB7u
xZw-wiX?ZMoKys2-0aU$uMgV$GRPNA^{D@>aqMTT9<~QKS*ueZ+_0@ZS{XaZnN>~5@

literal 0
HcmV?d00001

diff --git a/docs/source/deployment/frameworks/index.md b/docs/source/deployment/frameworks/index.md
index cb758d3e6d2e4..8be38d56cd286 100644
--- a/docs/source/deployment/frameworks/index.md
+++ b/docs/source/deployment/frameworks/index.md
@@ -9,6 +9,7 @@ dstack
 helm
 lws
 modal
+open-webui
 skypilot
 triton
 :::
diff --git a/docs/source/deployment/frameworks/open-webui.md b/docs/source/deployment/frameworks/open-webui.md
new file mode 100644
index 0000000000000..08ad90ba688ef
--- /dev/null
+++ b/docs/source/deployment/frameworks/open-webui.md
@@ -0,0 +1,29 @@
+(deployment-open-webui)=
+
+# Open WebUI
+
+1. Install the (Docker)[https://docs.docker.com/engine/install/]
+
+2. Start the vLLM server with the supported chat completion model, e.g.
+
+```console
+vllm serve qwen/Qwen1.5-0.5B-Chat
+```
+
+1. Start the (Open WebUI)[https://github.com/open-webui/open-webui] docker container (replace the vllm serve host and vllm serve port):
+
+```console
+docker run -d -p 3000:8080 \
+--name open-webui \
+-v open-webui:/app/backend/data \
+-e OPENAI_API_BASE_URL=http://<vllm serve host>:<vllm serve port>/v1 \
+--restart always \
+ghcr.io/open-webui/open-webui:main
+```
+
+1. Open it in the browser: <http://open-webui-host:3000/>
+
+On the top of the web page, you can see the model `qwen/Qwen1.5-0.5B-Chat`.
+
+:::{image} /assets/deployment/open_webui.png
+:::

From 5b1aca2ae39b4bcdd04d916ec55bfc87f98d4835 Mon Sep 17 00:00:00 2001
From: intervitens <155717317+intervitens@users.noreply.github.com>
Date: Thu, 17 Apr 2025 13:35:07 +0300
Subject: [PATCH 473/593] [Bugfix] Fix GLM4 model (#16618)

Signed-off-by: intervitens <intervitens@tutanota.com>
---
 docs/source/models/supported_models.md | 2 +-
 tests/models/registry.py               | 2 +-
 vllm/model_executor/models/glm4.py     | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 0b193ca0f502e..34917b5bfeffa 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -338,7 +338,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
 - * `Glm4ForCausalLM`
   * GLM-4-0414
-  * `THUDM/GLM-4-32B-Chat-0414`, etc.
+  * `THUDM/GLM-4-32B-0414`, etc.
   * ✅︎
   * ✅︎
 - * `GPT2LMHeadModel`
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 8d50644a86529..22e03f49edfe8 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -147,7 +147,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                          min_transformers_version="4.50"),
     "GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"),
     "Glm4ForCausalLM": _HfExamplesInfo(
-        "THUDM/GLM-4-32B-Chat-0414",
+        "THUDM/GLM-4-32B-0414",
         is_available_online=False,
         min_transformers_version="4.52.dev0"
     ),
diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
index cba093cbfef78..28cebfbd7baa0 100644
--- a/vllm/model_executor/models/glm4.py
+++ b/vllm/model_executor/models/glm4.py
@@ -82,7 +82,7 @@ class Glm4Attention(nn.Module):
         partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5)
         self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
         self.head_dim = head_dim or hidden_size // self.total_num_heads
-        self.rotary_dim = int(partial_rotary_factor * self.head_dim)
+        self.rotary_dim = self.head_dim
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
@@ -110,6 +110,7 @@ class Glm4Attention(nn.Module):
             base=self.rope_theta,
             rope_scaling=rope_scaling,
             partial_rotary_factor=partial_rotary_factor,
+            is_neox_style=False,
         )
         self.attn = Attention(self.num_heads,
                               self.head_dim,
@@ -197,13 +198,12 @@ class Glm4DecoderLayer(nn.Module):
         )
 
         hidden_states = self.post_self_attn_layernorm(hidden_states)
-        hidden_states = residual + hidden_states
 
         # Fully Connected
-        hidden_states = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
         hidden_states = self.post_mlp_layernorm(hidden_states)
-        hidden_states = residual + hidden_states
 
         return hidden_states, residual
 

From 207da2818644e2349cb2bb6d60e68711c7006ac3 Mon Sep 17 00:00:00 2001
From: Michael Yao <haifeng.yao@daocloud.io>
Date: Thu, 17 Apr 2025 18:46:21 +0800
Subject: [PATCH 474/593] [Doc] Fix a 404 link in installation/cpu.md (#16773)

Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
---
 docs/source/getting_started/installation/cpu.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/getting_started/installation/cpu.md b/docs/source/getting_started/installation/cpu.md
index db22ef79c926a..2c0ec60d7100f 100644
--- a/docs/source/getting_started/installation/cpu.md
+++ b/docs/source/getting_started/installation/cpu.md
@@ -272,7 +272,7 @@ $ python examples/offline_inference/basic/basic.py
 
 - Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance.
 
-- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.inc.md#non-uniform-memory-access-numa). For NUMA architecture, Tensor Parallel is a option for better performance.
+- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.md#non-uniform-memory-access-numa). For NUMA architecture, Tensor Parallel is a option for better performance.
 
   - Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:
 

From 99ed5261019ddee871d2ee6186bf4b0a4e46c098 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Thu, 17 Apr 2025 19:02:35 +0800
Subject: [PATCH 475/593] [Misc] refactor examples series - lmcache (#16758)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 .../offline_inference/cpu_offload_lmcache.py  | 121 +++++++++++-------
 1 file changed, 78 insertions(+), 43 deletions(-)

diff --git a/examples/offline_inference/cpu_offload_lmcache.py b/examples/offline_inference/cpu_offload_lmcache.py
index 8211629b24ecc..025444233d3b0 100644
--- a/examples/offline_inference/cpu_offload_lmcache.py
+++ b/examples/offline_inference/cpu_offload_lmcache.py
@@ -3,9 +3,12 @@
 This file demonstrates the example usage of cpu offloading
 with LMCache.
 
-Note that `pip install lmcache` is needed to run this example.
-Learn more about LMCache in https://github.com/LMCache/LMCache.
+Note that `lmcache` is needed to run this example.
+Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1
+Learn more about LMCache environment setup, please refer to:
+https://docs.lmcache.ai/getting_started/installation.html
 """
+import contextlib
 import os
 import time
 
@@ -15,51 +18,83 @@ from lmcache.integration.vllm.utils import ENGINE_NAME
 from vllm import LLM, SamplingParams
 from vllm.config import KVTransferConfig
 
-# LMCache-related environment variables
-# Use experimental features in LMCache
-os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
-# LMCache is set to use 256 tokens per chunk
-os.environ["LMCACHE_CHUNK_SIZE"] = "256"
-# Enable local CPU backend in LMCache
-os.environ["LMCACHE_LOCAL_CPU"] = "True"
-# Set local CPU memory limit to 5.0 GB
-os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
 
-# This example script runs two requests with a shared prefix.
-shared_prompt = "Hello, how are you?" * 1000
-first_prompt = [
-    shared_prompt + "Hello, my name is",
-]
-second_prompt = [
-    shared_prompt + "Tell me a very long story",
-]
+def setup_environment_variables():
+    # LMCache-related environment variables
+    # Use experimental features in LMCache
+    os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
+    # LMCache is set to use 256 tokens per chunk
+    os.environ["LMCACHE_CHUNK_SIZE"] = "256"
+    # Enable local CPU backend in LMCache
+    os.environ["LMCACHE_LOCAL_CPU"] = "True"
+    # Set local CPU memory limit to 5.0 GB
+    os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
 
-sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
 
-ktc = KVTransferConfig.from_cli(
-    '{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}')
-# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
-# memory. Reduce the value if your GPU has less memory.
-# Note that LMCache is not compatible with chunked prefill for now.
-llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
-          kv_transfer_config=ktc,
-          max_model_len=8000,
-          enable_chunked_prefill=False,
-          gpu_memory_utilization=0.8)
+@contextlib.contextmanager
+def build_llm_with_lmcache():
+    ktc = KVTransferConfig.from_cli(
+        '{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}')
+    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
+    # memory. Reduce the value if your GPU has less memory.
+    # Note that LMCache is not compatible with chunked prefill for now.
+    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
+              kv_transfer_config=ktc,
+              max_model_len=8000,
+              enable_chunked_prefill=False,
+              gpu_memory_utilization=0.8)
 
-outputs = llm.generate(first_prompt, sampling_params)
-for output in outputs:
-    generated_text = output.outputs[0].text
-    print(f"Generated text: {generated_text!r}")
-print("First request done.")
+    try:
+        yield llm
+    finally:
+        # Clean up lmcache backend
+        LMCacheEngineBuilder.destroy(ENGINE_NAME)
 
-time.sleep(1)
 
-outputs = llm.generate(second_prompt, sampling_params)
-for output in outputs:
-    generated_text = output.outputs[0].text
-    print(f"Generated text: {generated_text!r}")
-print("Second request done.")
+def print_output(
+    llm: LLM,
+    prompt: list[str],
+    sampling_params: SamplingParams,
+    req_str: str,
+):
+    start = time.time()
+    outputs = llm.generate(prompt, sampling_params)
+    print("-" * 50)
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print(f"Generated text: {generated_text!r}")
+    print(f"Generation took {time.time() - start:.2f} seconds, "
+          f"{req_str} request done.")
+    print("-" * 50)
 
-# Clean up lmcache backend
-LMCacheEngineBuilder.destroy(ENGINE_NAME)
+
+def main():
+    setup_environment_variables()
+
+    with build_llm_with_lmcache() as llm:
+
+        # This example script runs two requests with a shared prefix.
+        # Define the shared prompt and specific prompts
+        shared_prompt = "Hello, how are you?" * 1000
+        first_prompt = [
+            shared_prompt + "Hello, my name is",
+        ]
+        second_prompt = [
+            shared_prompt + "Tell me a very long story",
+        ]
+
+        sampling_params = SamplingParams(temperature=0,
+                                         top_p=0.95,
+                                         max_tokens=10)
+
+        # Print the first output
+        print_output(llm, first_prompt, sampling_params, "first")
+
+        time.sleep(1)
+
+        # print the second output
+        print_output(llm, second_prompt, sampling_params, "second")
+
+
+if __name__ == "__main__":
+    main()

From d27ea94034a9cf3383b2d4d340b9086fd7cebb82 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 17 Apr 2025 12:19:42 +0100
Subject: [PATCH 476/593] Improve configs - `TokenizerPoolConfig` +
 `DeviceConfig` (#16603)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/test_config.py     |  26 ++++++++-
 vllm/config.py           |  73 ++++++++++++++++--------
 vllm/engine/arg_utils.py | 118 ++++++++++++++++++++-------------------
 3 files changed, 136 insertions(+), 81 deletions(-)

diff --git a/tests/test_config.py b/tests/test_config.py
index 06264c5b99b95..53db91e81c414 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1,14 +1,36 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from dataclasses import asdict
+from dataclasses import MISSING, Field, asdict, dataclass, field
 
 import pytest
 
-from vllm.config import ModelConfig, PoolerConfig
+from vllm.config import ModelConfig, PoolerConfig, get_field
 from vllm.model_executor.layers.pooler import PoolingType
 from vllm.platforms import current_platform
 
 
+def test_get_field():
+
+    @dataclass
+    class TestConfig:
+        a: int
+        b: dict = field(default_factory=dict)
+        c: str = "default"
+
+    with pytest.raises(ValueError):
+        get_field(TestConfig, "a")
+
+    b = get_field(TestConfig, "b")
+    assert isinstance(b, Field)
+    assert b.default is MISSING
+    assert b.default_factory is dict
+
+    c = get_field(TestConfig, "c")
+    assert isinstance(c, Field)
+    assert c.default == "default"
+    assert c.default_factory is MISSING
+
+
 @pytest.mark.parametrize(
     ("model_id", "expected_runner_type", "expected_task"),
     [
diff --git a/vllm/config.py b/vllm/config.py
index cca725c7792f8..7e2869e4eabcf 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -182,6 +182,23 @@ def config(cls: type[Config]) -> type[Config]:
     return cls
 
 
+def get_field(cls: type[Config], name: str) -> Field:
+    """Get the default factory field of a dataclass by name. Used for getting
+    default factory fields in `EngineArgs`."""
+    if not is_dataclass(cls):
+        raise TypeError("The given class is not a dataclass.")
+    cls_fields = {f.name: f for f in fields(cls)}
+    if name not in cls_fields:
+        raise ValueError(f"Field '{name}' not found in {cls.__name__}.")
+    named_field: Field = cls_fields.get(name)
+    if (default_factory := named_field.default_factory) is not MISSING:
+        return field(default_factory=default_factory)
+    if (default := named_field.default) is not MISSING:
+        return field(default=default)
+    raise ValueError(
+        f"{cls.__name__}.{name} must have a default value or default factory.")
+
+
 class ModelConfig:
     """Configuration for the model.
 
@@ -1364,20 +1381,26 @@ class CacheConfig:
             logger.warning("Possibly too large swap space. %s", msg)
 
 
+PoolType = Literal["ray"]
+
+
+@config
 @dataclass
 class TokenizerPoolConfig:
-    """Configuration for the tokenizer pool.
+    """Configuration for the tokenizer pool."""
 
-    Args:
-        pool_size: Number of tokenizer workers in the pool.
-        pool_type: Type of the pool.
-        extra_config: Additional config for the pool.
-            The way the config will be used depends on the
-            pool type.
-    """
-    pool_size: int
-    pool_type: Union[str, type["BaseTokenizerGroup"]]
-    extra_config: dict
+    pool_size: int = 0
+    """Number of tokenizer workers in the pool to use for asynchronous
+    tokenization. If 0, will use synchronous tokenization."""
+
+    pool_type: Union[PoolType, type["BaseTokenizerGroup"]] = "ray"
+    """Type of tokenizer pool to use for asynchronous tokenization. Ignored if
+    tokenizer_pool_size is 0."""
+
+    extra_config: dict = field(default_factory=dict)
+    """Additional config for the pool. The way the config will be used depends
+    on the pool type. This should be a JSON string that will be parsed into a
+    dictionary. Ignored if tokenizer_pool_size is 0."""
 
     def compute_hash(self) -> str:
         """
@@ -1408,7 +1431,7 @@ class TokenizerPoolConfig:
     @classmethod
     def create_config(
         cls, tokenizer_pool_size: int,
-        tokenizer_pool_type: Union[str, type["BaseTokenizerGroup"]],
+        tokenizer_pool_type: Union[PoolType, type["BaseTokenizerGroup"]],
         tokenizer_pool_extra_config: Optional[Union[str, dict]]
     ) -> Optional["TokenizerPoolConfig"]:
         """Create a TokenizerPoolConfig from the given parameters.
@@ -1483,7 +1506,7 @@ class LoadConfig:
     download_dir: Optional[str] = None
     """Directory to download and load the weights, default to the default
     cache directory of Hugging Face."""
-    model_loader_extra_config: Optional[Union[str, dict]] = None
+    model_loader_extra_config: dict = field(default_factory=dict)
     """Extra config for model loader. This will be passed to the model loader
     corresponding to the chosen load_format. This should be a JSON string that
     will be parsed into a dictionary."""
@@ -1514,10 +1537,6 @@ class LoadConfig:
         return hash_str
 
     def __post_init__(self):
-        model_loader_extra_config = self.model_loader_extra_config or {}
-        if isinstance(model_loader_extra_config, str):
-            self.model_loader_extra_config = json.loads(
-                model_loader_extra_config)
         if isinstance(self.load_format, str):
             load_format = self.load_format.lower()
             self.load_format = LoadFormat(load_format)
@@ -2029,9 +2048,19 @@ class SchedulerConfig:
         return self.num_scheduler_steps > 1
 
 
+Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu", "hpu"]
+
+
+@config
+@dataclass
 class DeviceConfig:
-    device: Optional[torch.device]
-    device_type: str
+    """Configuration for the device to use for vLLM execution."""
+
+    device: Union[Device, torch.device] = "auto"
+    """Device type for vLLM execution."""
+    device_type: str = field(init=False)
+    """Device type from the current platform. This is set in
+    `__post_init__`."""
 
     def compute_hash(self) -> str:
         """
@@ -2053,8 +2082,8 @@ class DeviceConfig:
                                usedforsecurity=False).hexdigest()
         return hash_str
 
-    def __init__(self, device: str = "auto") -> None:
-        if device == "auto":
+    def __post_init__(self):
+        if self.device == "auto":
             # Automated device type detection
             from vllm.platforms import current_platform
             self.device_type = current_platform.device_type
@@ -2065,7 +2094,7 @@ class DeviceConfig:
                     "to turn on verbose logging to help debug the issue.")
         else:
             # Device type is assigned explicitly
-            self.device_type = device
+            self.device_type = self.device
 
         # Some device types require processing inputs on CPU
         if self.device_type in ["neuron"]:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 32cb2e90af20b..85b3ddfce48c4 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -16,15 +16,15 @@ from typing_extensions import TypeIs
 
 import vllm.envs as envs
 from vllm import version
-from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat,
-                         DecodingConfig, DeviceConfig,
+from vllm.config import (CacheConfig, CompilationConfig, Config, ConfigFormat,
+                         DecodingConfig, Device, DeviceConfig,
                          DistributedExecutorBackend, HfOverrides,
                          KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
                          ModelConfig, ModelImpl, ObservabilityConfig,
-                         ParallelConfig, PoolerConfig, PromptAdapterConfig,
-                         SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
-                         TaskOption, TokenizerPoolConfig, VllmConfig,
-                         get_attr_docs)
+                         ParallelConfig, PoolerConfig, PoolType,
+                         PromptAdapterConfig, SchedulerConfig, SchedulerPolicy,
+                         SpeculativeConfig, TaskOption, TokenizerPoolConfig,
+                         VllmConfig, get_attr_docs, get_field)
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
@@ -44,27 +44,17 @@ logger = init_logger(__name__)
 
 ALLOWED_DETAILED_TRACE_MODULES = ["model", "worker", "all"]
 
-DEVICE_OPTIONS = [
-    "auto",
-    "cuda",
-    "neuron",
-    "cpu",
-    "tpu",
-    "xpu",
-    "hpu",
-]
-
 # object is used to allow for special typing forms
 T = TypeVar("T")
 TypeHint = Union[type[Any], object]
 TypeHintT = Union[type[T], object]
 
 
-def optional_arg(val: str, return_type: type[T]) -> Optional[T]:
+def optional_arg(val: str, return_type: Callable[[str], T]) -> Optional[T]:
     if val == "" or val == "None":
         return None
     try:
-        return cast(Callable, return_type)(val)
+        return return_type(val)
     except ValueError as e:
         raise argparse.ArgumentTypeError(
             f"Value {val} cannot be converted to {return_type}.") from e
@@ -82,8 +72,11 @@ def optional_float(val: str) -> Optional[float]:
     return optional_arg(val, float)
 
 
-def nullable_kvs(val: str) -> Optional[Mapping[str, int]]:
-    """Parses a string containing comma separate key [str] to value [int]
+def nullable_kvs(val: str) -> Optional[dict[str, int]]:
+    """NOTE: This function is deprecated, args should be passed as JSON
+    strings instead.
+    
+    Parses a string containing comma separate key [str] to value [int]
     pairs into a dictionary.
 
     Args:
@@ -117,6 +110,17 @@ def nullable_kvs(val: str) -> Optional[Mapping[str, int]]:
     return out_dict
 
 
+def optional_dict(val: str) -> Optional[dict[str, int]]:
+    try:
+        return optional_arg(val, json.loads)
+    except ValueError:
+        logger.warning(
+            "Failed to parse JSON string. Attempting to parse as "
+            "comma-separated key=value pairs. This will be deprecated in a "
+            "future release.")
+        return nullable_kvs(val)
+
+
 @dataclass
 class EngineArgs:
     """Arguments for vLLM engine."""
@@ -178,12 +182,14 @@ class EngineArgs:
     enforce_eager: Optional[bool] = None
     max_seq_len_to_capture: int = 8192
     disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
-    tokenizer_pool_size: int = 0
+    tokenizer_pool_size: int = TokenizerPoolConfig.pool_size
     # Note: Specifying a tokenizer pool by passing a class
     # is intended for expert use only. The API may change without
     # notice.
-    tokenizer_pool_type: Union[str, Type["BaseTokenizerGroup"]] = "ray"
-    tokenizer_pool_extra_config: Optional[Dict[str, Any]] = None
+    tokenizer_pool_type: Union[PoolType, Type["BaseTokenizerGroup"]] = \
+        TokenizerPoolConfig.pool_type
+    tokenizer_pool_extra_config: dict[str, Any] = \
+        get_field(TokenizerPoolConfig, "extra_config")
     limit_mm_per_prompt: Optional[Mapping[str, int]] = None
     mm_processor_kwargs: Optional[Dict[str, Any]] = None
     disable_mm_preprocessor_cache: bool = False
@@ -199,14 +205,14 @@ class EngineArgs:
     long_lora_scaling_factors: Optional[Tuple[float]] = None
     lora_dtype: Optional[Union[str, torch.dtype]] = 'auto'
     max_cpu_loras: Optional[int] = None
-    device: str = 'auto'
+    device: Device = DeviceConfig.device
     num_scheduler_steps: int = SchedulerConfig.num_scheduler_steps
     multi_step_stream_outputs: bool = SchedulerConfig.multi_step_stream_outputs
     ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
     num_gpu_blocks_override: Optional[int] = None
     num_lookahead_slots: int = SchedulerConfig.num_lookahead_slots
-    model_loader_extra_config: Optional[
-        dict] = LoadConfig.model_loader_extra_config
+    model_loader_extra_config: dict = \
+        get_field(LoadConfig, "model_loader_extra_config")
     ignore_patterns: Optional[Union[str,
                                     List[str]]] = LoadConfig.ignore_patterns
     preemption_mode: Optional[str] = SchedulerConfig.preemption_mode
@@ -294,14 +300,15 @@ class EngineArgs:
             """Check if the class is a custom type."""
             return cls.__module__ != "builtins"
 
-        def get_kwargs(cls: type[Any]) -> dict[str, Any]:
+        def get_kwargs(cls: type[Config]) -> dict[str, Any]:
             cls_docs = get_attr_docs(cls)
             kwargs = {}
             for field in fields(cls):
                 name = field.name
-                # One of these will always be present
-                default = (field.default_factory
-                           if field.default is MISSING else field.default)
+                default = field.default
+                # This will only be True if default is MISSING
+                if field.default_factory is not MISSING:
+                    default = field.default_factory()
                 kwargs[name] = {"default": default, "help": cls_docs[name]}
 
                 # Make note of if the field is optional and get the actual
@@ -331,8 +338,9 @@ class EngineArgs:
                 elif can_be_type(field_type, float):
                     kwargs[name][
                         "type"] = optional_float if optional else float
+                elif can_be_type(field_type, dict):
+                    kwargs[name]["type"] = optional_dict
                 elif (can_be_type(field_type, str)
-                      or can_be_type(field_type, dict)
                       or is_custom_type(field_type)):
                     kwargs[name]["type"] = optional_str if optional else str
                 else:
@@ -674,25 +682,19 @@ class EngineArgs:
                             'Additionally for encoder-decoder models, if the '
                             'sequence length of the encoder input is larger '
                             'than this, we fall back to the eager mode.')
-        parser.add_argument('--tokenizer-pool-size',
-                            type=int,
-                            default=EngineArgs.tokenizer_pool_size,
-                            help='Size of tokenizer pool to use for '
-                            'asynchronous tokenization. If 0, will '
-                            'use synchronous tokenization.')
-        parser.add_argument('--tokenizer-pool-type',
-                            type=str,
-                            default=EngineArgs.tokenizer_pool_type,
-                            help='Type of tokenizer pool to use for '
-                            'asynchronous tokenization. Ignored '
-                            'if tokenizer_pool_size is 0.')
-        parser.add_argument('--tokenizer-pool-extra-config',
-                            type=optional_str,
-                            default=EngineArgs.tokenizer_pool_extra_config,
-                            help='Extra config for tokenizer pool. '
-                            'This should be a JSON string that will be '
-                            'parsed into a dictionary. Ignored if '
-                            'tokenizer_pool_size is 0.')
+
+        # Tokenizer arguments
+        tokenizer_kwargs = get_kwargs(TokenizerPoolConfig)
+        tokenizer_group = parser.add_argument_group(
+            title="TokenizerPoolConfig",
+            description=TokenizerPoolConfig.__doc__,
+        )
+        tokenizer_group.add_argument('--tokenizer-pool-size',
+                                     **tokenizer_kwargs["pool_size"])
+        tokenizer_group.add_argument('--tokenizer-pool-type',
+                                     **tokenizer_kwargs["pool_type"])
+        tokenizer_group.add_argument('--tokenizer-pool-extra-config',
+                                     **tokenizer_kwargs["extra_config"])
 
         # Multimodal related configs
         parser.add_argument(
@@ -784,11 +786,15 @@ class EngineArgs:
                             type=int,
                             default=EngineArgs.max_prompt_adapter_token,
                             help='Max number of PromptAdapters tokens')
-        parser.add_argument("--device",
-                            type=str,
-                            default=EngineArgs.device,
-                            choices=DEVICE_OPTIONS,
-                            help='Device type for vLLM execution.')
+
+        # Device arguments
+        device_kwargs = get_kwargs(DeviceConfig)
+        device_group = parser.add_argument_group(
+            title="DeviceConfig",
+            description=DeviceConfig.__doc__,
+        )
+        device_group.add_argument("--device", **device_kwargs["device"])
+
         parser.add_argument('--num-scheduler-steps',
                             type=int,
                             default=1,
@@ -1302,8 +1308,6 @@ class EngineArgs:
 
         if self.qlora_adapter_name_or_path is not None and \
             self.qlora_adapter_name_or_path != "":
-            if self.model_loader_extra_config is None:
-                self.model_loader_extra_config = {}
             self.model_loader_extra_config[
                 "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path
 

From c69bf4ee064741c1fa12ec74d20c97f7cdd67d9f Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Thu, 17 Apr 2025 19:34:20 +0800
Subject: [PATCH 477/593] fix: hyperlink (#16778)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 docs/source/deployment/frameworks/open-webui.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/deployment/frameworks/open-webui.md b/docs/source/deployment/frameworks/open-webui.md
index 08ad90ba688ef..83e5303a00ef2 100644
--- a/docs/source/deployment/frameworks/open-webui.md
+++ b/docs/source/deployment/frameworks/open-webui.md
@@ -2,7 +2,7 @@
 
 # Open WebUI
 
-1. Install the (Docker)[https://docs.docker.com/engine/install/]
+1. Install the [Docker](https://docs.docker.com/engine/install/)
 
 2. Start the vLLM server with the supported chat completion model, e.g.
 
@@ -10,7 +10,7 @@
 vllm serve qwen/Qwen1.5-0.5B-Chat
 ```
 
-1. Start the (Open WebUI)[https://github.com/open-webui/open-webui] docker container (replace the vllm serve host and vllm serve port):
+1. Start the [Open WebUI](https://github.com/open-webui/open-webui) docker container (replace the vllm serve host and vllm serve port):
 
 ```console
 docker run -d -p 3000:8080 \

From dbe7f07001955d6ba745f297203fee0aa0fbc5cf Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 17 Apr 2025 20:53:31 +0800
Subject: [PATCH 478/593] [Doc] Make sure to update vLLM when installing latest
 code (#16781)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/getting_started/installation/gpu/cuda.inc.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/source/getting_started/installation/gpu/cuda.inc.md
index d3e375aec10cb..cd39d6376fe3c 100644
--- a/docs/source/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/source/getting_started/installation/gpu/cuda.inc.md
@@ -46,7 +46,7 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe
 ##### Install the latest code using `pip`
 
 ```console
-pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
+pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
 ```
 
 `--pre` is required for `pip` to consider pre-released versions.
@@ -65,9 +65,11 @@ Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.p
 Another way to install the latest code is to use `uv`:
 
 ```console
-uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly
+uv pip install -U vllm --extra-index-url https://wheels.vllm.ai/nightly
 ```
 
+##### Install specific revisions using `uv`
+
 If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
 
 ```console

From 11c3b98491dcd8860d2aa6cac7b9ab2cd0bd2b53 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Thu, 17 Apr 2025 21:37:37 +0800
Subject: [PATCH 479/593] [Doc] Document Matryoshka Representation Learning
 support (#16770)

---
 docs/source/models/pooling_models.md          | 74 +++++++++++++++++++
 .../openai_embedding_matryoshka_fy.py         | 36 +++++++++
 2 files changed, 110 insertions(+)
 create mode 100644 examples/online_serving/openai_embedding_matryoshka_fy.py

diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index dbcd846cc9779..5f1c2b5b4a3ba 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -141,3 +141,77 @@ Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints tha
 - [Pooling API](#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models.
 - [Embeddings API](#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](#multimodal-inputs) for embedding models.
 - [Score API](#score-api) is similar to `LLM.score` for cross-encoder models.
+
+## Matryoshka Embeddings
+
+[Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows user to trade off between performance and cost.
+
+:::{warning}
+Not all embedding models are trained using Matryoshka Representation Learning. To avoid misuse of the `dimensions` parameter, vLLM returns an error for requests that attempt to change the output dimension of models that do not support Matryoshka Embeddings.
+
+For example, setting `dimensions` parameter while using the `BAAI/bge-m3` model will result in the following error.
+
+```json
+{"object":"error","message":"Model \"BAAI/bge-m3\" does not support matryoshka representation, changing output dimensions will lead to poor results.","type":"BadRequestError","param":null,"code":400}
+```
+
+:::
+
+### Manually enable Matryoshka Embeddings
+
+There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, we simply check the existence of the fields `is_matryoshka` or `matryoshka_dimensions` inside `config.json`.
+
+For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}` (offline) or `--hf_overrides '{"is_matryoshka": true}'` (online).
+
+Here is an example to serve a model with Matryoshka Embeddings enabled.
+
+```text
+vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf_overrides '{"is_matryoshka":true}'
+```
+
+### Offline Inference
+
+You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter in {class}`~vllm.PoolingParams`.
+
+```python
+from vllm import LLM, PoolingParams
+
+model = LLM(model="jinaai/jina-embeddings-v3", 
+            task="embed", 
+            trust_remote_code=True)
+outputs = model.embed(["Follow the white rabbit."], 
+                      pooling_params=PoolingParams(dimensions=32))
+print(outputs[0].outputs)
+```
+
+A code example can be found here: <gh-file:examples/offline_inference/embed_matryoshka_fy.py>
+
+### Online Inference
+
+Use the following command to start vllm server.
+
+```text
+vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
+```
+
+You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter.
+
+```text
+curl http://127.0.0.1:8000/v1/embeddings \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "input": "Follow the white rabbit.",
+    "model": "jinaai/jina-embeddings-v3",
+    "encoding_format": "float",
+    "dimensions": 1
+  }'
+```
+
+Expected output:
+
+```json
+{"id":"embd-0aab28c384d348c3b8f0eb783109dc5f","object":"list","created":1744195454,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-1.0]}],"usage":{"prompt_tokens":10,"total_tokens":10,"completion_tokens":0,"prompt_tokens_details":null}}
+```
+
+A openai client example can be found here: <gh-file:examples/online_serving/openai_embedding_matryoshka_fy.py>
diff --git a/examples/online_serving/openai_embedding_matryoshka_fy.py b/examples/online_serving/openai_embedding_matryoshka_fy.py
new file mode 100644
index 0000000000000..27ab8cb64037b
--- /dev/null
+++ b/examples/online_serving/openai_embedding_matryoshka_fy.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Example Python client for embedding API dimensions using vLLM API server
+NOTE:
+    start a supported Matryoshka Embeddings model server with `vllm serve`, e.g.
+    vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
+"""
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+
+def main():
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    responses = client.embeddings.create(
+        input=["Follow the white rabbit."],
+        model=model,
+        dimensions=1,
+    )
+
+    for data in responses.data:
+        print(data.embedding)  # List of float of len 1
+
+
+if __name__ == "__main__":
+    main()

From 7c02d6a137867a2c4007c6ff991922426af55124 Mon Sep 17 00:00:00 2001
From: Insu Kim <157341422+insukim1994@users.noreply.github.com>
Date: Thu, 17 Apr 2025 23:10:08 +0900
Subject: [PATCH 480/593] [Doc] Changed explanation of generation_tokens_total
 and prompt_tokens_total counter type metrics to avoid confusion (#16784)

Signed-off-by: insukim1994 <insu.kim@moreh.io>
---
 docs/source/design/v1/metrics.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/design/v1/metrics.md b/docs/source/design/v1/metrics.md
index b3981b2dc24a7..8c50451103309 100644
--- a/docs/source/design/v1/metrics.md
+++ b/docs/source/design/v1/metrics.md
@@ -66,8 +66,8 @@ vLLM also provides [a reference example](https://docs.vllm.ai/en/latest/getting_
 The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important:
 
 - `vllm:e2e_request_latency_seconds_bucket` - End to end request latency measured in seconds
-- `vllm:prompt_tokens_total` - Prompt Tokens/Sec
-- `vllm:generation_tokens_total` - Generation Tokens/Sec
+- `vllm:prompt_tokens_total` - Prompt Tokens
+- `vllm:generation_tokens_total` - Generation Tokens
 - `vllm:time_per_output_token_seconds` - Inter token latency (Time Per Output Token, TPOT) in second.
 - `vllm:time_to_first_token_seconds` - Time to First Token (TTFT) latency in seconds.
 - `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in RUNNING, WAITING, and SWAPPED state

From 05fcd1b4308aa2dca9a1b24c540b57a94a7ba124 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 17 Apr 2025 07:45:24 -0700
Subject: [PATCH 481/593] [V1][Perf] Faster incremental detokenization (#15137)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 requirements/common.txt                       |   2 +-
 requirements/test.in                          |   1 +
 requirements/test.txt                         |   6 +-
 tests/lora/test_llama_tp.py                   |   1 +
 tests/tokenization/test_detokenize.py         | 196 ++++++++++----
 .../tool_parsers/mistral_tool_parser.py       |   9 +
 vllm/v1/engine/detokenizer.py                 | 247 ++++++++++++------
 7 files changed, 317 insertions(+), 145 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 4df32460c2db7..33c4c3219f159 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -8,7 +8,7 @@ blake3
 py-cpuinfo
 transformers >= 4.51.1
 huggingface-hub[hf_xet] >= 0.30.0  # Required for Xet downloads.
-tokenizers >= 0.19.1  # Required for Llama 3.
+tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
 aiohttp
diff --git a/requirements/test.in b/requirements/test.in
index c3690f4c9ca42..833f26b554103 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -35,6 +35,7 @@ opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.8 # required for model evaluation test
 transformers==4.51.1
+tokenizers==0.21.1
 huggingface-hub[hf_xet]>=0.30.0  # Required for Xet downloads.
 # quantization
 bitsandbytes>=0.45.3
diff --git a/requirements/test.txt b/requirements/test.txt
index 948c9eda79e1a..ee99540263efc 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -624,8 +624,10 @@ tiktoken==0.7.0
     #   mistral-common
 timm==1.0.11
     # via -r requirements/test.in
-tokenizers==0.21.0
-    # via transformers
+tokenizers==0.21.1
+    # via
+    #   -r requirements/test.in
+    #   transformers
 torch==2.6.0
     # via
     #   -r requirements/test.in
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index cdb8c893b8bc7..e3a054bd62064 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -47,6 +47,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     ]
     sampling_params = vllm.SamplingParams(temperature=0,
                                           max_tokens=256,
+                                          skip_special_tokens=False,
                                           stop=["[/assistant]"])
     outputs = llm.generate(
         prompts,
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index b1860e0bb7083..0f8b98a135818 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -4,14 +4,22 @@ from collections.abc import Generator
 from typing import Any, Optional
 
 import pytest
-from transformers import AutoTokenizer
+from transformers import (AutoTokenizer, PreTrainedTokenizer,
+                          PreTrainedTokenizerFast)
 
 from vllm.inputs import token_inputs
 from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup
-from vllm.transformers_utils.detokenizer import (Detokenizer,
-                                                 detokenize_incrementally)
+from vllm.transformers_utils.detokenizer import Detokenizer
 from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.detokenizer import (FastIncrementalDetokenizer,
+                                        IncrementalDetokenizer,
+                                        SlowIncrementalDetokenizer)
+
+SPECIAL_TOKS_TRUTH = [
+    "Some text with adjacent special tokens                <|padding|><|padding|><fim_prefix><fim_middle><fim_suffix>other text<fim_pad>",  # noqa
+]
 
 TRUTH = [
     "Hello here, this is a simple test",
@@ -22,7 +30,8 @@ TRUTH = [
     # incomplete UTF-8 characters
     # see https://github.com/vllm-project/vllm/pull/9625
     "ပုံပြင်လေးပြောပြပါ်",
-]
+] + SPECIAL_TOKS_TRUTH
+
 TOKENIZERS = [
     "facebook/opt-125m",
     "gpt2",
@@ -38,26 +47,37 @@ TOKENIZERS = [
 ]
 
 
-def _run_incremental_decode(tokenizer, all_input_ids,
-                            skip_special_tokens: bool, starting_index: int):
-    decoded_text = ""
-    offset = 0
-    token_offset = 0
-    prev_tokens = None
-    for i in range(starting_index, len(all_input_ids)):
-        new_tokens, text, offset, token_offset = detokenize_incrementally(
-            tokenizer,
-            all_input_ids[:i + 1],
-            prev_tokens,
-            offset,
-            token_offset,
-            skip_special_tokens=skip_special_tokens)
-        decoded_text += text
-        if prev_tokens is None:
-            prev_tokens = new_tokens
-        else:
-            prev_tokens += new_tokens
-    return decoded_text
+def _run_incremental_decode(tokenizer,
+                            all_input_ids,
+                            skip_special_tokens: bool,
+                            starting_index: int,
+                            spaces_between_special_tokens: bool = True,
+                            fast: Optional[bool] = None):
+
+    prompt_token_ids = all_input_ids[:starting_index]
+
+    params = SamplingParams(
+        skip_special_tokens=skip_special_tokens,
+        spaces_between_special_tokens=spaces_between_special_tokens,
+    )
+    request = EngineCoreRequest("", "", prompt_token_ids, None, None, None,
+                                params, None, 0.0, None)
+
+    if fast is None:
+        detokenizer = IncrementalDetokenizer.from_new_request(
+            tokenizer, request)
+    elif fast:
+        detokenizer = FastIncrementalDetokenizer(tokenizer, request)
+    else:
+        detokenizer = SlowIncrementalDetokenizer(tokenizer, request)
+
+    output_text = ""
+    for i, token_id in enumerate(all_input_ids[starting_index:]):
+        detokenizer.update([token_id], False)
+        finished = i == len(all_input_ids) - 1
+        output_text += detokenizer.get_next_output_text(finished, delta=True)
+
+    return output_text, detokenizer.output_token_ids
 
 
 @pytest.fixture
@@ -85,11 +105,13 @@ def test_mistral_edge_case(tokenizer, truth):
     starting_index = 0
     all_input_ids = tokenizer(truth, add_special_tokens=False).input_ids
 
-    decoded_text = _run_incremental_decode(tokenizer,
-                                           all_input_ids,
-                                           skip_special_tokens=True,
-                                           starting_index=starting_index)
+    decoded_text, out_ids = _run_incremental_decode(
+        tokenizer,
+        all_input_ids,
+        skip_special_tokens=True,
+        starting_index=starting_index)
     assert decoded_text == truth
+    assert out_ids == all_input_ids[starting_index:]
 
 
 @pytest.fixture
@@ -106,40 +128,86 @@ def skip_special_tokens(request, tokenizer_name) -> Generator[bool, Any, None]:
 @pytest.mark.parametrize("with_prompt", [True, False])
 @pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
 @pytest.mark.parametrize("skip_special_tokens", (True, False), indirect=True)
-def test_decode_streaming(tokenizer, truth, with_prompt, skip_special_tokens):
+@pytest.mark.parametrize("spaces_between_special_tokens", (True, False))
+@pytest.mark.parametrize("fast", (True, False))
+def test_decode_streaming(tokenizer, truth, with_prompt, skip_special_tokens,
+                          spaces_between_special_tokens, fast):
+    if fast and not isinstance(tokenizer, PreTrainedTokenizerFast):
+        pytest.skip()
+
+    if skip_special_tokens and not spaces_between_special_tokens:
+        pytest.skip()
+
+    if not fast and isinstance(tokenizer, PreTrainedTokenizerFast):
+        # Fix up inconsistency in fast/slow tokenizer behaviour.
+        tokenizer.add_special_tokens({
+            "additional_special_tokens": [
+                at for at in
+                tokenizer._tokenizer.get_added_tokens_decoder().values()
+                if at.special
+            ]
+        })
+
+    extra_decode_args = {} if not isinstance(tokenizer,  PreTrainedTokenizer) \
+        else {"spaces_between_special_tokens": spaces_between_special_tokens}
+
+    truth_tokens = tokenizer(truth, add_special_tokens=False).input_ids
+    if tokenizer.bos_token_id is not None:
+        truth_tokens.insert(0, tokenizer.bos_token_id)
+    truth_tokens.append(tokenizer.eos_token_id)
+
+    new_truth = tokenizer.decode(truth_tokens,
+                                 skip_special_tokens=skip_special_tokens,
+                                 **extra_decode_args)
+
     if with_prompt:
-        truth_tokens = tokenizer(truth, add_special_tokens=False).input_ids
-        prompt_input_ids = truth_tokens[:len(truth) // 2]
-        generated_input_ids = truth_tokens[len(truth) // 2:]
+        num_prompt_tokens = len(
+            tokenizer(truth[:len(truth) // 2],
+                      add_special_tokens=False).input_ids)
+        if tokenizer.bos_token_id is not None:
+            num_prompt_tokens += 1
+
+        prompt_input_ids = truth_tokens[:num_prompt_tokens]
+        generated_input_ids = truth_tokens[num_prompt_tokens:]
         all_input_ids = prompt_input_ids + generated_input_ids
         starting_index = len(prompt_input_ids)
         prompt = tokenizer.decode(prompt_input_ids,
-                                  skip_special_tokens=skip_special_tokens)
-        generated = truth[len(prompt):]
-    else:
-        generated = truth
-        starting_index = 0
-        all_input_ids = tokenizer(truth, add_special_tokens=False).input_ids
-    if skip_special_tokens:
-        if tokenizer.bos_token_id is not None:
-            all_input_ids = [tokenizer.bos_token_id] + all_input_ids
-            starting_index += 1
-        all_input_ids = all_input_ids + [tokenizer.eos_token_id]
+                                  skip_special_tokens=skip_special_tokens,
+                                  **extra_decode_args)
 
-    decoded_text = _run_incremental_decode(
+        generated = new_truth[len(prompt):]
+    else:
+        generated = new_truth
+        starting_index = 0
+        all_input_ids = truth_tokens
+
+    decoded_text, out_ids = _run_incremental_decode(
         tokenizer,
         all_input_ids,
         skip_special_tokens=skip_special_tokens,
-        starting_index=starting_index)
+        starting_index=starting_index,
+        spaces_between_special_tokens=spaces_between_special_tokens,
+        fast=fast)
 
     assert decoded_text == generated
+    assert out_ids == all_input_ids[starting_index:]
 
-    decoded_text = _run_incremental_decode(
+
+@pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
+@pytest.mark.parametrize("fast", (True, False))
+def test_oov_decode(tokenizer, fast):
+    if fast and not isinstance(tokenizer, PreTrainedTokenizerFast):
+        pytest.skip()
+
+    decoded_text, out_ids = _run_incremental_decode(
         tokenizer, [len(tokenizer)],
-        skip_special_tokens=skip_special_tokens,
-        starting_index=starting_index)
+        skip_special_tokens=True,
+        starting_index=0,
+        spaces_between_special_tokens=True,
+        fast=fast)
 
     assert decoded_text == ''
+    assert out_ids == [len(tokenizer)]
 
 
 @pytest.fixture
@@ -165,15 +233,14 @@ def detokenizer(tokenizer_name: str) -> Detokenizer:
 @pytest.fixture(name="complete_sequence_token_ids")
 def create_complete_sequence_token_ids(complete_sequence: str,
                                        tokenizer) -> list[int]:
-    complete_sequence_token_ids = tokenizer(complete_sequence).input_ids
-    return complete_sequence_token_ids
+    return tokenizer(complete_sequence, add_special_tokens=False).input_ids
 
 
 def create_sequence(prompt_token_ids=None):
-    prompt_token_ids = prompt_token_ids or [1]
+    prompt_token_ids = prompt_token_ids or []
     return Sequence(
         seq_id=0,
-        inputs=token_inputs(prompt_token_ids, prompt="<s>"),
+        inputs=token_inputs(prompt_token_ids),
         block_size=16,
     )
 
@@ -224,7 +291,7 @@ def test_decode_sequence_logprobs(complete_sequence: str,
     assert sequential_result == "".join(sequential_logprobs_text_chosen_token)
     assert sequential_result != "".join(sequential_logprobs_text_other_token)
 
-    if skip_special_tokens:
+    if not skip_special_tokens:
         # Text for logprobs for the chosen token should be the same as the
         # generated text. Note that this will only be true if we skip
         # special tokens.
@@ -233,10 +300,23 @@ def test_decode_sequence_logprobs(complete_sequence: str,
 
 @pytest.mark.parametrize("complete_sequence", TRUTH)
 @pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
-def test_decode_prompt_logprobs(complete_sequence_token_ids: list[int],
+def test_decode_prompt_logprobs(complete_sequence: str,
+                                complete_sequence_token_ids: list[int],
                                 detokenizer: Detokenizer):
+
+    # We want to use skip_special_tokens=False here but Mistral tokenizers
+    # don't support that.
+    if complete_sequence not in SPECIAL_TOKS_TRUTH:
+        skip_special_tokens = True
+    elif not isinstance(detokenizer.tokenizer_group.get_lora_tokenizer(None),
+                        MistralTokenizer):
+        skip_special_tokens = False
+    else:
+        pytest.skip("MistralTokenizers don't support "
+                    "skip_special_tokens=False")
+        return
     """Verify Detokenizer decodes prompt logprobs correctly."""
-    sampling_params = SamplingParams(skip_special_tokens=True,
+    sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens,
                                      prompt_logprobs=1)
 
     # Run sequentially.
@@ -256,8 +336,10 @@ def test_decode_prompt_logprobs(complete_sequence_token_ids: list[int],
     # decoded_prompt_logprobs doesn't contain the first token.
     token_ids = complete_sequence_token_ids
     tokenizer = detokenizer.get_tokenizer_for_seq(seq)
-    text_full = tokenizer.decode(token_ids, skip_special_tokens=True)
-    text_first = tokenizer.decode(token_ids[0], skip_special_tokens=True)
+    text_full = tokenizer.decode(token_ids,
+                                 skip_special_tokens=skip_special_tokens)
+    text_first = tokenizer.decode(token_ids[0],
+                                  skip_special_tokens=skip_special_tokens)
     text = text_full[len(text_first):]
 
     # Text for logprobs for the chosen token should be the same as the
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index 0661445639d74..bff6cb79ad536 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -70,6 +70,15 @@ class MistralToolParser(ToolParser):
                 "Mistral Tool Parser could not locate the tool call token in "
                 "the tokenizer!")
 
+    def adjust_request(
+            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        if request.tools and request.tool_choice != 'none':
+            # do not skip special tokens because mistral uses the special
+            # tokens to indicate the start and end of the tool calls
+            # information.
+            request.skip_special_tokens = False
+        return request
+
     def extract_tool_calls(
         self,
         model_output: str,
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index bf06a17507b21..006d53d8f1288 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,8 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
-
-from dataclasses import dataclass, field
+from abc import ABC, abstractmethod
 from typing import Optional
 
+from tokenizers import Tokenizer
+from tokenizers.decoders import DecodeStream
+from transformers import PreTrainedTokenizerFast
+
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.logger import init_logger
 from vllm.transformers_utils.detokenizer_utils import (
@@ -12,39 +15,22 @@ from vllm.v1.engine import EngineCoreRequest
 logger = init_logger(__name__)
 
 
-@dataclass
 class IncrementalDetokenizer:
 
-    # Generation data
-    token_ids: list[int]
-    output_text: str = ""
-    tokens: list[str] = field(default_factory=list)
-    prompt_len: int = 0
-
-    # Stop strings
-    stop: list[str] = field(default_factory=list)
-    include_stop_str_in_output: bool = False
-
-    # Metadata for incremental detokenization
-    prefix_offset: int = 0
-    read_offset: int = 0
-
-    # Parameters for detokenization
-    skip_special_tokens: bool = True
-    spaces_between_special_tokens: bool = True
-
-    # Tokenizer for this request,
-    # None if detokenization is disabled.
-    tokenizer: Optional[AnyTokenizer] = None
-
-    # Accounting for stop string buffering
-    stop_buffer_length: int = 0
-    _last_output_text_offset: int = 0
+    def __init__(self):
+        self.token_ids: list[int] = []
 
     @property
     def output_token_ids(self) -> list[int]:
-        return self.token_ids if not self.prompt_len else (
-            self.token_ids[self.prompt_len:])
+        return self.token_ids
+
+    def update(self, new_token_ids: list[int],
+               stop_terminated: bool) -> Optional[str]:
+        self.token_ids.extend(new_token_ids)
+        return None
+
+    def get_next_output_text(self, finished: bool, delta: bool) -> str:
+        return ""
 
     @classmethod
     def from_new_request(
@@ -54,39 +40,37 @@ class IncrementalDetokenizer:
     ) -> "IncrementalDetokenizer":
 
         if tokenizer is None:
-            return cls(token_ids=[])
+            # No tokenizer => skipping detokenization.
+            return IncrementalDetokenizer()
 
-        tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens(
-            tokenizer=tokenizer,
-            prompt_ids=request.prompt_token_ids,
-            skip_special_tokens=request.sampling_params.skip_special_tokens,
-        )
+        if isinstance(tokenizer, PreTrainedTokenizerFast):
+            # Fast tokenizer => use tokenizers library DecodeStream.
+            return FastIncrementalDetokenizer(tokenizer, request)
+
+        # Fall back to slow python-based incremental detokenization.
+        return SlowIncrementalDetokenizer(tokenizer, request)
+
+
+class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC):
+
+    def __init__(self, request: EngineCoreRequest):
+        super().__init__()
+
+        # Stop strings
+        params = request.sampling_params
+        self.stop = stop = params.stop
+        self.include_stop_str_in_output = params.include_stop_str_in_output
 
-        stops = request.sampling_params.stop
         # Number of chars to hold back when stop strings are to be excluded
         # from streamed output.
-        if stops and not request.sampling_params.include_stop_str_in_output:
-            stop_buffer_length = max(len(s) for s in stops) - 1
+        if stop and not self.include_stop_str_in_output:
+            self.stop_buffer_length = max(len(s) for s in stop) - 1
         else:
-            stop_buffer_length = 0
+            self.stop_buffer_length = 0
+        self._last_output_text_offset: int = 0
 
-        return cls(
-            tokens=tokens,
-            # Detokenizer mutates this list, so need a unique copy.
-            # NOTE(Nick): could we take ownership of it though?
-            token_ids=request.prompt_token_ids.copy(),
-            stop=stops,
-            include_stop_str_in_output=request.sampling_params.
-            include_stop_str_in_output,
-            prefix_offset=prefix_offset,
-            read_offset=read_offset,
-            skip_special_tokens=request.sampling_params.skip_special_tokens,
-            spaces_between_special_tokens=request.sampling_params.
-            spaces_between_special_tokens,
-            prompt_len=len(request.prompt_token_ids),
-            tokenizer=tokenizer,
-            stop_buffer_length=stop_buffer_length,
-        )
+        # Generation data
+        self.output_text = ""
 
     def update(self, new_token_ids: list[int],
                stop_terminated: bool) -> Optional[str]:
@@ -98,11 +82,7 @@ class IncrementalDetokenizer:
         Return matched stop string or None.
         """
         if not new_token_ids:
-            # Skip detokenization if no new token ids
-            return None
-        if self.tokenizer is None:
-            # Skip detokenization if no tokenizer
-            self.token_ids.extend(new_token_ids)
+            # Skip detokenization if no new token ids.
             return None
 
         if stop_terminated and not self.include_stop_str_in_output:
@@ -116,34 +96,16 @@ class IncrementalDetokenizer:
         # 1) Detokenize the new token ids incrementally.
         # TODO(woosuk): This method becomes very inefficient when the number of
         # new_token_ids is more than 1. We need to optimize this.
-        decoded_text = ""
+        offset_before = len(self.output_text)
         for new_token_id in new_token_ids:
             self.token_ids.append(new_token_id)
-            (new_tokens, new_decoded_token_text, prefix_offset,
-             read_offset) = detokenize_incrementally(
-                 tokenizer=self.tokenizer,
-                 all_input_ids=self.token_ids,
-                 prev_tokens=self.tokens,
-                 prefix_offset=self.prefix_offset,
-                 read_offset=self.read_offset,
-                 skip_special_tokens=self.skip_special_tokens,
-                 spaces_between_special_tokens=self.
-                 spaces_between_special_tokens,
-             )
-
-            self.tokens.extend(new_tokens)
-            self.prefix_offset = prefix_offset
-            self.read_offset = read_offset
-
-            decoded_text += new_decoded_token_text
-
-        self.output_text += decoded_text
+            self.output_text += self.decode_next(new_token_id)
 
         if stop_terminated:
             if skipped_stop_token_id is not None:
-                # Cleanup after skipping detokenization
+                # Cleanup after skipping detokenization.
                 self.token_ids.append(skipped_stop_token_id)
-            # Stop token triggered; skip stop string check
+            # Stop token triggered; skip stop string check.
             return None
 
         # 2) Evaluate stop strings.
@@ -151,7 +113,7 @@ class IncrementalDetokenizer:
         if self.stop:
             stop = StopChecker.check_stop_strings(
                 output_text=self.output_text,
-                new_char_count=len(decoded_text),
+                new_char_count=len(self.output_text) - offset_before,
                 stop=self.stop,
                 include_in_output=self.include_stop_str_in_output,
             )
@@ -162,6 +124,10 @@ class IncrementalDetokenizer:
 
         return stop_string
 
+    @abstractmethod
+    def decode_next(self, next_token_id: int) -> str:
+        raise NotImplementedError
+
     def get_next_output_text(self, finished: bool, delta: bool) -> str:
         """If delta is True, only new text since the last call to
         this method is returned"""
@@ -177,3 +143,114 @@ class IncrementalDetokenizer:
             self._last_output_text_offset = length
             return self.output_text[last_offset:length]
         return ""
+
+
+class FastIncrementalDetokenizer(BaseIncrementalDetokenizer):
+
+    def __init__(self, tokenizer: PreTrainedTokenizerFast,
+                 request: EngineCoreRequest):
+        super().__init__(request)
+
+        sampling_params = request.sampling_params
+        self.stream = DecodeStream(
+            skip_special_tokens=sampling_params.skip_special_tokens)
+
+        self.tokenizer: Tokenizer = tokenizer._tokenizer
+
+        # Find a safe place to start.
+        prompt_suffix = request.prompt_token_ids
+        prompt_len = len(prompt_suffix)
+        if prompt_len > 4:
+            for i in range(4, max(prompt_len + 1, 32)):
+                suffix = request.prompt_token_ids[-i:]
+                if '�' not in self.tokenizer.decode(suffix):
+                    prompt_suffix = suffix
+                    break
+
+        # Prime the stream.
+        for tid in prompt_suffix:
+            self.stream.step(self.tokenizer, tid)
+
+        self.spaces_between_special_tokens = (
+            sampling_params.skip_special_tokens
+            or sampling_params.spaces_between_special_tokens)
+
+        if not self.spaces_between_special_tokens:
+            # Store dict of added token ids so that we can suppress
+            # the spaces between them.
+            if (added_token_ids := getattr(self.tokenizer, "added_token_ids",
+                                           None)) is None:
+                self.tokenizer.added_token_ids = added_token_ids = {
+                    tid: tok.content
+                    for tid, tok in
+                    self.tokenizer.get_added_tokens_decoder().items()
+                }
+
+            if added_token_ids:
+                self.last_special = False
+                self.added_token_ids = added_token_ids
+            else:
+                # No added tokens.
+                self.spaces_between_special_tokens = True
+
+    def decode_next(self, next_token_id: int) -> str:
+        token = self.stream.step(self.tokenizer, next_token_id)
+
+        if not self.spaces_between_special_tokens:
+            special_token = self.added_token_ids.get(next_token_id)
+            is_special = special_token is not None
+            if is_special and self.last_special:
+                # Return raw token string without any prefixed spaces.
+                token = special_token
+            self.last_special = is_special
+
+        return token or ""
+
+
+class SlowIncrementalDetokenizer(BaseIncrementalDetokenizer):
+
+    def __init__(self, tokenizer: AnyTokenizer, request: EngineCoreRequest):
+        super().__init__(request)
+
+        self.tokenizer = tokenizer
+
+        # Metadata for incremental detokenization.
+        self.tokens, self.prefix_offset, self.read_offset = (
+            convert_prompt_ids_to_tokens(
+                tokenizer=tokenizer,
+                prompt_ids=request.prompt_token_ids,
+                skip_special_tokens=request.sampling_params.
+                skip_special_tokens,
+            ))
+
+        self.token_ids.extend(request.prompt_token_ids)
+        self.prompt_len = len(request.prompt_token_ids)
+
+        params = request.sampling_params
+        self.skip_special_tokens = params.skip_special_tokens
+        self.spaces_between_special_tokens = (
+            params.spaces_between_special_tokens)
+
+    @property
+    def output_token_ids(self) -> list[int]:
+        return self.token_ids if not self.prompt_len else (
+            self.token_ids[self.prompt_len:])
+
+    def decode_next(self, next_token_id: int) -> str:
+        new_tokens, decoded_text, prefix_offset, read_offset = (
+            detokenize_incrementally(
+                tokenizer=self.tokenizer,
+                all_input_ids=self.token_ids,
+                prev_tokens=self.tokens,
+                prefix_offset=self.prefix_offset,
+                read_offset=self.read_offset,
+                skip_special_tokens=self.skip_special_tokens,
+                spaces_between_special_tokens=self.
+                spaces_between_special_tokens,
+            ))
+
+        self.tokens.extend(new_tokens)
+        self.prefix_offset = prefix_offset
+        self.read_offset = read_offset
+
+        return decoded_text

From 6211b92273f5935d66a704d77fade25202bad615 Mon Sep 17 00:00:00 2001
From: Robin <863579016@qq.com>
Date: Fri, 18 Apr 2025 00:01:07 +0800
Subject: [PATCH 482/593] [Bugfix]Fix index out of range error in api server
 log (#16787)

Signed-off-by: WangErXiao <863579016@qq.com>
---
 vllm/entrypoints/openai/api_server.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 2c15aa8a9335a..87a4cdd12ad9f 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -881,7 +881,8 @@ def build_app(args: Namespace) -> FastAPI:
                 section async for section in response.body_iterator
             ]
             response.body_iterator = iterate_in_threadpool(iter(response_body))
-            logger.info("response_body={%s}", response_body[0].decode())
+            logger.info("response_body={%s}",
+                        response_body[0].decode() if response_body else None)
             return response
 
     for middleware in args.middleware:

From a018e555fd872ead45a1ab13d86626bb37064076 Mon Sep 17 00:00:00 2001
From: Ximingwang-09 <72070413+Ximingwang-09@users.noreply.github.com>
Date: Fri, 18 Apr 2025 00:01:30 +0800
Subject: [PATCH 483/593] [Kernel] Add fp8_w8a8 fused MoE kernel tuning configs
 for DeepSeek V3/R1 on NVIDIA H20 (#16753)

Signed-off-by: ximing.wxm <ximing.wxm@antgroup.com>
Co-authored-by: ximing.wxm <ximing.wxm@antgroup.com>
---
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 1 file changed, 146 insertions(+)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..857d11e488917
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}

From 5125d72f024aa0da545e4da03ec67109353f48e4 Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Fri, 18 Apr 2025 01:48:31 +0800
Subject: [PATCH 484/593] [Model] use AutoWeightsLoader for
 olmoe,opt,orion,persimmon,phi3_small (#16548)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 vllm/model_executor/models/olmoe.py      | 112 ++++++++++++-----------
 vllm/model_executor/models/opt.py        |  88 ++++++++++--------
 vllm/model_executor/models/orion.py      |  91 +++++++++---------
 vllm/model_executor/models/persimmon.py  |  74 ++++++++-------
 vllm/model_executor/models/phi3_small.py |  44 +++++----
 5 files changed, 216 insertions(+), 193 deletions(-)

diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 6cf3f1f82645c..296bac51d0125 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -39,7 +39,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -255,7 +255,7 @@ class OlmoeModel(nn.Module):
         quant_config = vllm_config.quant_config
 
         self.vocab_size = config.vocab_size
-
+        self.config = config
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
@@ -308,56 +308,6 @@ class OlmoeModel(nn.Module):
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-
-class OlmoeForCausalLM(nn.Module, SupportsPP):
-
-    fall_back_to_pt_during_load = False
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        self.config = config
-        self.quant_config = quant_config
-        self.model = OlmoeModel(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
-        self.lm_head = ParallelLMHead(config.vocab_size,
-                                      config.hidden_size,
-                                      quant_config=quant_config)
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
-
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
-        return hidden_states
-
-    def compute_logits(self, hidden_states: torch.Tensor,
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
-        return logits
-
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
@@ -380,8 +330,6 @@ class OlmoeForCausalLM(nn.Module, SupportsPP):
         params_dict = dict(self.named_parameters())
         loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
@@ -453,3 +401,59 @@ class OlmoeForCausalLM(nn.Module, SupportsPP):
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
+
+
+class OlmoeForCausalLM(nn.Module, SupportsPP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = OlmoeModel(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = get_sampler()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=["rotary_emb.inv_freq"],
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 4a12f36d90e84..14aae2fbf817f 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -43,7 +43,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -313,6 +313,43 @@ class OPTModel(nn.Module):
                             intermediate_tensors,
                             inputs_embeds=inputs_embeds)
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class OPTForCausalLM(nn.Module, SupportsPP):
     packed_modules_mapping = {
@@ -320,6 +357,10 @@ class OPTForCausalLM(nn.Module, SupportsPP):
         "gate_up_proj": ["gate_proj", "up_proj"]
     }
 
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
+        "decoder.": "model.decoder.",
+    })
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -371,42 +412,9 @@ class OPTForCausalLM(nn.Module, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-        ]
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "lm_head.weight" in name and self.config.tie_word_embeddings:
-                continue
-            if name.startswith("decoder."):
-                name = "model." + name
-
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head.weight"]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index 0b42666e02d61..5d35234544c30 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -30,7 +30,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -260,6 +260,45 @@ class OrionModel(nn.Module):
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class OrionForCausalLM(nn.Module, SupportsPP):
 
@@ -314,46 +353,14 @@ class OrionForCausalLM(nn.Module, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=([
+                "rotary_emb.inv_freq",
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+                "rotary_emb.cos_cached",
+                "rotary_emb.sin_cached"
+            ]),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index db8d170a8c91b..15afea82f3a0e 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -46,7 +46,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -221,7 +221,7 @@ class PersimmonModel(nn.Module):
         quant_config = vllm_config.quant_config
 
         self.vocab_size = config.vocab_size
-
+        self.config = config
         self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
                                                    config.hidden_size)
         self.start_layer, self.end_layer, self.layers = make_layers(
@@ -260,6 +260,38 @@ class PersimmonModel(nn.Module):
         hidden_states = self.final_layernorm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if is_pp_missing_parameter(name, self):
+                continue
+            param = params_dict[name]
+
+            if "query_key_value" in name:
+                # copy from vllm/model_executor/models/bloom.py
+                # NOTE: Persimmon's fused QKV's output_dim has the shape of
+                # (num_heads * 3 * head_size), while the
+                # required shape is (3 * num_heads * head_size).
+                # Thus, we need weight conversion.
+                output_dim = getattr(param, "output_dim", None)
+                num_heads = self.config.num_attention_heads
+                if output_dim is not None:
+                    loaded_weight_shape = loaded_weight.shape
+                    loaded_weight = loaded_weight.view(
+                        loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
+                        loaded_weight_shape[output_dim + 1:])
+                    loaded_weight = loaded_weight.transpose(
+                        output_dim, output_dim + 1)
+                    loaded_weight = loaded_weight.reshape(loaded_weight_shape)
+
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class PersimmonForCausalLM(nn.Module, SupportsPP):
 
@@ -315,39 +347,5 @@ class PersimmonForCausalLM(nn.Module, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                continue
-            if is_pp_missing_parameter(name, self):
-                continue
-            param = params_dict[name]
-
-            if "query_key_value" in name:
-                # copy from vllm/model_executor/models/bloom.py
-                # NOTE: Persimmon's fused QKV's output_dim has the shape of
-                # (num_heads * 3 * head_size), while the
-                # required shape is (3 * num_heads * head_size).
-                # Thus, we need weight conversion.
-                output_dim = getattr(param, "output_dim", None)
-                num_heads = self.config.num_attention_heads
-                if output_dim is not None:
-                    loaded_weight_shape = loaded_weight.shape
-                    loaded_weight = loaded_weight.view(
-                        loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
-                        loaded_weight_shape[output_dim + 1:])
-                    loaded_weight = loaded_weight.transpose(
-                        output_dim, output_dim + 1)
-                    loaded_weight = loaded_weight.reshape(loaded_weight_shape)
-
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py
index 33984f54ae271..7b02c9edfad21 100644
--- a/vllm/model_executor/models/phi3_small.py
+++ b/vllm/model_executor/models/phi3_small.py
@@ -26,7 +26,7 @@ from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -353,10 +353,29 @@ class Phi3SmallModel(nn.Module):
         hidden_states = self.final_layernorm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class Phi3SmallForCausalLM(nn.Module, SupportsPP):
     _tied_weights_keys = ["lm_head.weight"]
 
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_suffix={"rotary_emb.inv_freq": None})
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -448,21 +467,8 @@ class Phi3SmallForCausalLM(nn.Module, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if name.endswith(".bias") and name not in params_dict:
-                continue
-            if is_pp_missing_parameter(name, self):
-                continue
-            if "lm_head.weight" in name and self.config.tie_word_embeddings:
-                continue
-            param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head.weight"]
+                           if self.config.tie_word_embeddings else None))
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)

From 5989f4684d62d5cb1852624ce0fd04fc08dd239b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Thu, 17 Apr 2025 20:09:57 +0200
Subject: [PATCH 485/593] [TPU][V1] Fix padding recompilation when
 `max-num-batched-tokens` is not even (#16726)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 tests/v1/tpu/worker/test_tpu_model_runner.py |  8 ++++++++
 vllm/v1/worker/tpu_model_runner.py           | 12 +++++++-----
 vllm/v1/worker/tpu_worker.py                 |  4 ++--
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 5c7eab0b6b11b..5db6060935691 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -294,11 +294,19 @@ def test_update_states_request_unscheduled(model_runner):
 
 
 def test_get_paddings():
+    # Bucketed padding
     min_token_size, max_token_size, padding_gap = 16, 512, 64
     expected_paddings = [16, 32, 64, 128, 192, 256, 320, 384, 448, 512]
+    actual_paddings = _get_token_paddings(min_token_size, max_token_size,
+                                          padding_gap)
+
+    # Bucketed padding with max_token_size not a power of two.
+    max_token_size = 317
+    expected_paddings = [16, 32, 64, 128, 192, 256, 320]
     actual_paddings = _get_token_paddings(min_token_size, max_token_size,
                                           padding_gap)
     assert actual_paddings == expected_paddings
+
     # Exponential padding.
     max_token_size, padding_gap = 1024, 0
     expected_paddings = [16, 32, 64, 128, 256, 512, 1024]
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index c61c449e17988..b66cd8d280d88 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -128,10 +128,16 @@ class TPUModelRunner:
         self.block_size = cache_config.block_size
         self.max_model_len = model_config.max_model_len
         self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size)
-        self.max_num_tokens = scheduler_config.max_num_batched_tokens
         # InputBatch needs to work with sampling tensors greater than padding
         # to avoid dynamic shapes. Also, avoid suboptimal alignment.
         self.max_num_reqs = max(scheduler_config.max_num_seqs, MIN_NUM_SEQS)
+        self.num_tokens_paddings = _get_token_paddings(
+            min_token_size=16,
+            max_token_size=scheduler_config.max_num_batched_tokens,
+            padding_gap=envs.VLLM_TPU_BUCKET_PADDING_GAP)
+        # In case `max_num_tokens < max(num_tokens_paddings)` use the actual
+        # padded max value to pre-allocate data structures and pre-compile.
+        self.max_num_tokens = self.num_tokens_paddings[-1]
 
         # Model-related.
         self.num_attn_layers = model_config.get_num_layers_by_block_type(
@@ -211,10 +217,6 @@ class TPUModelRunner:
         # Range tensor with values [0 .. self.max_num_tokens - 1].
         # Used to initialize positions / context_lens / seq_lens
         self.arange_np = np.arange(self.max_num_tokens, dtype=np.int32)
-        self.num_tokens_paddings = _get_token_paddings(
-            min_token_size=16,
-            max_token_size=self.max_num_tokens,
-            padding_gap=envs.VLLM_TPU_BUCKET_PADDING_GAP)
         self.num_reqs_paddings = _get_req_paddings(
             min_req_size=MIN_NUM_SEQS, max_req_size=self.max_num_reqs)
 
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index 73c43969b87b5..8f2b4acc32c30 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -156,8 +156,8 @@ class TPUWorker:
             self.vllm_config.compilation_config.static_forward_context,
             runner_kv_caches)
 
-        self.model_runner._dummy_run(
-            self.scheduler_config.max_num_batched_tokens)
+        # `max_num_tokens >= max_num_batched_tokens` due to padding.
+        self.model_runner._dummy_run(self.model_runner.max_num_tokens)
 
         # Synchronize before measuring the memory usage.
         xm.wait_device_ops()

From eb5819b2d9ff4e5a019de97c333bbedf2a2def1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Thu, 17 Apr 2025 20:18:11 +0200
Subject: [PATCH 486/593] [V1][TPU] Enable Top K (#15489)

Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Hyesoo Yang <hyeygit@gmail.com>
Co-authored-by: Hyesoo Yang <hyeygit@gmail.com>
---
 tests/v1/tpu/test_sampler.py            | 18 +++++++++++++++++
 tests/v1/tpu/test_topk_topp_sampler.py  | 22 +++++++++++++++++++-
 vllm/envs.py                            |  6 ------
 vllm/v1/sample/ops/topk_topp_sampler.py | 27 ++++++++++++++-----------
 vllm/v1/sample/tpu/metadata.py          | 13 +++++++-----
 vllm/v1/worker/tpu_model_runner.py      |  9 +++++++--
 6 files changed, 69 insertions(+), 26 deletions(-)

diff --git a/tests/v1/tpu/test_sampler.py b/tests/v1/tpu/test_sampler.py
index 0147da5335171..74ad81406b4a2 100644
--- a/tests/v1/tpu/test_sampler.py
+++ b/tests/v1/tpu/test_sampler.py
@@ -1,4 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
+import random
+
 import pytest
 
 from vllm import LLM, envs
@@ -39,3 +41,19 @@ def test_sampler_different(model_name: str):
         # Unsupported `seed` param.
         sampling_params = SamplingParams(temperature=0.3, seed=42)
         output2 = llm.generate(prompts, sampling_params)
+
+    # Batch-case with TopK
+    for B in [4, 16]:
+        p = prompts * B
+        sampling_params = [
+            SamplingParams(
+                temperature=0.1,
+                min_p=0.8,
+                max_tokens=64,
+                # Vary number of ks
+                top_k=random.randint(4, 12)) for _ in range(B)
+        ]
+        # Make sure first two reqs have the same K
+        sampling_params[0] = sampling_params[1]
+        output = llm.generate(p, sampling_params)
+        assert output[0].outputs[0].text == output[1].outputs[0].text
diff --git a/tests/v1/tpu/test_topk_topp_sampler.py b/tests/v1/tpu/test_topk_topp_sampler.py
index dce0303e68d55..ff9217f8f3cab 100644
--- a/tests/v1/tpu/test_topk_topp_sampler.py
+++ b/tests/v1/tpu/test_topk_topp_sampler.py
@@ -5,7 +5,8 @@ import pytest
 import torch
 
 from vllm.platforms import current_platform
-from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p_tpu
+from vllm.v1.sample.ops.topk_topp_sampler import (apply_top_k_top_p,
+                                                  apply_top_k_top_p_tpu)
 
 if not current_platform.is_tpu():
     pytest.skip("This test needs a TPU.", allow_module_level=True)
@@ -16,6 +17,25 @@ VOCAB_SIZE = 128 * 1024
 TOLERANCE = 1e-6
 
 
+def test_topk_equivalence_to_native_impl():
+    with torch.device(xm.xla_device()):
+        xm.set_rng_state(seed=33)
+
+        logits = torch.rand((BATCH_SIZE, VOCAB_SIZE))
+
+        # Random top-k values between 1 and 10.
+        k = torch.randint(1, 10, (BATCH_SIZE, ))
+
+        # Set k=vocab_size for ~50% of requests in the batch (top-k disabled).
+        k.masked_fill_(torch.randint(0, 2, (BATCH_SIZE, ), dtype=bool),
+                       VOCAB_SIZE)
+
+        result_tpu = apply_top_k_top_p_tpu(logits=logits.clone(), k=k, p=None)
+
+        result_native = apply_top_k_top_p(logits=logits.clone(), k=k, p=None)
+        assert torch.allclose(result_native, result_tpu)
+
+
 def test_topp_result_sums_past_p():
     with torch.device(xm.xla_device()):
         xm.set_rng_state(seed=33)
diff --git a/vllm/envs.py b/vllm/envs.py
index d32968c3d173a..76b5a4d84a825 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -103,7 +103,6 @@ if TYPE_CHECKING:
     VLLM_DP_MASTER_PORT: int = 0
     VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
     VLLM_V0_USE_OUTLINES_CACHE: bool = False
-    VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION: bool = False
     VLLM_TPU_BUCKET_PADDING_GAP: int = 0
     VLLM_USE_DEEP_GEMM: bool = False
     VLLM_XGRAMMAR_CACHE_MB: int = 0
@@ -685,11 +684,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_V0_USE_OUTLINES_CACHE":
     lambda: os.environ.get("VLLM_V0_USE_OUTLINES_CACHE", "0") == "1",
 
-    # If set, disables TPU-specific optimization for top-k & top-p sampling
-    "VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION":
-    lambda: bool(int(os.environ["VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION"]))
-    if "VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION" in os.environ else None,
-
     # Gap between padding buckets for the forward pass. So we have
     # 8, we will run forward pass with [16, 24, 32, ...].
     "VLLM_TPU_BUCKET_PADDING_GAP":
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index f69623edd6321..745b81ded3f11 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -72,14 +72,7 @@ class TopKTopPSampler(nn.Module):
                     "best performance, please install FlashInfer.")
                 self.forward = self.forward_native
         elif current_platform.is_tpu():
-            if envs.VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION:
-                logger.warning(
-                    "TPU-specific optimization for top-k & top-p sampling are "
-                    "disabled, falling back to PyTorch-native implementation "
-                    "which could be very slow.")
-                self.forward = self.forward_native
-            else:
-                self.forward = self.forward_tpu
+            self.forward = self.forward_tpu
         else:
             self.forward = self.forward_native
 
@@ -146,12 +139,22 @@ def apply_top_k_top_p_tpu(
     chance of being chosen during final sampling, so we can consider the tie
     being broken then.
     """
+    probs = logits.softmax(dim=-1)
+    probs_sort, _ = probs.sort(dim=-1, descending=False)
+
     if k is not None:
-        logits = apply_top_k_only(logits, k)
+        top_k_count = probs_sort.size(1) - k.to(torch.long)  # shape: (batch, )
+        top_k_count = top_k_count.unsqueeze(dim=1)
+        top_k_cutoff = probs_sort.gather(-1, top_k_count)
+
+        # Make sure the no top-k rows are no-op.
+        no_top_k_mask = (k == logits.shape[1]).unsqueeze(dim=1)
+        top_k_cutoff.masked_fill_(no_top_k_mask, -float("inf"))
+
+        elements_to_discard = probs < top_k_cutoff
+        logits.masked_fill_(elements_to_discard, -float("inf"))
 
     if p is not None:
-        probs = logits.softmax(dim=-1)
-        probs_sort, _ = probs.sort(dim=-1, descending=False)
         cumprob = torch.cumsum(probs_sort, dim=-1)
         top_p_mask = cumprob <= 1 - p.unsqueeze(dim=1)
         top_p_mask[:, -1] = False  # at least one
@@ -224,7 +227,7 @@ def apply_top_k_only(
     max_top_k = k.max()
     # topk.values tensor has shape [batch_size, max_top_k].
     # Convert top k to 0-based index in range [0, max_top_k).
-    k_index = k.sub_(1).unsqueeze(1).expand(logits.shape[0], 1)
+    k_index = k.sub_(1).unsqueeze(1)
     top_k_mask = logits.topk(max_top_k, dim=1).values.gather(1, k_index.long())
     # Handle non-topk rows.
     top_k_mask.masked_fill_(no_top_k_mask.unsqueeze(1), -float("inf"))
diff --git a/vllm/v1/sample/tpu/metadata.py b/vllm/v1/sample/tpu/metadata.py
index 3950fda3e5eae..917d8baf60bf0 100644
--- a/vllm/v1/sample/tpu/metadata.py
+++ b/vllm/v1/sample/tpu/metadata.py
@@ -10,7 +10,7 @@ DEFAULT_SAMPLING_PARAMS = dict(
     temperature=-1.0,
     min_p=0.0,
     # strictly disabled for now
-    # top_k=-1,
+    top_k=0,
     # top_p=0.0,
     # frequency_penalties=0.0,
     # presence_penalties=0.0,
@@ -99,11 +99,13 @@ class TPUSupportedSamplingMetadata:
 
         fill_slice(input_batch.temperature_cpu_tensor,
                    DEFAULT_SAMPLING_PARAMS["temperature"])
-        # TODO Temporarily disabled until sampling options are enabled
-        # fill_slice(input_batch.top_p_cpu_tensor)
-        # fill_slice(input_batch.top_k_cpu_tensor)
         fill_slice(input_batch.min_p_cpu_tensor,
                    DEFAULT_SAMPLING_PARAMS["min_p"])
+        fill_slice(input_batch.top_k_cpu_tensor,
+                   DEFAULT_SAMPLING_PARAMS["top_k"])
+        # TODO Temporarily disabled until sampling options are enabled
+        # fill_slice(input_batch.top_p_cpu_tensor,
+        #            DEFAULT_SAMPLING_PARAMS["top_p"])
 
         # Slice persistent device tensors to a fixed pre-compiled padded shape.
         return cls(
@@ -112,6 +114,7 @@ class TPUSupportedSamplingMetadata:
             all_greedy=input_batch.all_greedy,
             # TODO enable more and avoid returning None values
             top_p=None,  # input_batch.top_p[:padded_num_reqs],
-            top_k=None,  # input_batch.top_k[:padded_num_reqs],
+            top_k=input_batch.top_k_cpu_tensor[:padded_num_reqs].to(
+                xla_device),
             min_p=input_batch.min_p_cpu_tensor[:padded_num_reqs].to(
                 xla_device))
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index b66cd8d280d88..f31454ab31f77 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -920,14 +920,19 @@ class TPUModelRunner:
                                       device=self.device)
                 torch._dynamo.mark_dynamic(indices, 0)
                 self.select_hidden_states(dummy_hidden, indices)
-            logger.info("  -- num_tokens: %d", num_tokens)
+                logger.info("  -- num_tokens: %d, num_seqs: %d", num_tokens,
+                            num_reqs)
+                # Requests can't be more than tokens. But do compile for the
+                # next bigger value in case num_tokens uses bucketed padding.
+                if num_reqs >= min(num_tokens, self.max_num_reqs):
+                    break
         xm.wait_device_ops()
         end = time.perf_counter()
         logger.info("Compilation finished in in %.2f [secs].", end - start)
         self._update_num_xla_graphs("select_hidden_states")
 
     def _precompile_sample_from_hidden(self) -> None:
-        logger.info("Compiling sampling with different input shapes.")
+        logger.info("Compiling sampling with different num_reqs.")
         start = time.perf_counter()
         hsize = self.model_config.get_hidden_size()
         for num_reqs in self.num_reqs_paddings:

From 92edf35826e7067bfcba9ae2f0ae2cacd3109cb2 Mon Sep 17 00:00:00 2001
From: "Sijia(Jackson) Chen" <sijiachen.sj@gmail.com>
Date: Thu, 17 Apr 2025 11:44:34 -0700
Subject: [PATCH 487/593] [ROCM] enable aiter fused moe kernel for llama4 bf16
 checkpoints (#16674)

---
 .../layers/fused_moe/rocm_aiter_fused_moe.py        | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index ac158a7eee534..4214e89448212 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -26,6 +26,7 @@ def rocm_aiter_fused_experts(
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         use_fp8_w8a8: bool = False,
+        apply_router_weight_on_input: bool = False,
         w1_scale: Optional[torch.Tensor] = None,
         w2_scale: Optional[torch.Tensor] = None,
         block_shape: Optional[List[int]] = None,
@@ -39,6 +40,18 @@ def rocm_aiter_fused_experts(
     from vllm.model_executor.layers.quantization.utils.fp8_utils import (
         per_token_group_quant_fp8)
 
+    if apply_router_weight_on_input:
+        assert (topk_weights.dim() == 2
+                ), "`topk_weights` should be in shape (num_tokens, topk)"
+        _, topk = topk_weights.shape
+        assert (
+            topk == 1
+        ), "Only support topk=1 when `apply_router_weight_on_input` is True"
+
+        hidden_states = hidden_states * topk_weights.to(hidden_states.dtype)
+        topk_ids = topk_ids.to(torch.int32)
+        topk_weights = torch.ones_like(topk_weights, dtype=torch.float32)
+
     if envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE and use_fp8_w8a8:
         assert w1_scale is not None
         assert w2_scale is not None

From e4755f7fac67e9dd1723173f2eb30778dbe8d792 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Thu, 17 Apr 2025 20:52:18 +0100
Subject: [PATCH 488/593] [V1][Metrics] Fix http metrics middleware (#15894)

---
 docs/source/design/v1/metrics.md      | 11 +++++++++
 vllm/entrypoints/openai/api_server.py | 34 +++++++++++++--------------
 2 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/docs/source/design/v1/metrics.md b/docs/source/design/v1/metrics.md
index 8c50451103309..3f96290798a33 100644
--- a/docs/source/design/v1/metrics.md
+++ b/docs/source/design/v1/metrics.md
@@ -86,6 +86,17 @@ See [the PR which added this Dashboard](gh-pr:2316) for interesting and useful b
 
 Prometheus support was initially added [using the aioprometheus library](gh-pr:1890), but a switch was made quickly to [prometheus_client](gh-pr:2730). The rationale is discussed in both linked PRs.
 
+With the switch to `aioprometheus`, we lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](gh-pr:15657):
+
+```bash
+$ curl http://0.0.0.0:8000/metrics 2>/dev/null  | grep -P '^http_(?!.*(_bucket|_created|_sum)).*'
+http_requests_total{handler="/v1/completions",method="POST",status="2xx"} 201.0
+http_request_size_bytes_count{handler="/v1/completions"} 201.0
+http_response_size_bytes_count{handler="/v1/completions"} 201.0
+http_request_duration_highr_seconds_count 201.0
+http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201.0
+```
+
 ### Multi-process Mode
 
 In v0, metrics are collected in the engine core process and we use multi-process mode to make them available in the API server process. See <gh-pr:7279>.
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 87a4cdd12ad9f..1368195808970 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -310,33 +310,33 @@ def mount_metrics(app: FastAPI):
     # We need to set PROMETHEUS_MULTIPROC_DIR environment variable
     # before prometheus_client is imported.
     # See https://prometheus.github.io/client_python/multiprocess/
-    from prometheus_client import (CollectorRegistry, make_asgi_app,
+    from prometheus_client import (REGISTRY, CollectorRegistry, make_asgi_app,
                                    multiprocess)
     from prometheus_fastapi_instrumentator import Instrumentator
 
+    registry = REGISTRY
+
     prometheus_multiproc_dir_path = os.getenv("PROMETHEUS_MULTIPROC_DIR", None)
     if prometheus_multiproc_dir_path is not None:
         logger.debug("vLLM to use %s as PROMETHEUS_MULTIPROC_DIR",
                      prometheus_multiproc_dir_path)
         registry = CollectorRegistry()
         multiprocess.MultiProcessCollector(registry)
-        Instrumentator(
-            excluded_handlers=[
-                "/metrics",
-                "/health",
-                "/load",
-                "/ping",
-                "/version",
-                "/server_info",
-            ],
-            registry=registry,
-        ).add().instrument(app).expose(app)
 
-        # Add prometheus asgi middleware to route /metrics requests
-        metrics_route = Mount("/metrics", make_asgi_app(registry=registry))
-    else:
-        # Add prometheus asgi middleware to route /metrics requests
-        metrics_route = Mount("/metrics", make_asgi_app())
+    Instrumentator(
+        excluded_handlers=[
+            "/metrics",
+            "/health",
+            "/load",
+            "/ping",
+            "/version",
+            "/server_info",
+        ],
+        registry=registry,
+    ).add().instrument(app).expose(app)
+
+    # Add prometheus asgi middleware to route /metrics requests
+    metrics_route = Mount("/metrics", make_asgi_app(registry=registry))
 
     # Workaround for 307 Redirect for /metrics
     metrics_route.path_regex = re.compile("^/metrics(?P<path>.*)$")

From 0377b8310b28b9ac5d5b6f62c264bd2c727f0e1b Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 17 Apr 2025 13:12:09 -0700
Subject: [PATCH 489/593] [MLA] Simplification to batch P/D reordering (#16673)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/attention/backends/mla/common.py | 12 +++++-------
 vllm/v1/worker/gpu_model_runner.py       | 16 +++++++---------
 2 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 8c7179ba0a8af..b77e952521969 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -415,20 +415,18 @@ class MLACommonMetadataBuilder(Generic[M]):
         # the above loop
         num_decodes = len(decodes)
         num_prefills = len(prefills)
-        first_prefill = 0
         modified_batch = False
 
         for i in range(1, min(num_decodes, num_prefills) + 1):
             # If the decode is at the "back" of the batch, i, we can swap it
             # with the prefill closest to the front of the batch
-            if decodes[num_decodes - i] >= num_decodes:
-                input_batch.swap_states(prefills[first_prefill],
-                                        decodes[num_decodes - i])
-                first_prefill += 1
-                modified_batch = True
-            else:
+            decode_idx = decodes[num_decodes - i]
+            if decode_idx < num_decodes:
                 break
 
+            input_batch.swap_states(prefills[i - 1], decode_idx)
+            modified_batch = True
+
         # Save for next `build` call
         # TODO(lucas): this is a bit of a hack, we should probably have a
         # better way of doing this
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index bfdb0f72251f6..baf0dfb9d9df4 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -458,7 +458,13 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         if removed_req_indices:
             self.input_batch.condense(removed_req_indices)
 
-        if batch_changed:
+        # Some attention backends (namely MLA) may want to separate requests
+        # based on if the attention computation will be compute-bound or
+        # memory-bound. This gives them a hook to do that.
+        batch_reordered = self.attn_metadata_builder.reorder_batch(
+            self.input_batch, scheduler_output)
+
+        if batch_changed or batch_reordered:
             self.input_batch.refresh_sampling_metadata()
 
     def _prepare_inputs(
@@ -471,14 +477,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         num_reqs = self.input_batch.num_reqs
         assert num_reqs > 0
 
-        # Some attention backends (namely MLA) may want to separate requests
-        # based on if the attention computation will be compute-bound or
-        # memory-bound. This gives them a hook to do that.
-        modified_batch = self.attn_metadata_builder.reorder_batch(
-            self.input_batch, scheduler_output)
-        if modified_batch:
-            self.input_batch.refresh_sampling_metadata()
-
         # OPTIMIZATION: Start copying the block table first.
         # This way, we can overlap the copy with the following CPU operations.
         self.input_batch.block_table.commit(num_reqs)

From 3408e471597e7a36ca79fab5fc849f4fb5576df8 Mon Sep 17 00:00:00 2001
From: Yihua Cheng <yihua98@uchicago.edu>
Date: Thu, 17 Apr 2025 15:22:40 -0500
Subject: [PATCH 490/593] [P/D][V1] KV Connector API V1 (#15960)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: ApostaC <yihua98@uchicago.edu>
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
Signed-off-by: remi <remi@mistral.ai>
Co-authored-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Co-authored-by: Rémi Delacourt <54138269+Flechman@users.noreply.github.com>
Co-authored-by: Tyler Michael Smith <tysmith@redhat.com>
---
 .../decode_example.py                         |  36 ++
 .../prefill_example.py                        |  43 ++
 .../disaggregated-prefill-v1/run.sh           |   5 +
 tests/v1/core/test_scheduler.py               | 415 +++++++++++++++++-
 vllm/attention/layer.py                       |  45 +-
 vllm/distributed/kv_transfer/__init__.py      |  11 +
 .../kv_transfer/kv_connector/base.py          |   4 +
 .../kv_transfer/kv_connector/factory.py       |  52 ++-
 .../kv_transfer/kv_connector/v1/__init__.py   |   8 +
 .../kv_transfer/kv_connector/v1/base.py       | 209 +++++++++
 .../v1/shared_storage_connector.py            | 382 ++++++++++++++++
 ...ransfer_agent.py => kv_connector_agent.py} |   2 +-
 .../kv_transfer/kv_transfer_state.py          |  70 +++
 vllm/distributed/parallel_state.py            |  35 +-
 vllm/engine/arg_utils.py                      |   6 -
 vllm/forward_context.py                       |  23 +
 vllm/v1/core/kv_cache_manager.py              |   5 +-
 vllm/v1/core/sched/output.py                  |   5 +
 vllm/v1/core/sched/scheduler.py               |  70 ++-
 vllm/v1/engine/core.py                        |   6 +-
 vllm/v1/worker/gpu_model_runner.py            |  11 +
 vllm/v1/worker/gpu_worker.py                  |  10 +-
 vllm/worker/model_runner.py                   |   3 +-
 vllm/worker/worker.py                         |   4 +-
 24 files changed, 1377 insertions(+), 83 deletions(-)
 create mode 100644 examples/offline_inference/disaggregated-prefill-v1/decode_example.py
 create mode 100644 examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
 create mode 100644 examples/offline_inference/disaggregated-prefill-v1/run.sh
 create mode 100644 vllm/distributed/kv_transfer/kv_connector/v1/__init__.py
 create mode 100644 vllm/distributed/kv_transfer/kv_connector/v1/base.py
 create mode 100644 vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
 rename vllm/distributed/kv_transfer/{kv_transfer_agent.py => kv_connector_agent.py} (97%)
 create mode 100644 vllm/distributed/kv_transfer/kv_transfer_state.py

diff --git a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
new file mode 100644
index 0000000000000..66efbc0c9deec
--- /dev/null
+++ b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+# Read prompts from output.txt
+prompts = []
+try:
+    with open("output.txt") as f:
+        for line in f:
+            prompts.append(line.strip())
+    print(f"Loaded {len(prompts)} prompts from output.txt")
+except FileNotFoundError:
+    print("Error: output.txt file not found")
+    exit(-1)
+
+sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
+
+llm = LLM(
+    model="meta-llama/Llama-3.2-1B-Instruct",
+    enforce_eager=True,
+    gpu_memory_utilization=0.8,
+    max_num_batched_tokens=64,
+    max_num_seqs=16,
+    kv_transfer_config=KVTransferConfig.from_cli(
+        '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both",'
+        '"kv_connector_extra_config": {"shared_storage_path": "local_storage"}}'
+    ))  #, max_model_len=2048, max_num_batched_tokens=2048)
+
+# 1ST generation (prefill instance)
+outputs = llm.generate(prompts, sampling_params)
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
new file mode 100644
index 0000000000000..f7cbf6557d54f
--- /dev/null
+++ b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+context = "Hi " * 1000
+context2 = "Hey " * 500
+prompts = [
+    context + "Hello, my name is",
+    context + "The capital of France is",
+    context2 + "Your name is",
+    context2 + "The capital of China is",
+]
+
+sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
+
+llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
+          enforce_eager=True,
+          gpu_memory_utilization=0.8,
+          kv_transfer_config=KVTransferConfig.from_cli(
+              '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both", '
+              '"kv_connector_extra_config": '
+              '{"shared_storage_path": "local_storage"}}')
+          )  #, max_model_len=2048, max_num_batched_tokens=2048)
+
+# 1ST generation (prefill instance)
+outputs = llm.generate(
+    prompts,
+    sampling_params,
+)
+
+new_prompts = []
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    new_prompts.append(prompt + generated_text)
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+# Write new_prompts to output.txt
+with open("output.txt", "w") as f:
+    for prompt in new_prompts:
+        f.write(prompt + "\n")
+print(f"Saved {len(new_prompts)} prompts to output.txt")
diff --git a/examples/offline_inference/disaggregated-prefill-v1/run.sh b/examples/offline_inference/disaggregated-prefill-v1/run.sh
new file mode 100644
index 0000000000000..0ebf45a1586a0
--- /dev/null
+++ b/examples/offline_inference/disaggregated-prefill-v1/run.sh
@@ -0,0 +1,5 @@
+rm -rf local_storage/
+rm output.txt
+
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 prefill_example.py
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 decode_example.py
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index bc17ca32e5b64..691ca59b062c2 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1,10 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 from typing import Optional
+from unittest.mock import Mock
 
 import pytest
 import torch
 
-from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
+from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
+                         SchedulerConfig, VllmConfig)
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.sched.output import SchedulerOutput
@@ -25,6 +27,9 @@ def create_scheduler(
     enable_prefix_caching: Optional[bool] = None,
     long_prefill_token_threshold: int = 0,
     disable_chunked_mm_input: bool = False,
+    use_kv_connector: bool = False,
+    num_blocks: int = 10000,
+    block_size: int = 16,
 ) -> Scheduler:
     '''Create scheduler under test.
 
@@ -60,31 +65,36 @@ def create_scheduler(
         'enable_prefix_caching': enable_prefix_caching
     })
     cache_config = CacheConfig(
-        block_size=16,
+        block_size=block_size,
         gpu_memory_utilization=0.9,
         swap_space=0,
         cache_dtype="auto",
         **kwargs_cache,
     )
+    kv_transfer_config = KVTransferConfig(
+        kv_connector="SharedStorageConnector",
+        kv_role="kv_both",
+        kv_connector_extra_config={"shared_storage_path": "local_storage"},
+    ) if use_kv_connector else None
+
     vllm_config = VllmConfig(
         scheduler_config=scheduler_config,
         model_config=model_config,
         cache_config=cache_config,
+        kv_transfer_config=kv_transfer_config,
     )
     kv_cache_config = KVCacheConfig(
-        num_blocks=10000,  # A large number of blocks to hold all requests
+        num_blocks=num_blocks,  # A large number of blocks to hold all requests
         tensors={},
         kv_cache_groups=[
             KVCacheGroupSpec(['layer'],
-                             FullAttentionSpec(16, 1, 1, torch.float32, False))
+                             FullAttentionSpec(block_size, 1, 1, torch.float32,
+                                               False))
         ],
     )
-    cache_config.num_gpu_blocks = 10000
+    cache_config.num_gpu_blocks = num_blocks
     return Scheduler(
-        scheduler_config,
-        model_config,
-        cache_config,
-        lora_config=None,
+        vllm_config=vllm_config,
         kv_cache_config=kv_cache_config,
         log_stats=True,
         structured_output_manager=StructuredOutputManager(vllm_config),
@@ -761,3 +771,390 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
         stats = scheduler_stats.spec_decoding_stats
         assert stats.num_draft_tokens == expected[0]
         assert stats.num_accepted_tokens == expected[1]
+
+
+def _assert_right_scheduler_output(
+    output: SchedulerOutput,
+    num_requests: int,
+    expected_num_scheduled_tokens: int,
+):
+    """Check if SchedulerOutput is correct after remote KV cache hit."""
+
+    # We should inject the kv_connector_metadata.
+    assert len(output.kv_connector_metadata.requests) == num_requests
+
+    # Only num_tokens - matched_num_new_tokens should be scheduled.
+    for _, num_scheduled_tokens in output.num_scheduled_tokens.items():
+        assert num_scheduled_tokens == expected_num_scheduled_tokens
+
+
+def _assert_right_kv_cache_manager(
+    scheduler: Scheduler,
+    req_ids: list[str],
+    num_tokens: int,
+    block_size: int,
+    num_requests: int,
+    num_total_blocks: int,
+):
+    """Check whether KVCacheManager is correct after allocate."""
+
+    # Make sure the request stats are right.
+    EXPECTED_ACTUAL_BLOCKS = num_tokens // block_size
+    EXPECTED_TOTAL_BLOCKS = (EXPECTED_ACTUAL_BLOCKS +
+                             scheduler.kv_cache_manager.num_preallocate_blocks)
+    for req_id in req_ids:
+        blocks = scheduler.kv_cache_manager.req_to_blocks[req_id]
+        hashes = scheduler.kv_cache_manager.req_to_block_hashes[req_id]
+        assert (scheduler.kv_cache_manager.num_cached_block[req_id] ==
+                EXPECTED_ACTUAL_BLOCKS)
+        assert len(blocks) == EXPECTED_TOTAL_BLOCKS
+        assert len(hashes) == EXPECTED_ACTUAL_BLOCKS
+
+    # Make sure we actually touched all the blocks.
+    BLOCKS_PER_REQ = (num_tokens / block_size +
+                      scheduler.kv_cache_manager.num_preallocate_blocks)
+    assert (scheduler.kv_cache_manager.block_pool.get_num_free_blocks() ==
+            num_total_blocks - num_requests * BLOCKS_PER_REQ)
+
+
+def _step_until_done(
+    scheduler: Scheduler,
+    output: SchedulerOutput,
+    model_runner_output: ModelRunnerOutput,
+):
+    """Loop over schedule(), update_from_output() until finished."""
+
+    all_finished = False
+    _ = scheduler.update_from_output(output, model_runner_output)
+    while not all_finished:
+        # Schedule + a few iterations until stopping.
+        output = scheduler.schedule()
+        assert len(scheduler.running)
+        for _, num_scheduled_tokens in output.num_scheduled_tokens.items():
+            # We should be in the decode phase now.
+            assert num_scheduled_tokens == 1
+        assert len(output.kv_connector_metadata.requests) == 0
+        ecos = scheduler.update_from_output(output, model_runner_output)
+        all_done = True
+        for eco in ecos.outputs:
+            if eco.finish_reason is None:
+                all_done = False
+        all_finished = all_done
+
+
+def test_kv_connector_basic():
+    """
+    Test whether Scheduler with KVConnector schedules tokens, allocates
+    memory, and cleans up requests as expected under normal operation.
+    """
+
+    # Setup Scheduler.
+    scheduler = create_scheduler(
+        enable_prefix_caching=True,
+        use_kv_connector=True,
+    )
+    NUM_TOTAL_BLOCKS = (
+        scheduler.kv_cache_manager.block_pool.get_num_free_blocks())
+    BLOCK_SIZE = scheduler.cache_config.block_size
+
+    # Mock External Cache Hit.
+    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
+    scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
+    scheduler.connector.get_num_new_matched_tokens.return_value = (
+        NUM_MATCHED_NEW_TOKENS)
+
+    ######################################################
+    # FIRST SET OF REQUESTS - External Hit Only
+    NUM_REQUESTS = 2
+    NUM_TOKENS = NUM_MATCHED_NEW_TOKENS * 2
+    MAX_TOKENS = 3
+    requests = create_requests(num_requests=NUM_REQUESTS,
+                               num_tokens=NUM_TOKENS,
+                               max_tokens=MAX_TOKENS)
+    req_ids = []
+    req_to_index = {}
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        req_ids.append(request.request_id)
+        req_to_index[request.request_id] = i
+
+    MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[1000]] * len(req_ids),
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+
+    # Ensure ScheduleOutput is correct.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output=output,
+        num_requests=NUM_REQUESTS,
+        # Just the incremental tokens should be scheduled.
+        expected_num_scheduled_tokens=NUM_TOKENS - NUM_MATCHED_NEW_TOKENS,
+    )
+
+    # Ensure KVCacheManager is correct.
+    _assert_right_kv_cache_manager(scheduler, req_ids, NUM_TOKENS, BLOCK_SIZE,
+                                   NUM_REQUESTS, NUM_TOTAL_BLOCKS)
+
+    # Continue Generation until done.
+    _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
+    _ = scheduler.schedule()
+    # Confirm we clean up the memory properly.
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
+        == NUM_TOTAL_BLOCKS
+
+    ######################################################
+    # SECOND SET OF REQUESTS - Local And External Hit
+    NUM_TOKENS_PREFIX = NUM_TOKENS
+    # We will get a local prefix cache hit for the first
+    # NUM_TOKENS_PREFIX tokens since they are used above.
+    NUM_TOKENS = NUM_TOKENS_PREFIX * 2
+    requests = create_requests(num_requests=NUM_REQUESTS,
+                               num_tokens=NUM_TOKENS,
+                               max_tokens=MAX_TOKENS)
+    req_ids = []
+    req_to_index = {}
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        req_ids.append(request.request_id)
+        req_to_index[request.request_id] = i
+
+    MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[1000]] * len(req_ids),
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+
+    # We should get a local cache hit of NUM_TOKENS_PREFIX and
+    # a remote KV cache hit of NUM_MATCHED_NEW_TOKENS.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output=output,
+        num_requests=NUM_REQUESTS,
+        # Just the incremental tokens after local + remote cache hit.
+        expected_num_scheduled_tokens=(NUM_TOKENS - NUM_TOKENS_PREFIX -
+                                       NUM_MATCHED_NEW_TOKENS))
+
+    # Ensure KVCacheManager is correct.
+    _assert_right_kv_cache_manager(scheduler, req_ids, NUM_TOKENS, BLOCK_SIZE,
+                                   NUM_REQUESTS, NUM_TOTAL_BLOCKS)
+
+    # Continue Generation until done.
+    _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
+    _ = scheduler.schedule()
+    # Confirm we clean up the memory properly.
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
+        == NUM_TOTAL_BLOCKS
+
+
+def test_kv_connector_unable_to_allocate():
+    """
+    Test whether scheduler with KVConnector is able to handle
+    unable to allocate (run out of blocks in allocate_slots().
+    """
+
+    # Setup Scheduler With Mock External Cache Hit.
+    BLOCK_SIZE = 4
+    NUM_BLOCKS = 10
+    scheduler = create_scheduler(
+        enable_prefix_caching=True,
+        use_kv_connector=True,
+        block_size=BLOCK_SIZE,
+        num_blocks=NUM_BLOCKS,
+    )
+    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
+    scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
+    scheduler.connector.get_num_new_matched_tokens.return_value = (
+        NUM_MATCHED_NEW_TOKENS)
+
+    # Create two requests. The second request will not be able to
+    # allocate slots because it will not have enough blocks.
+    NUM_REQUESTS = 2
+    NUM_TOKENS = (NUM_BLOCKS // 2 + 1) * BLOCK_SIZE
+    MAX_TOKENS = 2
+    requests = create_requests(num_requests=NUM_REQUESTS,
+                               num_tokens=NUM_TOKENS,
+                               max_tokens=MAX_TOKENS)
+    req_ids = []
+    req_to_index = {}
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        req_ids.append(request.request_id)
+        req_to_index[request.request_id] = i
+
+    MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[1000]] * len(req_ids),
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+
+    # Just one request should be running.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(output,
+                                   num_requests=1,
+                                   expected_num_scheduled_tokens=NUM_TOKENS -
+                                   NUM_MATCHED_NEW_TOKENS)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 1
+
+    # All memory should be freed, with one request waiting.
+    _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
+        == NUM_BLOCKS - 1
+    assert len(scheduler.running) == 0
+    assert len(scheduler.waiting) == 1
+
+    # Just one request should be running.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(output,
+                                   num_requests=1,
+                                   expected_num_scheduled_tokens=NUM_TOKENS -
+                                   NUM_MATCHED_NEW_TOKENS)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 0
+
+    # All memory should be freed, with no requests waiting / running.
+    _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
+        == NUM_BLOCKS - 1
+    assert len(scheduler.running) == 0
+    assert len(scheduler.waiting) == 0
+
+
+def test_kv_connector_handles_preemption():
+    """
+    Test whether scheduler with KVConnector is able to handle
+    unable to allocate (run out of blocks in allocate_slots().
+    """
+
+    # Setup Scheduler With Mock External Cache Hit.
+    BLOCK_SIZE = 2
+    # NOTE: there is 1 null block, so this is 6 blocks.
+    NUM_BLOCKS = 7
+    scheduler = create_scheduler(
+        enable_prefix_caching=True,
+        use_kv_connector=True,
+        block_size=BLOCK_SIZE,
+        num_blocks=NUM_BLOCKS,
+    )
+    scheduler.kv_cache_manager.num_preallocate_blocks = 0
+
+    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
+    scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
+    scheduler.connector.get_num_new_matched_tokens.return_value = (
+        NUM_MATCHED_NEW_TOKENS)
+
+    # Create two requests.
+    # Both can be scheduled at first, but the second request
+    # will be preempted and re-scheduled.
+    NUM_REQUESTS = 2
+    NUM_TOKENS = BLOCK_SIZE * 2 + 1
+    MAX_TOKENS = BLOCK_SIZE * 2
+    requests = create_requests(num_requests=NUM_REQUESTS,
+                               num_tokens=NUM_TOKENS,
+                               max_tokens=MAX_TOKENS)
+    req_ids = []
+    req_to_index = {}
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        req_ids.append(request.request_id)
+        req_to_index[request.request_id] = i
+
+    MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[1000]] * len(req_ids),
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+
+    # All can be scheduled - 1st token.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output,
+        # 2 remote kv cache hits.
+        num_requests=2,
+        expected_num_scheduled_tokens=NUM_TOKENS - NUM_MATCHED_NEW_TOKENS)
+    assert len(scheduler.running) == 2
+    _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+
+    # All can be scheduled - 2nd token.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output,
+        # no connector_metadata
+        num_requests=0,
+        expected_num_scheduled_tokens=1)
+    assert len(scheduler.running) == 2
+    _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+
+    # This will generate a new block and cause a preemption - 3rd token.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output,
+        # no connector_metadata
+        num_requests=0,
+        expected_num_scheduled_tokens=1)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 1
+    _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 1
+
+    # Only 1 can be scheduled - 4th (and last token).
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output,
+        # no connector_metadata
+        num_requests=0,
+        expected_num_scheduled_tokens=1)
+    assert len(scheduler.waiting) == 1
+    assert len(scheduler.running) == 1
+    _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+    assert len(scheduler.running) == 0
+    assert len(scheduler.waiting) == 1
+    # All memory should be freed since nothing is running.
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
+        == NUM_BLOCKS - 1
+
+    # Restarts the preempted request - generate 3rd token.
+    # This will have a local and remote cache hit.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output,
+        # 1 remote kv_cache hit!
+        num_requests=1,
+        # Only 1 block was preempted and there is a single
+        # remote hit. So only single new token is scheduled.
+        expected_num_scheduled_tokens=1,
+    )
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 0
+    _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 0
+
+    # Only 1 can be scheduled - 4th (and last token).
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output,
+        # no connector_metadata
+        num_requests=0,
+        expected_num_scheduled_tokens=1)
+    assert len(scheduler.running) == 1
+    _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+    assert len(scheduler.running) == 0
+    # All memory should be freed since nothing is running.
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
+        == NUM_BLOCKS - 1
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index dbf4723ee1bd7..68452f4c03b0d 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -10,6 +10,9 @@ import vllm.envs as envs
 from vllm.attention import AttentionType
 from vllm.attention.selector import backend_name_to_enum, get_attn_backend
 from vllm.config import CacheConfig, get_current_vllm_config
+from vllm.distributed.kv_transfer import (get_kv_transfer_group,
+                                          has_kv_transfer_group,
+                                          is_v1_kv_transfer_group)
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.model_executor.layers.linear import UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization.base_config import (
@@ -329,17 +332,54 @@ class MultiHeadAttention(nn.Module):
         return out.reshape(bsz, q_len, -1)
 
 
+def wait_for_kv_layer_from_connector(layer_name: str):
+    if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
+        return
+
+    connector = get_kv_transfer_group()
+
+    forward_context: ForwardContext = get_forward_context()
+    attn_metadata = forward_context.attn_metadata
+    if attn_metadata is None:
+        return
+
+    connector.wait_for_layer_load(layer_name)
+
+
+def maybe_save_kv_layer_to_connector(
+    layer_name: str,
+    kv_cache_layer: List[torch.Tensor],
+):
+    if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
+        return
+
+    connector = get_kv_transfer_group()
+
+    forward_context: ForwardContext = get_forward_context()
+    attn_metadata = forward_context.attn_metadata
+    if attn_metadata is None:
+        return
+
+    connector.save_kv_layer(layer_name, kv_cache_layer, attn_metadata)
+
+
 def unified_attention(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
     layer_name: str,
 ) -> torch.Tensor:
+    wait_for_kv_layer_from_connector(layer_name)
+
     forward_context: ForwardContext = get_forward_context()
     attn_metadata = forward_context.attn_metadata
     self = forward_context.no_compile_layers[layer_name]
     kv_cache = self.kv_cache[forward_context.virtual_engine]
-    return self.impl.forward(self, query, key, value, kv_cache, attn_metadata)
+    output = self.impl.forward(self, query, key, value, kv_cache,
+                               attn_metadata)
+
+    maybe_save_kv_layer_to_connector(layer_name, kv_cache)
+    return output
 
 
 def unified_attention_fake(
@@ -367,6 +407,7 @@ def unified_attention_with_output(
     output: torch.Tensor,
     layer_name: str,
 ) -> None:
+    wait_for_kv_layer_from_connector(layer_name)
     forward_context: ForwardContext = get_forward_context()
     attn_metadata = forward_context.attn_metadata
     self = forward_context.no_compile_layers[layer_name]
@@ -379,6 +420,8 @@ def unified_attention_with_output(
                       attn_metadata,
                       output=output)
 
+    maybe_save_kv_layer_to_connector(layer_name, kv_cache)
+
 
 def unified_attention_with_output_fake(
     query: torch.Tensor,
diff --git a/vllm/distributed/kv_transfer/__init__.py b/vllm/distributed/kv_transfer/__init__.py
index e69de29bb2d1d..ec07c6fe0d12d 100644
--- a/vllm/distributed/kv_transfer/__init__.py
+++ b/vllm/distributed/kv_transfer/__init__.py
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm.distributed.kv_transfer.kv_transfer_state import (
+    ensure_kv_transfer_initialized, get_kv_transfer_group,
+    has_kv_transfer_group, is_v1_kv_transfer_group)
+
+__all__ = [
+    "get_kv_transfer_group", "has_kv_transfer_group",
+    "is_v1_kv_transfer_group", "ensure_kv_transfer_initialized",
+    "KVConnectorBaseType"
+]
diff --git a/vllm/distributed/kv_transfer/kv_connector/base.py b/vllm/distributed/kv_transfer/kv_connector/base.py
index 57c764b481c29..0d1a3d40af413 100644
--- a/vllm/distributed/kv_transfer/kv_connector/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/base.py
@@ -12,6 +12,7 @@ from typing import TYPE_CHECKING, List, Tuple, Union
 
 import torch
 
+from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
 from vllm.sequence import IntermediateTensors
 
 if TYPE_CHECKING:
@@ -121,3 +122,6 @@ class KVConnectorBase(ABC):
         """
 
         raise NotImplementedError
+
+
+KVConnectorBaseType = Union[KVConnectorBase, KVConnectorBase_V1]
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index e37ce6dc75b03..665ea2f5ba011 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -3,14 +3,22 @@
 import importlib
 from typing import TYPE_CHECKING, Callable, Dict, Type
 
+import vllm.envs as envs
+from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType
+from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1,
+                                                          KVConnectorRole)
+from vllm.logger import init_logger
+
 from .base import KVConnectorBase
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
 
+logger = init_logger(__name__)
+
 
 class KVConnectorFactory:
-    _registry: Dict[str, Callable[[], Type[KVConnectorBase]]] = {}
+    _registry: Dict[str, Callable[[], Type[KVConnectorBaseType]]] = {}
 
     @classmethod
     def register_connector(cls, name: str, module_path: str,
@@ -19,22 +27,51 @@ class KVConnectorFactory:
         if name in cls._registry:
             raise ValueError(f"Connector '{name}' is already registered.")
 
-        def loader() -> Type[KVConnectorBase]:
+        def loader() -> Type[KVConnectorBaseType]:
             module = importlib.import_module(module_path)
             return getattr(module, class_name)
 
         cls._registry[name] = loader
 
     @classmethod
-    def create_connector(cls, rank: int, local_rank: int,
-                         config: "VllmConfig") -> KVConnectorBase:
+    def create_connector_v0(cls, rank: int, local_rank: int,
+                            config: "VllmConfig") -> KVConnectorBase:
+        if envs.VLLM_USE_V1:
+            raise ValueError("Attempting to initialize a V0 Connector, "
+                             f"but found {envs.VLLM_USE_V1=}")
+
         connector_name = config.kv_transfer_config.kv_connector
         if connector_name not in cls._registry:
             raise ValueError(f"Unsupported connector type: {connector_name}")
 
         connector_cls = cls._registry[connector_name]()
+        assert issubclass(connector_cls, KVConnectorBase)
         return connector_cls(rank, local_rank, config)
 
+    @classmethod
+    def create_connector_v1(
+        cls,
+        config: "VllmConfig",
+        role: KVConnectorRole,
+    ) -> KVConnectorBase_V1:
+        if not envs.VLLM_USE_V1:
+            raise ValueError("Attempting to initialize a V1 Connector, "
+                             f"but found {envs.VLLM_USE_V1=}")
+
+        connector_name = config.kv_transfer_config.kv_connector
+        connector_cls = cls._registry[connector_name]()
+        assert issubclass(connector_cls, KVConnectorBase_V1)
+        logger.info("Creating v1 connector with name: %s", connector_name)
+        # NOTE(Kuntai): v1 connector is explicitly separated into two roles.
+        # Scheduler connector:
+        # - Co-locate with scheduler process
+        # - Should only be used inside the Scheduler class
+        # Worker connector:
+        # - Co-locate with worker process
+        # - Should only be used inside the forward context & attention layer
+        # We build separately to enforce strict separation
+        return connector_cls(config, role)
+
 
 # Register various connectors here.
 # The registration should not be done in each individual file, as we want to
@@ -57,4 +94,9 @@ KVConnectorFactory.register_connector(
 KVConnectorFactory.register_connector(
     "MooncakeStoreConnector",
     "vllm.distributed.kv_transfer.kv_connector.mooncake_store_connector",
-    "MooncakeStoreConnector")
\ No newline at end of file
+    "MooncakeStoreConnector")
+
+KVConnectorFactory.register_connector(
+    "SharedStorageConnector",
+    "vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector",
+    "SharedStorageConnector")
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py
new file mode 100644
index 0000000000000..a017b140e0902
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1, KVConnectorRole)
+
+__all__ = [
+    "KVConnectorRole",
+    "KVConnectorBase_V1",
+]
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
new file mode 100644
index 0000000000000..95967d2ca9193
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -0,0 +1,209 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+KVConnectorBase_V1 Class for Distributed KV Cache & Hidden State
+communication in vLLM v1
+
+The class provides the following primitives:
+    Scheduler-side: runs in the scheduler, binds metadata, which
+    is used by the worker-side to load/save KV cache.
+        get_num_new_matched_tokens() - get number of new tokens 
+            that exist in the remote KV cache
+        update_state_after_alloc() - update KVConnector state after
+            temporary buffer alloc by the CacheManager.
+
+    Worker-side: runs in each worker, loads/saves KV cache to/from
+    the Connector based on the metadata.
+        start_load_kv() - starts loading all KVs (maybe async)
+        wait_for_layer_load() - blocks until layer i load is done
+
+        save_kv_layer() - starts saving KV for layer i (maybe async)
+        wait_for_save() - blocks until all saves are done
+"""
+
+import enum
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.v1.core.sched.output import SchedulerOutput
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+    from vllm.config import VllmConfig
+    from vllm.forward_context import ForwardContext
+    from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+class KVConnectorRole(enum.Enum):
+    # Connector running in the scheduler process
+    SCHEDULER = 0
+
+    # Connector running in the worker process
+    WORKER = 1
+
+
+@dataclass
+class KVConnectorMetadata:
+    pass
+
+
+class KVConnectorBase_V1(ABC):
+
+    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
+        logger.warning(
+            "Initializing KVConnectorBase_V1. This API is experimental and "
+            "subject to change in the future as we iterate the design.")
+        self._connector_metadata = KVConnectorMetadata()
+        self._vllm_config = vllm_config
+        self._role = role
+
+    @property
+    def role(self) -> KVConnectorRole:
+        return self._role
+
+    def bind_connector_metadata(
+            self, connector_metadata: KVConnectorMetadata) -> None:
+        """Set the connector metadata from the scheduler.
+
+        This function should be called by the model runner every time 
+        before the model execution. The metadata will be used for runtime
+        KV cache loading and saving.
+
+        Args:
+            connector_metadata (dict): the connector metadata.
+        """
+        self._connector_metadata = connector_metadata
+
+    def clear_connector_metadata(self) -> None:
+        """Clear the connector metadata.
+
+        This function should be called by the model runner every time 
+        after the model execution.
+        """
+        self._connector_metadata = KVConnectorMetadata()
+
+    def _get_connector_metadata(self) -> KVConnectorMetadata:
+        """Get the connector metadata.
+
+        This function should only be called inside the connector.
+
+        Returns:
+            ConnectorMetadata: the connector metadata.
+        """
+        return self._connector_metadata
+
+    # ==============================
+    # Worker-side methods
+    # ==============================
+
+    @abstractmethod
+    def start_load_kv(self, forward_context: "ForwardContext",
+                      **kwargs) -> None:
+        """
+        Start loading the KV cache from the connector to vLLM's paged
+        KV buffer. This is called from the forward context before the
+        forward pass to enable async loading during model execution.
+
+        Args:
+            forward_context (ForwardContext): the forward context.
+            **kwargs: additional arguments for the load operation
+
+        Note:
+            The number of elements in kv_caches and layer_names should be 
+            the same.
+            
+        """
+        pass
+
+    @abstractmethod
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        """
+        Block until the KV for a specific layer is loaded into vLLM's
+        paged buffer. This is called from within attention layer to ensure
+        async copying from start_load_kv is complete.
+        
+        This interface will be useful for layer-by-layer pipelining.
+
+        Args:
+            layer_name: the name of that layer
+        """
+        pass
+
+    @abstractmethod
+    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
+                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+        """
+        Start saving a layer of KV cache from vLLM's paged buffer 
+        to the connector. This is called from within attention layer to
+        enable async copying during execution.
+
+        Args:
+            layer_name (str): the name of the layer.
+            kv_layer (torch.Tensor): the paged KV buffer of the current 
+                layer in vLLM.
+            attn_metadata (AttentionMetadata): the attention metadata.
+            **kwargs: additional arguments for the save operation.
+        """
+        pass
+
+    @abstractmethod
+    def wait_for_save(self):
+        """
+        Block until all the save operations is done. This is called
+        as the forward context exits to ensure that the async saving
+        from save_kv_layer is complete before finishing the forward.
+
+        This prevents overwrites of paged KV buffer before saving done.
+        """
+        pass
+
+    # ==============================
+    # Scheduler-side methods
+    # ==============================
+    @abstractmethod
+    def get_num_new_matched_tokens(
+        self,
+        request: "Request",
+        num_computed_tokens: int,
+    ) -> int:
+        """
+        Get number of new tokens that can be loaded from the
+        external KV cache beyond the num_computed_tokens.
+        
+        Args:
+            request (Request): the request object.
+            num_computed_tokens (int): the number of locally
+                computed tokens for this request
+
+        Returns:
+            the number of tokens that can be loaded from the 
+            external KV cache beyond what is already computed.
+        """
+        pass
+
+    @abstractmethod
+    def update_state_after_alloc(self, request: "Request",
+                                 num_external_tokens: int):
+        """
+        Update KVConnector state after block allocation.
+        """
+        pass
+
+    @abstractmethod
+    def build_connector_meta(
+            self, scheduler_output: SchedulerOutput) -> KVConnectorMetadata:
+        """
+        Build the connector metadata for this step.
+
+        This function should NOT modify fields in the scheduler_output.
+        Also, calling this function will reset the state of the connector.
+
+        Args:
+            scheduler_output (SchedulerOutput): the scheduler output object.
+        """
+        pass
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
new file mode 100644
index 0000000000000..1d2040784e6cb
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@@ -0,0 +1,382 @@
+# SPDX-License-Identifier: Apache-2.0
+import hashlib
+import os
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import safetensors
+import torch
+
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.mla.common import MLACommonMetadata
+from vllm.v1.core.sched.output import SchedulerOutput
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+    from vllm.forward_context import ForwardContext
+    from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class ReqMeta:
+    # Request tokens
+    token_ids: torch.Tensor
+    # Slot mappings, should have the same length as token_ids
+    slot_mapping: torch.Tensor
+    # Is store or load
+    is_store: bool
+
+    @staticmethod
+    def make_meta(token_ids: list[int], block_ids: list[int], block_size: int,
+                  is_store: bool) -> "ReqMeta":
+        valid_num_tokens = align_to_block_size(len(token_ids), block_size)
+        token_ids_tensor = torch.tensor(token_ids)[:valid_num_tokens]
+        block_ids_tensor = torch.tensor(block_ids)
+        num_blocks = block_ids_tensor.shape[0]
+        block_offsets = torch.arange(0, block_size)
+        slot_mapping = block_offsets.reshape((1, block_size)) + \
+                block_ids_tensor.reshape((num_blocks, 1)) * block_size
+        slot_mapping = slot_mapping.flatten()[:valid_num_tokens]
+        return ReqMeta(
+            token_ids=token_ids_tensor,
+            slot_mapping=slot_mapping,
+            is_store=is_store,
+        )
+
+
+@dataclass
+class SharedStorageConnectorMetadata(KVConnectorMetadata):
+    requests: list[ReqMeta]
+
+    def __init__(self):
+        self.requests = []
+
+    def add_request(
+        self,
+        token_ids: list[int],
+        block_ids: list[int],
+        block_size: int,
+        is_store: bool,
+    ) -> None:
+        self.requests.append(
+            ReqMeta.make_meta(token_ids, block_ids, block_size, is_store))
+
+
+class SharedStorageConnector(KVConnectorBase_V1):
+    # NOTE: This is Simple debug implementation of the KV connector.
+    # It save / load the KV cache to / from the disk.
+    # It does extra work which will overwrite the existing prefix-cache in GPU
+    # - to remove the overhead, need to add some "mask" in the ReqMeta class
+
+    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
+        super().__init__(vllm_config=vllm_config, role=role)
+        self._block_size = vllm_config.cache_config.block_size
+        self._requests_need_load: dict[str, Request] = {}
+        transfer_config = vllm_config.kv_transfer_config
+        self._storage_path = transfer_config.get_from_extra_config(
+            "shared_storage_path", "/tmp")
+        logger.info(vllm_config.kv_transfer_config)
+        logger.info("Shared storage path is %s", self._storage_path)
+
+    def start_load_kv(self, forward_context: "ForwardContext",
+                      **kwargs) -> None:
+        """Start loading the KV cache from the connector buffer to vLLM's 
+        paged KV buffer.
+
+        Args:
+            forward_context (ForwardContext): the forward context.
+            **kwargs: additional arguments for the load operation
+
+        Note:
+            The number of elements in kv_caches and layer_names should be 
+            the same.
+        """
+        attn_metadata = forward_context.attn_metadata
+
+        def inject_kv_into_layer(
+            dst_kv_cache_layer: torch.Tensor,
+            src_kv_cache: torch.Tensor,
+            slot_mapping: torch.Tensor,
+        ) -> None:
+            """Inject the KV cache into the layer.
+
+            Args:
+                dst_kv_cache_layer (torch.Tensor): the destination KV cache 
+                    layer. In shape [2, num_pages, page_size, xxx] if not 
+                    using MLA, [num_pages, page_size, xxx] otherwise.
+                src_kv_cache (torch.Tensor): the source KV cache. In shape
+                    [2, num_tokens, xxx] if not using MLA, [num_tokens, xxx] 
+                    otherwise.
+                slot_mapping (torch.Tensor): the slot mapping. In shape 
+                    [num_tokens].
+            """
+            dst_kv_cache_layer_shape = dst_kv_cache_layer.shape
+            if isinstance(attn_metadata, MLACommonMetadata):
+                num_pages = dst_kv_cache_layer_shape[0]
+                page_size = dst_kv_cache_layer_shape[1]
+                dst_kv_cache_layer = dst_kv_cache_layer.reshape(
+                    num_pages * page_size, -1)
+                dst_kv_cache_layer[slot_mapping, ...] = src_kv_cache
+                dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape)
+            else:
+                num_pages = dst_kv_cache_layer_shape[1]
+                page_size = dst_kv_cache_layer_shape[2]
+                dst_kv_cache_layer = dst_kv_cache_layer.reshape(
+                    2, num_pages * page_size, -1)
+                dst_kv_cache_layer[:, slot_mapping, ...] = src_kv_cache
+                dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape)
+
+        # Get the metadata
+        metadata: KVConnectorMetadata = \
+            self._get_connector_metadata()
+        assert isinstance(metadata, SharedStorageConnectorMetadata)
+
+        if metadata is None:
+            logger.warning(
+                "In connector.start_load_kv, but the connector metadata is None"
+            )
+            return
+
+        attn_metadata = forward_context.attn_metadata
+        if attn_metadata is None:
+            logger.warning(
+                "In connector.start_load_kv, but the attn_metadata is None")
+            return
+
+        # Load the KV for each request each layer
+        for request in metadata.requests:
+            if request.is_store:
+                continue
+            logger.info("Inject KV cache of %d tokens to the paged memory",
+                        len(request.slot_mapping))
+            for layer_name in forward_context.no_compile_layers:
+                attn_layer = forward_context.no_compile_layers[layer_name]
+                kv_cache_layer = attn_layer.kv_cache[\
+                        forward_context.virtual_engine]
+
+                filename = self._generate_filename_debug(
+                    layer_name, request.token_ids)
+                kv_cache = safetensors.torch.load_file(
+                    filename)["kv_cache"].cuda()
+                inject_kv_into_layer(kv_cache_layer, kv_cache,
+                                     request.slot_mapping)
+
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        """Blocking until the KV for a specific layer is loaded into vLLM's
+        paged buffer. 
+        
+        This interface will be useful for layer-by-layer pipelining.
+
+        Args:
+            layer_name: the name of that layer
+        """
+        return
+
+    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
+                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+        """Start saving the KV cache of the layer from vLLM's paged buffer 
+        to the connector.
+
+        Args:
+            layer_name (str): the name of the layer.
+            kv_layer (torch.Tensor): the paged KV buffer of the current 
+                layer in vLLM.
+            attn_metadata (AttentionMetadata): the attention metadata.
+            **kwargs: additional arguments for the save operation.
+        """
+
+        def extract_kv_from_layer(
+            layer: torch.Tensor,
+            slot_mapping: torch.Tensor,
+        ) -> torch.Tensor:
+            """Extract the KV cache from the layer.
+
+            Assume the shape of the layer is (2, num_pages, page_size, xxx)
+            if MLA is not used, and (num_pages, page_size, xxx) otherwise.
+            """
+            if isinstance(attn_metadata, MLACommonMetadata):
+                num_pages, page_size = layer.shape[0], layer.shape[1]
+                return layer.reshape(num_pages * page_size, -1)[slot_mapping,
+                                                                ...]
+            num_pages, page_size = layer.shape[1], layer.shape[2]
+            return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping,
+                                                               ...]
+
+        connector_metadata = self._get_connector_metadata()
+        assert isinstance(connector_metadata, SharedStorageConnectorMetadata)
+        for request in connector_metadata.requests:
+            if request.is_store:
+                filename = self._generate_filename_debug(
+                    layer_name, request.token_ids)
+                kv_cache = extract_kv_from_layer(kv_layer,
+                                                 request.slot_mapping)
+                tensors = {"kv_cache": kv_cache.detach().cpu()}
+                safetensors.torch.save_file(tensors, filename)
+
+    def wait_for_save(self):
+        return
+
+    def get_num_new_matched_tokens(
+        self,
+        request: "Request",
+        num_computed_tokens: int,
+    ) -> int:
+        """
+        Get number of new tokens that can be loaded from the
+        external KV cache beyond the num_computed_tokens.
+        
+        Args:
+            request (Request): the request object.
+            num_computed_tokens (int): the number of locally
+                computed tokens for this request
+
+        Returns:
+            the number of tokens that can be loaded from the 
+            external KV cache beyond what is already computed.
+        """
+
+        # NOTE: in this debug implementation, we assume that the prompt is
+        # cached_prompt + newly_generated_single_token
+        # Therefore, we use prompt_token_ids[:-1] to determine the folder name
+
+        # NOTE: in current v1 scheduler, the num_computed_tokens is aligned
+        # with the block granularity. And it expects the returned blocks and
+        # num_computed_tokens to also be aligned with the block granularity.
+        if not self._found_match_for_request(request):
+            return 0
+
+        logger.info("External Cache Hit!")
+
+        # Now, first num_tokens_to_check tokens are hit, we need to prepare
+        # the metadata for the worker connector to correctly load the KV
+        num_tokens_to_check = align_to_block_size(
+            len(request.prompt_token_ids) - 1, self._block_size)
+
+        return num_tokens_to_check - num_computed_tokens
+
+    def update_state_after_alloc(self, request: "Request",
+                                 num_external_tokens: int):
+        """
+        Update KVConnector state after block allocation.
+
+        If blocks were allocated, add to _requests_need_load,
+        such that we load the KVs in the next forward pass.
+        """
+        if num_external_tokens > 0:
+            self._requests_need_load[request.request_id] = request
+
+    def build_connector_meta(
+        self,
+        scheduler_output: SchedulerOutput,
+    ) -> KVConnectorMetadata:
+        """Build the connector metadata for this step.
+
+        This function should NOT modify any fields in the scheduler_output.
+        Also, calling this function will reset the state of the connector.
+
+        Args:
+            scheduler_output (SchedulerOutput): the scheduler output object.
+        """
+        meta = SharedStorageConnectorMetadata()
+
+        total_need_load = 0
+        for new_req in scheduler_output.scheduled_new_reqs:
+            if new_req.req_id in self._requests_need_load:
+                meta.add_request(token_ids=new_req.prompt_token_ids,
+                                 block_ids=new_req.block_ids,
+                                 block_size=self._block_size,
+                                 is_store=False)
+                total_need_load += 1
+            else:
+                # NOTE: here, we set the store and load being exclusive,
+                # but a single request can have both store and load.
+                # NOTE(rob): for this debug implementation, we only cache
+                # the original prompt tokens.
+                if not self._found_match_for_request(new_req):
+                    meta.add_request(token_ids=new_req.prompt_token_ids,
+                                     block_ids=new_req.block_ids,
+                                     block_size=self._block_size,
+                                     is_store=True)
+
+        for cached_req in scheduler_output.scheduled_cached_reqs:
+            # NOTE(rob): here we rely on the resumed requests being
+            # the first N requests in the list scheduled_cache_reqs.
+            if not cached_req.resumed_from_preemption:
+                break
+            if cached_req.req_id in self._requests_need_load:
+                # NOTE(rob): cached_req_data does not have the full
+                # list of token ids (only new tokens). So we look it
+                # up in the actual request object.
+                request = self._requests_need_load[cached_req.req_id]
+                total_tokens = (len(cached_req.new_token_ids) +
+                                cached_req.num_computed_tokens)
+                token_ids = request.all_token_ids[:total_tokens]
+
+                # NOTE(rob): For resumed req, new_block_ids is all
+                # of the block_ids for the request.
+                block_ids = cached_req.new_block_ids
+
+                meta.add_request(token_ids=token_ids,
+                                 block_ids=block_ids,
+                                 block_size=self._block_size,
+                                 is_store=False)
+                total_need_load += 1
+
+        assert total_need_load == len(self._requests_need_load)
+        self._requests_need_load.clear()
+        return meta
+
+    # ==============================
+    # Helper functions
+    # ==============================
+
+    def _found_match_for_request(
+        self,
+        request: "Request",
+    ) -> bool:
+        """Check if the cache is hit for the request.
+        """
+        num_tokens_to_check = align_to_block_size(
+            len(request.prompt_token_ids) - 1, self._block_size)
+        foldername = self._generate_foldername_debug(torch.tensor(
+            request.prompt_token_ids)[:num_tokens_to_check],
+                                                     create_folder=False)
+        return os.path.exists(foldername)
+
+    def _generate_foldername_debug(
+        self,
+        input_ids: torch.Tensor,
+        create_folder=False,
+    ) -> str:
+        """Generate a folder name based on the hash of the bytes of the input 
+        ids.
+        """
+        input_ids_bytes = input_ids.numpy().tobytes()
+        input_ids_hash = hashlib.md5(input_ids_bytes).hexdigest()
+        foldername = os.path.join(self._storage_path, input_ids_hash)
+        if create_folder:
+            os.makedirs(foldername, exist_ok=True)
+        return foldername
+
+    def _generate_filename_debug(
+        self,
+        layer_name: str,
+        input_ids: torch.Tensor,
+    ) -> str:
+        """Generate a file name based on the layer name and the hash 
+        of the bytes of the input ids.
+        """
+        foldername = self._generate_foldername_debug(input_ids,
+                                                     create_folder=True)
+        return os.path.join(foldername, f"{layer_name}.safetensors")
+
+
+def align_to_block_size(num_tokens: int, block_size) -> int:
+    """Align the number of tokens to the block size.
+    """
+    return (num_tokens - 1) // block_size * block_size
diff --git a/vllm/distributed/kv_transfer/kv_transfer_agent.py b/vllm/distributed/kv_transfer/kv_connector_agent.py
similarity index 97%
rename from vllm/distributed/kv_transfer/kv_transfer_agent.py
rename to vllm/distributed/kv_transfer/kv_connector_agent.py
index 1e80e0bd7de86..9d7145098105e 100644
--- a/vllm/distributed/kv_transfer/kv_transfer_agent.py
+++ b/vllm/distributed/kv_transfer/kv_connector_agent.py
@@ -46,7 +46,7 @@ class KVTransferAgent:
         assert self.config.kv_transfer_config.is_kv_transfer_instance, "KV"\
             "TransferAgent should only be used when kv_connector is set."
 
-        self.connector = KVConnectorFactory.create_connector(
+        self.connector = KVConnectorFactory.create_connector_v0(
             rank, local_rank, config)
 
     def send_kv_caches_and_hidden_states(
diff --git a/vllm/distributed/kv_transfer/kv_transfer_state.py b/vllm/distributed/kv_transfer/kv_transfer_state.py
new file mode 100644
index 0000000000000..25d2f2cf5c6e6
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_transfer_state.py
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import TYPE_CHECKING, Optional
+
+from vllm import envs
+from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType
+from vllm.distributed.kv_transfer.kv_connector.factory import (
+    KVConnectorFactory)
+from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1,
+                                                          KVConnectorRole)
+from vllm.distributed.parallel_state import get_world_group
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
+_KV_CONNECTOR_AGENT: Optional[KVConnectorBaseType] = None
+
+
+def get_kv_transfer_group() -> KVConnectorBaseType:
+    assert _KV_CONNECTOR_AGENT is not None, (
+        "disaggregated KV cache transfer parallel group is not initialized")
+    return _KV_CONNECTOR_AGENT
+
+
+def has_kv_transfer_group() -> bool:
+    return _KV_CONNECTOR_AGENT is not None
+
+
+def is_v1_kv_transfer_group(
+        connector: Optional[KVConnectorBaseType] = None) -> bool:
+    """Check if the KV connector is the v1 connector.
+    If the argument is None, it will check the global KV connector
+
+    Args:
+        connector: The KV connector to check. If None, it will check the
+            global KV connector.
+
+    Note:
+        This function will no-longer be needed after the v1 KV connector
+        becomes the default.
+    """
+    if connector is None:
+        connector = _KV_CONNECTOR_AGENT
+
+    if connector is None:
+        return False
+
+    return isinstance(connector, KVConnectorBase_V1)
+
+
+def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
+    """
+    Initialize KV cache transfer parallel group.
+    """
+
+    global _KV_CONNECTOR_AGENT
+
+    if vllm_config.kv_transfer_config is None:
+        return
+
+    if (vllm_config.kv_transfer_config.is_kv_transfer_instance
+            and _KV_CONNECTOR_AGENT is None):
+        if envs.VLLM_USE_V1:
+            _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector_v1(
+                config=vllm_config, role=KVConnectorRole.WORKER)
+        else:
+            _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector_v0(
+                rank=get_world_group().rank,
+                local_rank=get_world_group().local_rank,
+                config=vllm_config,
+            )
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index e0eeeffb88a70..d0ac7e92f9c89 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -29,15 +29,13 @@ from collections import namedtuple
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
 from multiprocessing import shared_memory
-from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple,
-                    Union)
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 from unittest.mock import patch
 
 import torch
 import torch.distributed
 from torch.distributed import Backend, ProcessGroup
 
-import vllm.distributed.kv_transfer.kv_transfer_agent as kv_transfer
 import vllm.envs as envs
 from vllm.distributed.device_communicators.base_device_communicator import (
     DeviceCommunicatorBase)
@@ -46,9 +44,6 @@ from vllm.logger import init_logger
 from vllm.utils import (direct_register_custom_op, resolve_obj_by_qualname,
                         supports_custom_op)
 
-if TYPE_CHECKING:
-    from vllm.config import VllmConfig
-
 
 @dataclass
 class GraphCaptureContext:
@@ -772,14 +767,6 @@ def get_pp_group() -> GroupCoordinator:
 # kept for backward compatibility
 get_pipeline_model_parallel_group = get_pp_group
 
-_KV_TRANSFER: Optional[kv_transfer.KVTransferAgent] = None
-
-
-def get_kv_transfer_group() -> kv_transfer.KVTransferAgent:
-    assert _KV_TRANSFER is not None, (
-        "disaggregated KV cache transfer parallel group is not initialized")
-    return _KV_TRANSFER
-
 
 @contextmanager
 def graph_capture(device: torch.device):
@@ -962,26 +949,6 @@ def initialize_model_parallel(
         _DP.rank_in_group, _PP.rank_in_group, _TP.rank_in_group)
 
 
-def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
-    """
-    Initialize KV cache transfer parallel group.
-    """
-
-    global _KV_TRANSFER
-
-    if vllm_config.kv_transfer_config is None:
-        return
-
-    if all([
-            vllm_config.kv_transfer_config.is_kv_transfer_instance,
-            _KV_TRANSFER is None
-    ]):
-        _KV_TRANSFER = kv_transfer.KVTransferAgent(
-            rank=get_world_group().rank,
-            local_rank=get_world_group().local_rank,
-            config=vllm_config)
-
-
 def ensure_model_parallel_initialized(
     tensor_model_parallel_size: int,
     pipeline_model_parallel_size: int,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 85b3ddfce48c4..7c1bde0fecb7f 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1527,12 +1527,6 @@ class EngineArgs:
                                    recommend_to_remove=False)
                 return False
 
-        # No Disaggregated Prefill so far.
-        if self.kv_transfer_config != EngineArgs.kv_transfer_config:
-            _raise_or_fallback(feature_name="--kv-transfer-config",
-                               recommend_to_remove=False)
-            return False
-
         # No FlashInfer or XFormers so far.
         V1_BACKENDS = [
             "FLASH_ATTN_VLLM_V1", "FLASH_ATTN", "PALLAS", "PALLAS_VLLM_V1",
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index e195a03c5cac8..06790d8ee2f8c 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -11,6 +11,10 @@ import torch.distributed as dist
 
 import vllm.envs as envs
 from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer import (get_kv_transfer_group,
+                                          has_kv_transfer_group,
+                                          is_v1_kv_transfer_group)
+from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
@@ -98,6 +102,17 @@ def set_forward_context(attn_metadata: Any,
         virtual_engine=virtual_engine,
         attn_metadata=attn_metadata,
         dp_metadata=dp_metadata)
+
+    # KVConnector: trigger (possibly async) load before forward.
+    # Each attn layer will block until the reading is complete.
+    trigger_kv_transfer = (attn_metadata is not None
+                           and has_kv_transfer_group()
+                           and is_v1_kv_transfer_group())
+    if trigger_kv_transfer:
+        kv_connector = get_kv_transfer_group()
+        assert isinstance(kv_connector, KVConnectorBase_V1)
+        kv_connector.start_load_kv(_forward_context)
+
     try:
         yield
     finally:
@@ -133,4 +148,12 @@ def set_forward_context(attn_metadata: Any,
                     logger.info(("Batchsize forward time stats "
                                  "(batchsize, count, median_time(ms)): %s"),
                                 forward_stats)
+
+        # KVConnector: each attn layer triggers (possibly async) save.
+        # Ensure all those operations complete before forward() is done.
+        if trigger_kv_transfer:
+            kv_connector = get_kv_transfer_group()
+            assert isinstance(kv_connector, KVConnectorBase_V1)
+            kv_connector.wait_for_save()
+
         _forward_context = prev_context
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 33761cf7f9c01..6e5f969d72f14 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -171,8 +171,9 @@ class KVCacheManager:
 
         Args:
             request: The request to allocate slots.
-            num_tokens: The number of tokens to allocate. Note that this does
-                not include the tokens that have already been computed.
+            num_tokens: The number of tokens to allocate, including external
+                tokens. Note that this does not include tokens that have
+                already been computed locally (i.e. new_computed_blocks).
             new_computed_blocks: A list of new computed blocks just hitting the
                 prefix caching.
             num_lookahead_tokens: The number of speculative tokens to allocate.
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index dc0d2d59fea7f..1d3f1f41f8fbe 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -9,6 +9,8 @@ if TYPE_CHECKING:
     import numpy as np
     import numpy.typing as npt
 
+    from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+        KVConnectorMetadata)
     from vllm.lora.request import LoRARequest
     from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
     from vllm.sampling_params import SamplingParams
@@ -121,3 +123,6 @@ class SchedulerOutput:
     structured_output_request_ids: dict[str, int]
     # the bitmask for the whole batch
     grammar_bitmask: Optional[npt.NDArray[np.int32]]
+
+    # KV Cache Connector metadata.
+    kv_connector_metadata: Optional[KVConnectorMetadata] = None
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index a81574875a5c1..7e658d134cf77 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -7,8 +7,10 @@ from collections import deque
 from collections.abc import Iterable
 from typing import Optional, Union
 
-from vllm.config import (CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig,
-                         SpeculativeConfig)
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.factory import (
+    KVConnectorFactory)
+from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorRole
 from vllm.logger import init_logger
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
@@ -34,20 +36,17 @@ class Scheduler(SchedulerInterface):
 
     def __init__(
         self,
-        scheduler_config: SchedulerConfig,
-        model_config: ModelConfig,
-        cache_config: CacheConfig,
-        lora_config: Optional[LoRAConfig],
+        vllm_config: VllmConfig,
         kv_cache_config: KVCacheConfig,
         structured_output_manager: StructuredOutputManager,
-        speculative_config: SpeculativeConfig = None,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         include_finished_set: bool = False,
         log_stats: bool = False,
     ) -> None:
-        self.scheduler_config = scheduler_config
-        self.cache_config = cache_config
-        self.lora_config = lora_config
+        self.vllm_config = vllm_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
         self.kv_cache_config = kv_cache_config
         self.log_stats = log_stats
         self.structured_output_manager = structured_output_manager
@@ -64,11 +63,22 @@ class Scheduler(SchedulerInterface):
             self.scheduler_config.max_num_batched_tokens
         self.max_model_len = self.scheduler_config.max_model_len
 
+        # Create KVConnector for the Scheduler. Note that each Worker
+        # will have a corresponding KVConnector with Role=WORKER.
+        # KV Connector pushes/pull of remote KVs for P/D and offloading.
+        self.connector = None
+        if self.vllm_config.kv_transfer_config is not None:
+            self.connector = KVConnectorFactory.create_connector_v1(
+                config=self.vllm_config, role=KVConnectorRole.SCHEDULER)
+
+        num_gpu_blocks = self.cache_config.num_gpu_blocks
+        assert num_gpu_blocks is not None and num_gpu_blocks > 0
+
         # Create the KV cache manager.
         self.kv_cache_manager = KVCacheManager(
             kv_cache_config=kv_cache_config,
             max_model_len=self.max_model_len,
-            enable_caching=cache_config.enable_prefix_caching,
+            enable_caching=self.cache_config.enable_prefix_caching,
             caching_hash_algo=self.cache_config.prefix_caching_hash_algo,
             log_stats=self.log_stats)
         self.block_size = self.cache_config.block_size
@@ -99,8 +109,8 @@ class Scheduler(SchedulerInterface):
         # This can be changed when we make encoder cache for embedding caching
         # across requests.
         encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
-            model_config=model_config,
-            scheduler_config=scheduler_config,
+            model_config=vllm_config.model_config,
+            scheduler_config=vllm_config.scheduler_config,
             mm_registry=mm_registry,
         )
 
@@ -115,6 +125,7 @@ class Scheduler(SchedulerInterface):
             cache_size=encoder_cache_size)
 
         self.num_lookahead_tokens = 0
+        speculative_config = vllm_config.speculative_config
         if speculative_config and speculative_config.method == "eagle":
             self.num_lookahead_tokens = \
                 speculative_config.num_speculative_tokens
@@ -304,6 +315,16 @@ class Scheduler(SchedulerInterface):
                 # Get already-cached tokens.
                 computed_blocks, num_computed_tokens = \
                     self.kv_cache_manager.get_computed_blocks(request)
+
+                # Get externally-cached tokens if using a KVConnector.
+                num_external_tokens = (
+                    0 if self.connector is None else
+                    self.connector.get_num_new_matched_tokens(
+                        request, num_computed_tokens))
+
+                # Total computed tokens (local + external).
+                num_computed_tokens += num_external_tokens
+
                 # Number of tokens to be scheduled.
                 # We use `request.num_tokens` instead of
                 # `request.num_prompt_tokens` to consider the resumed requests,
@@ -330,11 +351,21 @@ class Scheduler(SchedulerInterface):
                     new_encoder_budget = encoder_budget
 
                 new_blocks = self.kv_cache_manager.allocate_slots(
-                    request, num_new_tokens, computed_blocks)
+                    request, num_new_tokens + num_external_tokens,
+                    computed_blocks)
                 if new_blocks is None:
                     # The request cannot be scheduled.
                     break
 
+                # KVConnector: update internal state after allocation.
+                # This information is used to determine if a load is
+                # needed for this request.
+                if self.connector is not None:
+                    self.connector.update_state_after_alloc(
+                        request,
+                        num_external_tokens,
+                    )
+
                 self.waiting.popleft()
                 if request.use_structured_output:
                     structured_output_request_ids[
@@ -443,6 +474,14 @@ class Scheduler(SchedulerInterface):
             grammar_bitmask=grammar_bitmask,
         )
 
+        # NOTE(Kuntai): this function is designed for multiple purposes:
+        # 1. Plan the KV cache store
+        # 2. Wrap up all the KV cache load / save ops into an opaque object
+        # 3. Clear the internal states of the connector
+        if self.connector is not None:
+            meta = self.connector.build_connector_meta(scheduler_output)
+            scheduler_output.kv_connector_metadata = meta
+
         # Advance the number of computed tokens for the request AFTER
         # the request is scheduled.
         # 1. The scheduler_output of the current step has to include the
@@ -508,6 +547,9 @@ class Scheduler(SchedulerInterface):
         If an encoder input cannot be scheduled due to cache or budget
         limitations, the method adjusts `num_new_tokens` to schedule only the
         decoder tokens up to just before the unschedulable encoder input.
+
+        Note that num_computed_tokens includes both locally cached
+        blocks and externally cached blocks (via KVConnector).
         """
         encoder_inputs_to_schedule: list[int] = []
         mm_positions = request.mm_positions
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index ba5e5050abbb7..9c4036efd050b 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -92,12 +92,8 @@ class EngineCore:
                 vllm_config.scheduler_config.scheduler_cls)
 
         self.scheduler: SchedulerInterface = Scheduler(
-            scheduler_config=vllm_config.scheduler_config,
-            model_config=vllm_config.model_config,
-            cache_config=vllm_config.cache_config,
-            lora_config=vllm_config.lora_config,
+            vllm_config=vllm_config,
             kv_cache_config=kv_cache_config,
-            speculative_config=vllm_config.speculative_config,
             structured_output_manager=self.structured_output_manager,
             include_finished_set=vllm_config.parallel_config.data_parallel_size
             > 1,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index baf0dfb9d9df4..ac0701c459860 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -13,6 +13,8 @@ import torch.nn as nn
 from vllm.attention import AttentionType, get_attn_backend
 from vllm.attention.layer import Attention
 from vllm.config import CompilationLevel, VllmConfig
+from vllm.distributed.kv_transfer import (get_kv_transfer_group,
+                                          has_kv_transfer_group)
 from vllm.distributed.parallel_state import get_pp_group, graph_capture
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
@@ -987,6 +989,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         scheduler_output: "SchedulerOutput",
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> Union[ModelRunnerOutput, torch.Tensor]:
+        # Update KVConnector with the KVConnector metadata forward().
+        if has_kv_transfer_group():
+            get_kv_transfer_group().bind_connector_metadata(
+                scheduler_output.kv_connector_metadata)
+
         self._update_states(scheduler_output)
         if not scheduler_output.total_num_scheduled_tokens:
             # Return empty ModelRunnerOutput if there's no work to do.
@@ -1228,6 +1235,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             # in the next step.
             del draft_probs
 
+        # Clear KVConnector state after all KVs are generated.
+        if has_kv_transfer_group():
+            get_kv_transfer_group().clear_connector_metadata()
+
         return ModelRunnerOutput(
             req_ids=self.input_batch.req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 2972e0ffb3baa..3a29f8d0deefe 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -9,11 +9,12 @@ import torch.distributed
 import torch.nn as nn
 
 import vllm.envs as envs
-from vllm.config import ParallelConfig, VllmConfig
+from vllm.config import VllmConfig
 from vllm.device_allocator.cumem import CuMemAllocator
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment,
                               set_custom_all_reduce)
+from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -110,7 +111,7 @@ class Worker(WorkerBase):
             raise RuntimeError(
                 f"Not support device type: {self.device_config.device}")
         # Initialize the distributed environment.
-        init_worker_distributed_environment(self.parallel_config, self.rank,
+        init_worker_distributed_environment(self.vllm_config, self.rank,
                                             self.distributed_init_method,
                                             self.local_rank)
         # Set random seed.
@@ -285,12 +286,13 @@ class Worker(WorkerBase):
 
 
 def init_worker_distributed_environment(
-    parallel_config: ParallelConfig,
+    vllm_config: VllmConfig,
     rank: int,
     distributed_init_method: Optional[str] = None,
     local_rank: int = -1,
 ) -> None:
     """Initialize the distributed environment."""
+    parallel_config = vllm_config.parallel_config
     set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
 
     init_distributed_environment(parallel_config.world_size, rank,
@@ -299,6 +301,8 @@ def init_worker_distributed_environment(
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
                                       parallel_config.pipeline_parallel_size)
 
+    ensure_kv_transfer_initialized(vllm_config)
+
 
 def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
     # Check if the GPU supports the dtype.
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 9524a69f6b3aa..49b0ba1bd8a49 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -23,7 +23,8 @@ from vllm.attention.backends.abstract import AttentionState
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.config import CompilationLevel, VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
-from vllm.distributed import get_kv_transfer_group, get_pp_group
+from vllm.distributed import get_pp_group
+from vllm.distributed.kv_transfer import get_kv_transfer_group
 from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank,
                                              graph_capture)
 from vllm.forward_context import get_forward_context, set_forward_context
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index d59f20f499964..9ea003bec5e06 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -10,10 +10,10 @@ import torch.distributed
 import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.device_allocator.cumem import CuMemAllocator
-from vllm.distributed import (ensure_kv_transfer_initialized,
-                              ensure_model_parallel_initialized,
+from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment,
                               set_custom_all_reduce)
+from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed

From 183dad7a85487dbb351c43c11d2180a6108d5448 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 17 Apr 2025 18:14:07 -0400
Subject: [PATCH 491/593] [Attention] Update to lastest FA3 code (#13111)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
---
 cmake/external_projects/vllm_flash_attn.cmake |   2 +-
 vllm/attention/backends/mla/common.py         | 182 +++++++++---------
 vllm/attention/backends/utils.py              |  26 ++-
 vllm/v1/attention/backends/flash_attn.py      |  59 +++++-
 vllm/v1/attention/backends/mla/common.py      |  90 ++++++---
 5 files changed, 241 insertions(+), 118 deletions(-)

diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index afd7c47e8ac00..110ef266c6653 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG dc9d410b3e2d6534a4c70724c2515f4def670a22
+          GIT_TAG 0a721daebe4fa7149f06ecf3d3eabeb6dcd0f1fa
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index 54278f5f608eb..2ec771a64557a 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -1043,8 +1043,8 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
         self.q_proj = q_proj
         self.kv_b_proj = kv_b_proj
         self.o_proj = o_proj
-        self.triton_fa_func = triton_attention
 
+        self.triton_fa_func = triton_attention
         # Handle the differences between the flash_attn_varlen from flash_attn
         # and the one from vllm_flash_attn. The former is used on RoCM and the
         # latter has an additional parameter to control FA2 vs FA3
@@ -1055,6 +1055,70 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
                 functools.partial(flash_attn_varlen_func,
                                   fa_version=self.vllm_flash_attn_version)
 
+        # For MLA the v head dim is smaller than qk head dim so we pad out
+        # v with 0s to match the qk head dim for attention backends that do
+        # not support different headdims
+        # We don't need to pad V if we are on a hopper system with FA3
+        self._pad_v = self.vllm_flash_attn_version is None or not (
+            self.vllm_flash_attn_version == 3
+            and current_platform.get_device_capability()[0] == 9)
+
+    def _flash_attn_varlen_diff_headdims(self, q, k, v, softmax_scale,
+                                         return_softmax_lse, **kwargs):
+        maybe_padded_v = v
+        if self._pad_v:
+            maybe_padded_v = torch.nn.functional.pad(
+                v, [0, q.shape[-1] - v.shape[-1]], value=0)
+
+        if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN \
+            and not return_softmax_lse:
+            attn_out = self.triton_fa_func(
+                q,
+                k,
+                maybe_padded_v,
+                **kwargs,
+            )
+        if is_vllm_fa:
+            attn_out = self.flash_attn_varlen_func(
+                q=q,
+                k=k,
+                v=maybe_padded_v,
+                return_softmax_lse=return_softmax_lse,
+                softmax_scale=softmax_scale,
+                **kwargs,
+            )
+        else:
+            # Use return_attn_probs instead of return_softmax_lse for RoCM
+            attn_out = self.flash_attn_varlen_func(
+                q=q,
+                k=k,
+                v=maybe_padded_v,
+                return_attn_probs=return_softmax_lse,
+                softmax_scale=softmax_scale,
+                **kwargs,
+            )
+
+        # Unpack the output if there is multiple results,
+        # triton always returns (output, softmax_lse),
+        # vllm_flash_attn returns (output, softmax_lse) when
+        #  `return_softmax_lse = True`
+        # flash_attn (RoCM) returns (output, softmax_lse, ...) when
+        #  `return_attn_probs = True`
+        rest = None
+        if isinstance(attn_out, tuple):
+            attn_out, *rest = attn_out
+
+        # unpad if necessary
+        if self._pad_v:
+            attn_out = attn_out[..., :v.shape[-1]]
+
+        # Remain consistent with old `flash_attn_varlen_func` where there
+        # is only one output tensor if `return_softmax_lse` is False.
+        if return_softmax_lse:
+            assert rest is not None
+            return attn_out, rest[0]
+        return attn_out
+
     def _v_up_proj_and_o_proj(self, x):
         # Convert from (B, N, L) to (N, B, L)
         x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
@@ -1176,40 +1240,19 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
             k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))),
                           dim=-1)
 
-            # For MLA the v head dim is smaller than qk head dim so we pad
-            # out v with 0s to match the qk head dim
-            v_padded = torch.nn.functional.pad(v,
-                                               [0, q.shape[-1] - v.shape[-1]],
-                                               value=0)
-
-            if is_vllm_fa:
-                attn_output, attn_softmax_lse = self.flash_attn_varlen_func(
-                    q=q,
-                    k=k,
-                    v=v_padded,
-                    cu_seqlens_q=prefill_metadata.query_start_loc,
-                    cu_seqlens_k=prefill_metadata.context_chunk_cu_seq_lens[i],
-                    max_seqlen_q=prefill_metadata.max_query_len,
-                    max_seqlen_k=prefill_metadata.
-                    context_chunk_max_seq_lens[i],
-                    softmax_scale=self.scale,
-                    causal=False,  # Context is unmasked
-                    return_softmax_lse=True,
-                )
-            else:
-                attn_output, attn_softmax_lse, _ = self.flash_attn_varlen_func(
-                    q=q,
-                    k=k,
-                    v=v_padded,
-                    cu_seqlens_q=prefill_metadata.query_start_loc,
-                    cu_seqlens_k=prefill_metadata.context_chunk_cu_seq_lens[i],
-                    max_seqlen_q=prefill_metadata.max_query_len,
-                    max_seqlen_k=prefill_metadata.
-                    context_chunk_max_seq_lens[i],
-                    softmax_scale=self.scale,
-                    causal=False,  # Context is unmasked
-                    return_attn_probs=True,
-                )
+            attn_output, attn_softmax_lse = \
+                self._flash_attn_varlen_diff_headdims(
+                q=q,
+                k=k,
+                v=v,
+                cu_seqlens_q=prefill_metadata.query_start_loc,
+                cu_seqlens_k=prefill_metadata.context_chunk_cu_seq_lens[i],
+                max_seqlen_q=prefill_metadata.max_query_len,
+                max_seqlen_k=prefill_metadata.context_chunk_max_seq_lens[i],
+                softmax_scale=self.scale,
+                causal=False,  # Context is unmasked
+                return_softmax_lse=True,
+            )
 
             if output is None:
                 output = attn_output
@@ -1252,58 +1295,22 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
 
         k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
 
-        # For MLA the v head dim is smaller than qk head dim so we pad out
-        # v with 0s to match the qk head dim
-        v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
-                                           value=0)
-
-        if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN and not has_context:
-            output = self.triton_fa_func(
-                q,
-                k,
-                v_padded,
-                None,
-                prefill_metadata.query_start_loc,
-                prefill_metadata.query_start_loc,
-                prefill_metadata.max_prefill_seq_len,
-                prefill_metadata.max_prefill_seq_len,
-                True,  # causal
-                self.scale,
-                None,  # attn_mask is None unless applying ALiBi mask
-            )
-            ## triton flash attention always return 2 objects
-            if not has_context:
-                output = output[0]
-        elif is_vllm_fa:
-            output = self.flash_attn_varlen_func(
-                q=q,
-                k=k,
-                v=v_padded,
-                cu_seqlens_q=prefill_metadata.query_start_loc,
-                cu_seqlens_k=prefill_metadata.query_start_loc,
-                max_seqlen_q=prefill_metadata.max_prefill_seq_len,
-                max_seqlen_k=prefill_metadata.max_prefill_seq_len,
-                softmax_scale=self.scale,
-                causal=True,
-                return_softmax_lse=has_context,
-            )
-        else:
-            output = self.flash_attn_varlen_func(
-                q=q,
-                k=k,
-                v=v_padded,
-                cu_seqlens_q=prefill_metadata.query_start_loc,
-                cu_seqlens_k=prefill_metadata.query_start_loc,
-                max_seqlen_q=prefill_metadata.max_prefill_seq_len,
-                max_seqlen_k=prefill_metadata.max_prefill_seq_len,
-                softmax_scale=self.scale,
-                causal=True,
-                return_attn_probs=has_context,
-            )
+        output = self._flash_attn_varlen_diff_headdims(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=prefill_metadata.query_start_loc,
+            cu_seqlens_k=prefill_metadata.query_start_loc,
+            max_seqlen_q=prefill_metadata.max_prefill_seq_len,
+            max_seqlen_k=prefill_metadata.max_prefill_seq_len,
+            softmax_scale=self.scale,
+            causal=True,
+            return_softmax_lse=has_context,
+        )
 
         if has_context:
             # ROCm flash_attn_varlen_func will return 3 objects instead of 2
-            suffix_output, suffix_lse, *rest = output
+            suffix_output, suffix_lse = output
             context_output, context_lse = self._compute_prefill_context( \
                 q, kv_c_and_k_pe_cache, attn_metadata)
 
@@ -1316,12 +1323,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
                 suffix_lse=suffix_lse,
             )
 
-        # slice by `:v.shape[-1]` in order to remove v headdim padding
-        output = output\
-            .view(-1, self.num_heads, q.shape[-1])[..., :v.shape[-1]]\
-                .reshape(-1, self.num_heads * v.shape[-1])
-
-        return self.o_proj(output)[0]
+        return self.o_proj(output.flatten(start_dim=-2))[0]
 
     @abstractmethod
     def _forward_decode(
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index b4413c36b64a0..89f1ea9b8a570 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -2,8 +2,10 @@
 """Attention backend utils"""
 from collections import defaultdict
 from contextlib import contextmanager
+from dataclasses import dataclass
 from itertools import accumulate
-from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Type, TypeVar, Union
+from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type,
+                    TypeVar, Union)
 
 import numpy as np
 import torch
@@ -11,6 +13,7 @@ import torch
 from vllm.attention import (AttentionMetadata, AttentionMetadataBuilder,
                             AttentionState)
 from vllm.attention.backends.abstract import AttentionType
+from vllm.config import ModelConfig
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalPlaceholderMap
 from vllm.utils import async_tensor_h2d, make_tensor_with_pad
@@ -583,3 +586,24 @@ def get_num_prefill_decode_query_kv_tokens(
 
     return (num_prefill_query_tokens, num_prefill_kv_tokens,
             num_decode_query_tokens)
+
+
+@dataclass
+class MLADims:
+    q_lora_rank: Optional[int]
+    kv_lora_rank: int
+    qk_nope_head_dim: int
+    qk_rope_head_dim: int
+    v_head_dim: int
+
+
+def get_mla_dims(model_config: ModelConfig) -> MLADims:
+    hf_text_config = model_config.hf_text_config
+
+    return MLADims(
+        q_lora_rank=getattr(hf_text_config, "q_lora_rank", None),
+        kv_lora_rank=hf_text_config.kv_lora_rank,
+        qk_nope_head_dim=hf_text_config.qk_nope_head_dim,
+        qk_rope_head_dim=hf_text_config.qk_rope_head_dim,
+        v_head_dim=hf_text_config.v_head_dim,
+    )
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index b4c7708daab91..c039cd8067f33 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -23,7 +23,8 @@ if TYPE_CHECKING:
     from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
 if current_platform.is_cuda():
-    from vllm.vllm_flash_attn import flash_attn_varlen_func
+    from vllm.vllm_flash_attn import (flash_attn_varlen_func,
+                                      get_scheduler_metadata)
 
 logger = init_logger(__name__)
 
@@ -93,6 +94,10 @@ class FlashAttentionMetadata:
     prefix_kv_lens: Optional[torch.Tensor]
     suffix_kv_lens: Optional[torch.Tensor]
 
+    # Optional aot scheduling
+    scheduler_metadata: Optional[torch.Tensor] = None
+    prefix_scheduler_metadata: Optional[torch.Tensor] = None
+
     # For logging.
     num_input_tokens: int = 0  # Number of tokens including padding.
 
@@ -277,7 +282,14 @@ def make_local_attention_virtual_batches(
 class FlashAttentionMetadataBuilder:
 
     def __init__(self, runner: "GPUModelRunner"):
+        model_config = runner.model_config
+
         self.runner = runner
+        self.aot_schedule = (get_flash_attn_version() == 3)
+        self.num_heads = model_config.get_num_attention_heads(
+            runner.parallel_config)
+        self.headdim = model_config.get_head_size()
+        self.page_size = self.runner.block_size
 
     def reorder_batch(self, input_batch: "InputBatch",
                       scheduler_output: "SchedulerOutput") -> bool:
@@ -319,6 +331,24 @@ class FlashAttentionMetadataBuilder:
             )
 
         use_cascade = common_prefix_len > 0
+
+        def schedule(cu_query_lens, max_query_len, seqlens, max_seq_len,
+                     causal):
+            if self.aot_schedule:
+                return get_scheduler_metadata(
+                    batch_size=num_reqs,
+                    max_seqlen_q=max_query_len,
+                    max_seqlen_k=max_seq_len,
+                    cache_seqlens=seqlens,
+                    num_heads_q=self.num_heads,
+                    num_heads_kv=self.num_heads,
+                    headdim=self.headdim,
+                    page_size=self.page_size,
+                    cu_seqlens_q=cu_query_lens,
+                    causal=causal,
+                )
+            return None
+
         if use_cascade:
             cu_prefix_query_lens = torch.tensor([0, num_actual_tokens],
                                                 dtype=torch.int32,
@@ -330,10 +360,28 @@ class FlashAttentionMetadataBuilder:
                               common_prefix_len)
             suffix_kv_lens = torch.from_numpy(suffix_kv_lens).to(
                 self.runner.device)
+            prefix_scheduler_metadata = schedule(
+                cu_query_lens=cu_prefix_query_lens,
+                max_query_len=num_actual_tokens,
+                seqlens=prefix_kv_lens,
+                max_seq_len=common_prefix_len,
+                causal=False)
+            scheduler_metadata = schedule(cu_query_lens=query_start_loc,
+                                          max_query_len=max_query_len,
+                                          seqlens=suffix_kv_lens,
+                                          max_seq_len=max_seq_len -
+                                          common_prefix_len,
+                                          causal=True)
         else:
             cu_prefix_query_lens = None
             prefix_kv_lens = None
             suffix_kv_lens = None
+            prefix_scheduler_metadata = None
+            scheduler_metadata = schedule(cu_query_lens=query_start_loc,
+                                          max_query_len=max_query_len,
+                                          seqlens=seq_lens,
+                                          max_seq_len=max_seq_len,
+                                          causal=True)
 
         attn_metadata = FlashAttentionMetadata(
             num_actual_tokens=num_actual_tokens,
@@ -345,10 +393,12 @@ class FlashAttentionMetadataBuilder:
             slot_mapping=slot_mapping,
             use_cascade=use_cascade,
             common_prefix_len=common_prefix_len,
+            scheduler_metadata=scheduler_metadata,
             cu_prefix_query_lens=cu_prefix_query_lens,
             prefix_kv_lens=prefix_kv_lens,
             suffix_kv_lens=suffix_kv_lens,
             local_attn_metadata=local_attn_metadata,
+            prefix_scheduler_metadata=prefix_scheduler_metadata,
         )
         return attn_metadata
 
@@ -515,6 +565,7 @@ class FlashAttentionImpl(AttentionImpl):
                 window_size=self.sliding_window,
                 block_table=block_table,
                 softcap=self.logits_soft_cap,
+                scheduler_metadata=attn_metadata.scheduler_metadata,
                 fa_version=self.vllm_flash_attn_version,
                 q_descale=layer._q_scale.expand(descale_shape),
                 k_descale=layer._k_scale.expand(descale_shape),
@@ -543,6 +594,8 @@ class FlashAttentionImpl(AttentionImpl):
             block_table=attn_metadata.block_table,
             common_prefix_len=attn_metadata.common_prefix_len,
             fa_version=self.vllm_flash_attn_version,
+            prefix_scheduler_metadata=attn_metadata.prefix_scheduler_metadata,
+            suffix_scheduler_metadata=attn_metadata.scheduler_metadata,
             q_descale=layer._q_scale,
             k_descale=layer._k_scale,
             v_descale=layer._v_scale,
@@ -636,6 +689,8 @@ def cascade_attention(
     block_table: torch.Tensor,
     common_prefix_len: int,
     fa_version: int,
+    prefix_scheduler_metadata: Optional[torch.Tensor] = None,
+    suffix_scheduler_metadata: Optional[torch.Tensor] = None,
     q_descale: Optional[torch.Tensor] = None,
     k_descale: Optional[torch.Tensor] = None,
     v_descale: Optional[torch.Tensor] = None,
@@ -667,6 +722,7 @@ def cascade_attention(
         block_table=block_table[:1],
         softcap=logits_soft_cap,
         return_softmax_lse=True,
+        scheduler_metadata=prefix_scheduler_metadata,
         fa_version=fa_version,
         q_descale=q_descale.expand(descale_shape)
         if q_descale is not None else None,
@@ -693,6 +749,7 @@ def cascade_attention(
         block_table=block_table[:, num_common_kv_blocks:],
         softcap=logits_soft_cap,
         return_softmax_lse=True,
+        scheduler_metadata=suffix_scheduler_metadata,
         fa_version=fa_version,
         q_descale=q_descale.expand(descale_shape)
         if q_descale is not None else None,
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index b77e952521969..c0a6bd29623e6 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -195,6 +195,7 @@ from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
                                               AttentionMetadata,
                                               MLAAttentionImpl)
+from vllm.attention.backends.utils import get_mla_dims
 from vllm.attention.ops.merge_attn_states import merge_attn_states
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -207,9 +208,11 @@ from vllm.vllm_flash_attn.fa_utils import get_flash_attn_version
 
 try:
     from vllm.vllm_flash_attn import flash_attn_varlen_func
+    is_vllm_fa = True
 except ImportError:
     # For rocm use upstream flash attention
     from flash_attn import flash_attn_varlen_func
+    is_vllm_fa = False
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
@@ -350,6 +353,14 @@ class MLACommonMetadataBuilder(Generic[M]):
         model_config = runner.model_config
         cache_config = runner.cache_config
         self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
+        self.num_heads = model_config.get_num_attention_heads(
+            runner.parallel_config)
+        self.mla_dims = get_mla_dims(model_config)
+        self.aot_schedule = is_vllm_fa and (get_flash_attn_version() == 3)
+
+        # Dont try to access the runner on AMD
+        if self.aot_schedule:
+            self.page_size = self.runner.block_size
 
         if self.chunked_prefill_enabled:
             self.chunked_prefill_workspace_size = min(
@@ -375,7 +386,6 @@ class MLACommonMetadataBuilder(Generic[M]):
                 dtype=model_config.dtype,
                 device=runner.device,
             )
-            self.page_size = self.runner.block_size
 
     def reorder_batch(self, input_batch: "InputBatch",
                       scheduler_output: "SchedulerOutput") -> bool:
@@ -464,7 +474,6 @@ class MLACommonMetadataBuilder(Generic[M]):
 
         seq_lens_cpu = self.runner.seq_lens_cpu[:num_reqs]
         seq_lens = seq_lens_cpu.to(device, non_blocking=True)
-        max_query_len = seq_lens_cpu.max().item()
 
         prefill_metadata = None
         if self._num_prefills > 0:
@@ -475,6 +484,8 @@ class MLACommonMetadataBuilder(Generic[M]):
                 num_computed_tokens_cpu_tensor[reqs_start:num_reqs]
             max_context_len_cpu = context_lens_cpu.max().item()
             num_prefills_with_context_cpu = (context_lens_cpu > 0).sum().item()
+            prefill_query_start_loc = query_start_loc[
+                reqs_start:] - query_start_loc[reqs_start]
 
             chunked_context_metadata = None
             if self.chunked_prefill_enabled and self._num_prefills > 0 \
@@ -537,8 +548,7 @@ class MLACommonMetadataBuilder(Generic[M]):
             prefill_metadata = MLACommonPrefillMetadata(
                 input_positions=input_positions[tokens_start:],
                 block_table=block_table[reqs_start:, ...],
-                query_start_loc=query_start_loc[reqs_start:] -
-                query_start_loc[reqs_start],
+                query_start_loc=prefill_query_start_loc,
                 max_query_len=max_query_len,
                 chunked_context=chunked_context_metadata,
             )
@@ -628,11 +638,56 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
         # and the one from vllm_flash_attn. The former is used on RoCM and the
         # latter has an additional parameter to control FA2 vs FA3
         self.flash_attn_varlen_func = flash_attn_varlen_func
+        self.vllm_flash_attn_version = get_flash_attn_version()
         if self.vllm_flash_attn_version is not None:
             self.flash_attn_varlen_func = \
                 functools.partial(flash_attn_varlen_func,
                                   fa_version=self.vllm_flash_attn_version)
 
+        # For MLA the v head dim is smaller than qk head dim so we pad out
+        # v with 0s to match the qk head dim for attention backends that do
+        # not support different headdims
+        # We don't need to pad V if we are on a hopper system with FA3
+        self._pad_v = self.vllm_flash_attn_version is None or not (
+            self.vllm_flash_attn_version == 3
+            and current_platform.get_device_capability()[0] == 9)
+
+    def _flash_attn_varlen_diff_headdims(self,
+                                         q,
+                                         k,
+                                         v,
+                                         return_softmax_lse=False,
+                                         softmax_scale=None,
+                                         **kwargs):
+        maybe_padded_v = v
+        if self._pad_v:
+            maybe_padded_v = torch.nn.functional.pad(
+                v, [0, q.shape[-1] - v.shape[-1]], value=0)
+
+        attn_out = self.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=maybe_padded_v,
+            return_softmax_lse=return_softmax_lse,
+            softmax_scale=softmax_scale,
+            **kwargs,
+        )
+
+        # Unpack the output if there is multiple results
+        lse = None
+        if isinstance(attn_out, tuple):
+            attn_out, lse = attn_out[0], attn_out[1]
+
+        # unpad if necessary
+        if self._pad_v:
+            attn_out = attn_out[..., :v.shape[-1]]
+
+        # Remain consistent with old `flash_attn_varlen_func` where there
+        # is only one output tensor if `return_softmax_lse` is False.
+        if return_softmax_lse:
+            return attn_out, lse
+        return attn_out
+
     def _v_up_proj_and_o_proj(self, x):
         # Convert from (B, N, L) to (N, B, L)
         x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
@@ -745,16 +800,11 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
             k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))),
                           dim=-1)
 
-            # For MLA the v head dim is smaller than qk head dim so we pad
-            # out v with 0s to match the qk head dim
-            v_padded = torch.nn.functional.pad(v,
-                                               [0, q.shape[-1] - v.shape[-1]],
-                                               value=0)
-
-            attn_output, attn_softmax_lse = self.flash_attn_varlen_func(
+            attn_output, attn_softmax_lse = \
+                self._flash_attn_varlen_diff_headdims(
                 q=q,
                 k=k,
-                v=v_padded,
+                v=v,
                 cu_seqlens_q=prefill_metadata.query_start_loc,
                 cu_seqlens_k=prefill_metadata.chunked_context.cu_seq_lens[i],
                 max_seqlen_q=prefill_metadata.max_query_len,
@@ -801,15 +851,10 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
 
         k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
 
-        # For MLA the v head dim is smaller than qk head dim so we pad out
-        # v with 0s to match the qk head dim
-        v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
-                                           value=0)
-
-        output = self.flash_attn_varlen_func(
+        output = self._flash_attn_varlen_diff_headdims(
             q=q,
             k=k,
-            v=v_padded,
+            v=v,
             cu_seqlens_q=attn_metadata.prefill.query_start_loc,
             cu_seqlens_k=attn_metadata.prefill.query_start_loc,
             max_seqlen_q=attn_metadata.prefill.max_query_len,
@@ -833,12 +878,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
                 suffix_lse=suffix_lse,
             )
 
-        # slice by `:v.shape[-1]` in order to remove v headdim padding
-        output = output\
-            .view(-1, self.num_heads, q.shape[-1])[..., :v.shape[-1]]\
-                .reshape(-1, self.num_heads * v.shape[-1])
-
-        return self.o_proj(output)[0]
+        return self.o_proj(output.flatten(start_dim=-2))[0]
 
     @abstractmethod
     def _forward_decode(

From e37073efd7dea225ff8a0614f04575547d7588ff Mon Sep 17 00:00:00 2001
From: Tarun Kumar <takumar@redhat.com>
Date: Fri, 18 Apr 2025 09:38:27 +0530
Subject: [PATCH 492/593] Add property-based testing for vLLM endpoints using
 an API defined by an OpenAPI 3.1 schema (#16721)

Signed-off-by: Tarun Kumar <takumar@redhat.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
---
 .buildkite/test-pipeline.yaml                 |  2 +-
 requirements/test.in                          |  1 +
 requirements/test.txt                         | 88 +++++++++++++++++--
 .../entrypoints/openai/test_openai_schema.py  | 49 +++++++++++
 4 files changed, 134 insertions(+), 6 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_openai_schema.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 5fc7b48bfcf2d..c4a54bfbcf5e6 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -118,7 +118,7 @@ steps:
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
   - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py  --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py  --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_openai_schema.py
   - pytest -v -s entrypoints/test_chat_utils.py
   - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
diff --git a/requirements/test.in b/requirements/test.in
index 833f26b554103..3be580db0674c 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -37,6 +37,7 @@ lm-eval[api]==0.4.8 # required for model evaluation test
 transformers==4.51.1
 tokenizers==0.21.1
 huggingface-hub[hf_xet]>=0.30.0  # Required for Xet downloads.
+schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
 bitsandbytes>=0.45.3
 buildkite-test-collector==0.1.9
diff --git a/requirements/test.txt b/requirements/test.txt
index ee99540263efc..6dcd4ff01460c 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -20,21 +20,29 @@ aiosignal==1.3.1
 annotated-types==0.7.0
     # via pydantic
 anyio==4.6.2.post1
-    # via httpx
+    # via
+    #   httpx
+    #   starlette
 argcomplete==3.5.1
     # via datamodel-code-generator
+arrow==1.3.0
+    # via isoduration
 attrs==24.2.0
     # via
     #   aiohttp
+    #   hypothesis
     #   jsonlines
     #   jsonschema
+    #   pytest-subtests
     #   referencing
 audioread==3.0.1
     # via librosa
 awscli==1.35.23
     # via -r requirements/test.in
 backoff==2.2.1
-    # via -r requirements/test.in
+    # via
+    #   -r requirements/test.in
+    #   schemathesis
 bitsandbytes==0.45.3
     # via -r requirements/test.in
 black==24.10.0
@@ -69,11 +77,13 @@ click==8.1.7
     #   jiwer
     #   nltk
     #   ray
+    #   schemathesis
     #   typer
 colorama==0.4.6
     # via
     #   awscli
     #   sacrebleu
+    #   schemathesis
     #   tqdm-multiprocess
 contourpy==1.3.0
     # via matplotlib
@@ -138,6 +148,8 @@ filelock==3.16.1
     #   transformers
 fonttools==4.54.1
     # via matplotlib
+fqdn==1.5.1
+    # via jsonschema
 frozendict==2.4.6
     # via einx
 frozenlist==1.5.0
@@ -156,8 +168,12 @@ genai-perf==0.0.8
     # via -r requirements/test.in
 genson==1.3.0
     # via datamodel-code-generator
+graphql-core==3.2.6
+    # via hypothesis-graphql
 h11==0.14.0
     # via httpcore
+harfile==0.3.0
+    # via schemathesis
 hf-xet==0.1.4
     # via huggingface-hub
 hiredis==3.0.0
@@ -165,7 +181,9 @@ hiredis==3.0.0
 httpcore==1.0.6
     # via httpx
 httpx==0.27.2
-    # via -r requirements/test.in
+    # via
+    #   -r requirements/test.in
+    #   schemathesis
 huggingface-hub==0.30.1
     # via
     #   -r requirements/test.in
@@ -180,17 +198,29 @@ huggingface-hub==0.30.1
     #   vocos
 humanize==4.11.0
     # via runai-model-streamer
+hypothesis==6.131.0
+    # via
+    #   hypothesis-graphql
+    #   hypothesis-jsonschema
+    #   schemathesis
+hypothesis-graphql==0.11.1
+    # via schemathesis
+hypothesis-jsonschema==0.23.1
+    # via schemathesis
 idna==3.10
     # via
     #   anyio
     #   email-validator
     #   httpx
+    #   jsonschema
     #   requests
     #   yarl
 inflect==5.6.2
     # via datamodel-code-generator
 iniconfig==2.0.0
     # via pytest
+isoduration==20.11.0
+    # via jsonschema
 isort==5.13.2
     # via datamodel-code-generator
 jinja2==3.1.6
@@ -210,12 +240,18 @@ joblib==1.4.2
     #   scikit-learn
 jsonlines==4.0.0
     # via lm-eval
+jsonpointer==3.0.0
+    # via jsonschema
 jsonschema==4.23.0
     # via
+    #   hypothesis-jsonschema
     #   mistral-common
     #   ray
+    #   schemathesis
 jsonschema-specifications==2024.10.1
     # via jsonschema
+junit-xml==1.9
+    # via schemathesis
 kaleido==0.2.1
     # via genai-perf
 kiwisolver==1.4.7
@@ -239,7 +275,9 @@ mamba-ssm==2.2.4
 markdown-it-py==3.0.0
     # via rich
 markupsafe==3.0.2
-    # via jinja2
+    # via
+    #   jinja2
+    #   werkzeug
 matplotlib==3.9.2
     # via -r requirements/test.in
 mbstrdecoder==1.1.3
@@ -449,6 +487,8 @@ pygments==2.18.0
     # via rich
 pyparsing==3.2.0
     # via matplotlib
+pyrate-limiter==3.7.0
+    # via schemathesis
 pytablewriter==1.2.0
     # via lm-eval
 pytest==8.3.3
@@ -461,7 +501,9 @@ pytest==8.3.3
     #   pytest-mock
     #   pytest-rerunfailures
     #   pytest-shard
+    #   pytest-subtests
     #   pytest-timeout
+    #   schemathesis
 pytest-asyncio==0.24.0
     # via -r requirements/test.in
 pytest-forked==1.6.0
@@ -472,10 +514,13 @@ pytest-rerunfailures==14.0
     # via -r requirements/test.in
 pytest-shard==0.1.2
     # via -r requirements/test.in
+pytest-subtests==0.14.1
+    # via schemathesis
 pytest-timeout==2.3.1
     # via -r requirements/test.in
 python-dateutil==2.9.0.post0
     # via
+    #   arrow
     #   botocore
     #   matplotlib
     #   pandas
@@ -497,6 +542,7 @@ pyyaml==6.0.2
     #   peft
     #   ray
     #   responses
+    #   schemathesis
     #   timm
     #   transformers
     #   vocos
@@ -527,10 +573,16 @@ requests==2.32.3
     #   pooch
     #   ray
     #   responses
+    #   schemathesis
+    #   starlette-testclient
     #   tiktoken
     #   transformers
 responses==0.25.3
     # via genai-perf
+rfc3339-validator==0.1.4
+    # via jsonschema
+rfc3987==1.3.8
+    # via jsonschema
 rich==13.9.4
     # via
     #   genai-perf
@@ -559,6 +611,8 @@ safetensors==0.4.5
     #   peft
     #   timm
     #   transformers
+schemathesis==3.39.15
+    # via -r requirements/test.in
 scikit-learn==1.5.2
     # via
     #   librosa
@@ -584,12 +638,16 @@ shellingham==1.5.4
     # via typer
 six==1.16.0
     # via
+    #   junit-xml
     #   python-dateutil
+    #   rfc3339-validator
     #   rouge-score
 sniffio==1.3.1
     # via
     #   anyio
     #   httpx
+sortedcontainers==2.4.0
+    # via hypothesis
 soundfile==0.12.1
     # via
     #   -r requirements/test.in
@@ -598,6 +656,12 @@ soxr==0.5.0.post1
     # via librosa
 sqlitedict==2.1.0
     # via lm-eval
+starlette==0.46.2
+    # via
+    #   schemathesis
+    #   starlette-testclient
+starlette-testclient==0.4.1
+    # via schemathesis
 statsmodels==0.14.4
     # via genai-perf
 sympy==1.13.1
@@ -628,6 +692,10 @@ tokenizers==0.21.1
     # via
     #   -r requirements/test.in
     #   transformers
+tomli==2.2.1
+    # via schemathesis
+tomli-w==1.2.0
+    # via schemathesis
 torch==2.6.0
     # via
     #   -r requirements/test.in
@@ -693,6 +761,8 @@ typepy==1.3.2
     #   tabledata
 typer==0.15.2
     # via fastsafetensors
+types-python-dateutil==2.9.0.20241206
+    # via arrow
 typing-extensions==4.12.2
     # via
     #   huggingface-hub
@@ -705,6 +775,8 @@ typing-extensions==4.12.2
     #   typer
 tzdata==2024.2
     # via pandas
+uri-template==1.3.0
+    # via jsonschema
 urllib3==2.2.3
     # via
     #   blobfile
@@ -716,6 +788,10 @@ vector-quantize-pytorch==1.21.2
     # via -r requirements/test.in
 vocos==0.1.0
     # via -r requirements/test.in
+webcolors==24.11.1
+    # via jsonschema
+werkzeug==3.1.3
+    # via schemathesis
 word2number==1.1
     # via lm-eval
 xxhash==3.5.0
@@ -723,6 +799,8 @@ xxhash==3.5.0
     #   datasets
     #   evaluate
 yarl==1.17.1
-    # via aiohttp
+    # via
+    #   aiohttp
+    #   schemathesis
 zstandard==0.23.0
     # via lm-eval
diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py
new file mode 100644
index 0000000000000..1ccb803a328d6
--- /dev/null
+++ b/tests/entrypoints/openai/test_openai_schema.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import schemathesis
+from schemathesis import GenerationConfig
+
+from ...utils import RemoteOpenAIServer
+
+schemathesis.experimental.OPEN_API_3_1.enable()
+
+MODEL_NAME = "HuggingFaceTB/SmolVLM-256M-Instruct"
+MAXIMUM_IMAGES = 2
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--task",
+        "generate",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "5",
+        "--enforce-eager",
+        "--trust-remote-code",
+        "--limit-mm-per-prompt",
+        f"image={MAXIMUM_IMAGES}",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def get_schema(server):
+    # avoid generating null (\x00) bytes in strings during test case generation
+    return schemathesis.openapi.from_uri(
+        f"{server.url_root}/openapi.json",
+        generation_config=GenerationConfig(allow_x00=False),
+    )
+
+
+schema = schemathesis.from_pytest_fixture("get_schema")
+
+
+@schema.parametrize()
+@schema.override(headers={"Content-Type": "application/json"})
+async def test_openapi_stateless(case):
+    #No need to verify SSL certificate for localhost
+    await case.call_and_validate(verify=False)

From c16fb5dae88969f0e8b4ee3f9f7861fe91c8b2dc Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 18 Apr 2025 12:22:34 +0800
Subject: [PATCH 493/593] [Doc] Improve help examples for
 `--compilation-config` (#16729)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/design/v1/torch_compile.md |  2 +-
 tests/engine/test_arg_utils.py         | 16 ++++++++++++----
 vllm/engine/arg_utils.py               |  7 ++++---
 3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/docs/source/design/v1/torch_compile.md b/docs/source/design/v1/torch_compile.md
index 57dba680b97c6..83409380ba9a3 100644
--- a/docs/source/design/v1/torch_compile.md
+++ b/docs/source/design/v1/torch_compile.md
@@ -134,6 +134,6 @@ The cudagraphs are captured and managed by the compiler backend, and replayed wh
 
 By default, vLLM will try to determine a set of sizes to capture cudagraph. You can also override it using the config `cudagraph_capture_sizes`:
 
-`VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'cudagraph_capture_sizes': [1, 2, 4, 8]}"`
+`VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.2-1B --compilation-config "{'cudagraph_capture_sizes': [1, 2, 4, 8]}"`
 
 Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture.
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 92387b46425e6..dcdcdd455311d 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -53,12 +53,20 @@ def test_compilation_config():
     assert args.compilation_config.level == 3
 
     # set to string form of a dict
-    args = parser.parse_args(["--compilation-config", "{'level': 3}"])
-    assert args.compilation_config.level == 3
+    args = parser.parse_args([
+        "--compilation-config",
+        "{'level': 3, 'cudagraph_capture_sizes': [1, 2, 4, 8]}",
+    ])
+    assert (args.compilation_config.level == 3 and
+            args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8])
 
     # set to string form of a dict
-    args = parser.parse_args(["--compilation-config={'level': 3}"])
-    assert args.compilation_config.level == 3
+    args = parser.parse_args([
+        "--compilation-config="
+        "{'level': 3, 'cudagraph_capture_sizes': [1, 2, 4, 8]}",
+    ])
+    assert (args.compilation_config.level == 3 and
+            args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8])
 
 
 def test_prefix_cache_default():
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 7c1bde0fecb7f..f98f95171c530 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -939,10 +939,11 @@ class EngineArgs:
                             'testing only. level 3 is the recommended level '
                             'for production.\n'
                             'To specify the full compilation config, '
-                            'use a JSON string.\n'
+                            'use a JSON string, e.g. ``{"level": 3, '
+                            '"cudagraph_capture_sizes": [1, 2, 4, 8]}``\n'
                             'Following the convention of traditional '
-                            'compilers, using -O without space is also '
-                            'supported. -O3 is equivalent to -O 3.')
+                            'compilers, using ``-O`` without space is also '
+                            'supported. ``-O3`` is equivalent to ``-O 3``.')
 
         parser.add_argument('--kv-transfer-config',
                             type=KVTransferConfig.from_cli,

From 7a4a5de729466de4142e70b845e78a0171296468 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Fri, 18 Apr 2025 13:12:42 +0800
Subject: [PATCH 494/593] [Misc] Update outdated note: LMCache now supports
 chunked prefill (#16697)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 examples/offline_inference/cpu_offload_lmcache.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/offline_inference/cpu_offload_lmcache.py b/examples/offline_inference/cpu_offload_lmcache.py
index 025444233d3b0..37aea281032fd 100644
--- a/examples/offline_inference/cpu_offload_lmcache.py
+++ b/examples/offline_inference/cpu_offload_lmcache.py
@@ -37,11 +37,11 @@ def build_llm_with_lmcache():
         '{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}')
     # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
     # memory. Reduce the value if your GPU has less memory.
-    # Note that LMCache is not compatible with chunked prefill for now.
+    # Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
     llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
               kv_transfer_config=ktc,
               max_model_len=8000,
-              enable_chunked_prefill=False,
+              enable_chunked_prefill=True,
               gpu_memory_utilization=0.8)
 
     try:

From 30ed81b7cafa79decb97fc6793eed516c6924543 Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Fri, 18 Apr 2025 13:12:54 +0800
Subject: [PATCH 495/593] [V1][Structured Output] Minor modification to
 `_validate_structured_output()` (#16748)

Signed-off-by: shen-shanshan <467638484@qq.com>
---
 vllm/v1/engine/processor.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index afbbddb86d511..61b7c45b62fa8 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -149,6 +149,7 @@ class Processor:
             "xgrammar", "xgrammar:disable-any-whitespace", "guidance",
             "guidance:disable-any-whitespace", "auto"
         ]
+
         engine_level_backend = self.decoding_config.guided_decoding_backend
         if engine_level_backend not in supported_backends:
             raise ValueError(f"Only {supported_backends} structured output is "
@@ -169,8 +170,15 @@ class Processor:
         if engine_level_backend.startswith("xgrammar"):
             # xgrammar with no fallback
             validate_xgrammar_grammar(params)
-            params.guided_decoding.backend = engine_level_backend
-        elif engine_level_backend == "auto":
+        elif engine_level_backend.startswith("guidance"):
+            # TODO: ideally we would have the LLTokenizer here as Lark syntax
+            # allows <|special_token|> and similar, see
+            # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
+            # Without tokenizer these are disallowed in grammars.
+            validate_guidance_grammar(params, tokenizer=None)
+        else:
+            # NOTE: engine_level_backend must be "auto" here, because we have
+            # checked supported_backends above.
             # "auto" is an opt-in to opinionated behavior where we try to
             # choose a backend based on request contents. This is not the
             # default as it is less predictable and subject to change
@@ -183,14 +191,6 @@ class Processor:
                 # are not supported in xgrammar. Fall back to guidance.
                 params.guided_decoding.backend = "guidance"
 
-        if engine_level_backend.startswith("guidance"):
-            # TODO ideally we would have the LLTokenizer here as Lark syntax
-            # allows <|special_token|> and similar, see
-            # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
-            # Without tokenizer these are disallowed in grammars.
-            validate_guidance_grammar(params, tokenizer=None)
-            params.guided_decoding.backend = engine_level_backend
-
     def process_inputs(
         self,
         request_id: str,

From 6a0f5475615e5dc1b21ef95f7c6d00d98a001a85 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Thu, 17 Apr 2025 23:13:26 -0600
Subject: [PATCH 496/593] Add hardware print to TPU V1 test (#16792)

---
 .buildkite/scripts/hardware_ci/run-tpu-v1-test.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index 87f74277cf900..0bf563d5be244 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -17,10 +17,12 @@ source /etc/environment
 docker run --privileged --net host --shm-size=16G -it \
     -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
     vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
-    && python3 -m pip install pytest \
+    && python3 -m pip install pytest tpu-info \
     && python3 -m pip install lm_eval[api]==0.4.4 \
     && export VLLM_USE_V1=1 \
     && export VLLM_XLA_CHECK_RECOMPILATION=1 \
+    && echo HARDWARE \
+    && tpu-info \
     && echo TEST_0 \
     && pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py \
     && echo TEST_1 \

From 7eb42556281d30436a3a988f2c9184ec63c59338 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 18 Apr 2025 01:13:29 -0400
Subject: [PATCH 497/593] [BugFix] Accuracy fix for llama4 int4 - improperly
 casted scales (#16801)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
---
 csrc/moe/moe_wna16.cu                         | 10 +++-------
 vllm/model_executor/layers/fused_moe/layer.py |  1 +
 vllm/model_executor/models/llama4.py          |  4 ++--
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/csrc/moe/moe_wna16.cu b/csrc/moe/moe_wna16.cu
index 51ae76c1ec882..7b6a111c00adc 100644
--- a/csrc/moe/moe_wna16.cu
+++ b/csrc/moe/moe_wna16.cu
@@ -13,7 +13,6 @@
 template <typename scalar_t, int bit, int GROUPS>
 __global__ void moe_wna16_gemm_kernel(
     const scalar_t* __restrict__ input, scalar_t* __restrict__ output,
-
     const uint32_t* __restrict__ qweight, const scalar_t* __restrict__ scales,
     const uint32_t* __restrict__ qzeros,
 
@@ -54,8 +53,6 @@ __global__ void moe_wna16_gemm_kernel(
       if (token_index / top_k >= size_m) break;
 
       num_valid_tokens = m + 1;
-      if (blockIdx.z == 0 && offset_n < size_n)
-        output[token_index * size_n + offset_n] = Dtype::int2num(0);
 
       if (expert_id != -1) {
         int k_per_thread = DIVIDE(BLOCK_SIZE_K, BLOCK_SIZE_N);
@@ -284,8 +281,7 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                              int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
                              int64_t BLOCK_SIZE_K, int64_t bit) {
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
-  auto options =
-      torch::TensorOptions().dtype(input.dtype()).device(input.device());
+  output.zero_();
 
   const int num_experts = b_qweight.size(0);
   const int size_m = input.size(0);
@@ -302,9 +298,9 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
   const uint32_t* b_qzeros_ptr;
   if (b_qzeros.has_value())
     b_qzeros_ptr = (const uint32_t*)b_qzeros.value().data_ptr<uint8_t>();
-  const float* topk_weights_ptr;
+  const float* topk_weights_ptr = nullptr;
   if (topk_weights.has_value())
-    topk_weights_ptr = (const float*)topk_weights.value().data_ptr();
+    topk_weights_ptr = (const float*)topk_weights.value().data_ptr<float>();
 
   int groups_per_block_row = BLOCK_SIZE_K / group_size;
   TORCH_CHECK(bit == 4 || bit == 8, "bit must be 4 or 8");
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 6e32e3e2f50dc..43fb311289fd4 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -422,6 +422,7 @@ class FusedMoE(torch.nn.Module):
 
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
 
         # Note: here we guard against accessing the TP and DP groups when
         # uninitialized (this happens when testing)
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index 51efbfe202f0b..e5d1a671f5d6f 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -51,8 +51,8 @@ class Llama4MoE(nn.Module):
         renormalize: bool,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         router_scores, router_indices = fast_topk(gating_output, topk, dim=-1)
-        router_scores = torch.sigmoid(router_scores.float()).to(
-            hidden_states.dtype)
+        # psuedo-standard is that the router scores are floats
+        router_scores = torch.sigmoid(router_scores.float())
         return (router_scores, router_indices.to(torch.int32))
 
     def __init__(self,

From e78587a64ccf236c8506c8b536965ac74560ab48 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 18 Apr 2025 06:13:32 +0100
Subject: [PATCH 498/593] Improve-mm-and-pooler-and-decoding-configs (#16789)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/source/models/supported_models.md        |  2 +-
 docs/source/serving/multimodal_inputs.md      |  2 +-
 examples/offline_inference/mistral-small.py   |  4 +-
 ...i_chat_completion_client_for_multimodal.py |  2 +-
 tests/engine/test_arg_utils.py                |  4 ++
 tests/entrypoints/openai/test_audio.py        |  2 +-
 tests/entrypoints/openai/test_video.py        |  2 +-
 tests/entrypoints/openai/test_vision.py       |  2 +-
 .../openai/test_vision_embedding.py           |  2 +-
 .../audio_language/test_ultravox.py           |  6 +-
 vllm/config.py                                | 49 +++++++------
 vllm/engine/arg_utils.py                      | 72 +++++++++----------
 vllm/multimodal/processing.py                 |  6 +-
 vllm/multimodal/registry.py                   |  7 +-
 14 files changed, 84 insertions(+), 78 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 34917b5bfeffa..4df9c511ca399 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -788,7 +788,7 @@ llm = LLM(
 Online serving:
 
 ```bash
-vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
+vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt '{"image":4}'
 ```
 
 **This is no longer required if you are using vLLM V1.**
diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index f45d36c3ccaca..d9a093e8d145d 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -228,7 +228,7 @@ First, launch the OpenAI-compatible server:
 
 ```bash
 vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
-  --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
+  --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
 ```
 
 Then, you can use the OpenAI client as follows:
diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py
index 9bb66fdbc4592..af1831bd36d1f 100644
--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@@ -16,11 +16,11 @@ from vllm.sampling_params import SamplingParams
 # # Mistral format
 # vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
 #   --tokenizer-mode mistral --config-format mistral --load-format mistral \
-#   --limit-mm-per-prompt 'image=4' --max-model-len 16384
+#   --limit-mm-per-prompt '{"image":4}' --max-model-len 16384
 #
 # # HF format
 # vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
-#   --limit-mm-per-prompt 'image=4' --max-model-len 16384
+#   --limit-mm-per-prompt '{"image":4}' --max-model-len 16384
 # ```
 #
 # - Client:
diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
index 18006e2c42322..70db4d95e6494 100644
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -9,7 +9,7 @@ vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
 
 (multi-image inference with Phi-3.5-vision-instruct)
 vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
-    --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
+    --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
 
 (audio inference with Ultravox)
 vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b --max-model-len 4096
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index dcdcdd455311d..afe6c9229009e 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -24,6 +24,10 @@ from vllm.utils import FlexibleArgumentParser
     }),
 ])
 def test_limit_mm_per_prompt_parser(arg, expected):
+    """This functionality is deprecated and will be removed in the future.
+    This argument should be passed as JSON string instead.
+    
+    TODO: Remove with nullable_kvs."""
     parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
     if arg is None:
         args = parser.parse_args([])
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index b13002a5b6823..a0a3215d67b55 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -27,7 +27,7 @@ def server():
         "--enforce-eager",
         "--trust-remote-code",
         "--limit-mm-per-prompt",
-        f"audio={MAXIMUM_AUDIOS}",
+        str({"audio": MAXIMUM_AUDIOS}),
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py
index f9ccce9c1c332..263842b94a704 100644
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -31,7 +31,7 @@ def server():
         "--enforce-eager",
         "--trust-remote-code",
         "--limit-mm-per-prompt",
-        f"video={MAXIMUM_VIDEOS}",
+        str({"video": MAXIMUM_VIDEOS}),
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 4b9029ded41b4..4aeb1700ba976 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -35,7 +35,7 @@ def server():
         "--enforce-eager",
         "--trust-remote-code",
         "--limit-mm-per-prompt",
-        f"image={MAXIMUM_IMAGES}",
+        str({"image": MAXIMUM_IMAGES}),
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index 3e6f13e10ac27..b1b24d8029b43 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -37,7 +37,7 @@ def server():
         "--enforce-eager",
         "--trust-remote-code",
         "--limit-mm-per-prompt",
-        f"image={MAXIMUM_IMAGES}",
+        str({"image": MAXIMUM_IMAGES}),
         "--chat-template",
         str(vlm2vec_jinja_path),
     ]
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index a843e41aa26e7..3d058d1bca5b9 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -48,9 +48,9 @@ def audio(request):
 ])
 def server(request, audio_assets):
     args = [
-        "--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager",
-        f"--limit-mm-per-prompt=audio={len(audio_assets)}",
-        "--trust-remote-code"
+        "--dtype", "bfloat16", "--max-model-len", "4096", "--enforce-eager",
+        "--limit-mm-per-prompt",
+        str({"audio": len(audio_assets)}), "--trust-remote-code"
     ] + [
         f"--{key.replace('_','-')}={value}"
         for key, value in request.param.items()
diff --git a/vllm/config.py b/vllm/config.py
index 7e2869e4eabcf..db43d790c5317 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -17,7 +17,7 @@ from dataclasses import (MISSING, dataclass, field, fields, is_dataclass,
 from importlib.util import find_spec
 from pathlib import Path
 from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Final, Literal,
-                    Optional, Protocol, TypeVar, Union)
+                    Optional, Protocol, TypeVar, Union, get_args)
 
 import torch
 from pydantic import BaseModel, Field, PrivateAttr
@@ -2725,6 +2725,7 @@ class PromptAdapterConfig:
                                                 self.prompt_adapter_dtype)
 
 
+@config
 @dataclass
 class MultiModalConfig:
     """Controls the behavior of multimodal models."""
@@ -2732,6 +2733,8 @@ class MultiModalConfig:
     limit_per_prompt: Mapping[str, int] = field(default_factory=dict)
     """
     The maximum number of input items allowed per prompt for each modality.
+    This should be a JSON string that will be parsed into a dictionary.
+    Defaults to 1 (V0) or 999 (V1) for each modality.
     """
 
     def compute_hash(self) -> str:
@@ -2753,24 +2756,20 @@ class MultiModalConfig:
                                usedforsecurity=False).hexdigest()
         return hash_str
 
-    def get_default_limit_per_prompt(self) -> int:
-        """
-        Return the default number of input items allowed per prompt
-        for any modality if not specified by the user.
-        """
-        return 999 if envs.VLLM_USE_V1 else 1
-
     def get_limit_per_prompt(self, modality: str) -> int:
         """
         Get the maximum number of input items allowed per prompt
         for the given modality.
         """
-        default = self.get_default_limit_per_prompt()
-        return self.limit_per_prompt.get(modality, default)
+        return self.limit_per_prompt.get(
+            modality,
+            999 if envs.VLLM_USE_V1 else 1,
+        )
 
     # TODO: Add configs to init vision tower or not.
 
 
+@config
 @dataclass
 class PoolerConfig:
     """Controls the behavior of output pooling in pooling models."""
@@ -3095,15 +3094,28 @@ def get_served_model_name(model: str,
     return served_model_name
 
 
+GuidedDecodingBackendV0 = Literal["auto", "outlines", "lm-format-enforcer",
+                                  "xgrammar"]
+GuidedDecodingBackendV1 = Literal["auto", "xgrammar", "guidance"]
+
+
+@config
 @dataclass
 class DecodingConfig:
-    """Dataclass which contains the decoding strategy of the engine"""
+    """Dataclass which contains the decoding strategy of the engine."""
 
-    # Which guided decoding algo to use.
-    # 'outlines' / 'lm-format-enforcer' / 'xgrammar'
-    guided_decoding_backend: str = "auto" if envs.VLLM_USE_V1 else "xgrammar"
+    guided_decoding_backend: Union[
+        GuidedDecodingBackendV0,
+        GuidedDecodingBackendV1] = "auto" if envs.VLLM_USE_V1 else "xgrammar"
+    """Which engine will be used for guided decoding (JSON schema / regex etc)
+    by default. With "auto", we will make opinionated choices based on request
+    contents and what the backend libraries currently support, so the behavior
+    is subject to change in each release."""
 
     reasoning_backend: Optional[str] = None
+    """Select the reasoning parser depending on the model that you're using.
+    This is used to parse the reasoning content into OpenAI API format.
+    Required for `--enable-reasoning`."""
 
     def compute_hash(self) -> str:
         """
@@ -3125,17 +3137,12 @@ class DecodingConfig:
         return hash_str
 
     def __post_init__(self):
-        v0_valid_guided_backends = [
-            'outlines', 'lm-format-enforcer', 'xgrammar', 'auto'
-        ]
-        v1_valid_guided_backends = ['xgrammar', 'guidance', 'auto']
-
         backend = GuidedDecodingParams(
             backend=self.guided_decoding_backend).backend_name
         if envs.VLLM_USE_V1:
-            valid_guided_backends = v1_valid_guided_backends
+            valid_guided_backends = get_args(GuidedDecodingBackendV1)
         else:
-            valid_guided_backends = v0_valid_guided_backends
+            valid_guided_backends = get_args(GuidedDecodingBackendV0)
         if backend not in valid_guided_backends:
             raise ValueError(f"Invalid guided_decoding_backend '{backend}',"
                              f" must be one of {valid_guided_backends}")
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index f98f95171c530..223542a5ea118 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -20,11 +20,12 @@ from vllm.config import (CacheConfig, CompilationConfig, Config, ConfigFormat,
                          DecodingConfig, Device, DeviceConfig,
                          DistributedExecutorBackend, HfOverrides,
                          KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
-                         ModelConfig, ModelImpl, ObservabilityConfig,
-                         ParallelConfig, PoolerConfig, PoolType,
-                         PromptAdapterConfig, SchedulerConfig, SchedulerPolicy,
-                         SpeculativeConfig, TaskOption, TokenizerPoolConfig,
-                         VllmConfig, get_attr_docs, get_field)
+                         ModelConfig, ModelImpl, MultiModalConfig,
+                         ObservabilityConfig, ParallelConfig, PoolerConfig,
+                         PoolType, PromptAdapterConfig, SchedulerConfig,
+                         SchedulerPolicy, SpeculativeConfig, TaskOption,
+                         TokenizerPoolConfig, VllmConfig, get_attr_docs,
+                         get_field)
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
@@ -190,7 +191,8 @@ class EngineArgs:
         TokenizerPoolConfig.pool_type
     tokenizer_pool_extra_config: dict[str, Any] = \
         get_field(TokenizerPoolConfig, "extra_config")
-    limit_mm_per_prompt: Optional[Mapping[str, int]] = None
+    limit_mm_per_prompt: Mapping[str, int] = \
+        get_field(MultiModalConfig, "limit_per_prompt")
     mm_processor_kwargs: Optional[Dict[str, Any]] = None
     disable_mm_preprocessor_cache: bool = False
     enable_lora: bool = False
@@ -252,7 +254,7 @@ class EngineArgs:
 
     additional_config: Optional[Dict[str, Any]] = None
     enable_reasoning: Optional[bool] = None
-    reasoning_parser: Optional[str] = None
+    reasoning_parser: Optional[str] = DecodingConfig.reasoning_backend
     use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
 
     def __post_init__(self):
@@ -478,18 +480,22 @@ class EngineArgs:
                             'Examples:\n'
                             '- 1k → 1000\n'
                             '- 1K → 1024\n')
-        parser.add_argument(
+
+        # Guided decoding arguments
+        guided_decoding_kwargs = get_kwargs(DecodingConfig)
+        guided_decoding_group = parser.add_argument_group(
+            title="DecodingConfig",
+            description=DecodingConfig.__doc__,
+        )
+        guided_decoding_group.add_argument(
             '--guided-decoding-backend',
-            type=str,
-            default=DecodingConfig.guided_decoding_backend,
-            help='Which engine will be used for guided decoding'
-            ' (JSON schema / regex etc) by default. Currently support '
-            'https://github.com/mlc-ai/xgrammar and '
-            'https://github.com/guidance-ai/llguidance.'
-            'Valid backend values are "xgrammar", "guidance", and "auto". '
-            'With "auto", we will make opinionated choices based on request '
-            'contents and what the backend libraries currently support, so '
-            'the behavior is subject to change in each release.')
+            **guided_decoding_kwargs["guided_decoding_backend"])
+        guided_decoding_group.add_argument(
+            "--reasoning-parser",
+            # This choices is a special case because it's not static
+            choices=list(ReasoningParserManager.reasoning_parsers),
+            **guided_decoding_kwargs["reasoning_backend"])
+
         parser.add_argument(
             '--logits-processor-pattern',
             type=optional_str,
@@ -697,18 +703,14 @@ class EngineArgs:
                                      **tokenizer_kwargs["extra_config"])
 
         # Multimodal related configs
-        parser.add_argument(
-            '--limit-mm-per-prompt',
-            type=nullable_kvs,
-            default=EngineArgs.limit_mm_per_prompt,
-            # The default value is given in
-            # MultiModalConfig.get_default_limit_per_prompt
-            help=('For each multimodal plugin, limit how many '
-                  'input instances to allow for each prompt. '
-                  'Expects a comma-separated list of items, '
-                  'e.g.: `image=16,video=2` allows a maximum of 16 '
-                  'images and 2 videos per prompt. Defaults to '
-                  '1 (V0) or 999 (V1) for each modality.'))
+        multimodal_kwargs = get_kwargs(MultiModalConfig)
+        multimodal_group = parser.add_argument_group(
+            title="MultiModalConfig",
+            description=MultiModalConfig.__doc__,
+        )
+        multimodal_group.add_argument('--limit-mm-per-prompt',
+                                      **multimodal_kwargs["limit_per_prompt"])
+
         parser.add_argument(
             '--mm-processor-kwargs',
             default=None,
@@ -1018,16 +1020,6 @@ class EngineArgs:
             "If enabled, the model will be able to generate reasoning content."
         )
 
-        parser.add_argument(
-            "--reasoning-parser",
-            type=str,
-            choices=list(ReasoningParserManager.reasoning_parsers),
-            default=None,
-            help=
-            "Select the reasoning parser depending on the model that you're "
-            "using. This is used to parse the reasoning content into OpenAI "
-            "API format. Required for ``--enable-reasoning``.")
-
         parser.add_argument(
             "--disable-cascade-attn",
             action="store_true",
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 7f289426d349e..16358d1a5ee4c 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+import json
 import re
 import sys
 from abc import ABC, abstractmethod
@@ -1117,8 +1118,9 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
             if num_items > allowed_limit:
                 raise ValueError(
-                    f"You set or defaulted to {modality}={allowed_limit} "
-                    f"in --limit-mm-per-prompt`, but passed {num_items} "
+                    "You set or defaulted to "
+                    f"'{json.dumps({modality: allowed_limit})}' in "
+                    f"`--limit-mm-per-prompt`, but passed {num_items} "
                     f"{modality} items in the same prompt.")
 
         return mm_items
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index def0595013b8b..5c687e49d22b0 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import functools
+import json
 from collections import UserDict
 from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
@@ -194,9 +195,9 @@ class MultiModalRegistry:
             max_items = self._limits_by_model[model_config][data_key]
             if num_items > max_items:
                 raise ValueError(
-                    f"You set {data_key}={max_items} (or defaulted to 1) in "
-                    f"`--limit-mm-per-prompt`, but found {num_items} items "
-                    "in the same prompt.")
+                    f"You set '{json.dumps({data_key: max_items})}' (or "
+                    "defaulted to 1) in `--limit-mm-per-prompt`, but found "
+                    f"{num_items} items in the same prompt.")
 
             input_dict = plugin.map_input(model_config, data_value,
                                           mm_processor_kwargs)

From 7bdfd29a358ddbf9dc2544968f5bf9b700773be9 Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Fri, 18 Apr 2025 13:13:35 +0800
Subject: [PATCH 499/593] [Misc] add collect_env to cli and docker image
 (#16759)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 .github/ISSUE_TEMPLATE/200-installation.yml   |  2 +-
 .github/ISSUE_TEMPLATE/300-usage.yml          |  2 +-
 .github/ISSUE_TEMPLATE/400-bug-report.yml     |  2 +-
 .../700-performance-discussion.yml            |  2 +-
 docker/Dockerfile                             |  1 +
 docker/Dockerfile.cpu                         |  1 +
 collect_env.py => vllm/collect_env.py         | 19 +++++-----
 vllm/entrypoints/cli/collect_env.py           | 35 +++++++++++++++++++
 vllm/entrypoints/cli/main.py                  |  2 ++
 9 files changed, 54 insertions(+), 12 deletions(-)
 rename collect_env.py => vllm/collect_env.py (98%)
 create mode 100644 vllm/entrypoints/cli/collect_env.py

diff --git a/.github/ISSUE_TEMPLATE/200-installation.yml b/.github/ISSUE_TEMPLATE/200-installation.yml
index 590e56c137813..34da4019687b2 100644
--- a/.github/ISSUE_TEMPLATE/200-installation.yml
+++ b/.github/ISSUE_TEMPLATE/200-installation.yml
@@ -14,7 +14,7 @@ body:
     description: |
       Please run the following and paste the output below.
       ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
       # For security purposes, please feel free to check the contents of collect_env.py before running it.
       python collect_env.py
       ```
diff --git a/.github/ISSUE_TEMPLATE/300-usage.yml b/.github/ISSUE_TEMPLATE/300-usage.yml
index 004798a388a63..c9e4be0e7719f 100644
--- a/.github/ISSUE_TEMPLATE/300-usage.yml
+++ b/.github/ISSUE_TEMPLATE/300-usage.yml
@@ -14,7 +14,7 @@ body:
     description: |
       Please run the following and paste the output below.
       ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
       # For security purposes, please feel free to check the contents of collect_env.py before running it.
       python collect_env.py
       ```
diff --git a/.github/ISSUE_TEMPLATE/400-bug-report.yml b/.github/ISSUE_TEMPLATE/400-bug-report.yml
index d4113da8b5b81..b96ab40749003 100644
--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
@@ -14,7 +14,7 @@ body:
     description: |
       Please run the following and paste the output below.
       ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
       # For security purposes, please feel free to check the contents of collect_env.py before running it.
       python collect_env.py
       ```
diff --git a/.github/ISSUE_TEMPLATE/700-performance-discussion.yml b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
index 273f50d59cf76..3d31c11550167 100644
--- a/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
+++ b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
@@ -35,7 +35,7 @@ body:
     description: |
       Please run the following and paste the output below.
       ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
       # For security purposes, please feel free to check the contents of collect_env.py before running it.
       python collect_env.py
       ```
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 45adf83e34ae6..e8e18df1bb496 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -241,6 +241,7 @@ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
 fi
 COPY examples examples
 COPY benchmarks benchmarks
+COPY ./vllm/collect_env.py .
 
 # Although we build Flashinfer with AOT mode, there's still
 # some issues w.r.t. JIT compilation. Therefore we need to
diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index 54d1ce86d0112..c647d9036f400 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -121,6 +121,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 ADD ./tests/ ./tests/
 ADD ./examples/ ./examples/
 ADD ./benchmarks/ ./benchmarks/
+ADD ./vllm/collect_env.py .
 
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
diff --git a/collect_env.py b/vllm/collect_env.py
similarity index 98%
rename from collect_env.py
rename to vllm/collect_env.py
index e11271a13640a..a404c1c3cb585 100644
--- a/collect_env.py
+++ b/vllm/collect_env.py
@@ -283,12 +283,13 @@ def get_vllm_version():
     if __version__ == "dev":
         return "N/A (dev)"
 
-    if len(__version_tuple__) == 4: # dev build
-        git_sha = __version_tuple__[-1][1:] # type: ignore
+    if len(__version_tuple__) == 4:  # dev build
+        git_sha = __version_tuple__[-1][1:]  # type: ignore
         return f"{__version__} (git sha: {git_sha}"
 
     return __version__
 
+
 def summarize_vllm_build_flags():
     # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
     return 'CUDA Archs: {}; ROCm: {}; Neuron: {}'.format(
@@ -502,7 +503,9 @@ def get_pip_packages(run_lambda, patterns=None):
             print("uv is set")
             cmd = ["uv", "pip", "list", "--format=freeze"]
         else:
-            raise RuntimeError("Could not collect pip list output (pip or uv module not available)")
+            raise RuntimeError(
+                "Could not collect pip list output (pip or uv module not available)"
+            )
 
         out = run_and_read_all(run_lambda, cmd)
         return "\n".join(line for line in out.splitlines()
@@ -535,13 +538,12 @@ def is_xnnpack_available():
     else:
         return "N/A"
 
+
 def get_env_vars():
     env_vars = ''
-    secret_terms=('secret', 'token', 'api', 'access', 'password')
-    report_prefix = ("TORCH", "NCCL", "PYTORCH",
-                     "CUDA", "CUBLAS", "CUDNN",
-                     "OMP_", "MKL_",
-                     "NVIDIA")
+    secret_terms = ('secret', 'token', 'api', 'access', 'password')
+    report_prefix = ("TORCH", "NCCL", "PYTORCH", "CUDA", "CUBLAS", "CUDNN",
+                     "OMP_", "MKL_", "NVIDIA")
     for k, v in os.environ.items():
         if any(term in k.lower() for term in secret_terms):
             continue
@@ -552,6 +554,7 @@ def get_env_vars():
 
     return env_vars
 
+
 def get_env_info():
     run_lambda = run
     pip_version, pip_list_output = get_pip_packages(run_lambda)
diff --git a/vllm/entrypoints/cli/collect_env.py b/vllm/entrypoints/cli/collect_env.py
new file mode 100644
index 0000000000000..d5f9f7e729f08
--- /dev/null
+++ b/vllm/entrypoints/cli/collect_env.py
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+
+from vllm.collect_env import main as collect_env_main
+from vllm.entrypoints.cli.types import CLISubcommand
+from vllm.entrypoints.openai.cli_args import make_arg_parser
+from vllm.utils import FlexibleArgumentParser
+
+
+class CollectEnvSubcommand(CLISubcommand):
+    """The `serve` subcommand for the vLLM CLI. """
+
+    def __init__(self):
+        self.name = "collect-env"
+        super().__init__()
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        """Collect information about the environment."""
+        collect_env_main()
+
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        serve_parser = subparsers.add_parser(
+            "collect-env",
+            help="Start collecting environment information.",
+            description="Start collecting environment information.",
+            usage="vllm collect-env")
+        return make_arg_parser(serve_parser)
+
+
+def cmd_init() -> list[CLISubcommand]:
+    return [CollectEnvSubcommand()]
diff --git a/vllm/entrypoints/cli/main.py b/vllm/entrypoints/cli/main.py
index aa54bd66bed67..b7c1afce71181 100644
--- a/vllm/entrypoints/cli/main.py
+++ b/vllm/entrypoints/cli/main.py
@@ -5,6 +5,7 @@ import signal
 import sys
 
 import vllm.entrypoints.cli.benchmark.main
+import vllm.entrypoints.cli.collect_env
 import vllm.entrypoints.cli.openai
 import vllm.entrypoints.cli.serve
 import vllm.version
@@ -15,6 +16,7 @@ CMD_MODULES = [
     vllm.entrypoints.cli.openai,
     vllm.entrypoints.cli.serve,
     vllm.entrypoints.cli.benchmark.main,
+    vllm.entrypoints.cli.collect_env,
 ]
 
 
From aaec845f8ed7f445c66ba0d28c84bec9d184f5ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luka=20Govedi=C4=8D?=
 <ProExpertProg@users.noreply.github.com>
Date: Fri, 18 Apr 2025 01:46:45 -0400
Subject: [PATCH 500/593] [ROCm] [Attention] Cleanup ROCm output passing
 (#16431)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Luka Govedič <lgovedic@redhat.com>
---
 vllm/attention/backends/rocm_flash_attn.py | 41 ++++++++++------------
 1 file changed, 18 insertions(+), 23 deletions(-)

diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 7376f9303788d..90a21906b6e63 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -27,6 +27,7 @@ _PARTITION_SIZE_ROCM = 256
 
 
 class ROCmFlashAttentionBackend(AttentionBackend):
+    accept_output_buffer: bool = True
 
     @staticmethod
     def get_name() -> str:
@@ -515,7 +516,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
 
             from vllm.attention.ops.triton_flash_attention import (  # noqa: F401
                 triton_attention)
-            self.attn_func = triton_attention
+            self.triton_attn_func = triton_attention
             logger.debug("Using Triton FA in ROCmBackend")
             if self.sliding_window != (-1, -1):
                 logger.warning("ROCm Triton FA does not currently support "
@@ -531,7 +532,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
             else:
                 try:
                     from flash_attn import flash_attn_varlen_func  # noqa: F401
-                    self.attn_func = flash_attn_varlen_func
+                    self.fa_attn_func = flash_attn_varlen_func
                     logger.debug("Using CK FA in ROCmBackend")
                 except ModuleNotFoundError:
                     self.use_naive_attn = True
@@ -542,7 +543,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                         "ROCm Naive FlashAttention does not support "
                         "attention logits soft capping.")
 
-                self.attn_func = _sdpa_attention
+                self.sdpa_attn_func = _sdpa_attention
                 logger.debug("Using naive (SDPA) attention in ROCmBackend")
 
     def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor:
@@ -613,6 +614,8 @@ class ROCmFlashAttentionImpl(AttentionImpl):
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
+        assert output is not None, "Output tensor must be provided."
+
         query = query.view(-1, self.num_heads, self.head_size)
         if key is not None:
             assert value is not None
@@ -656,7 +659,6 @@ class ROCmFlashAttentionImpl(AttentionImpl):
             assert attn_metadata.num_encoder_tokens is not None
             num_prefill_tokens = attn_metadata.num_encoder_tokens
 
-        output = torch.empty_like(query)
         # Query for decode. KV is not needed because it is already cached.
         decode_query = query[num_prefill_tokens:]
         # QKV for prefill.
@@ -704,11 +706,11 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                             query.dtype,
                             seq_lens,
                             make_attn_mask=causal_mask)  # type: ignore
-                    out, _ = self.attn_func(
+                    self.triton_attn_func(
                         query,
                         key,
                         value,
-                        None,
+                        output[:num_prefill_tokens],
                         query_seq_start_loc,
                         key_seq_start_loc,
                         query_max_seq_len,
@@ -733,10 +735,11 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                     key = key.movedim(0, key.dim() - 2)
                     value = value.movedim(0, value.dim() - 2)
                     # sdpa math backend attention
-                    out = self.attn_func(
+                    self.sdpa_attn_func(
                         query,
                         key,
                         value,
+                        output[:num_prefill_tokens],
                         query_seq_start_loc,
                         num_prefill_tokens,
                         self.num_heads,
@@ -745,7 +748,8 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                         attn_masks,
                     )
                 else:
-                    out = self.attn_func(
+                    # upstream FA does not support an output arg, copy
+                    output[:num_prefill_tokens] = self.fa_attn_func(
                         q=query,
                         k=key,
                         v=value,
@@ -760,12 +764,6 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                         softcap=self.logits_soft_cap,
                     )
 
-                # common code for prefill
-                assert output[:num_prefill_tokens].shape == out.shape
-                if output.shape[0] > num_prefill_tokens:
-                    output[:num_prefill_tokens] = out
-                else:
-                    output = out
             else:
                 # prefix-enabled attention -
                 # not applicable for encoder-only models
@@ -818,14 +816,10 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                     device=output.device,
                 )
                 max_logits = torch.empty_like(exp_sums)
-                if num_prefill_tokens > 0:
-                    out = output[num_prefill_tokens:]
-                else:
-                    out = output
 
                 query_start_loc = None
                 ops.paged_attention_rocm(
-                    out,
+                    output[num_prefill_tokens:],
                     exp_sums,
                     max_logits,
                     tmp_output,
@@ -878,7 +872,8 @@ def _sdpa_attention(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
-    seq_lens: List[int],
+    output: torch.Tensor,
+    seq_lens: torch.Tensor,
     num_tokens: int,
     num_heads: int,
     head_size: int,
@@ -886,9 +881,9 @@ def _sdpa_attention(
     attn_masks: Optional[List[torch.Tensor]] = None,
 ) -> torch.Tensor:
     start = 0
-    output = torch.empty((num_tokens, num_heads, head_size),
-                         dtype=query.dtype,
-                         device=query.device)
+    assert output.shape == (num_tokens, num_heads, head_size)
+    assert output.dtype == query.dtype
+    assert output.device == query.device
 
     for i, seq_len in enumerate(seq_lens):
         end = start + seq_len

From e31045f95ca0f7262b156cd7e3e34100cbf1f4d1 Mon Sep 17 00:00:00 2001
From: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Date: Thu, 17 Apr 2025 22:51:30 -0700
Subject: [PATCH 501/593] [Bugfix] fix pp for llama4 (#16746)

Signed-off-by: Lu Fang <fanglu@fb.com>
---
 vllm/model_executor/models/mllama4.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 0966f546ddf90..69e3ea8bd0637 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -672,9 +672,9 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
             self.config,
             None,
             prefix=maybe_prefix(prefix, "multi_modal_projector"))
-
         self.language_model = _initialize_model(
-            vllm_config=vllm_config.with_hf_config(config.text_config),
+            vllm_config=vllm_config.with_hf_config(config.text_config,
+                                                   ["LlamaForCausalLM"]),
             prefix=maybe_prefix(prefix, "language_model"),
             model_class=Llama4ForCausalLM,
         )
@@ -824,7 +824,7 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
         # language_model is an Llama4ForCausalLM instance. We load it's
         # using llama4's load_weights routine.
         language_model_weights, other_weights = self.separate_weights(
-            weights, prefix="language_model.model.")
+            weights, prefix="language_model.")
         loader = AutoWeightsLoader(self)
         loaded_language_model_params = loader.load_weights(
             language_model_weights)

From 9c1d5b456d916c68f41f471bfe8ea13fc69b80fa Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <31703736+nathan-weinberg@users.noreply.github.com>
Date: Fri, 18 Apr 2025 02:10:49 -0400
Subject: [PATCH 502/593] [Doc] add podman setup instructions for official
 image (#16796)

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 docs/source/deployment/docker.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md
index 6b794db656c05..ca56710bc2ef2 100644
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@@ -19,6 +19,18 @@ $ docker run --runtime nvidia --gpus all \
     --model mistralai/Mistral-7B-v0.1
 ```
 
+This image can also be used with other container engines such as [Podman](https://podman.io/).
+
+```console
+$ podman run --gpus all \
+  -v ~/.cache/huggingface:/root/.cache/huggingface \
+  --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+  -p 8000:8000 \
+  --ipc=host \
+  vllm/vllm-openai:latest \
+  --model mistralai/Mistral-7B-v0.1
+```
+
 You can add any other <project:#engine-args> you need after the image tag (`vllm/vllm-openai:latest`).
 
 :::{note}

From 26507f8973e834dce56abb0796c1d5e833b08a31 Mon Sep 17 00:00:00 2001
From: Michael Yao <haifeng.yao@daocloud.io>
Date: Fri, 18 Apr 2025 14:42:58 +0800
Subject: [PATCH 503/593] [Docs] Fix a link and grammar issue in
 production-stack.md (#16809)

Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
---
 docs/source/deployment/integrations/production-stack.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/deployment/integrations/production-stack.md b/docs/source/deployment/integrations/production-stack.md
index e66e8e6a16b29..05f1568306cc9 100644
--- a/docs/source/deployment/integrations/production-stack.md
+++ b/docs/source/deployment/integrations/production-stack.md
@@ -16,7 +16,7 @@ Ensure that you have a running Kubernetes environment with GPU (you can follow [
 
 ## Deployment using vLLM production stack
 
-The standard vLLM production stack install uses a Helm chart. You can run this [bash script](https://github.com/vllm-project/production-stack/blob/main/tutorials/install-helm.sh) to install Helm on your GPU server.
+The standard vLLM production stack is installed using a Helm chart. You can run this [bash script](https://github.com/vllm-project/production-stack/blob/main/utils/install-helm.sh) to install Helm on your GPU server.
 
 To install the vLLM production stack, run the following commands on your desktop:
 

From 87e067de41debb3bb714698bfc88e9a5e098f3c9 Mon Sep 17 00:00:00 2001
From: Jonghyun Choe <andy.choe729@gmail.com>
Date: Fri, 18 Apr 2025 19:42:41 +0900
Subject: [PATCH 504/593] [Model] use AutoWeightsLoader for BigCode, GPT-J
 (#16823)

Signed-off-by: Jonghyun Choe <andy.choe729@gmail.com>
---
 vllm/model_executor/models/gpt_bigcode.py |  54 +++++-----
 vllm/model_executor/models/gpt_j.py       | 116 ++++++++++++----------
 2 files changed, 91 insertions(+), 79 deletions(-)

diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index 43f3d4f6dc9cc..40d01a2ecfc5b 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -43,7 +43,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers)
 
 
@@ -244,6 +244,30 @@ class GPTBigCodeModel(nn.Module):
         hidden_states = self.ln_f(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if ".attn.bias" in name:
+                # Skip attention mask.
+                # NOTE: "c_attn.bias" should not be skipped.
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            # TODO (@robertgshaw2-neuralmagic): move to fp8 linear method
+            if "c_attn.input_scale" in name or "c_attn.weight_scale" in name:
+                weight_loader(param, loaded_weight, 'q')
+                weight_loader(param, loaded_weight, 'k')
+                weight_loader(param, loaded_weight, 'v')
+            else:
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {"c_attn": ["c_attn"]}
@@ -315,26 +339,8 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "lm_head.weight" in name:
-                continue
-            if ".attn.bias" in name:
-                # Skip attention mask.
-                # NOTE: "c_attn.bias" should not be skipped.
-                continue
-            if is_pp_missing_parameter(name, self):
-                continue
-            param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            # TODO (@robertgshaw2-neuralmagic): move to fp8 linear method
-            if "c_attn.input_scale" in name or "c_attn.weight_scale" in name:
-                weight_loader(param, loaded_weight, 'q')
-                weight_loader(param, loaded_weight, 'k')
-                weight_loader(param, loaded_weight, 'v')
-            else:
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]),
+        )
+        return loader.load_weights(weights)
\ No newline at end of file
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 752aec0b223dd..1f73d2ab0f1ae 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -43,7 +43,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -188,6 +188,7 @@ class GPTJModel(nn.Module):
         quant_config = vllm_config.quant_config
 
         self.config = config
+        self.quant_config = quant_config
         self.embed_dim = config.n_embd
         self.wte = VocabParallelEmbedding(
             config.vocab_size,
@@ -228,6 +229,63 @@ class GPTJModel(nn.Module):
         hidden_states = self.ln_f(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "attn.bias" in name or "attn.masked_bias" in name:
+                continue
+
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class GPTJForCausalLM(nn.Module, SupportsPP):
 
@@ -285,57 +343,5 @@ class GPTJForCausalLM(nn.Module, SupportsPP):
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "attn.bias" in name or "attn.masked_bias" in name:
-                continue
-
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache quantization scales
-                param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
-                weight_loader(param, loaded_weight)
-                loaded_params.add(scale_name)
-                continue
-
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
\ No newline at end of file

From aadb6565628cbc3b56cfc20df245ad36de675262 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 18 Apr 2025 20:15:09 +0800
Subject: [PATCH 505/593] [Misc] Clean up Kimi-VL (#16833)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 examples/offline_inference/vision_language.py |  4 +-
 .../vision_language_multi_image.py            |  3 +-
 vllm/model_executor/models/kimi_vl.py         | 57 ++++++-------------
 3 files changed, 20 insertions(+), 44 deletions(-)

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 4476009fd2714..b4ad610cdafb3 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -376,9 +376,9 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
 
     engine_args = EngineArgs(
         model="moonshotai/Kimi-VL-A3B-Instruct",
-        max_model_len=4096,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
         trust_remote_code=True,
+        max_model_len=4096,
+        limit_mm_per_prompt={"image": 1},
     )
 
     return ModelRequestData(
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 52e9389670666..e2e14d16228a9 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -331,11 +331,10 @@ def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
 
     engine_args = EngineArgs(
         model=model_name,
+        trust_remote_code=True,
         max_model_len=4096,
         max_num_seqs=4,
-        tensor_parallel_size=1,
         limit_mm_per_prompt={"image": len(image_urls)},
-        trust_remote_code=True,
     )
 
     placeholders = [{"type": "image", "image": url} for url in image_urls]
diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py
index c2fac70afc49c..1520f6992f0a9 100644
--- a/vllm/model_executor/models/kimi_vl.py
+++ b/vllm/model_executor/models/kimi_vl.py
@@ -56,7 +56,6 @@ from transformers.activations import GELUActivation
 from vllm.config import VllmConfig
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
-from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
@@ -70,22 +69,20 @@ from vllm.model_executor.models.moonvit import MoonVitPretrainedModel
 from vllm.model_executor.models.utils import merge_multimodal_embeddings
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig
 from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
 
 from .utils import is_pp_missing_parameter, maybe_prefix
 
-logger = init_logger(__name__)
-
 
 # For dummy input only
 @dataclass
@@ -143,6 +140,9 @@ class KimiVLProcessingInfo(BaseProcessingInfo):
     def get_hf_config(self):
         return self.ctx.get_hf_config(KimiVLConfig)
 
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
     def get_num_image_tokens(
         self,
         *,
@@ -180,23 +180,6 @@ class KimiVLProcessingInfo(BaseProcessingInfo):
         token_width = (width + pad_width) // (kernel_size[1] * patch_size)
         return int(token_height * token_width)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        # None means unlimited
-        return {"image": None}
-
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {
-            "image":
-            self.get_num_image_tokens(
-                image_width=MaxImageTokenMeta.width,
-                image_height=MaxImageTokenMeta.height,
-            ),
-        }
-
     @property
     def image_token_id(self) -> int:
         return self.get_hf_config().media_placeholder_token_id
@@ -204,34 +187,28 @@ class KimiVLProcessingInfo(BaseProcessingInfo):
 
 class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]):
 
-    def __init__(self, info: KimiVLProcessingInfo) -> None:
-        super().__init__(info)
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
 
-        self.image_token_id = self.info.image_token_id
-        self.image_token = self.info.get_tokenizer().decode(
-            self.image_token_id)
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
 
-    def get_dummy_processor_inputs(
+        return image_token * num_images
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
-        width = MaxImageTokenMeta.width
-        height = MaxImageTokenMeta.height
-        mm_data = {
+        return {
             "image":
-            self._get_dummy_images(width=width,
-                                   height=height,
+            self._get_dummy_images(width=MaxImageTokenMeta.width,
+                                   height=MaxImageTokenMeta.height,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text=self.image_token * num_images,
-            mm_data=mm_data,
-        )
-
 
 class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
 

From 686623c5e7a0ee0c7679c052ced565dd83055709 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 18 Apr 2025 13:58:39 +0100
Subject: [PATCH 506/593] Fix `nullable_kvs` fallback (#16837)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/engine/test_arg_utils.py                 |  2 +-
 tests/entrypoints/openai/test_audio.py         |  4 +++-
 tests/entrypoints/openai/test_video.py         |  4 +++-
 tests/entrypoints/openai/test_vision.py        |  4 +++-
 .../openai/test_vision_embedding.py            |  4 +++-
 .../audio_language/test_ultravox.py            |  3 ++-
 vllm/config.py                                 |  7 +++----
 vllm/engine/arg_utils.py                       | 18 +++++++++---------
 8 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index afe6c9229009e..7902011519d90 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -10,7 +10,7 @@ from vllm.utils import FlexibleArgumentParser
 
 
 @pytest.mark.parametrize(("arg", "expected"), [
-    (None, None),
+    (None, dict()),
     ("image=16", {
         "image": 16
     }),
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index a0a3215d67b55..29d5a85af6132 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import json
+
 import openai
 import pytest
 import pytest_asyncio
@@ -27,7 +29,7 @@ def server():
         "--enforce-eager",
         "--trust-remote-code",
         "--limit-mm-per-prompt",
-        str({"audio": MAXIMUM_AUDIOS}),
+        json.dumps({"audio": MAXIMUM_AUDIOS}),
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py
index 263842b94a704..8679c2f25db40 100644
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import json
+
 import openai
 import pytest
 import pytest_asyncio
@@ -31,7 +33,7 @@ def server():
         "--enforce-eager",
         "--trust-remote-code",
         "--limit-mm-per-prompt",
-        str({"video": MAXIMUM_VIDEOS}),
+        json.dumps({"video": MAXIMUM_VIDEOS}),
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 4aeb1700ba976..87b5cee73ecb0 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import json
+
 import openai
 import pytest
 import pytest_asyncio
@@ -35,7 +37,7 @@ def server():
         "--enforce-eager",
         "--trust-remote-code",
         "--limit-mm-per-prompt",
-        str({"image": MAXIMUM_IMAGES}),
+        json.dumps({"image": MAXIMUM_IMAGES}),
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index b1b24d8029b43..26c68e06c199f 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import json
+
 import pytest
 import requests
 from PIL import Image
@@ -37,7 +39,7 @@ def server():
         "--enforce-eager",
         "--trust-remote-code",
         "--limit-mm-per-prompt",
-        str({"image": MAXIMUM_IMAGES}),
+        json.dumps({"image": MAXIMUM_IMAGES}),
         "--chat-template",
         str(vlm2vec_jinja_path),
     ]
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index 3d058d1bca5b9..bd1dcba6a9951 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import json
 from typing import Optional
 
 import numpy as np
@@ -50,7 +51,7 @@ def server(request, audio_assets):
     args = [
         "--dtype", "bfloat16", "--max-model-len", "4096", "--enforce-eager",
         "--limit-mm-per-prompt",
-        str({"audio": len(audio_assets)}), "--trust-remote-code"
+        json.dumps({"audio": len(audio_assets)}), "--trust-remote-code"
     ] + [
         f"--{key.replace('_','-')}={value}"
         for key, value in request.param.items()
diff --git a/vllm/config.py b/vllm/config.py
index db43d790c5317..5b5ac40f6aa2c 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -10,7 +10,6 @@ import sys
 import textwrap
 import warnings
 from collections import Counter
-from collections.abc import Mapping
 from contextlib import contextmanager
 from dataclasses import (MISSING, dataclass, field, fields, is_dataclass,
                          replace)
@@ -355,7 +354,7 @@ class ModelConfig:
         disable_cascade_attn: bool = False,
         skip_tokenizer_init: bool = False,
         served_model_name: Optional[Union[str, list[str]]] = None,
-        limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
+        limit_mm_per_prompt: Optional[dict[str, int]] = None,
         use_async_output_proc: bool = True,
         config_format: ConfigFormat = ConfigFormat.AUTO,
         hf_token: Optional[Union[bool, str]] = None,
@@ -578,7 +577,7 @@ class ModelConfig:
                 self.tokenizer = s3_tokenizer.dir
 
     def _init_multimodal_config(
-        self, limit_mm_per_prompt: Optional[Mapping[str, int]]
+        self, limit_mm_per_prompt: Optional[dict[str, int]]
     ) -> Optional["MultiModalConfig"]:
         if self.registry.is_multimodal_model(self.architectures):
             return MultiModalConfig(limit_per_prompt=limit_mm_per_prompt or {})
@@ -2730,7 +2729,7 @@ class PromptAdapterConfig:
 class MultiModalConfig:
     """Controls the behavior of multimodal models."""
 
-    limit_per_prompt: Mapping[str, int] = field(default_factory=dict)
+    limit_per_prompt: dict[str, int] = field(default_factory=dict)
     """
     The maximum number of input items allowed per prompt for each modality.
     This should be a JSON string that will be parsed into a dictionary.
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 223542a5ea118..1f719392bd9f2 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -7,7 +7,7 @@ import json
 import re
 import threading
 from dataclasses import MISSING, dataclass, fields
-from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Literal, Mapping,
+from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Literal,
                     Optional, Tuple, Type, TypeVar, Union, cast, get_args,
                     get_origin)
 
@@ -112,14 +112,14 @@ def nullable_kvs(val: str) -> Optional[dict[str, int]]:
 
 
 def optional_dict(val: str) -> Optional[dict[str, int]]:
-    try:
+    if re.match("^{.*}$", val):
         return optional_arg(val, json.loads)
-    except ValueError:
-        logger.warning(
-            "Failed to parse JSON string. Attempting to parse as "
-            "comma-separated key=value pairs. This will be deprecated in a "
-            "future release.")
-        return nullable_kvs(val)
+
+    logger.warning(
+        "Failed to parse JSON string. Attempting to parse as "
+        "comma-separated key=value pairs. This will be deprecated in a "
+        "future release.")
+    return nullable_kvs(val)
 
 
 @dataclass
@@ -191,7 +191,7 @@ class EngineArgs:
         TokenizerPoolConfig.pool_type
     tokenizer_pool_extra_config: dict[str, Any] = \
         get_field(TokenizerPoolConfig, "extra_config")
-    limit_mm_per_prompt: Mapping[str, int] = \
+    limit_mm_per_prompt: dict[str, int] = \
         get_field(MultiModalConfig, "limit_per_prompt")
     mm_processor_kwargs: Optional[Dict[str, Any]] = None
     disable_mm_preprocessor_cache: bool = False

From 3d3ab3689f19a3296995002395f9f756edd27957 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Fri, 18 Apr 2025 23:11:57 +0800
Subject: [PATCH 507/593] [New Model]: Snowflake Arctic Embed (Family) 
 (#16649)

---
 .../openai/test_embedding_dimensions.py       |  15 +-
 .../language/test_snowflake_arctic_embed.py   | 101 +++++++++
 tests/models/embedding/utils.py               |   8 +
 tests/models/registry.py                      |   4 +
 vllm/model_executor/layers/activation.py      |   1 +
 vllm/model_executor/models/bert.py            | 201 ++++++++++++++++--
 vllm/model_executor/models/registry.py        |   8 +-
 7 files changed, 312 insertions(+), 26 deletions(-)
 create mode 100644 tests/models/embedding/language/test_snowflake_arctic_embed.py

diff --git a/tests/entrypoints/openai/test_embedding_dimensions.py b/tests/entrypoints/openai/test_embedding_dimensions.py
index 79d43a2231f82..43d109f74f5da 100644
--- a/tests/entrypoints/openai/test_embedding_dimensions.py
+++ b/tests/entrypoints/openai/test_embedding_dimensions.py
@@ -3,24 +3,17 @@
 Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`.
 """
 
-from typing import NamedTuple
-
 import openai
 import pytest
 
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 
+from ...models.embedding.utils import EmbedModelInfo
 from ...utils import RemoteOpenAIServer
 
-
-class ModelInfo(NamedTuple):
-    name: str
-    is_matryoshka: bool
-
-
 MODELS = [
-    ModelInfo(name="BAAI/bge-m3", is_matryoshka=False),
-    ModelInfo(name="jinaai/jina-embeddings-v3", is_matryoshka=True),
+    EmbedModelInfo(name="BAAI/bge-m3", is_matryoshka=False),
+    EmbedModelInfo(name="jinaai/jina-embeddings-v3", is_matryoshka=True),
 ]
 
 input_texts = [
@@ -30,7 +23,7 @@ input_texts = [
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model", MODELS)
-async def test_validating_dimensions(model: ModelInfo):
+async def test_validating_dimensions(model: EmbedModelInfo):
     args = [
         "--task",
         "embed",
diff --git a/tests/models/embedding/language/test_snowflake_arctic_embed.py b/tests/models/embedding/language/test_snowflake_arctic_embed.py
new file mode 100644
index 0000000000000..2b884fceec80c
--- /dev/null
+++ b/tests/models/embedding/language/test_snowflake_arctic_embed.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Compare the embedding outputs of HF and vLLM models.
+
+Run `pytest tests/models/embedding/language/test_snowflake_arctic_embed.py`.
+"""
+import pytest
+
+from tests.models.embedding.utils import EmbedModelInfo
+
+from ..utils import check_embeddings_close
+
+EMBEDDING_PROMPTS = [
+    'what is snowflake?', 'Where can I get the best tacos?', 'The Data Cloud!',
+    'Mexico City of Course!'
+]
+
+MODELS = [
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-xs",
+                   is_matryoshka=False,
+                   architecture="BertModel",
+                   enable_test=True),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-s",
+                   is_matryoshka=False,
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m",
+                   is_matryoshka=False,
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long",
+                   is_matryoshka=False,
+                   architecture="NomicBertModel",
+                   enable_test=True),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-l",
+                   is_matryoshka=False,
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5",
+                   is_matryoshka=True,
+                   architecture="BertModel",
+                   enable_test=True),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-l-v2.0",
+                   is_matryoshka=True,
+                   architecture="XLMRobertaModel",
+                   enable_test=True),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
+                   is_matryoshka=True,
+                   architecture="GteModel",
+                   enable_test=True),
+]
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model_info: EmbedModelInfo,
+    dtype: str,
+    monkeypatch,
+) -> None:
+    if not model_info.enable_test:
+        # A model family has many models with the same architecture,
+        # and we don't need to test each one.
+        pytest.skip("Skipping test.")
+
+    example_prompts = example_prompts + EMBEDDING_PROMPTS
+
+    vllm_extra_kwargs = {
+        "hf_overrides": {
+            "is_matryoshka": model_info.is_matryoshka
+        }
+    }
+
+    with hf_runner(model_info.name, dtype=dtype,
+                   is_sentence_transformer=True) as hf_model:
+        hf_outputs = hf_model.encode(example_prompts)
+
+    with vllm_runner(model_info.name,
+                     task="embed",
+                     dtype=dtype,
+                     max_model_len=None,
+                     **vllm_extra_kwargs) as vllm_model:
+
+        assert (vllm_model.model.llm_engine.model_config.is_matryoshka ==
+                model_info.is_matryoshka)
+
+        if model_info.architecture:
+            assert (model_info.architecture
+                    in vllm_model.model.llm_engine.model_config.architectures)
+
+        vllm_outputs = vllm_model.encode(example_prompts)
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+        tol=1e-2,
+    )
diff --git a/tests/models/embedding/utils.py b/tests/models/embedding/utils.py
index 5aeeb51785402..a58116e2bf0d7 100644
--- a/tests/models/embedding/utils.py
+++ b/tests/models/embedding/utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from collections.abc import Sequence
+from typing import NamedTuple
 
 import torch
 import torch.nn.functional as F
@@ -37,3 +38,10 @@ def matryoshka_fy(tensor, dimensions):
     tensor = tensor[..., :dimensions]
     tensor = F.normalize(tensor, p=2, dim=1)
     return tensor
+
+
+class EmbedModelInfo(NamedTuple):
+    name: str
+    is_matryoshka: bool
+    architecture: str = ""
+    enable_test: bool = True
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 22e03f49edfe8..3f902df93f208 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -247,11 +247,15 @@ _EMBEDDING_EXAMPLE_MODELS = {
     "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
     "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"),
     "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
+    "GteModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
+                                               trust_remote_code=True),
     "InternLM2ForRewardModel": _HfExamplesInfo("internlm/internlm2-1_8b-reward",
                                                trust_remote_code=True),
     "JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"),  # noqa: E501
     "LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
     "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
+    "NomicBertModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-long",  # noqa: E501
+                                               trust_remote_code=True),
     "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
     "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"),
     "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B"),
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 1de0f499c1a69..f082afb7e9c0c 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -354,6 +354,7 @@ def get_act_fn(act_fn_name: str) -> nn.Module:
 _ACTIVATION_AND_MUL_REGISTRY = LazyDict({
     "gelu": lambda: GeluAndMul(),
     "silu": lambda: SiluAndMul(),
+    "gelu_and_mul": lambda: GeluAndMul(),
 })
 
 
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index e1d77646f47e8..76a529c93343f 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -11,8 +11,10 @@ from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, PoolerConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.forward_context import get_forward_context
-from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.activation import (get_act_and_mul_fn,
+                                                   get_act_fn)
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.pooler import (CrossEncodingPooler, Pooler,
@@ -108,6 +110,7 @@ class BertEncoder(nn.Module):
 
     def __init__(self,
                  vllm_config: VllmConfig,
+                 bias: bool = True,
                  rotary_kwargs: Optional[dict] = None,
                  prefix: str = ""):
         super().__init__()
@@ -118,6 +121,7 @@ class BertEncoder(nn.Module):
             BertLayer(config=config,
                       cache_config=cache_config,
                       quant_config=quant_config,
+                      bias=bias,
                       rotary_kwargs=rotary_kwargs,
                       prefix=f"{prefix}.layer.{layer_idx}")
             for layer_idx in range(config.num_hidden_layers)
@@ -139,6 +143,7 @@ class BertLayer(nn.Module):
                  config: BertConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
+                 bias: bool = True,
                  rotary_kwargs: Optional[dict] = None,
                  prefix: str = ""):
         super().__init__()
@@ -149,19 +154,31 @@ class BertLayer(nn.Module):
             layer_norm_eps=config.layer_norm_eps,
             cache_config=cache_config,
             quant_config=quant_config,
+            bias=bias,
             rotary_kwargs=rotary_kwargs,
             prefix=f"{prefix}.attention")
 
-        self.intermediate = BertIntermediate(
-            hidden_size=config.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-            quant_config=quant_config,
-            prefix=f"{prefix}.intermediate")
+        if config.hidden_act in ["silu", "gelu_and_mul"]:
+            self.intermediate = BertGatedIntermediate(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                bias=bias,
+                quant_config=quant_config,
+                prefix=f"{prefix}.intermediate")
+        else:
+            self.intermediate = BertIntermediate(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                bias=bias,
+                quant_config=quant_config,
+                prefix=f"{prefix}.intermediate")
 
         self.output = BertOutput(hidden_size=config.hidden_size,
                                  intermediate_size=config.intermediate_size,
                                  layer_norm_eps=config.layer_norm_eps,
+                                 bias=bias,
                                  quant_config=quant_config,
                                  prefix=f"{prefix}.output")
 
@@ -181,6 +198,7 @@ class BertAttention(nn.Module):
         layer_norm_eps: float,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = True,
         rotary_kwargs: Optional[dict] = None,
         prefix: str = "",
     ):
@@ -190,11 +208,13 @@ class BertAttention(nn.Module):
                                       num_attention_heads=num_attention_heads,
                                       cache_config=cache_config,
                                       quant_config=quant_config,
+                                      bias=bias,
                                       rotary_kwargs=rotary_kwargs,
                                       prefix=f"{prefix}.output")
 
         self.output = BertSelfOutput(hidden_size=hidden_size,
                                      layer_norm_eps=layer_norm_eps,
+                                     bias=bias,
                                      quant_config=quant_config,
                                      prefix=f"{prefix}.output")
 
@@ -215,6 +235,7 @@ class BertSelfAttention(nn.Module):
         num_attention_heads: int,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = True,
         rotary_kwargs: Optional[dict] = None,
         prefix: str = "",
     ):
@@ -240,7 +261,7 @@ class BertSelfAttention(nn.Module):
             head_size=self.head_dim,
             total_num_heads=self.total_num_heads,
             total_num_kv_heads=self.total_num_kv_heads,
-            bias=True,
+            bias=bias,
             quant_config=quant_config,
             prefix=f"{prefix}.qkv_proj")
 
@@ -278,12 +299,13 @@ class BertSelfOutput(nn.Module):
     def __init__(self,
                  hidden_size: int,
                  layer_norm_eps: float,
+                 bias: bool = True,
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: str = ""):
         super().__init__()
         self.dense = RowParallelLinear(input_size=hidden_size,
                                        output_size=hidden_size,
-                                       bias=True,
+                                       bias=bias,
                                        quant_config=quant_config,
                                        prefix=f"{prefix}.dense")
         self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
@@ -301,12 +323,13 @@ class BertIntermediate(nn.Module):
                  hidden_size: int,
                  intermediate_size: int,
                  hidden_act: str,
+                 bias: bool = True,
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: str = ""):
         super().__init__()
         self.dense = ColumnParallelLinear(input_size=hidden_size,
                                           output_size=intermediate_size,
-                                          bias=True,
+                                          bias=bias,
                                           quant_config=quant_config,
                                           prefix=f"{prefix}.dense")
         self.intermediate_act_fn = get_act_fn(hidden_act)
@@ -317,19 +340,46 @@ class BertIntermediate(nn.Module):
         return hidden_states
 
 
+class BertGatedIntermediate(nn.Module):
+    # for NomciBert and GteModel
+
+    def __init__(self,
+                 hidden_size: int,
+                 intermediate_size: int,
+                 hidden_act: str,
+                 bias: bool = True,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.act_fn = get_act_and_mul_fn(hidden_act)
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(hidden_states)
+        hidden_states = self.act_fn(gate_up)
+        return hidden_states
+
+
 class BertOutput(nn.Module):
 
     def __init__(self,
                  hidden_size: int,
                  intermediate_size: int,
                  layer_norm_eps: float,
+                 bias: bool = True,
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: str = ""):
         super().__init__()
 
         self.dense = RowParallelLinear(input_size=intermediate_size,
                                        output_size=hidden_size,
-                                       bias=True,
+                                       bias=bias,
                                        quant_config=quant_config,
                                        prefix=f"{prefix}.dense")
 
@@ -343,19 +393,32 @@ class BertOutput(nn.Module):
 
 
 class BertModel(nn.Module, SupportsQuant):
-    packed_modules_mapping = {"qkv_proj": ["query", "key", "value"]}
+    packed_modules_mapping = {
+        "qkv_proj": ["query", "key", "value"],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
 
     def __init__(self,
                  *,
                  vllm_config: VllmConfig,
                  prefix: str = "",
                  embedding_class: type = BertEmbedding,
+                 bias: bool = True,
                  rotary_kwargs: Optional[dict] = None,
                  add_pooling_layer: bool = False):
         super().__init__()
+        """
+        For BertModel, all linear layers have bias.
+        For NomicBertModel, all linear layers do not have bias.
+        """
+
         config = vllm_config.model_config.hf_config
         self.embeddings = embedding_class(config)
         self.encoder = BertEncoder(vllm_config=vllm_config,
+                                   bias=bias,
                                    rotary_kwargs=rotary_kwargs,
                                    prefix=f"{prefix}.encoder")
         self.pooler = BertPooler(config) if add_pooling_layer else None
@@ -387,6 +450,8 @@ class BertModel(nn.Module, SupportsQuant):
             ("qkv_proj", "query", "q"),
             ("qkv_proj", "key", "k"),
             ("qkv_proj", "value", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
         ]
 
         params_dict = dict(self.named_parameters())
@@ -546,3 +611,115 @@ class BertForSequenceClassification(nn.Module, SupportsCrossEncoding,
                          inputs_embeds=inputs_embeds,
                          intermediate_tensors=intermediate_tensors,
                          token_type_ids=token_type_ids)
+
+
+class NomicBertEmbeddingModel(BertEmbeddingModel):
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            "emb_ln": "embeddings.LayerNorm",
+            "layers": "layer",
+            "attn.Wqkv": "attention.self.qkv_proj",
+            "attn.out_proj": "attention.output.dense",
+            'norm1': "attention.output.LayerNorm",
+            'mlp.fc11': "intermediate.up_proj",
+            'mlp.fc12': "intermediate.gate_proj",
+            'mlp.fc2': "output.dense",
+            'norm2': "output.LayerNorm",
+        })
+
+    def _build_model(self,
+                     vllm_config: VllmConfig,
+                     prefix: str = "") -> BertModel:
+        config = vllm_config.model_config.hf_config
+
+        assert config.__class__.__name__ == "NomicBertConfig"
+        assert config.activation_function == "swiglu"
+
+        # Assume NomicBertModel all linear layers do not have bias
+        assert not config.mlp_fc1_bias
+        assert not config.mlp_fc2_bias
+        assert not config.qkv_proj_bias
+
+        config.layer_norm_eps = config.layer_norm_epsilon
+        config.position_embedding_type = "rotary"
+        config.intermediate_size = config.n_inner
+        config.hidden_act = "silu"
+        config.hidden_size = config.n_embd
+        config.num_hidden_layers = config.n_layer
+
+        head_dim = config.hidden_size // config.num_attention_heads
+        rotary_kwargs = {
+            "head_size": head_dim,
+            "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
+            "max_position": config.max_trained_positions,
+            "base": config.rotary_emb_base,
+            "rope_scaling": {
+                "rope_type": "dynamic",
+                "factor": config.rotary_scaling_factor
+            }
+        }
+
+        return BertModel(vllm_config=vllm_config,
+                         prefix=prefix,
+                         bias=False,
+                         rotary_kwargs=rotary_kwargs,
+                         embedding_class=BertEmbedding)
+
+
+class GteEmbeddingModel(BertEmbeddingModel):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            "attention.qkv_proj": "attention.self.qkv_proj",
+            "attention.o_proj": "attention.output.dense",
+            'attn_ln': "attention.output.LayerNorm",
+            'mlp.down_proj': "output.dense",
+            'mlp_ln': "output.LayerNorm",
+        })
+
+    def _build_model(self,
+                     vllm_config: VllmConfig,
+                     prefix: str = "") -> BertModel:
+        config = vllm_config.model_config.hf_config
+
+        assert config.__class__.__name__ == "GteConfig"
+        assert config.position_embedding_type == "rope"
+        assert config.hidden_act == "gelu"
+
+        config.position_embedding_type = "rotary"
+        config.hidden_act = "gelu_and_mul"
+
+        head_dim = config.hidden_size // config.num_attention_heads
+        rotary_kwargs = {
+            "head_size": head_dim,
+            "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
+            "max_position": config.max_position_embeddings,
+            "base": config.rope_theta,
+        }
+
+        model = BertModel(vllm_config=vllm_config,
+                          prefix=prefix,
+                          rotary_kwargs=rotary_kwargs,
+                          embedding_class=BertEmbedding)
+
+        # GteModel only gate_up_proj does not have bias.
+        # Hack method learned from vllm/model_executor/models/glm.py
+        for layer in model.encoder.layer:
+            layer.intermediate.gate_up_proj.bias = None
+            layer.intermediate.skip_bias_add = True
+        return model
+
+    def split_up_gate_proj(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        n = "mlp.up_gate_proj"
+        for name, weight in weights:
+            if n in name:
+                up, gate = weight.chunk(2, dim=0)
+                yield name.replace(n, "intermediate.up_proj"), up
+                yield name.replace(n, "intermediate.gate_proj"), gate
+            else:
+                yield name, weight
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        weights = self.hf_to_vllm_mapper.apply(weights)
+        weights = self.split_up_gate_proj(weights)
+        self.model.load_weights(weights)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 670a44392847a..0c7a1d973ff98 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -122,13 +122,11 @@ _TEXT_GENERATION_MODELS = {
 _EMBEDDING_MODELS = {
     # [Text-only]
     "BertModel": ("bert", "BertEmbeddingModel"),
-    "RobertaModel": ("roberta", "RobertaEmbeddingModel"),
-    "RobertaForMaskedLM": ("roberta", "RobertaEmbeddingModel"),
-    "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"),
     "DeciLMForCausalLM": ("nemotron_nas", "DeciLMForCausalLM"),
     "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"),
     "GlmForCausalLM": ("glm", "GlmForCausalLM"),
     "GritLM": ("gritlm", "GritLM"),
+    "GteModel": ("bert", "GteEmbeddingModel"),
     "InternLM2ForRewardModel": ("internlm2", "InternLM2ForRewardModel"),
     "JambaForSequenceClassification": ("jamba", "JambaForSequenceClassification"),  # noqa: E501
     "LlamaModel": ("llama", "LlamaForCausalLM"),
@@ -138,12 +136,16 @@ _EMBEDDING_MODELS = {
         if arch == "LlamaForCausalLM"
     },
     "MistralModel": ("llama", "LlamaForCausalLM"),
+    "NomicBertModel": ("bert", "NomicBertEmbeddingModel"),
     "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
     "Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"),
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"),
     "Qwen2ForProcessRewardModel": ("qwen2_rm", "Qwen2ForProcessRewardModel"),
+    "RobertaForMaskedLM": ("roberta", "RobertaEmbeddingModel"),
+    "RobertaModel": ("roberta", "RobertaEmbeddingModel"),
     "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
+    "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"),
     # [Multimodal]
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),

From 5a5e29de88826a1d3d7aa4f1d621067401999e98 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sat, 19 Apr 2025 07:24:42 +0800
Subject: [PATCH 508/593] [Misc] refactor examples series - Chat Completion
 Client With Tools (#16829)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 ...penai_chat_completion_client_with_tools.py | 193 ++++++++++--------
 1 file changed, 111 insertions(+), 82 deletions(-)

diff --git a/examples/online_serving/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py
index 416fb61ca8bb5..c25203860ff39 100644
--- a/examples/online_serving/openai_chat_completion_client_with_tools.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools.py
@@ -17,6 +17,7 @@ vllm serve --model NousResearch/Hermes-2-Pro-Llama-3-8B \
             --enable-auto-tool-choice --tool-call-parser hermes
 """
 import json
+from typing import Any
 
 from openai import OpenAI
 
@@ -24,15 +25,6 @@ from openai import OpenAI
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    # defaults to os.environ.get("OPENAI_API_KEY")
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-
-models = client.models.list()
-model = models.data[0].id
-
 tools = [{
     "type": "function",
     "function": {
@@ -78,86 +70,123 @@ messages = [{
     "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
 }]
 
-chat_completion = client.chat.completions.create(messages=messages,
-                                                 model=model,
-                                                 tools=tools)
 
-print("Chat completion results:")
-print(chat_completion)
-print("\n\n")
-
-tool_calls_stream = client.chat.completions.create(messages=messages,
-                                                   model=model,
-                                                   tools=tools,
-                                                   stream=True)
-
-chunks = []
-for chunk in tool_calls_stream:
-    chunks.append(chunk)
-    if chunk.choices[0].delta.tool_calls:
-        print(chunk.choices[0].delta.tool_calls[0])
-    else:
-        print(chunk.choices[0].delta)
-
-arguments = []
-tool_call_idx = -1
-for chunk in chunks:
-
-    if chunk.choices[0].delta.tool_calls:
-        tool_call = chunk.choices[0].delta.tool_calls[0]
-
-        if tool_call.index != tool_call_idx:
-            if tool_call_idx >= 0:
-                print(
-                    f"streamed tool call arguments: {arguments[tool_call_idx]}"
-                )
-            tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
-            arguments.append("")
-        if tool_call.id:
-            print(f"streamed tool call id: {tool_call.id} ")
-
-        if tool_call.function:
-            if tool_call.function.name:
-                print(f"streamed tool call name: {tool_call.function.name}")
-
-            if tool_call.function.arguments:
-                arguments[tool_call_idx] += tool_call.function.arguments
-
-if len(arguments):
-    print(f"streamed tool call arguments: {arguments[-1]}")
-
-print("\n\n")
-
-messages.append({
-    "role": "assistant",
-    "tool_calls": chat_completion.choices[0].message.tool_calls
-})
-
-
-# Now, simulate a tool call
 def get_current_weather(city: str, state: str, unit: 'str'):
     return ("The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
             "partly cloudly, with highs in the 90's.")
 
 
-available_tools = {"get_current_weather": get_current_weather}
+def handle_tool_calls_stream(
+    client: OpenAI,
+    messages: list[dict[str, str]],
+    model: str,
+    tools: list[dict[str, Any]],
+) -> list[Any]:
+    tool_calls_stream = client.chat.completions.create(messages=messages,
+                                                       model=model,
+                                                       tools=tools,
+                                                       stream=True)
+    chunks = []
+    print("chunks: ")
+    for chunk in tool_calls_stream:
+        chunks.append(chunk)
+        if chunk.choices[0].delta.tool_calls:
+            print(chunk.choices[0].delta.tool_calls[0])
+        else:
+            print(chunk.choices[0].delta)
+    return chunks
 
-completion_tool_calls = chat_completion.choices[0].message.tool_calls
-for call in completion_tool_calls:
-    tool_to_call = available_tools[call.function.name]
-    args = json.loads(call.function.arguments)
-    result = tool_to_call(**args)
-    print(result)
+
+def handle_tool_calls_arguments(chunks: list[Any]) -> list[str]:
+    arguments = []
+    tool_call_idx = -1
+    print("arguments: ")
+    for chunk in chunks:
+        if chunk.choices[0].delta.tool_calls:
+            tool_call = chunk.choices[0].delta.tool_calls[0]
+            if tool_call.index != tool_call_idx:
+                if tool_call_idx >= 0:
+                    print(f"streamed tool call arguments: "
+                          f"{arguments[tool_call_idx]}")
+                tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
+                arguments.append("")
+            if tool_call.id:
+                print(f"streamed tool call id: {tool_call.id} ")
+
+            if tool_call.function:
+                if tool_call.function.name:
+                    print(
+                        f"streamed tool call name: {tool_call.function.name}")
+
+                if tool_call.function.arguments:
+                    arguments[tool_call_idx] += tool_call.function.arguments
+
+    return arguments
+
+
+def main():
+    # Initialize OpenAI client
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    # Get available models and select one
+    models = client.models.list()
+    model = models.data[0].id
+
+    chat_completion = client.chat.completions.create(messages=messages,
+                                                     model=model,
+                                                     tools=tools)
+
+    print("-" * 70)
+    print("Chat completion results:")
+    print(chat_completion)
+    print("-" * 70)
+
+    # Stream tool calls
+    chunks = handle_tool_calls_stream(client, messages, model, tools)
+    print("-" * 70)
+
+    # Handle arguments from streamed tool calls
+    arguments = handle_tool_calls_arguments(chunks)
+
+    if len(arguments):
+        print(f"streamed tool call arguments: {arguments[-1]}\n")
+
+    print("-" * 70)
+
+    # Add tool call results to the conversation
     messages.append({
-        "role": "tool",
-        "content": result,
-        "tool_call_id": call.id,
-        "name": call.function.name
+        "role": "assistant",
+        "tool_calls": chat_completion.choices[0].message.tool_calls
     })
 
-chat_completion_2 = client.chat.completions.create(messages=messages,
-                                                   model=model,
-                                                   tools=tools,
-                                                   stream=False)
-print("\n\n")
-print(chat_completion_2)
+    # Now, simulate a tool call
+    available_tools = {"get_current_weather": get_current_weather}
+
+    completion_tool_calls = chat_completion.choices[0].message.tool_calls
+    for call in completion_tool_calls:
+        tool_to_call = available_tools[call.function.name]
+        args = json.loads(call.function.arguments)
+        result = tool_to_call(**args)
+        print("tool_to_call result: ", result)
+        messages.append({
+            "role": "tool",
+            "content": result,
+            "tool_call_id": call.id,
+            "name": call.function.name
+        })
+
+    chat_completion_2 = client.chat.completions.create(messages=messages,
+                                                       model=model,
+                                                       tools=tools,
+                                                       stream=False)
+    print("Chat completion2 results:")
+    print(chat_completion_2)
+    print("-" * 70)
+
+
+if __name__ == "__main__":
+    main()

From 490b1698a5c789913b0a62109d9592621449ebcb Mon Sep 17 00:00:00 2001
From: Justin Ho <59701887+jmho@users.noreply.github.com>
Date: Fri, 18 Apr 2025 19:28:53 -0400
Subject: [PATCH 509/593] [Doc] Updated Llama section in tool calling docs to
 have llama 3.2 config info (#16857)

Signed-off-by: jmho <jaylenho734@gmail.com>
---
 docs/source/features/tool_calling.md | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/docs/source/features/tool_calling.md b/docs/source/features/tool_calling.md
index 8b8bbd28d3483..57888e122969d 100644
--- a/docs/source/features/tool_calling.md
+++ b/docs/source/features/tool_calling.md
@@ -152,12 +152,13 @@ Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_cha
 
 Supported models:
 
-* `meta-llama/Meta-Llama-3.1-8B-Instruct`
-* `meta-llama/Meta-Llama-3.1-70B-Instruct`
-* `meta-llama/Meta-Llama-3.1-405B-Instruct`
-* `meta-llama/Meta-Llama-3.1-405B-Instruct-FP8`
+All Llama 3.1 and 3.2 models should be supported.
+
+* `meta-llama/Llama-3.1-*`
+* `meta-llama/Llama-3.2-*`
+
+The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) introduced by the Llama-3.2 models, see the `pythonic` tool parser below.
 
-The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) in Llama-3.2 models, see the `pythonic` tool parser below.
 Other tool calling formats like the built in python tool calling or custom tool calling are not supported.
 
 Known issues:
@@ -166,10 +167,14 @@ Known issues:
 2. The model can generate parameters with a wrong format, such as generating
    an array serialized as string instead of an array.
 
-The `tool_chat_template_llama3_json.jinja` file contains the "official" Llama chat template, but tweaked so that
-it works better with vLLM.
+VLLM provides two JSON based chat templates for Llama 3.1 and 3.2:
 
-Recommended flags: `--tool-call-parser llama3_json --chat-template examples/tool_chat_template_llama3_json.jinja`
+* `examples/tool_chat_template_llama3.1_json.jinja` - this is the "official" chat template for the Llama 3.1
+models, but tweaked so that it works better with vLLM.
+* `examples/tool_chat_template_llama3.2_json.jinja` - this extends upon the Llama 3.1 chat template by adding support for
+images.
+
+Recommended flags: `--tool-call-parser llama3_json --chat-template {see_above}`
 
 #### IBM Granite
 

From 5c9121203cc34e781f1f249b69cb789244e861f0 Mon Sep 17 00:00:00 2001
From: omrishiv <327609+omrishiv@users.noreply.github.com>
Date: Sat, 19 Apr 2025 01:11:25 +0100
Subject: [PATCH 510/593] [release] Publish neuron docker image (#16733)

Signed-off-by: omrishiv <327609+omrishiv@users.noreply.github.com>
---
 .buildkite/release-pipeline.yaml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 3354ea37002b9..a21a657c4b05e 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -86,3 +86,18 @@ steps:
       - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
     env:
       DOCKER_BUILDKIT: "1"
+
+  - block: "Build Neuron release image"
+    key: block-neuron-release-image-build
+    depends_on: ~
+
+  - label: "Build and publish Neuron release image"
+    depends_on: block-neuron-release-image-build
+    agents:
+      queue: neuron-postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
+    env:
+      DOCKER_BUILDKIT: "1"

From 2c1bd848a668787082c0a9364d96db13a9201baa Mon Sep 17 00:00:00 2001
From: Yang Fan <suyang.fy@alibaba-inc.com>
Date: Sat, 19 Apr 2025 14:14:36 +0800
Subject: [PATCH 511/593] [Model][VLM] Add Qwen2.5-Omni model support (thinker
 only) (#15130)

Signed-off-by: fyabc <suyang.fy@alibaba-inc.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Xiong Wang <wangxiongts@163.com>
---
 docs/source/models/supported_models.md        |  15 +
 examples/offline_inference/audio_language.py  |  31 +
 .../offline_inference/qwen2_5_omni/README.md  |  32 +
 .../qwen2_5_omni/only_thinker.py              | 160 +++
 examples/offline_inference/vision_language.py |  37 +
 .../vision_language/test_models.py            |  17 +
 .../multimodal/processing/test_common.py      |   1 +
 tests/models/registry.py                      |   2 +
 vllm/assets/video.py                          |  18 +-
 vllm/entrypoints/chat_utils.py                |   6 +-
 vllm/envs.py                                  |   2 +-
 .../model_executor/layers/rotary_embedding.py | 332 +++++-
 vllm/model_executor/models/llava_onevision.py |  38 +-
 .../models/qwen2_5_omni_thinker.py            | 977 ++++++++++++++++++
 vllm/model_executor/models/qwen2_5_vl.py      |  79 +-
 vllm/model_executor/models/registry.py        |   1 +
 vllm/model_executor/models/transformers.py    |   2 +-
 vllm/multimodal/inputs.py                     |  78 +-
 vllm/transformers_utils/config.py             |  26 +-
 vllm/transformers_utils/processor.py          |  49 +
 vllm/v1/worker/gpu_model_runner.py            |   9 +
 vllm/worker/cpu_model_runner.py               |  14 +-
 vllm/worker/model_runner.py                   |  14 +-
 23 files changed, 1855 insertions(+), 85 deletions(-)
 create mode 100644 examples/offline_inference/qwen2_5_omni/README.md
 create mode 100644 examples/offline_inference/qwen2_5_omni/only_thinker.py
 create mode 100644 vllm/model_executor/models/qwen2_5_omni_thinker.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 4df9c511ca399..98f18319f1e40 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -1040,6 +1040,13 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
   * ✅︎
   * ✅︎
+- * `Qwen2_5OmniThinkerForConditionalGeneration`
+  * Qwen2.5-Omni
+  * T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup>
+  * `Qwen/Qwen2.5-Omni-7B`
+  *
+  * ✅︎
+  * ✅︎\*
 - * `SkyworkR1VChatModel`
   * Skywork-R1V-38B
   * T + I
@@ -1109,6 +1116,14 @@ For more details, please see: <gh-pr:4087#issuecomment-2250397630>
 Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.
 :::
 
+:::{note}
+To use Qwen2.5-Omni, you have to install a fork of Hugging Face Transformers library from source via
+`pip install git+https://github.com/BakerBunker/transformers.git@qwen25omni`.
+
+Read audio from video pre-processing is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
+`--mm-processor-kwargs '{"use_audio_in_video": True}'`.
+:::
+
 ### Pooling Models
 
 See [this page](pooling-models) for more information on how to use pooling models.
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 8f6779088e8fc..077b5c762a252 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -130,6 +130,36 @@ def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
     )
 
 
+# Qwen2.5-Omni
+def run_qwen2_5_omni(question: str, audio_count: int):
+    model_name = "Qwen/Qwen2.5-Omni-7B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"audio": audio_count},
+    )
+
+    audio_in_prompt = "".join([
+        "<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)
+    ])
+
+    default_system = (
+        "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
+        "Group, capable of perceiving auditory and visual inputs, as well as "
+        "generating text and speech.")
+
+    prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
+              "<|im_start|>user\n"
+              f"{audio_in_prompt}{question}<|im_end|>\n"
+              "<|im_start|>assistant\n")
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+    )
+
+
 # Ultravox 0.5-1B
 def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
     model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
@@ -182,6 +212,7 @@ model_example_map = {
     "minicpmo": run_minicpmo,
     "phi4_mm": run_phi4mm,
     "qwen2_audio": run_qwen2_audio,
+    "qwen2_5_omni": run_qwen2_5_omni,
     "ultravox": run_ultravox,
     "whisper": run_whisper,
 }
diff --git a/examples/offline_inference/qwen2_5_omni/README.md b/examples/offline_inference/qwen2_5_omni/README.md
new file mode 100644
index 0000000000000..c30541a598cee
--- /dev/null
+++ b/examples/offline_inference/qwen2_5_omni/README.md
@@ -0,0 +1,32 @@
+# Qwen2.5-Omni Offline Inference Examples
+
+This folder provides several example scripts on how to inference Qwen2.5-Omni offline.
+
+## Thinker Only
+
+```bash
+# Audio + image + video
+python examples/offline_inference/qwen2_5_omni/only_thinker.py -q mixed_modalities
+
+# Read vision and audio inputs from a single video file
+# NOTE: V1 engine does not support interleaved modalities yet.
+VLLM_USE_V1=0 python examples/offline_inference/qwen2_5_omni/only_thinker.py -q use_audio_in_video
+
+# Multiple audios
+VLLM_USE_V1=0 python examples/offline_inference/qwen2_5_omni/only_thinker.py -q multi_audios
+```
+
+This script will run the thinker part of Qwen2.5-Omni, and generate text response.
+
+You can also test Qwen2.5-Omni on a single modality:
+
+```bash
+# Process audio inputs
+python examples/offline_inference/audio_language.py --model-type qwen2_5_omni
+
+# Process image inputs
+python examples/offline_inference/vision_language.py --modality image --model-type qwen2_5_omni
+
+# Process video inputs
+python examples/offline_inference/vision_language.py --modality video --model-type qwen2_5_omni
+```
diff --git a/examples/offline_inference/qwen2_5_omni/only_thinker.py b/examples/offline_inference/qwen2_5_omni/only_thinker.py
new file mode 100644
index 0000000000000..c75a990120e07
--- /dev/null
+++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This example shows how to use vLLM for running offline inference 
+with the correct prompt format on Qwen2.5-Omni (thinker only).
+"""
+
+from typing import NamedTuple
+
+import vllm.envs as envs
+from vllm import LLM, SamplingParams
+from vllm.assets.audio import AudioAsset
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+from vllm.utils import FlexibleArgumentParser
+
+
+class QueryResult(NamedTuple):
+    inputs: dict
+    limit_mm_per_prompt: dict[str, int]
+
+
+# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
+# lower-end GPUs.
+# Unless specified, these settings have been tested to work on a single L4.
+
+default_system = (
+    "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
+    "Group, capable of perceiving auditory and visual inputs, as well as "
+    "generating text and speech.")
+
+
+def get_mixed_modalities_query() -> QueryResult:
+    question = ("What is recited in the audio? "
+                "What is the content of this image? Why is this video funny?")
+    prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
+              "<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>"
+              "<|vision_bos|><|IMAGE|><|vision_eos|>"
+              "<|vision_bos|><|VIDEO|><|vision_eos|>"
+              f"{question}<|im_end|>\n"
+              f"<|im_start|>assistant\n")
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "audio":
+                AudioAsset("mary_had_lamb").audio_and_sample_rate,
+                "image":
+                ImageAsset("cherry_blossom").pil_image.convert("RGB"),
+                "video":
+                VideoAsset(name="sample_demo_1.mp4",
+                           num_frames=16).np_ndarrays,
+            },
+        },
+        limit_mm_per_prompt={
+            "audio": 1,
+            "image": 1,
+            "video": 1
+        },
+    )
+
+
+def get_use_audio_in_video_query() -> QueryResult:
+    question = ("Describe the content of the video, "
+                "then convert what the baby say into text.")
+    prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
+              "<|im_start|>user\n<|vision_bos|><|VIDEO|><|vision_eos|>"
+              f"{question}<|im_end|>\n"
+              f"<|im_start|>assistant\n")
+    asset = VideoAsset(name="sample_demo_1.mp4", num_frames=16)
+    audio = asset.get_audio(sampling_rate=16000)
+    assert not envs.VLLM_USE_V1, ("V1 does not support use_audio_in_video. "
+                                  "Please launch this example with "
+                                  "`VLLM_USE_V1=0`.")
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "video": asset.np_ndarrays,
+                "audio": audio,
+            },
+            "mm_processor_kwargs": {
+                "use_audio_in_video": True,
+            },
+        },
+        limit_mm_per_prompt={
+            "audio": 1,
+            "video": 1
+        },
+    )
+
+
+def get_multi_audios_query() -> QueryResult:
+    question = "Are these two audio clips the same?"
+    prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
+              "<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>"
+              "<|audio_bos|><|AUDIO|><|audio_eos|>"
+              f"{question}<|im_end|>\n"
+              f"<|im_start|>assistant\n")
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "audio": [
+                    AudioAsset("winning_call").audio_and_sample_rate,
+                    AudioAsset("mary_had_lamb").audio_and_sample_rate,
+                ],
+            },
+        },
+        limit_mm_per_prompt={
+            "audio": 2,
+        },
+    )
+
+
+query_map = {
+    "mixed_modalities": get_mixed_modalities_query,
+    "use_audio_in_video": get_use_audio_in_video_query,
+    "multi_audios": get_multi_audios_query,
+}
+
+
+def main(args):
+    model_name = "Qwen/Qwen2.5-Omni-7B"
+    query_result = query_map[args.query_type]()
+
+    llm = LLM(model=model_name,
+              max_model_len=5632,
+              max_num_seqs=5,
+              limit_mm_per_prompt=query_result.limit_mm_per_prompt,
+              seed=args.seed)
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(temperature=0.2, max_tokens=64)
+
+    outputs = llm.generate(query_result.inputs,
+                           sampling_params=sampling_params)
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'audio language models')
+    parser.add_argument('--query-type',
+                        '-q',
+                        type=str,
+                        default="mixed_modalities",
+                        choices=query_map.keys(),
+                        help='Query type.')
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")
+
+    args = parser.parse_args()
+    main(args)
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index b4ad610cdafb3..510a043ce421e 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -941,6 +941,42 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# Qwen2.5-Omni
+def run_qwen2_5_omni(questions: list[str], modality: str):
+    model_name = "Qwen/Qwen2.5-Omni-7B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=5,
+        mm_processor_kwargs={
+            "min_pixels": 28 * 28,
+            "max_pixels": 1280 * 28 * 28,
+            "fps": [1],
+        },
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    if modality == "image":
+        placeholder = "<|IMAGE|>"
+    elif modality == "video":
+        placeholder = "<|VIDEO|>"
+
+    default_system = (
+        "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
+        "Group, capable of perceiving auditory and visual inputs, as well as "
+        "generating text and speech.")
+
+    prompts = [(f"<|im_start|>system\n{default_system}<|im_end|>\n"
+                f"<|im_start|>user\n<|vision_bos|>{placeholder}<|vision_eos|>"
+                f"{question}<|im_end|>\n"
+                "<|im_start|>assistant\n") for question in questions]
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # SkyworkR1V
 def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1010,6 +1046,7 @@ model_example_map = {
     "qwen_vl": run_qwen_vl,
     "qwen2_vl": run_qwen2_vl,
     "qwen2_5_vl": run_qwen2_5_vl,
+    "qwen2_5_omni": run_qwen2_5_omni,
     "skywork_chat": run_skyworkr1v,
     "smolvlm": run_smolvlm,
 }
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 5c87cefcd8e9e..9985cb579e109 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -139,6 +139,23 @@ VLM_TEST_SETTINGS = {
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
         marks=[pytest.mark.core_model, pytest.mark.cpu_model],
     ),
+    "qwen2_5_omni": VLMTestInfo(
+        models=["Qwen/Qwen2.5-Omni-7B"],
+        test_type=(
+            VLMTestType.IMAGE,
+            VLMTestType.MULTI_IMAGE,
+            VLMTestType.VIDEO
+        ),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>", # noqa: E501
+        video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForVision2Seq,
+        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
+        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+    ),
     #### Extended model tests
     "aria": VLMTestInfo(
         models=["rhymes-ai/Aria"],
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index b14e8a02bb1a3..a8f21ff919b71 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -280,6 +280,7 @@ def _test_processing_correctness_mistral(
     "Qwen/Qwen2-VL-2B-Instruct",
     "Qwen/Qwen2.5-VL-3B-Instruct",
     "Qwen/Qwen2-Audio-7B-Instruct",
+    "Qwen/Qwen2.5-Omni-7B",
     "Skywork/Skywork-R1V-38B",
     "fixie-ai/ultravox-v0_5-llama-3_2-1b",
     "openai/whisper-large-v3",
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 3f902df93f208..c15ae36198449 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -362,6 +362,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"),  # noqa: E501
     "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct",  # noqa: E501
                                                           min_transformers_version="4.49"),  # noqa: E501
+    "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B",  # noqa: E501
+                                                                  min_transformers_version="4.52"),  # noqa: E501
     "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),
     "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"),  # noqa: E501
     "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index 32b0b86ba36f4..133e18b68e25b 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -2,7 +2,7 @@
 
 from dataclasses import dataclass
 from functools import lru_cache
-from typing import Literal
+from typing import Literal, Optional
 
 import cv2
 import numpy as np
@@ -10,8 +10,15 @@ import numpy.typing as npt
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
+from vllm.utils import PlaceholderModule
+
 from .base import get_cache_dir
 
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
+
 
 @lru_cache
 def download_video_asset(filename: str) -> str:
@@ -85,3 +92,12 @@ class VideoAsset:
         video_path = download_video_asset(self.name)
         ret = video_to_ndarrays(video_path, self.num_frames)
         return ret
+
+    def get_audio(self, sampling_rate: Optional[float] = None) -> npt.NDArray:
+        """
+        Read audio data from the video asset, used in Qwen2.5-Omni examples.
+        
+        See also: examples/offline_inference/qwen2_5_omni/only_thinker.py
+        """
+        video_path = download_video_asset(self.name)
+        return librosa.load(video_path, sr=sampling_rate)[0]
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index d6010e1c7802d..e505a4592e2d0 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -506,6 +506,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
                 return "<|image|>"
             if model_type in ("qwen2_vl", "qwen2_5_vl"):
                 return "<|vision_start|><|image_pad|><|vision_end|>"
+            if model_type == "qwen2_5_omni":
+                return "<|vision_start|><|IMAGE|><|vision_end|>"
             if model_type == "molmo":
                 return ""
             if model_type == "aria":
@@ -521,7 +523,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
                 return "<|audio|>"
             if model_type == "phi4mm":
                 return "<|endoftext11|>"  # 200011 (see vocab.json in hf model)
-            if model_type == "qwen2_audio":
+            if model_type in ("qwen2_audio", "qwen2_5_omni"):
                 return (f"Audio {current_count}: "
                         f"<|audio_bos|><|AUDIO|><|audio_eos|>")
             if model_type == "minicpmo":
@@ -530,6 +532,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
         elif modality == "video":
             if model_type in ("qwen2_vl", "qwen2_5_vl"):
                 return "<|vision_start|><|video_pad|><|vision_end|>"
+            if model_type == "qwen2_5_omni":
+                return "<|vision_start|><|VIDEO|><|vision_end|>"
             if model_type in ("minicpmo", "minicpmv"):
                 return "(<video>./</video>)"
             if model_type.startswith("llava"):
diff --git a/vllm/envs.py b/vllm/envs.py
index 76b5a4d84a825..ac6089977846c 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -747,7 +747,7 @@ def compute_hash() -> str:
     variables, ensure that it is included in the factors list if
     it affects the computation graph. For example, different values
     of VLLM_PP_LAYER_PARTITION will generate different computation
-    graphs, so it is included in the factors list. The env vars that 
+    graphs, so it is included in the factors list. The env vars that
     affect the choice of different kernels or attention backends should
     also be included in the factors list.
     """
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 624ed63ab8b4b..e6f2461eb674c 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -988,8 +988,9 @@ class MRotaryEmbedding(RotaryEmbedding):
         key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
         return query, key
 
-    @staticmethod
+    @classmethod
     def get_input_positions(
+        cls,
         input_tokens: List[int],
         hf_config: PretrainedConfig,
         image_grid_thw: Optional[Union[List[List[int]], torch.Tensor]],
@@ -997,6 +998,8 @@ class MRotaryEmbedding(RotaryEmbedding):
         second_per_grid_ts: Optional[List[float]],
         context_len: int = 0,
         seq_len: Optional[int] = None,
+        audio_feature_lengths: Optional[torch.Tensor] = None,
+        use_audio_in_video: bool = False,
     ) -> Tuple[List[List[int]], int]:
         """Get mrope input positions and delta value."""
 
@@ -1006,7 +1009,48 @@ class MRotaryEmbedding(RotaryEmbedding):
             second_per_grid_ts
 
         llm_positions, mrope_position_delta = \
-            MRotaryEmbedding.get_input_positions_tensor(
+            cls.get_input_positions_tensor(
+                input_tokens=input_tokens,
+                hf_config=hf_config,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                second_per_grid_ts=second_per_grid_ts,
+                context_len=context_len,
+                seq_len=seq_len,
+                audio_feature_lengths=audio_feature_lengths,
+                use_audio_in_video=use_audio_in_video,
+            )
+
+        return llm_positions.tolist(), mrope_position_delta
+
+    @classmethod
+    def get_input_positions_tensor(
+        cls,
+        input_tokens: List[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Union[List[List[int]], torch.Tensor],
+        video_grid_thw: Union[List[List[int]], torch.Tensor],
+        second_per_grid_ts: List[float],
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+        audio_feature_lengths: Optional[torch.Tensor] = None,
+        use_audio_in_video: bool = False,
+    ) -> Tuple[torch.Tensor, int]:
+        from vllm.transformers_utils.config import thinker_uses_mrope
+        if thinker_uses_mrope(hf_config):
+            return cls._omni_get_input_positions_tensor(
+                input_tokens=input_tokens,
+                hf_config=hf_config,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                second_per_grid_ts=second_per_grid_ts,
+                context_len=context_len,
+                seq_len=seq_len,
+                audio_feature_lengths=audio_feature_lengths,
+                use_audio_in_video=use_audio_in_video,
+            )
+        else:
+            return cls._vl_get_input_positions_tensor(
                 input_tokens=input_tokens,
                 hf_config=hf_config,
                 image_grid_thw=image_grid_thw,
@@ -1016,10 +1060,9 @@ class MRotaryEmbedding(RotaryEmbedding):
                 seq_len=seq_len,
             )
 
-        return llm_positions.tolist(), mrope_position_delta
-
-    @staticmethod
-    def get_input_positions_tensor(
+    @classmethod
+    def _vl_get_input_positions_tensor(
+        cls,
         input_tokens: List[int],
         hf_config: PretrainedConfig,
         image_grid_thw: Union[List[List[int]], torch.Tensor],
@@ -1037,11 +1080,6 @@ class MRotaryEmbedding(RotaryEmbedding):
         tokens_per_second = getattr(hf_config.vision_config,
                                     "tokens_per_second", 1.0)
 
-        if isinstance(image_grid_thw, torch.Tensor):
-            image_grid_thw = image_grid_thw.tolist()
-        if isinstance(video_grid_thw, torch.Tensor):
-            video_grid_thw = video_grid_thw.tolist()
-
         input_tokens_tensor = torch.tensor(input_tokens)
         vision_start_indices = torch.argwhere(
             input_tokens_tensor == vision_start_token_id).squeeze(1)
@@ -1121,6 +1159,226 @@ class MRotaryEmbedding(RotaryEmbedding):
 
         return llm_positions, mrope_position_delta
 
+    @classmethod
+    def _omni_get_input_positions_tensor(
+        cls,
+        input_tokens: List[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Union[List[List[int]], torch.Tensor],
+        video_grid_thw: Union[List[List[int]], torch.Tensor],
+        second_per_grid_ts: Optional[List[float]] = None,
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+        audio_feature_lengths: Optional[torch.Tensor] = None,
+        use_audio_in_video: bool = False,
+    ) -> Tuple[torch.Tensor, int]:
+        """Get mrope input positions and delta value (Qwen2.5-Omni version).
+
+        Differences from MRotaryEmbedding:
+            1. Add audio support (and related `audio_feature_lengths`).
+            2. Add `use_audio_in_video` option to read audio from video inputs.
+                In this case, audio and vision position ids will be split into
+                chunks and interleaved.
+
+        Example:
+
+            (V_i are vision position ids, A_i are audio position ids)
+
+            |V_1 ...    V_n|A_1 ...   A_n|V_n+1 ... V_2n|A_n+1 ... A_2n|...
+            |vision chunk 1|audio chunk 1|vision chunk 2|audio chunk 2 |...
+        """
+
+        # TODO(fyabc): refactor and share more code with
+        #  _vl_get_input_positions_tensor.
+
+        thinker_config = hf_config.thinker_config
+        audio_token_id = thinker_config.audio_token_index
+        image_token_id = thinker_config.image_token_index
+        video_token_id = thinker_config.video_token_index
+        audio_start_token_id = thinker_config.audio_start_token_id
+        audio_end_token_id = thinker_config.audio_end_token_id
+        vision_end_token_id = thinker_config.vision_end_token_id
+        seconds_per_chunk = thinker_config.seconds_per_chunk
+        spatial_merge_size = thinker_config.vision_config.spatial_merge_size
+        tokens_per_second = getattr(thinker_config.vision_config,
+                                    "tokens_per_second", 25)
+
+        if isinstance(image_grid_thw, list):
+            image_grid_thw = torch.tensor(image_grid_thw)
+        if isinstance(video_grid_thw, list):
+            video_grid_thw = torch.tensor(video_grid_thw)
+
+        src_item = input_tokens
+        audio_seqlens = audio_feature_lengths
+        if not second_per_grid_ts:
+            second_per_grid_ts = [1] * video_grid_thw.shape[0]
+        audio_idx = 0
+        video_idx = 0
+        image_idx = 0
+        new_src_item: list[int] = []
+        llm_pos_ids_list: list[torch.Tensor] = []
+
+        idx = 0
+        while idx < len(src_item):
+            new_src_item_len = len(new_src_item)
+            start_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                llm_pos_ids_list) > 0 else 0
+            if src_item[idx] not in [
+                    audio_token_id, video_token_id, image_token_id
+            ]:
+                if src_item[idx] == vision_end_token_id and use_audio_in_video:
+                    start_idx -= 1
+                new_src_item.append(src_item[idx])
+                llm_pos_ids = torch.tensor([start_idx],
+                                           dtype=torch.long).expand(3, -1)
+                llm_pos_ids_list.append(llm_pos_ids)
+            elif src_item[idx] == audio_token_id:
+                assert audio_seqlens is not None
+                audio_seqlen = audio_seqlens[audio_idx]
+                place_num = (((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1)
+                new_src_item.extend([audio_token_id] * place_num)
+                llm_pos_ids = torch.arange(place_num).expand(3, -1) + start_idx
+                llm_pos_ids_list.append(llm_pos_ids)
+                audio_idx += 1
+            elif src_item[idx] == image_token_id:
+                grid_t = image_grid_thw[image_idx][0]
+                grid_hs = image_grid_thw[:, 1]
+                grid_ws = image_grid_thw[:, 2]
+                t_index = (torch.arange(grid_t) * 1 * tokens_per_second).long()
+                llm_pos_ids = cls._get_llm_pos_ids_for_vision(
+                    start_idx, image_idx, spatial_merge_size, t_index, grid_hs,
+                    grid_ws)
+                llm_pos_ids_list.append(llm_pos_ids)
+                vision_seqlen = image_grid_thw[image_idx].prod() // (
+                    spatial_merge_size**2)
+                new_src_item.extend([image_token_id] * vision_seqlen)
+                image_idx += 1
+            elif src_item[idx] == video_token_id and not use_audio_in_video:
+                grid_t = video_grid_thw[video_idx][0]
+                grid_hs = video_grid_thw[:, 1]
+                grid_ws = video_grid_thw[:, 2]
+                t_index = (torch.arange(grid_t) *
+                           second_per_grid_ts[video_idx] *
+                           tokens_per_second).long()
+                llm_pos_ids = cls._get_llm_pos_ids_for_vision(
+                    start_idx, video_idx, spatial_merge_size, t_index, grid_hs,
+                    grid_ws)
+                llm_pos_ids_list.append(llm_pos_ids)
+                vision_seqlen = video_grid_thw[video_idx].prod() // (
+                    spatial_merge_size**2)
+                new_src_item.extend([video_token_id] * vision_seqlen)
+                video_idx += 1
+            else:
+                # read audio from video
+                assert audio_seqlens is not None
+                audio_seqlen = audio_seqlens[audio_idx]
+                vision_seqlen = video_grid_thw[video_idx].prod() // (
+                    spatial_merge_size**2)
+                grid_t = video_grid_thw[video_idx][0]
+                grid_h = video_grid_thw[video_idx][1]
+                grid_w = video_grid_thw[video_idx][2]
+                grid_hs = video_grid_thw[:, 1]
+                grid_ws = video_grid_thw[:, 2]
+                t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk)
+                t_index = (torch.arange(grid_t) *
+                           second_per_grid_ts[video_idx] *
+                           tokens_per_second).long()
+                t_index_split_chunk = cls._split_list_into_ranges(
+                    t_index, t_ntoken_per_chunk)
+                new_src_item.extend([audio_start_token_id])
+                start_idx -= 1
+                llm_pos_ids_list.extend([
+                    torch.tensor([start_idx], dtype=torch.long).expand(3, -1)
+                ] * 1)
+                place_num = (((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1) + 2
+                pure_audio_len = place_num - 2
+                added_audio_len = 0
+                audio_llm_pos_ids_list: List[torch.Tensor] = []
+                for t_chunk in t_index_split_chunk:
+                    vision_ntoken_per_chunk = len(
+                        t_chunk) * grid_h * grid_w // (spatial_merge_size**2)
+                    new_src_item.extend([video_token_id] *
+                                        vision_ntoken_per_chunk)
+                    vision_llm_pos_ids_list = cls._get_llm_pos_ids_for_vision(
+                        start_idx + 1, video_idx, spatial_merge_size, t_chunk,
+                        grid_hs, grid_ws).split(1, dim=1)
+                    llm_pos_ids_list.extend(vision_llm_pos_ids_list)
+                    new_src_item.extend(
+                        min(t_ntoken_per_chunk, pure_audio_len -
+                            added_audio_len) * [audio_token_id])
+                    audio_start_idx = start_idx if len(
+                        audio_llm_pos_ids_list
+                    ) == 0 else audio_llm_pos_ids_list[-1][0].item()
+                    if min(t_ntoken_per_chunk,
+                           pure_audio_len - added_audio_len) > 0:
+                        audio_llm_pos_ids_list = (torch.arange(
+                            min(t_ntoken_per_chunk, pure_audio_len -
+                                added_audio_len)).expand(3, -1) +
+                                                  audio_start_idx + 1).split(
+                                                      1, dim=1)
+                    else:
+                        audio_llm_pos_ids_list = []
+                    added_audio_len += min(t_ntoken_per_chunk,
+                                           pure_audio_len - added_audio_len)
+                    llm_pos_ids_list.extend(audio_llm_pos_ids_list)
+                if added_audio_len < pure_audio_len:
+                    new_src_item.extend(
+                        (pure_audio_len - added_audio_len) * [audio_token_id])
+                    audio_llm_pos_ids_list = (
+                        torch.arange(pure_audio_len - added_audio_len).expand(
+                            3, -1) + llm_pos_ids_list[-1].max() + 1).split(
+                                1, dim=1)
+                    llm_pos_ids_list.extend(audio_llm_pos_ids_list)
+                llm_pos_ids_list.extend([
+                    torch.tensor(
+                        [llm_pos_ids_list[-1].max() + 1] * 3).unsqueeze(1)
+                ] * 1)
+                new_src_item.extend([audio_end_token_id])
+                audio_idx += 1
+                video_idx += 1
+            # move to the next token
+            idx += len(new_src_item) - new_src_item_len
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1)
+        mrope_position_delta = torch.cat(llm_pos_ids_list,
+                                         dim=1).max() + 1 - len(src_item)
+        llm_positions = llm_positions[:, context_len:seq_len]
+
+        return llm_positions, mrope_position_delta
+
+    @staticmethod
+    def _get_llm_pos_ids_for_vision(
+        start_idx: int,
+        vision_idx: int,
+        spatial_merge_size: int,
+        t_index: List[int],
+        grid_hs: torch.Tensor,
+        grid_ws: torch.Tensor,
+    ) -> torch.Tensor:
+        llm_pos_ids_list = []
+        llm_grid_h = grid_hs[vision_idx] // spatial_merge_size
+        llm_grid_w = grid_ws[vision_idx] // spatial_merge_size
+        h_index = (torch.arange(llm_grid_h).view(1, -1, 1).expand(
+            len(t_index), -1, llm_grid_w).flatten())
+        w_index = (torch.arange(llm_grid_w).view(1, 1, -1).expand(
+            len(t_index), llm_grid_h, -1).flatten())
+        t_index_tensor = torch.Tensor(t_index).to(llm_grid_h.device).view(
+            -1, 1).expand(-1, llm_grid_h * llm_grid_w).long().flatten()
+        _llm_pos_ids = torch.stack([t_index_tensor, h_index, w_index])
+        llm_pos_ids_list.append(_llm_pos_ids + start_idx)
+        llm_pos_ids = torch.cat(llm_pos_ids_list, dim=1)
+        return llm_pos_ids
+
+    @staticmethod
+    def _split_list_into_ranges(lst: torch.Tensor,
+                                interval: int) -> List[List[int]]:
+        ranges: List[List[int]] = [[]
+                                   for _ in range((max(lst) // interval) + 1)]
+        for num in lst:
+            index = num // interval
+            ranges[index].append(num)
+        return ranges
+
     @staticmethod
     def get_next_input_positions(
         mrope_position_delta: int,
@@ -1144,6 +1402,58 @@ class MRotaryEmbedding(RotaryEmbedding):
             mrope_position_delta + seq_len,
         ).expand(3, -1)
 
+    @classmethod
+    def omni_get_updates_use_audio_in_video(
+        cls,
+        thinker_config: PretrainedConfig,
+        audio_len: int,
+        video_grid_thw: Union[List[int], torch.Tensor],
+        video_second_per_grid_t: float,
+    ) -> List[int]:
+        """Get video prompt updates when `use_audio_in_video` is True.
+
+        In this case, audio and vision update ids will be split into
+        chunks and interleaved (details in `_omni_get_input_positions_tensor`).
+
+        <|video_bos|><|VIDEO|><|video_eos|> =>
+        <|video_bos|><|audio_bos|>(... chunks ...)<|audio_eos|><|video_eos|>
+        """
+
+        audio_token_id = thinker_config.audio_token_index
+        video_token_id = thinker_config.video_token_index
+        audio_start_token_id = thinker_config.audio_start_token_id
+        audio_end_token_id = thinker_config.audio_end_token_id
+        seconds_per_chunk = thinker_config.seconds_per_chunk
+        spatial_merge_size = thinker_config.vision_config.spatial_merge_size
+        tokens_per_second = getattr(thinker_config.vision_config,
+                                    "tokens_per_second", 25)
+
+        grid_t = video_grid_thw[0]
+        grid_h = video_grid_thw[1]
+        grid_w = video_grid_thw[2]
+        t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk)
+        t_index = (torch.arange(grid_t) * video_second_per_grid_t *
+                   tokens_per_second).long()
+        t_index_split_chunk = cls._split_list_into_ranges(
+            t_index, t_ntoken_per_chunk)
+
+        updates = [audio_start_token_id]
+        added_audio_len = 0
+        for t_chunk in t_index_split_chunk:
+            vision_ntoken_per_chunk = len(t_chunk) * grid_h * grid_w // (
+                spatial_merge_size**2)
+            updates.extend([video_token_id] * vision_ntoken_per_chunk)
+
+            audio_chunk_size = min(t_ntoken_per_chunk,
+                                   audio_len - added_audio_len)
+            updates.extend(audio_chunk_size * [audio_token_id])
+            added_audio_len += audio_chunk_size
+        if added_audio_len < audio_len:
+            updates.extend((audio_len - added_audio_len) * [audio_token_id])
+        updates.extend([audio_end_token_id])
+
+        return updates
+
 
 _ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {}
 
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 60d32c924694c..ab2bf881ad65b 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -583,21 +583,21 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
         )
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
-        modalities = {}
+        mm_input_by_modality = {}
 
         # Preserve the order of modalities if there are multiple of them
         # from the order of kwargs.
         for input_key in kwargs:
-            if input_key in ("pixel_values",
-                             "image_embeds") and "images" not in modalities:
-                modalities["images"] = self._parse_and_validate_image_input(
-                    **kwargs)
-            if input_key in ("pixel_values_videos",
-                             "video_embeds") and "videos" not in modalities:
-                modalities["videos"] = self._parse_and_validate_video_input(
-                    **kwargs)
+            if input_key in ("pixel_values", "image_embeds"
+                             ) and "image" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "image"] = self._parse_and_validate_image_input(**kwargs)
+            if input_key in ("pixel_values_videos", "video_embeds"
+                             ) and "video" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "video"] = self._parse_and_validate_video_input(**kwargs)
 
-        return modalities
+        return mm_input_by_modality
 
     def _select_image_features(self, image_features: torch.Tensor, *,
                                strategy: str) -> torch.Tensor:
@@ -848,8 +848,9 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
 
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
-        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
-        if not modalities:
+        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(
+            **kwargs)
+        if not mm_input_by_modality:
             return None
 
         # The result multimodal_embeddings is tuple of tensors, with each
@@ -858,14 +859,13 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         # NOTE: It is important to iterate over the keys in this dictionary
         # to preserve the order of the modalities.
-        for modality in modalities:
-            if modality == "images":
-                image_input = modalities["images"]
-                vision_embeddings = self._process_image_input(image_input)
+        for modality in mm_input_by_modality:
+            multimodal_input = mm_input_by_modality[modality]
+            if modality == "image":
+                vision_embeddings = self._process_image_input(multimodal_input)
                 multimodal_embeddings += tuple(vision_embeddings)
-            if modality == "videos":
-                video_input = modalities["videos"]
-                video_embeddings = self._process_video_pixels(video_input)
+            if modality == "video":
+                video_embeddings = self._process_video_pixels(multimodal_input)
                 multimodal_embeddings += tuple(video_embeddings)
 
         return multimodal_embeddings
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
new file mode 100644
index 0000000000000..517d6eb7d6d0e
--- /dev/null
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -0,0 +1,977 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen2.5-Omni model (thinker part)."""
+
+from copy import copy
+from functools import cached_property, partial
+from typing import (Any, Dict, Iterable, List, Mapping, Optional, Sequence,
+                    Set, Tuple, Union)
+
+import torch
+import torch.nn as nn
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import (
+    Qwen2_5OmniConfig, Qwen2_5OmniThinkerConfig)
+from transformers.models.qwen2_5_omni.modeling_qwen2_5_omni import (
+    Qwen2_5OmniAudioEncoder)
+from transformers.models.qwen2_5_omni.processing_qwen2_5_omni import (
+    Qwen2_5OmniProcessor)
+from transformers.models.whisper import WhisperFeatureExtractor
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.models.qwen2_5_vl import (
+    Qwen2_5_VisionTransformer, Qwen2_5_VLImageEmbeddingInputs,
+    Qwen2_5_VLImageInputs, Qwen2_5_VLImagePixelInputs,
+    Qwen2_5_VLProcessingInfo, Qwen2_5_VLVideoEmbeddingInputs,
+    Qwen2_5_VLVideoInputs, Qwen2_5_VLVideoPixelInputs)
+from vllm.model_executor.models.qwen2_audio import (
+    Qwen2AudioInputs, Qwen2AudioProcessingInfo,
+    _get_feat_extract_output_lengths)
+from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalDataParser
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.hasher import MultiModalHasher
+from vllm.multimodal.inputs import (ImageItem, ModalityData,
+                                    MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputs, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import (AudioProcessorItems, DictEmbeddingItems,
+                                   ModalityDataItems, MultiModalDataItems,
+                                   MultiModalDataParser)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        PlaceholderFeaturesInfo,
+                                        PromptReplacement, PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.tokenizer import decode_tokens, encode_tokens
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .utils import (AutoWeightsLoader, WeightsMapper,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+
+try:
+    import flash_attn
+except (ImportError, ModuleNotFoundError):
+    flash_attn = None
+
+logger = init_logger(__name__)
+
+
+def _qwen2_5_omni_thinker_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+    audio_feature_lengths = hf_inputs.get("audio_feature_lengths",
+                                          torch.empty((0, )))
+
+    image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
+    image_grid_sizes = image_grid_thw.prod(-1)
+
+    video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
+    video_grid_sizes = video_grid_thw.prod(-1)
+
+    return dict(
+        input_audio_features=MultiModalFieldConfig.flat_from_sizes(
+            "audio", audio_feature_lengths, dim=1),
+        feature_attention_mask=MultiModalFieldConfig.batched("audio"),
+        audio_feature_lengths=MultiModalFieldConfig.batched("audio"),
+        pixel_values=MultiModalFieldConfig.flat_from_sizes(
+            "image", image_grid_sizes),
+        image_embeds=MultiModalFieldConfig.flat_from_sizes(
+            "image", image_grid_sizes),
+        image_grid_thw=MultiModalFieldConfig.batched("image"),
+        pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
+            "video", video_grid_sizes),
+        video_embeds=MultiModalFieldConfig.flat_from_sizes(
+            "video", video_grid_sizes),
+        video_grid_thw=MultiModalFieldConfig.batched("video"),
+        second_per_grid_ts=MultiModalFieldConfig.batched("video"),
+    )
+
+
+class Qwen2_5OmniThinkerMultiModalDataParser(Qwen2VLMultiModalDataParser):
+
+    def _parse_audio_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
+    ) -> ModalityDataItems[Any, Any]:
+        if isinstance(data, dict):
+            return DictEmbeddingItems(
+                data,
+                modality="audio",
+                required_fields={
+                    "input_audio_features", "audio_feature_lengths"
+                },
+                fields_factory=_qwen2_5_omni_thinker_field_config,
+            )
+
+        return super()._parse_audio_data(data)
+
+
+class Qwen2_5OmniThinkerProcessingInfo(Qwen2AudioProcessingInfo,
+                                       Qwen2_5_VLProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(Qwen2_5OmniConfig).thinker_config
+
+    def get_hf_processor(
+        self,
+        *,
+        sampling_rate: Optional[int] = None,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        size: Optional[dict[str, int]] = None,
+        fps: Optional[Union[float, List[float]]] = None,
+        **kwargs: object,
+    ) -> Qwen2_5OmniProcessor:
+        if fps is not None:
+            kwargs["fps"] = fps
+        processor = self.ctx.get_hf_processor(
+            Qwen2_5OmniProcessor,
+            image_processor=self.get_image_processor(min_pixels=min_pixels,
+                                                     max_pixels=max_pixels,
+                                                     size=size),
+            **kwargs,
+        )
+        if not hasattr(processor, "audio_token"):
+            processor.audio_token = "<|AUDIO|>"
+        if not hasattr(processor, "image_token"):
+            processor.image_token = "<|IMAGE|>"
+        if not hasattr(processor, "video_token"):
+            processor.video_token = "<|VIDEO|>"
+        return processor
+
+    def get_feature_extractor(
+        self,
+        *,
+        sampling_rate: Optional[int] = None,
+        **kwargs: object,
+    ):
+        hf_processor = self.get_hf_processor(sampling_rate=sampling_rate)
+        feature_extractor = hf_processor.feature_extractor  # type: ignore
+        assert isinstance(feature_extractor, WhisperFeatureExtractor)
+        return feature_extractor
+
+    def get_max_audio_tokens(self) -> int:
+        hf_config = self.get_hf_config()
+        max_source_position = hf_config.audio_config.max_source_positions
+        output_lengths = (max_source_position - 2) // 2 + 1
+        return output_lengths
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"audio": None, "image": None, "video": None}
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        return {
+            "audio": self.get_max_audio_tokens(),
+            "image": self.get_max_image_tokens(),
+            "video": self.get_max_video_tokens(seq_len, mm_counts),
+        }
+
+
+class Qwen2_5OmniThinkerDummyInputsBuilder(
+        BaseDummyInputsBuilder[Qwen2_5OmniThinkerProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        hf_processor = self.info.get_hf_processor()
+
+        audio_token: str = hf_processor.audio_token
+        image_token: str = hf_processor.image_token
+        video_token: str = hf_processor.video_token
+
+        return (audio_token * num_audios + image_token * num_images +
+                video_token * num_videos)
+
+    # TODO: @abstractmethod after transition
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_audios = mm_counts.get("audio", 0)
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        feature_extractor = self.info.get_feature_extractor()
+
+        target_audio_length = min(
+            feature_extractor.chunk_length,
+            30,
+        ) * feature_extractor.sampling_rate
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
+
+        mm_data = {
+            "audio":
+            self._get_dummy_audios(length=target_audio_length,
+                                   num_audios=num_audios),
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "video":
+            self._get_dummy_videos(width=target_width,
+                                   height=target_height,
+                                   num_frames=target_num_frames,
+                                   num_videos=num_videos),
+        }
+
+        return mm_data
+
+
+class Qwen2_5OmniThinkerMultiModalProcessor(
+        BaseMultiModalProcessor[Qwen2_5OmniThinkerProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        feature_extractor = self.info.get_feature_extractor()
+        return Qwen2_5OmniThinkerMultiModalDataParser(
+            target_sr=feature_extractor.sampling_rate)
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        mm_data = dict(mm_data)
+        audios = mm_data.pop("audios", [])
+
+        # NOTE: WhisperFeatureExtractor cannot handle empty list of audios
+        if audios:
+            # NOTE: Qwen2.5-Omni processor accept "audio"
+            mm_data["audio"] = audios
+            mm_kwargs = dict(**mm_kwargs, )
+
+        hf_inputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+        input_features = hf_inputs.pop('input_features', None)
+        feature_attention_mask = hf_inputs.get('feature_attention_mask', None)
+        if ('input_audio_features' not in hf_inputs
+                and input_features is not None):
+            if feature_attention_mask is not None:
+                input_features = input_features.permute(
+                    0, 2, 1)[feature_attention_mask.bool()].permute(1, 0)
+            hf_inputs['input_audio_features'] = input_features
+        if ('audio_feature_lengths' not in hf_inputs
+                and feature_attention_mask is not None):
+            hf_inputs['audio_feature_lengths'] = feature_attention_mask.sum(-1)
+        return hf_inputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return _qwen2_5_omni_thinker_field_config(hf_inputs)
+
+    def apply(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        return_mm_hashes: bool = False,
+    ) -> MultiModalInputs:
+        """
+        Qwen2.5-Omni reimplements this function to handle `use_audio_in_video`.
+        """
+        mm_items = self._to_mm_items(mm_data)
+
+        # Create MM hashes to be returned (only used in V1)
+        # TODO: Use these hash keys for caching operations in apply_hf_processor
+        # instead of rehashing.
+
+        if return_mm_hashes:
+            model_id = self.info.model_id
+            mm_hashes = {
+                modality: [
+                    MultiModalHasher.hash_kwargs(model_id=model_id,
+                                                 **{modality: item},
+                                                 **hf_processor_mm_kwargs)
+                    for item in items
+                ]
+                for modality, items in mm_items.items()
+            }
+        else:
+            mm_hashes = None
+
+        (
+            prompt_ids,
+            mm_kwargs,
+            is_update_applied,
+        ) = self._cached_apply_hf_processor(
+            prompt,
+            mm_items,
+            hf_processor_mm_kwargs,
+        )
+
+        unbound_prompt_updates = self._get_prompt_updates(
+            mm_items,
+            hf_processor_mm_kwargs,
+            mm_kwargs,
+        )
+        mm_prompt_updates = self._bind_and_group_updates(
+            unbound_prompt_updates)
+
+        mm_item_counts = mm_items.get_all_counts()
+        self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
+
+        use_audio_in_video = hf_processor_mm_kwargs.get(
+            "use_audio_in_video", False)
+
+        if is_update_applied:
+            mm_placeholders = self._find_mm_placeholders(
+                mm_prompt_updates,
+                prompt_ids,
+                mm_item_counts,
+            )
+            self._validate_mm_placeholders(
+                mm_placeholders,
+                mm_item_counts,
+                use_audio_in_video=use_audio_in_video)
+
+            tokenizer = self.info.get_tokenizer()
+            prompt = decode_tokens(tokenizer, prompt_ids)
+        else:
+            (
+                prompt_ids,
+                prompt,
+                mm_placeholders,
+            ) = self._apply_prompt_updates(
+                prompt_ids,
+                mm_prompt_updates,
+                mm_item_counts,
+            )
+            self._validate_mm_placeholders(
+                mm_placeholders,
+                mm_item_counts,
+                use_audio_in_video=use_audio_in_video)
+
+        tokenizer = self.info.get_tokenizer()
+        prompt = decode_tokens(tokenizer, prompt_ids)
+
+        mm_placeholder_ranges = {
+            modality: [item.to_range() for item in placeholders]
+            for modality, placeholders in mm_placeholders.items()
+        }
+
+        if use_audio_in_video:
+            mm_kwargs["use_audio_in_video"] = True
+
+        return MultiModalInputs(
+            type="multimodal",
+            prompt=prompt,
+            prompt_token_ids=prompt_ids,
+            mm_kwargs=mm_kwargs,
+            mm_hashes=mm_hashes,
+            mm_placeholders=mm_placeholder_ranges,
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        tokenizer = self.info.get_tokenizer()
+        image_processor = self.info.get_image_processor(
+            **hf_processor_mm_kwargs)
+        vocab = tokenizer.get_vocab()
+
+        audio_token = processor.audio_token
+        image_token = processor.image_token
+        video_token = processor.video_token
+        audio_token_id = vocab[audio_token]
+        image_token_id = vocab[image_token]
+        video_token_id = vocab[video_token]
+
+        audio_feature_lengths = out_mm_kwargs.get("audio_feature_lengths")
+        feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
+        if audio_feature_lengths is None and feature_attention_mask is None:
+            audio_output_lengths = []
+        elif audio_feature_lengths is not None:
+            _, audio_output_lens = _get_feat_extract_output_lengths(
+                audio_feature_lengths)
+            audio_output_lengths = audio_output_lens.tolist()
+        elif feature_attention_mask is not None:
+            assert isinstance(feature_attention_mask, torch.Tensor)
+            _, audio_output_lens = _get_feat_extract_output_lengths(
+                feature_attention_mask.sum(-1))
+            audio_output_lengths = audio_output_lens.tolist()
+
+        # number of audios read from video.
+        audio_in_video_item_idx = 0
+
+        def get_replacement_qwen2_audio(item_idx: int):
+            item_idx += audio_in_video_item_idx
+
+            num_features = audio_output_lengths[item_idx]
+            if num_features == 0:
+                audios = mm_items.get_items("audio", AudioProcessorItems)
+                audio = audios.get(item_idx)
+                raise ValueError(
+                    f"The audio {audio} (len={len(audio)}) is too short "
+                    "to be represented inside the model")
+
+            return [audio_token_id] * num_features
+
+        def get_replacement_qwen2_vision(item_idx: int, modality: str):
+            grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx]
+            assert isinstance(grid_thw, torch.Tensor)
+            merge_length = image_processor.merge_size**2
+
+            token_id = image_token_id if modality == "image" else video_token_id
+            return [token_id] * (int(grid_thw.prod()) // merge_length)
+
+        use_audio_in_video = hf_processor_mm_kwargs.get(
+            "use_audio_in_video", False)
+        thinker_config = self.info.get_hf_config()
+
+        def get_replacement_qwen2_use_audio_in_video(item_idx: int):
+            nonlocal audio_in_video_item_idx
+
+            audio_num_features = audio_output_lengths[audio_in_video_item_idx +
+                                                      item_idx]
+            video_grid_thw = out_mm_kwargs["video_grid_thw"][item_idx]
+
+            audio_in_video_item_idx += 1
+
+            second_per_grid_ts = hf_processor_mm_kwargs.get(
+                "second_per_grid_ts", None)
+            if second_per_grid_ts:
+                video_second_per_grid_t = second_per_grid_ts[item_idx]
+            else:
+                video_second_per_grid_t = 1.0
+
+            return MRotaryEmbedding.omni_get_updates_use_audio_in_video(
+                thinker_config=thinker_config,
+                audio_len=audio_num_features,
+                video_grid_thw=video_grid_thw,
+                video_second_per_grid_t=video_second_per_grid_t,
+            )
+
+        video_replacement_fn = (
+            get_replacement_qwen2_use_audio_in_video if use_audio_in_video else
+            partial(get_replacement_qwen2_vision, modality="video"))
+
+        return [
+            PromptReplacement(
+                modality="audio",
+                target=audio_token,
+                replacement=get_replacement_qwen2_audio,
+            ),
+            PromptReplacement(
+                modality="image",
+                target=image_token,
+                replacement=partial(get_replacement_qwen2_vision,
+                                    modality="image"),
+            ),
+            PromptReplacement(
+                modality="video",
+                target=video_token,
+                replacement=video_replacement_fn,
+            ),
+        ]
+
+    def _apply_hf_processor_main(
+        self,
+        prompt: Union[str, list[int]],
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        *,
+        enable_hf_prompt_update: bool,
+    ) -> tuple[list[int], MultiModalKwargs, bool]:
+        """
+        Qwen2.5-Omni reimplements this function to handle text only.
+        """
+        print(prompt)
+        print(hf_processor_mm_kwargs)
+        print(mm_items)
+        if isinstance(prompt, str):
+            if enable_hf_prompt_update:
+                return self._apply_hf_processor_text_mm(
+                    prompt_text=prompt,
+                    mm_items=mm_items,
+                    hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+                )
+            tokenizer = self.info.get_tokenizer()
+            prompt_ids = encode_tokens(tokenizer, prompt)
+        else:
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt)
+
+        mm_kwargs = self._apply_hf_processor_mm_only(
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
+
+        return prompt_ids, mm_kwargs, False
+
+    def _apply_hf_processor_mm_only(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalKwargs:
+        """
+        Qwen2.5-Omni reimplements this function to handle `use_audio_in_video`.
+        """
+        mm_counts = mm_items.get_all_counts()
+
+        use_audio_in_video = hf_processor_mm_kwargs.get(
+            "use_audio_in_video", False)
+        if use_audio_in_video and "video" in mm_counts:
+            assert "audio" in mm_counts
+            mm_counts["audio"] -= mm_counts["video"]
+
+        _, mm_kwargs, _ = self._apply_hf_processor_text_mm(
+            prompt_text=self.dummy_inputs.get_dummy_text(mm_counts),
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
+
+        return mm_kwargs
+
+    def _validate_mm_placeholders(
+        self,
+        mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
+        mm_item_counts: Mapping[str, int],
+        use_audio_in_video: bool = False,
+    ) -> None:
+        if use_audio_in_video:
+            mm_item_counts = copy(mm_item_counts)
+            if "video" in mm_item_counts:
+                assert "audio" in mm_item_counts
+                mm_item_counts["audio"] -= mm_item_counts["video"]
+        super()._validate_mm_placeholders(mm_placeholders, mm_item_counts)
+
+
+class Qwen2_5OmniConditionalGenerationMixin:
+
+    def _validate_and_reshape_mm_tensor(self,
+                                        mm_input: object,
+                                        name: str,
+                                        dim: int = 0) -> torch.Tensor:
+        if not isinstance(mm_input, (torch.Tensor, list)):
+            raise ValueError(f"Incorrect type of {name}. "
+                             f"Got type: {type(mm_input)}")
+        if isinstance(mm_input, torch.Tensor):
+            return torch.concat(list(mm_input), dim=dim)
+        else:
+            return torch.concat(mm_input, dim=dim)
+
+    def _parse_and_validate_audio_input(
+            self, **kwargs: object) -> Optional[Qwen2AudioInputs]:
+        input_audio_features = kwargs.pop('input_audio_features', None)
+        audio_feature_lengths = kwargs.pop('audio_feature_lengths', None)
+        feature_attention_mask = kwargs.pop('feature_attention_mask', None)
+        if input_audio_features is None:
+            return None
+        input_audio_features = self._validate_and_reshape_mm_tensor(
+            input_audio_features, 'input_audio_features', dim=1)
+        if feature_attention_mask is not None:
+            feature_attention_mask = self._validate_and_reshape_mm_tensor(
+                feature_attention_mask, 'feature_attention_mask')
+        if not isinstance(input_audio_features, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of audio input features. "
+                             f"Got type: {type(input_audio_features)}")
+        return Qwen2AudioInputs(input_features=input_audio_features,
+                                audio_feature_lengths=audio_feature_lengths,
+                                feature_attention_mask=feature_attention_mask)
+
+    def _parse_and_validate_image_input(
+        self,
+        **kwargs: Dict[str, Any],
+    ) -> Optional[Qwen2_5_VLImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            pixel_values = self._validate_and_reshape_mm_tensor(
+                pixel_values, "image pixel values")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return Qwen2_5_VLImagePixelInputs(type="pixel_values",
+                                              pixel_values=pixel_values,
+                                              image_grid_thw=image_grid_thw)
+
+        if image_embeds is not None:
+            image_embeds = self._validate_and_reshape_mm_tensor(
+                image_embeds, "image embeds")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            if not isinstance(image_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+            return Qwen2_5_VLImageEmbeddingInputs(
+                type="image_embeds",
+                image_embeds=image_embeds,
+                image_grid_thw=image_grid_thw)
+
+    def _parse_and_validate_video_input(
+        self,
+        **kwargs: Dict[str, Any],
+    ) -> Optional[Qwen2_5_VLVideoInputs]:
+        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
+        video_embeds = kwargs.pop("video_embeds", None)
+        video_grid_thw = kwargs.pop("video_grid_thw", None)
+
+        if pixel_values_videos is None and video_embeds is None:
+            return None
+
+        if pixel_values_videos is not None:
+            pixel_values_videos = self._validate_and_reshape_mm_tensor(
+                pixel_values_videos, "video pixel values")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            return Qwen2_5_VLVideoPixelInputs(
+                type="pixel_values_videos",
+                pixel_values_videos=pixel_values_videos,
+                video_grid_thw=video_grid_thw,
+            )
+
+        if video_embeds is not None:
+            video_embeds = self._validate_and_reshape_mm_tensor(
+                video_embeds, "video embeds")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            if not isinstance(video_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of video embeddings. "
+                                 f"Got type: {type(video_embeds)}")
+            return Qwen2_5_VLVideoEmbeddingInputs(
+                type="video_embeds",
+                video_embeds=video_embeds,
+                video_grid_thw=video_grid_thw)
+
+    def _process_audio_input(
+        self,
+        audio_input: Qwen2AudioInputs,
+        audio_hashes: List[str] = None,
+        cached_audio_features: torch.Tensor = None,
+    ) -> torch.Tensor:
+
+        input_features = audio_input["input_features"]
+        audio_feature_lengths = audio_input["audio_feature_lengths"]
+        if input_features.ndim == 3:
+            assert input_features.shape[0] == 1
+            input_features = input_features.squeeze(0)
+        if audio_feature_lengths.ndim == 2:
+            assert audio_feature_lengths.shape[
+                0] == 1 or audio_feature_lengths.shape[1] == 1
+            if audio_feature_lengths.shape[0] == 1:
+                audio_feature_lengths = audio_feature_lengths.squeeze(0)
+            else:
+                audio_feature_lengths = audio_feature_lengths.squeeze(1)
+
+        audio_feat_lengths, audio_output_lengths = (
+            self.audio_tower._get_feat_extract_output_lengths(
+                audio_feature_lengths))
+
+        audio_outputs = self.audio_tower(
+            input_features.to(self.audio_tower.dtype),
+            feature_lens=audio_feature_lengths,
+            aftercnn_lens=audio_feat_lengths,
+        )
+        audio_features = audio_outputs.last_hidden_state
+        return audio_features.split(audio_output_lengths.tolist())
+
+    def _process_image_input(
+            self,
+            image_input: Qwen2_5_VLImageInputs) -> tuple[torch.Tensor, ...]:
+        if image_input["type"] == "image_embeds":
+            return image_input["image_embeds"].type(self.visual.dtype)
+
+        grid_thw = image_input["image_grid_thw"]
+        assert grid_thw.ndim == 2
+
+        pixel_values = image_input["pixel_values"].type(self.visual.dtype)
+        image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
+        # Split concatenated embeddings for each image item.
+        merge_size = self.visual.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
+
+        return image_embeds.split(sizes.tolist())
+
+    def _process_video_input(
+            self,
+            video_input: Qwen2_5_VLVideoInputs,
+            video_hashes: List[str] = None,
+            cached_video_embeds: torch.Tensor = None) -> torch.Tensor:
+        if video_input["type"] == "video_embeds":
+            return video_input["video_embeds"].type(self.visual.dtype)
+
+        grid_thw = video_input["video_grid_thw"]
+        assert grid_thw.ndim == 2
+
+        pixel_values_videos = video_input["pixel_values_videos"].type(
+            self.visual.dtype)
+        video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)
+        # Split concatenated embeddings for each video item.
+        merge_size = self.visual.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
+
+        return video_embeds.split(sizes.tolist())
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Qwen2_5OmniThinkerMultiModalProcessor,
+    info=Qwen2_5OmniThinkerProcessingInfo,
+    dummy_inputs=Qwen2_5OmniThinkerDummyInputsBuilder,
+)
+class Qwen2_5OmniThinkerForConditionalGeneration(
+        nn.Module, SupportsMultiModal, SupportsPP,
+        Qwen2_5OmniConditionalGenerationMixin):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "thinker.lm_head.": "language_model.lm_head.",
+            "thinker.model.": "language_model.model.",
+            "thinker.": "",
+        })
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        thinker_config: Qwen2_5OmniThinkerConfig = (
+            vllm_config.model_config.hf_config.thinker_config)
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.config = thinker_config
+        self.multimodal_config = multimodal_config
+
+        # force "use_flash_attention_2=True" to audio tower to align
+        # the results.
+        if flash_attn is not None:
+            audio_config = thinker_config.audio_config
+            audio_config._attn_implementation_autoset = True
+            audio_config._attn_implementation = "flash_attention_2"
+        else:
+            logger.warning(
+                "flash_attn is not available, the model may not yield the "
+                "exactly same result as the transformers implementation "
+                "in the audio tower part.")
+
+        self.audio_tower = Qwen2_5OmniAudioEncoder(thinker_config.audio_config)
+        self.visual = Qwen2_5_VisionTransformer(
+            vision_config=thinker_config.vision_config,
+            norm_eps=getattr(thinker_config.text_config, "rms_norm_eps", 1e-6),
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "visual"),
+        )
+        self.quant_config = quant_config
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            hf_config=thinker_config.text_config,
+            architectures=["Qwen2ForCausalLM"],
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    @cached_property
+    def sampler(self):
+        if hasattr(self.language_model, "sampler"):
+            return self.language_model.sampler
+
+        return get_sampler()
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        mm_input_by_modality = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("pixel_values", "image_embeds"
+                             ) and "image" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "image"] = self._parse_and_validate_image_input(**kwargs)
+            if input_key in ("pixel_values_videos", "video_embeds"
+                             ) and "video" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "video"] = self._parse_and_validate_video_input(**kwargs)
+            if input_key in ("input_audio_features"
+                             ) and "audio" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "audio"] = self._parse_and_validate_audio_input(**kwargs)
+        return mm_input_by_modality
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+
+        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(
+            **kwargs)
+        if not mm_input_by_modality:
+            return None
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in mm_input_by_modality:
+            multimodal_input = mm_input_by_modality[modality]
+            if modality == "image":
+                vision_embeddings = self._process_image_input(multimodal_input)
+                multimodal_embeddings += vision_embeddings
+            if modality == "video":
+                video_embeddings = self._process_video_input(multimodal_input)
+                multimodal_embeddings += video_embeddings
+            if modality == "audio":
+                audio_embeddings = self._process_audio_input(multimodal_input)
+                multimodal_embeddings += audio_embeddings
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+
+            # TODO (ywang96): support overlapping modalitiy embeddings so that
+            # `use_audio_in_video` will work on V1.
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings, [
+                    self.config.image_token_index,
+                    self.config.video_token_index,
+                    self.config.audio_token_index
+                ])
+        return inputs_embeds
+
+    def get_multimodal_embeddings_v0(
+            self, **kwargs: object) -> Optional[NestedTensors]:
+        audio_input = self._parse_and_validate_audio_input(**kwargs)
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        video_input = self._parse_and_validate_video_input(**kwargs)
+
+        if audio_input is None and image_input is None and video_input is None:
+            return None
+
+        multimodal_embeddings: List[Tuple[NestedTensors, str]] = []
+
+        if audio_input is not None:
+            audio_embeds = self._process_audio_input(audio_input)
+            multimodal_embeddings.append((audio_embeds, "audio"))
+        if image_input is not None:
+            image_embeds = self._process_image_input(image_input)
+            multimodal_embeddings.append((image_embeds, "image"))
+        if video_input is not None:
+            video_embeds = self._process_video_input(video_input)
+            multimodal_embeddings.append((video_embeds, "video"))
+        return multimodal_embeddings
+
+    def get_input_embeddings_v0(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is None:
+            return inputs_embeds
+
+        for embeddings, modality in multimodal_embeddings:
+            if modality == "audio":
+                placeholder_token_id = self.config.audio_token_index
+            if modality == "image":
+                placeholder_token_id = self.config.image_token_index
+            if modality == "video":
+                placeholder_token_id = self.config.video_token_index
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, embeddings, placeholder_token_id)
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            multimodal_embeddings = self.get_multimodal_embeddings_v0(**kwargs)
+            inputs_embeds = self.get_input_embeddings_v0(
+                input_ids, multimodal_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        return self.language_model.sample(logits, sampling_metadata)
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=["talker.", "token2wav."],
+        )
+        loaded_weights = loader.load_weights(weights,
+                                             mapper=self.hf_to_vllm_mapper)
+
+        return loaded_weights
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 84b7e59c8a0af..d5bc3446edb8f 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -38,13 +38,14 @@ from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
     Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig)
 
 from vllm.config import VllmConfig
-from vllm.distributed import parallel_state, tensor_model_parallel_all_gather
+from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
@@ -195,6 +196,23 @@ class Qwen2_5_VisionMLP(nn.Module):
         return x_down
 
 
+def all_gather_interleave(local_tensor, hidden_size: int, tp_size: int):
+    """All-gather the input tensor interleavely across model parallel group."""
+    import torch.distributed as dist
+    gathered_tensors = [torch.zeros_like(local_tensor) for _ in range(tp_size)]
+    dist.all_gather(gathered_tensors, local_tensor)
+
+    gathered_tensors_split = [
+        torch.split(tensor, hidden_size // tp_size, -1)
+        for tensor in gathered_tensors
+    ]
+    ordered_tensors = [
+        tensor for pair in zip(*gathered_tensors_split) for tensor in pair
+    ]
+    result_tensor = torch.cat(ordered_tensors, dim=-1)
+    return result_tensor
+
+
 class Qwen2_5_VisionAttention(nn.Module):
 
     def __init__(
@@ -214,10 +232,14 @@ class Qwen2_5_VisionAttention(nn.Module):
         self.num_attention_heads_per_partition = dist_utils.divide(
             num_heads, self.tp_size)
 
-        self.qkv = ColumnParallelLinear(input_size=embed_dim,
-                                        output_size=3 * projection_size,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.qkv")
+        self.qkv = QKVParallelLinear(
+            hidden_size=embed_dim,
+            head_size=self.hidden_size_per_attention_head,
+            total_num_heads=num_heads,
+            total_num_kv_heads=num_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv")
         self.proj = RowParallelLinear(input_size=projection_size,
                                       output_size=embed_dim,
                                       quant_config=quant_config,
@@ -236,7 +258,8 @@ class Qwen2_5_VisionAttention(nn.Module):
         # [s, b, 3 * head * head_dim]
         seq_len, bs, _ = qkv.shape
         if self.tp_size > 1:
-            qkv = tensor_model_parallel_all_gather(qkv)
+            qkv = all_gather_interleave(qkv, self.qkv.hidden_size,
+                                        self.tp_size)
 
         # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim]
         q, k, v = qkv.chunk(3, dim=2)
@@ -694,9 +717,9 @@ class Qwen2_5_VisionTransformer(nn.Module):
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
+            ("attn.qkv.", "attn.q.", "q"),
+            ("attn.qkv.", "attn.k.", "k"),
+            ("attn.qkv.", "attn.v.", "v"),
         ]
         params_dict = dict(self.named_parameters(remove_duplicate=False))
         loaded_params: Set[str] = set()
@@ -952,20 +975,20 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         return video_embeds.split(sizes.tolist())
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
-        modalities = {}
+        mm_input_by_modality = {}
 
         # Preserve the order of modalities if there are multiple of them
         # from the order of kwargs.
         for input_key in kwargs:
-            if input_key in ("pixel_values",
-                             "image_embeds") and "images" not in modalities:
-                modalities["images"] = self._parse_and_validate_image_input(
-                    **kwargs)
-            if input_key in ("pixel_values_videos",
-                             "video_embeds") and "videos" not in modalities:
-                modalities["videos"] = self._parse_and_validate_video_input(
-                    **kwargs)
-        return modalities
+            if input_key in ("pixel_values", "image_embeds"
+                             ) and "image" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "image"] = self._parse_and_validate_image_input(**kwargs)
+            if input_key in ("pixel_values_videos", "video_embeds"
+                             ) and "video" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "video"] = self._parse_and_validate_video_input(**kwargs)
+        return mm_input_by_modality
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
@@ -973,8 +996,9 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
 
-        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
-        if not modalities:
+        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(
+            **kwargs)
+        if not mm_input_by_modality:
             return None
 
         # The result multimodal_embeddings is tuple of tensors, with each
@@ -983,14 +1007,13 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         # NOTE: It is important to iterate over the keys in this dictionary
         # to preserve the order of the modalities.
-        for modality in modalities:
-            if modality == "images":
-                image_input = modalities["images"]
-                vision_embeddings = self._process_image_input(image_input)
+        for modality in mm_input_by_modality:
+            multimodal_input = mm_input_by_modality[modality]
+            if modality == "image":
+                vision_embeddings = self._process_image_input(multimodal_input)
                 multimodal_embeddings += vision_embeddings
-            if modality == "videos":
-                video_input = modalities["videos"]
-                video_embeddings = self._process_video_input(video_input)
+            if modality == "video":
+                video_embeddings = self._process_video_input(multimodal_input)
                 multimodal_embeddings += video_embeddings
         return multimodal_embeddings
 
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 0c7a1d973ff98..621b9d69faa5e 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -200,6 +200,7 @@ _MULTIMODAL_MODELS = {
     "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
     "Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"),  # noqa: E501
     "Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"),  # noqa: E501
+    "Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"),  # noqa: E501
     "UltravoxModel": ("ultravox", "UltravoxModel"),
     "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
     # [Encoder-decoder]
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index a1f233e04892e..212e35e79d452 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -84,7 +84,7 @@ def replace_linear_class(
 ) -> Union[ColumnParallelLinear, RowParallelLinear]:
     """
     Replace nn.Linear with one of vLLM's tensor parallel linear classes.
-    
+
     Args:
         linear (nn.Linear): `nn.Linear` to be replaced.
         style (str): Tensor parallel style of the new linear, e.g. "colwise".
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 53729799b629c..6855808e8e44a 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -320,7 +320,8 @@ class MultiModalFlatField(BaseMultiModalField):
         :func:`MultiModalFieldConfig.flat`
         :func:`MultiModalFieldConfig.flat_from_sizes`
     """
-    slices: Sequence[slice]
+    slices: Union[Sequence[slice], Sequence[Sequence[slice]]]
+    dim: int = 0
 
     def build_elems(
         self,
@@ -329,7 +330,10 @@ class MultiModalFlatField(BaseMultiModalField):
         data: NestedTensors,
     ) -> Sequence[MultiModalFieldElem]:
         field_factory = self._field_factory(modality=modality, key=key)
-        return [field_factory(data[s]) for s in self.slices]
+        if not is_list_of(self.slices, slice, check="all"):
+            assert isinstance(data, torch.Tensor), \
+                "torch.Tensor is required for multiple slices"
+        return [field_factory(data[cast(slice, s)]) for s in self.slices]
 
     def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
         if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
@@ -338,10 +342,16 @@ class MultiModalFlatField(BaseMultiModalField):
                 # - produce exactly same result as `torch.concat(batch)`
                 # - will achieve zero-copy if the tensor is contiguous
                 return batch[0].contiguous()
-            first_shape = batch[0].shape
-            if all(elem.shape[1:] == first_shape[1:] for elem in batch):
-                return torch.concat(batch)
 
+            def _expect_same_shape(tensor: torch.Tensor):
+                return tensor.shape[:self.dim] + tensor.shape[self.dim + 1:]
+
+            first_shape = _expect_same_shape(batch[0])
+
+            if all(_expect_same_shape(elem) == first_shape for elem in batch):
+                return torch.concat(batch, dim=self.dim)
+
+        assert self.dim == 0, "dim == 0 is required for nested list"
         return [e for elem in batch for e in elem]
 
 
@@ -398,7 +408,9 @@ class MultiModalFieldConfig:
         )
 
     @staticmethod
-    def flat(modality: str, slices: Sequence[slice]):
+    def flat(modality: str,
+             slices: Union[Sequence[slice], Sequence[Sequence[slice]]],
+             dim: int = 0):
         """
         Defines a field where an element in the batch is obtained by
         slicing along the first dimension of the underlying data.
@@ -406,8 +418,10 @@ class MultiModalFieldConfig:
         Args:
             modality: The modality of the multi-modal item that uses this
                 keyword argument.
-            slices: For each multi-modal item, a slice that is used to extract
-                the data corresponding to it.
+            slices: For each multi-modal item, a slice (dim=0) or a tuple of
+                slices (dim>0) that is used to extract the data corresponding 
+                to it.
+            dim: The dimension to extract data, default to 0.
 
         Example:
 
@@ -423,14 +437,33 @@ class MultiModalFieldConfig:
                     Element 1: [AAA]
                     Element 2: [BBBB]
                     Element 3: [CC]
+            
+            .. code-block::
+
+                Given:
+                    slices: [
+                        (slice(None), slice(0, 3)),
+                        (slice(None), slice(3, 7)),
+                        (slice(None), slice(7, 9))]
+                    dim: 1
+
+                Input:
+                    Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]]
+
+                Output:
+                    Element 1: [[A],[A],[A]]
+                    Element 2: [[B],[B],[B],[B]]
+                    Element 3: [[C],[C]]
         """
         return MultiModalFieldConfig(
-            field=MultiModalFlatField(slices=slices),
+            field=MultiModalFlatField(slices=slices, dim=dim),
             modality=modality,
         )
 
     @staticmethod
-    def flat_from_sizes(modality: str, size_per_item: torch.Tensor):
+    def flat_from_sizes(modality: str,
+                        size_per_item: torch.Tensor,
+                        dim: int = 0):
         """
         Defines a field where an element in the batch is obtained by
         slicing along the first dimension of the underlying data.
@@ -440,6 +473,7 @@ class MultiModalFieldConfig:
                 keyword argument.
             slices: For each multi-modal item, the size of the slice that
                 is used to extract the data corresponding to it.
+            dim: The dimension to slice, default to 0.
 
         Example:
 
@@ -455,6 +489,21 @@ class MultiModalFieldConfig:
                     Element 1: [AAA]
                     Element 2: [BBBB]
                     Element 3: [CC]
+
+            
+            .. code-block::
+
+                Given:
+                    slices: [3, 4, 2]
+                    dim: 1
+
+                Input:
+                    Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]]
+
+                Output:
+                    Element 1: [[A],[A],[A]]
+                    Element 2: [[B],[B],[B],[B]]
+                    Element 3: [[C],[C]]
     
         See also:
             :func:`MultiModalFieldConfig.flat`
@@ -465,12 +514,11 @@ class MultiModalFieldConfig:
                              f"but found shape: {size_per_item.shape}")
 
         slice_idxs = [0, *accumulate(size_per_item)]
-        slices = [
-            slice(slice_idxs[i], slice_idxs[i + 1])
-            for i in range(len(size_per_item))
-        ]
+        slices = [(slice(None, None, None), ) * dim +
+                  (slice(slice_idxs[i], slice_idxs[i + 1]), )
+                  for i in range(len(size_per_item))]
 
-        return MultiModalFieldConfig.flat(modality, slices)
+        return MultiModalFieldConfig.flat(modality, slices, dim=dim)
 
     @staticmethod
     def shared(modality: str, batch_size: int):
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index f37605be82893..178fdd63a872f 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -222,8 +222,7 @@ def patch_rope_scaling_dict(rope_scaling: Dict[str, Any]) -> None:
         logger.warning("Replacing legacy rope_type 'mrope' with 'default'")
 
 
-def uses_mrope(config: PretrainedConfig) -> bool:
-    """Detect if the model with this config uses M-ROPE."""
+def _uses_mrope(config: PretrainedConfig) -> bool:
     rope_scaling = getattr(config, "rope_scaling", None)
     if rope_scaling is None:
         return False
@@ -231,6 +230,24 @@ def uses_mrope(config: PretrainedConfig) -> bool:
     return "mrope_section" in rope_scaling
 
 
+def uses_mrope(config: PretrainedConfig) -> bool:
+    """Detect if the model with this config uses M-ROPE."""
+    return _uses_mrope(config) or thinker_uses_mrope(config)
+
+
+def thinker_uses_mrope(config: PretrainedConfig) -> bool:
+    """Detect if the model contains a thinker config and it uses M-ROPE."""
+    thinker_config = getattr(config, "thinker_config", None)
+    if thinker_config is None:
+        return False
+
+    thinker_text_config = getattr(thinker_config, "text_config", None)
+    if thinker_text_config is None:
+        return False
+
+    return uses_mrope(thinker_text_config)
+
+
 def is_encoder_decoder(config: PretrainedConfig) -> bool:
     """Detect if the model with this config is used as an encoder/decoder."""
     text_config = getattr(config, "text_config", None)
@@ -740,6 +757,11 @@ def get_hf_text_config(config: PretrainedConfig):
         # if transformers config doesn't align with this assumption.
         assert hasattr(config.text_config, "num_attention_heads")
         return config.text_config
+    elif hasattr(config, "thinker_config"):
+        # TODO(suyang.fy): Refactor code.
+        #  For Qwen2.5-Omni, change hf_text_config to
+        #  thinker_config.text_config.
+        return config.thinker_config.text_config
     else:
         return config
 
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index 1d09b99d50c06..ed2f4b076ded4 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -111,6 +111,55 @@ def cached_processor_from_config(
     )
 
 
+def get_feature_extractor(
+    processor_name: str,
+    *args: Any,
+    trust_remote_code: bool = False,
+    **kwargs: Any,
+):
+    """Load an audio feature extractor for the given model name 
+    via HuggingFace."""
+    # don't put this import at the top level
+    # it will call torch.cuda.device_count()
+    from transformers import AutoFeatureExtractor
+    from transformers.feature_extraction_utils import FeatureExtractionMixin
+    try:
+        feature_extractor = AutoFeatureExtractor.from_pretrained(
+            processor_name,
+            *args,
+            trust_remote_code=trust_remote_code,
+            **kwargs)
+    except ValueError as e:
+        # If the error pertains to the processor class not existing or not
+        # currently being imported, suggest using the --trust-remote-code flag.
+        # Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
+        if not trust_remote_code:
+            err_msg = (
+                "Failed to load the feature extractor. If the feature "
+                "extractor is a custom extractor not yet available in the "
+                "HuggingFace transformers library, consider setting "
+                "`trust_remote_code=True` in LLM or using the "
+                "`--trust-remote-code` flag in the CLI.")
+            raise RuntimeError(err_msg) from e
+        else:
+            raise e
+    return cast(FeatureExtractionMixin, feature_extractor)
+
+
+cached_get_feature_extractor = lru_cache(get_feature_extractor)
+
+
+def cached_feature_extractor_from_config(
+    model_config: "ModelConfig",
+    **kwargs: Any,
+):
+    return cached_get_feature_extractor(
+        model_config.model,
+        trust_remote_code=model_config.trust_remote_code,
+        **_merge_mm_kwargs(model_config, **kwargs),
+    )
+
+
 def get_image_processor(
     processor_name: str,
     *args: Any,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index ac0701c459860..7c88ecc31d025 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -355,6 +355,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 image_grid_thw = []
                 video_grid_thw = []
                 second_per_grid_ts = []
+                audio_feature_lengths = []
+                use_audio_in_video = False
                 for mm_input in self.requests[req_id].mm_inputs:
                     if mm_input.get("image_grid_thw") is not None:
                         image_grid_thw.extend(
@@ -365,6 +367,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                     if mm_input.get("second_per_grid_ts") is not None:
                         second_per_grid_ts.extend(
                             mm_input["second_per_grid_ts"])
+                    if mm_input.get("audio_feature_lengths") is not None:
+                        audio_feature_lengths.extend(
+                            mm_input["audio_feature_lengths"])
+                    if mm_input.get("use_audio_in_video") is True:
+                        use_audio_in_video = True
 
                 hf_config = self.model_config.hf_config
 
@@ -376,6 +383,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                         image_grid_thw=image_grid_thw,
                         video_grid_thw=video_grid_thw,
                         second_per_grid_ts=second_per_grid_ts,
+                        audio_feature_lengths=audio_feature_lengths,
+                        use_audio_in_video=use_audio_in_video,
                     )
 
             req_ids_to_add.append(req_id)
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 9f4b18869bdfa..f22d45ed82875 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -382,11 +382,17 @@ class ModelInputForCPUBuilder(ModelRunnerInputBuilderBase[ModelInputForCPU]):
 
             image_grid_thw = mm_kwargs.get("image_grid_thw", None)
             video_grid_thw = mm_kwargs.get("video_grid_thw", None)
-            assert image_grid_thw is not None or video_grid_thw is not None, (
-                "mrope embedding type requires multi-modal input mapper "
-                "returns 'image_grid_thw' or 'video_grid_thw'.")
+            audio_feature_lengths = mm_kwargs.get("audio_feature_lengths",
+                                                  None)
+            assert (
+                image_grid_thw is not None or video_grid_thw is not None
+                or audio_feature_lengths is not None), (
+                    "mrope embedding type requires multi-modal input mapper "
+                    "returns 'image_grid_thw' or 'video_grid_thw' or "
+                    "'audio_feature_lengths'.")
 
             second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None)
+            use_audio_in_video = mm_kwargs.get("use_audio_in_video", False)
             hf_config = self.runner.model_config.hf_config
             token_ids = seq_data.get_token_ids()
 
@@ -398,6 +404,8 @@ class ModelInputForCPUBuilder(ModelRunnerInputBuilderBase[ModelInputForCPU]):
                     video_grid_thw=video_grid_thw,
                     second_per_grid_ts=second_per_grid_ts,
                     context_len=computed_len,
+                    audio_feature_lengths=audio_feature_lengths,
+                    use_audio_in_video=use_audio_in_video,
                 )
             seq_data.mrope_position_delta = mrope_position_delta
 
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 49b0ba1bd8a49..2ebada343d0fb 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -699,11 +699,17 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
         if self.runner.model_config.uses_mrope:
             image_grid_thw = mm_kwargs.get("image_grid_thw", None)
             video_grid_thw = mm_kwargs.get("video_grid_thw", None)
-            assert image_grid_thw is not None or video_grid_thw is not None, (
-                "mrope embedding type requires multi-modal input mapper "
-                "returns 'image_grid_thw' or 'video_grid_thw'.")
+            audio_feature_lengths = mm_kwargs.get("audio_feature_lengths",
+                                                  None)
+            assert (
+                image_grid_thw is not None or video_grid_thw is not None
+                or audio_feature_lengths is not None), (
+                    "mrope embedding type requires multi-modal input mapper "
+                    "returns 'image_grid_thw' or 'video_grid_thw' or "
+                    "'audio_feature_lengths'.")
 
             second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None)
+            use_audio_in_video = mm_kwargs.get("use_audio_in_video", False)
             hf_config = self.runner.model_config.hf_config
 
             inter_data.mrope_input_positions = [None] * inter_data.n_seqs
@@ -721,6 +727,8 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
                         second_per_grid_ts=second_per_grid_ts,
                         context_len=inter_data.context_lens[seq_idx],
                         seq_len=inter_data.seq_lens[seq_idx],
+                        audio_feature_lengths=audio_feature_lengths,
+                        use_audio_in_video=use_audio_in_video,
                     )
 
                 seq_data.mrope_position_delta = mrope_position_delta

From 1d4680fad254e388f05c08795544d97ed71f2ca4 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Sat, 19 Apr 2025 01:21:43 -0500
Subject: [PATCH 512/593] [rocm][MI300] llama4 maverick fp8 moe config tp8
 (#16847)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 ...me=AMD_Instinct_MI300X,dtype=fp8_w8a8.json | 164 ++++++++++++++++++
 1 file changed, 164 insertions(+)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..555d173644522
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}

From 2ef0dc53b88ded24930f10665a0575f39aef0cac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Sat, 19 Apr 2025 09:03:54 +0200
Subject: [PATCH 513/593] [Frontend] Add sampling params to
 `v1/audio/transcriptions` endpoint (#16591)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Jannis Schönleber <joennlae@gmail.com>
Signed-off-by: NickLucche <nlucches@redhat.com>
Co-authored-by: Jannis Schönleber <joennlae@gmail.com>
---
 .../serving/openai_compatible_server.md       | 19 ++++-
 .../openai_transcription_client.py            |  7 +-
 .../openai/test_transcription_validation.py   | 33 +++++++++
 vllm/entrypoints/openai/protocol.py           | 74 ++++++++++++++++---
 4 files changed, 122 insertions(+), 11 deletions(-)

diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index a62d4a79e2aa1..34382c87a484b 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -402,9 +402,26 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
 To use the Transcriptions API, please install with extra audio dependencies using `pip install vllm[audio]`.
 :::
 
+Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
 <!-- TODO: api enforced limits + uploading audios -->
 
-Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
+#### Extra Parameters
+
+The following [sampling parameters](#sampling-params) are supported.
+
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-transcription-sampling-params
+:end-before: end-transcription-sampling-params
+:::
+
+The following extra parameters are supported:
+
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-transcription-extra-params
+:end-before: end-transcription-extra-params
+:::
 
 (tokenizer-api)=
 
diff --git a/examples/online_serving/openai_transcription_client.py b/examples/online_serving/openai_transcription_client.py
index 062868dd8adf0..5fcb7c5264162 100644
--- a/examples/online_serving/openai_transcription_client.py
+++ b/examples/online_serving/openai_transcription_client.py
@@ -26,7 +26,12 @@ def sync_openai():
             model="openai/whisper-large-v3",
             language="en",
             response_format="json",
-            temperature=0.0)
+            temperature=0.0,
+            # Additional sampling params not provided by OpenAI API.
+            extra_body=dict(
+                seed=4419,
+                repetition_penalty=1.3,
+            ))
         print("transcription result:", transcription.text)
 
 
diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index 29571bcd7649b..5c48df3cebbc2 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -192,3 +192,36 @@ async def test_stream_options(winning_call):
                 else:
                     continuous = continuous and hasattr(chunk, 'usage')
             assert final and continuous
+
+
+@pytest.mark.asyncio
+async def test_sampling_params(mary_had_lamb):
+    """
+    Compare sampling with params and greedy sampling to assert results
+    are different when extreme sampling parameters values are picked. 
+    """
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        transcription = await client.audio.transcriptions.create(
+            model=model_name,
+            file=mary_had_lamb,
+            language="en",
+            temperature=0.8,
+            extra_body=dict(seed=42,
+                            repetition_penalty=1.9,
+                            top_k=12,
+                            top_p=0.4,
+                            min_p=0.5,
+                            frequency_penalty=1.8,
+                            presence_penalty=2.0))
+
+        greedy_transcription = await client.audio.transcriptions.create(
+            model=model_name,
+            file=mary_had_lamb,
+            language="en",
+            temperature=0.0,
+            extra_body=dict(seed=42))
+
+        assert greedy_transcription.text != transcription.text
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 4639b4cea06b7..8d2ab29d221e5 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1577,14 +1577,6 @@ class TranscriptionRequest(OpenAIBaseModel):
     """
 
     ## TODO (varun) : Support if set to 0, certain thresholds are met !!
-    temperature: float = Field(default=0.0)
-    """The sampling temperature, between 0 and 1.
-
-    Higher values like 0.8 will make the output more random, while lower values
-    like 0.2 will make it more focused / deterministic. If set to 0, the model
-    will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
-    to automatically increase the temperature until certain thresholds are hit.
-    """
 
     timestamp_granularities: list[Literal["word", "segment"]] = Field(
         alias="timestamp_granularities[]", default=[])
@@ -1596,6 +1588,7 @@ class TranscriptionRequest(OpenAIBaseModel):
     timestamps incurs additional latency.
     """
 
+    # doc: begin-transcription-extra-params
     stream: Optional[bool] = False
     """Custom field not present in the original OpenAI definition. When set, 
     it will enable output to be streamed in a similar fashion as the Chat
@@ -1604,10 +1597,51 @@ class TranscriptionRequest(OpenAIBaseModel):
     # Flattened stream option to simplify form data.
     stream_include_usage: Optional[bool] = False
     stream_continuous_usage_stats: Optional[bool] = False
+    # doc: end-transcription-extra-params
+
+    # doc: begin-transcription-sampling-params
+    temperature: float = Field(default=0.0)
+    """The sampling temperature, between 0 and 1.
+
+    Higher values like 0.8 will make the output more random, while lower values
+    like 0.2 will make it more focused / deterministic. If set to 0, the model
+    will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
+    to automatically increase the temperature until certain thresholds are hit.
+    """
+
+    top_p: Optional[float] = None
+    """Enables nucleus (top-p) sampling, where tokens are selected from the 
+    smallest possible set whose cumulative probability exceeds `p`.
+    """
+
+    top_k: Optional[int] = None
+    """Limits sampling to the `k` most probable tokens at each step."""
+
+    min_p: Optional[float] = None
+    """Filters out tokens with a probability lower than `min_p`, ensuring a 
+    minimum likelihood threshold during sampling.
+    """
+
+    seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
+    """The seed to use for sampling."""
+
+    frequency_penalty: Optional[float] = 0.0
+    """The frequency penalty to use for sampling."""
+
+    repetition_penalty: Optional[float] = None
+    """The repetition penalty to use for sampling."""
+
+    presence_penalty: Optional[float] = 0.0
+    """The presence penalty to use for sampling."""
+    # doc: end-transcription-sampling-params
 
     # Default sampling parameters for transcription requests.
     _DEFAULT_SAMPLING_PARAMS: dict = {
-        "temperature": 0,
+        "repetition_penalty": 1.0,
+        "temperature": 1.0,
+        "top_p": 1.0,
+        "top_k": -1,
+        "min_p": 0.0,
     }
 
     def to_sampling_params(
@@ -1619,13 +1653,35 @@ class TranscriptionRequest(OpenAIBaseModel):
 
         if default_sampling_params is None:
             default_sampling_params = {}
+
         # Default parameters
         if (temperature := self.temperature) is None:
             temperature = default_sampling_params.get(
                 "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
+        if (top_p := self.top_p) is None:
+            top_p = default_sampling_params.get(
+                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
+        if (top_k := self.top_k) is None:
+            top_k = default_sampling_params.get(
+                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
+        if (min_p := self.min_p) is None:
+            min_p = default_sampling_params.get(
+                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
+
+        if (repetition_penalty := self.repetition_penalty) is None:
+            repetition_penalty = default_sampling_params.get(
+                "repetition_penalty",
+                self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"])
 
         return SamplingParams.from_optional(temperature=temperature,
                                             max_tokens=max_tokens,
+                                            seed=self.seed,
+                                            top_p=top_p,
+                                            top_k=top_k,
+                                            min_p=min_p,
+                                            frequency_penalty=self.frequency_penalty,
+                                            repetition_penalty=repetition_penalty,
+                                            presence_penalty=self.presence_penalty,
                                             output_kind=RequestOutputKind.DELTA
                                             if self.stream \
                                             else RequestOutputKind.FINAL_ONLY)

From 9d4ca19d50556d614003e86ed8cadebbae56a68a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Sat, 19 Apr 2025 11:24:14 +0200
Subject: [PATCH 514/593] [Misc] Benchmarks for audio models (#16505)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 benchmarks/backend_request_func.py            | 107 ++++++++++++++++++
 benchmarks/benchmark_dataset.py               |  80 +++++++++++++
 benchmarks/benchmark_serving.py               |  16 ++-
 .../test_transcription_api_correctness.py     |   1 +
 4 files changed, 199 insertions(+), 5 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 287d500a81de2..efd51c79c37cf 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import io
 import json
 import os
 import sys
@@ -32,6 +33,7 @@ class RequestFuncInput:
     extra_body: Optional[dict] = None
     multi_modal_content: Optional[dict] = None
     ignore_eos: bool = False
+    language: Optional[str] = None
 
 
 @dataclass
@@ -436,6 +438,110 @@ async def async_request_openai_chat_completions(
     return output
 
 
+async def async_request_openai_audio(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    # Lazy import without PlaceholderModule to avoid vllm dep.
+    import soundfile
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        ("transcriptions", "translations"
+         )), "OpenAI Chat Completions API URL must end with 'transcriptions' "
+    "or `translations`."
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        content = [{"type": "text", "text": request_func_input.prompt}]
+        payload = {
+            "model": request_func_input.model_name \
+                if request_func_input.model_name else request_func_input.model,
+            "temperature": 0.0,
+            "max_completion_tokens": request_func_input.output_len,
+            "stream": True,
+            "language": "en",
+            # Flattened due to multipart/form-data
+            "stream_include_usage": True,
+            "stream_continuous_usage_stats": True
+        }
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+
+        # Send audio file
+        def to_bytes(y, sr):
+            buffer = io.BytesIO()
+            soundfile.write(buffer, y, sr, format="WAV")
+            buffer.seek(0)
+            return buffer
+
+        with to_bytes(*request_func_input.multi_modal_content['audio']) as f:
+            form = aiohttp.FormData()
+            form.add_field('file', f, content_type='audio/wav')
+            for key, value in payload.items():
+                form.add_field(key, str(value))
+
+            output = RequestFuncOutput()
+            output.prompt_len = request_func_input.prompt_len
+
+            generated_text = ""
+            ttft = 0.0
+            st = time.perf_counter()
+            most_recent_timestamp = st
+            try:
+                async with session.post(url=api_url,
+                                        data=form,
+                                        headers=headers) as response:
+                    if response.status == 200:
+                        async for chunk_bytes in response.content:
+                            chunk_bytes = chunk_bytes.strip()
+                            if not chunk_bytes:
+                                continue
+
+                            chunk = chunk_bytes.decode("utf-8").removeprefix(
+                                "data: ")
+                            if chunk != "[DONE]":
+                                timestamp = time.perf_counter()
+                                data = json.loads(chunk)
+
+                                if choices := data.get("choices"):
+                                    content = choices[0]["delta"].get(
+                                        "content")
+                                    # First token
+                                    if ttft == 0.0:
+                                        ttft = timestamp - st
+                                        output.ttft = ttft
+
+                                    # Decoding phase
+                                    else:
+                                        output.itl.append(
+                                            timestamp - most_recent_timestamp)
+
+                                    generated_text += content or ""
+                                elif usage := data.get("usage"):
+                                    output.output_tokens = usage.get(
+                                        "completion_tokens")
+
+                                most_recent_timestamp = timestamp
+
+                        output.generated_text = generated_text
+                        output.success = True
+                        output.latency = most_recent_timestamp - st
+                    else:
+                        output.error = response.reason or ""
+                        output.success = False
+            except Exception:
+                output.success = False
+                exc_info = sys.exc_info()
+                output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
 def get_model(pretrained_model_name_or_path: str) -> str:
     if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
         from modelscope import snapshot_download
@@ -493,6 +599,7 @@ ASYNC_REQUEST_FUNCS = {
     "deepspeed-mii": async_request_deepspeed_mii,
     "openai": async_request_openai_completions,
     "openai-chat": async_request_openai_chat_completions,
+    "openai-audio": async_request_openai_audio,
     "tensorrt-llm": async_request_trt_llm,
     "scalellm": async_request_openai_completions,
     "sglang": async_request_openai_completions,
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index 63f174275d47b..ccbc6c022f1f9 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -64,6 +64,7 @@ class SampleRequest:
 
 class BenchmarkDataset(ABC):
     DEFAULT_SEED = 0
+    IS_MULTIMODAL = False
 
     def __init__(
         self,
@@ -621,6 +622,7 @@ class ConversationDataset(HuggingFaceDataset):
     SUPPORTED_DATASET_PATHS = {
         'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered'
     }
+    IS_MULTIMODAL = True
 
     def sample(self,
                tokenizer: PreTrainedTokenizerBase,
@@ -685,6 +687,7 @@ class VisionArenaDataset(HuggingFaceDataset):
         "lmarena-ai/vision-arena-bench-v0.1":
         lambda x: x["turns"][0][0]["content"]
     }
+    IS_MULTIMODAL = True
 
     def sample(
         self,
@@ -815,3 +818,80 @@ class AIMODataset(HuggingFaceDataset):
                 ))
         self.maybe_oversample_requests(sampled_requests, num_requests)
         return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# ASR Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class ASRDataset(HuggingFaceDataset):
+    """
+    Dataset class for processing a ASR dataset for transcription.
+    Tested on the following set:
+
+    +----------------+----------------------------------------+--------------------------+-----------------------------+
+    | Dataset        | Domain                                 | Speaking Style           | hf-subset                   |
+    +----------------+----------------------------------------+--------------------------+-----------------------------+
+    | TED-LIUM       | TED talks                              | Oratory                  | release1, release2, release3|
+    |                |                                        |                          | release3-speaker-adaptation |
+    | VoxPopuli      | European Parliament                    | Oratory                  | en, de, it, fr,  ...        |
+    | LibriSpeech    | Audiobook                              | Narrated                 | "LIUM/tedlium"              |
+    | GigaSpeech     | Audiobook, podcast, YouTube            | Narrated, spontaneous    | xs, s, m, l, xl, dev, test  |
+    | SPGISpeech     | Financial meetings                     | Oratory, spontaneous     | S, M, L, dev, test          |
+    | AMI            | Meetings                               | Spontaneous              | ihm, sdm                    |
+    +----------------+----------------------------------------+--------------------------+-----------------------------+
+
+    """ # noqa: E501
+    SUPPORTED_DATASET_PATHS = {
+        "openslr/librispeech_asr", "facebook/voxpopuli", "LIUM/tedlium",
+        "edinburghcstr/ami", "speechcolab/gigaspeech", "kensho/spgispeech"
+    }
+
+    DEFAULT_OUTPUT_LEN = 128
+    IS_MULTIMODAL = True
+
+    # TODO Whisper-specific. Abstract interface when more models are supported.
+    TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|>"\
+                              "<|notimestamps|>"
+    skip_long_audios: bool = True
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        **kwargs,
+    ) -> list:
+        import librosa
+        output_len = (output_len
+                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
+        prompt_len = len(tokenizer(prompt).input_ids)
+        sampled_requests = []
+        skipped = 0
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            audio = item["audio"]
+            y, sr = audio["array"], audio["sampling_rate"]
+            duration_s = librosa.get_duration(y=y, sr=sr)
+            # Whisper max supported duration
+            if self.skip_long_audios and duration_s > 30:
+                skipped += 1
+                continue
+
+            mm_content = {"audio": (y, sr)}
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=mm_content,
+                ))
+        if skipped:
+            logger.warning("%d samples discarded from dataset due to" \
+                           " their length being greater than" \
+                           " what Whisper supports.", skipped)
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index b5bd840d8410d..bc125f6b73dcd 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -50,7 +50,7 @@ try:
 except ImportError:
     from argparse import ArgumentParser as FlexibleArgumentParser
 
-from benchmark_dataset import (AIMODataset, BurstGPTDataset,
+from benchmark_dataset import (AIMODataset, ASRDataset, BurstGPTDataset,
                                ConversationDataset, HuggingFaceDataset,
                                InstructCoderDataset, RandomDataset,
                                SampleRequest, ShareGPTDataset, SonnetDataset,
@@ -274,10 +274,6 @@ async def benchmark(
         input_requests[0].expected_output_len, \
             input_requests[0].multi_modal_data
 
-    if backend != "openai-chat" and test_mm_content is not None:
-        # multi-modal benchmark is only available on OpenAI Chat backend.
-        raise ValueError(
-            "Multi-modal content is only supported on 'openai-chat' backend.")
     assert test_mm_content is None or isinstance(test_mm_content, dict)
     test_input = RequestFuncInput(
         model=model_id,
@@ -604,6 +600,9 @@ def main(args: argparse.Namespace):
         elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
             dataset_class = AIMODataset
             args.hf_split = "train"
+        elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = ASRDataset
+            args.hf_split = "train"
         else:
             supported_datasets = set([
                 dataset_name for cls in HuggingFaceDataset.__subclasses__()
@@ -615,6 +614,13 @@ def main(args: argparse.Namespace):
                 f" from one of following: {supported_datasets}. "
                 "Please consider contributing if you would "
                 "like to add support for additional dataset formats.")
+
+        if (dataset_class.IS_MULTIMODAL and backend not in \
+            ["openai-chat", "openai-audio"]):
+            # multi-modal benchmark is only available on OpenAI Chat backend.
+            raise ValueError(
+                "Multi-modal content is only supported on 'openai-chat' and " \
+                "'openai-audio' backend.")
         input_requests = dataset_class(
             dataset_path=args.dataset_path,
             dataset_subset=args.hf_subset,
diff --git a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
index eca5d184f5d60..642c204b9ff00 100644
--- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
+++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
@@ -150,6 +150,7 @@ def test_wer_correctness(model_name,
                          expected_wer,
                          n_examples=-1,
                          max_concurrent_request=None):
+    # TODO refactor to use `ASRDataset`
     with RemoteOpenAIServer(model_name, ['--enforce-eager']) as remote_server:
         dataset = load_hf_dataset(dataset_repo)
 

From d9737ca1c66662c7ed4e3047df74452d323c240e Mon Sep 17 00:00:00 2001
From: vie-serendipity <2733147505@qq.com>
Date: Sat, 19 Apr 2025 17:25:19 +0800
Subject: [PATCH 515/593] [V1][Misc] stop update prefix cache stats when
 logs_stats is disabled (#16460)

Signed-off-by: vie-serendipity <2733147505@qq.com>
---
 tests/v1/core/test_prefix_caching.py | 22 ++++++++++++++++++++
 vllm/v1/core/kv_cache_manager.py     | 30 ++++++++++++++++++----------
 vllm/v1/core/sched/scheduler.py      |  4 +++-
 3 files changed, 44 insertions(+), 12 deletions(-)

diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 80dd275a90b87..669c042369253 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -751,3 +751,25 @@ def test_reset_prefix_cache():
     assert manager.reset_prefix_cache()
     assert not manager.block_pool.cached_block_hash_to_block
     assert all([blk.block_hash is None for blk in manager.block_pool.blocks])
+
+
+def test_prefix_cache_stats_disabled():
+    """Test that prefix_cache_stats is None when log_stats is False."""
+    manager = KVCacheManager(
+        make_kv_cache_config(16, 11),
+        max_model_len=8192,
+        enable_caching=True,
+        log_stats=False,  # Disable logging stats
+    )
+    assert manager.prefix_cache_stats is None
+
+    # Call all functions that check whether log_stats is disabled.
+    req = make_request("0", list(range(16)))
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
+    assert not computed_blocks
+    assert num_computed_tokens == 0
+    manager.allocate_slots(req, 16, computed_blocks)
+    manager.reset_prefix_cache()
+
+    # Ensure prefix_cache_stats remains None
+    assert manager.prefix_cache_stats is None
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 6e5f969d72f14..c3c83baf51293 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -39,8 +39,9 @@ class KVCacheManager:
 
         self.enable_caching = enable_caching
         self.caching_hash_fn = sha256 if caching_hash_algo == "sha256" else hash
-        # FIXME: make prefix cache stats conditional on log_stats
         self.log_stats = log_stats
+        # FIXME: make prefix cache stats conditional on log_stats
+        self.prefix_cache_stats = PrefixCacheStats() if log_stats else None
         # NOTE(woosuk): To avoid frequent block allocation, we preallocate some
         # blocks for each request. For example, when a request reaches the end
         # of its block table, we preallocate N blocks in advance. This way, we
@@ -79,7 +80,6 @@ class KVCacheManager:
         # This is only used to track the RUNNING requests, we do not track the
         # data for reempted ones.
         self.num_cached_block: dict[str, int] = {}
-        self.prefix_cache_stats = PrefixCacheStats()
 
     @property
     def usage(self) -> float:
@@ -90,12 +90,14 @@ class KVCacheManager:
         """
         return self.block_pool.get_usage()
 
-    def make_prefix_cache_stats(self) -> PrefixCacheStats:
+    def make_prefix_cache_stats(self) -> Optional[PrefixCacheStats]:
         """Get (and reset) the prefix cache stats.
 
         Returns:
-            The current prefix caching stats.
+            The current prefix caching stats, or None if logging is disabled.
         """
+        if not self.log_stats:
+            return None
         stats = self.prefix_cache_stats
         self.prefix_cache_stats = PrefixCacheStats()
         return stats
@@ -125,7 +127,9 @@ class KVCacheManager:
                                                self.block_size, request)
             self.req_to_block_hashes[request.request_id] = block_hashes
 
-        self.prefix_cache_stats.requests += 1
+        if self.log_stats:
+            assert self.prefix_cache_stats is not None
+            self.prefix_cache_stats.requests += 1
         # When the request requires prompt logprobs, we skip prefix caching.
         if request.sampling_params.prompt_logprobs is not None:
             return [], 0
@@ -145,8 +149,10 @@ class KVCacheManager:
 
         computed_blocks = (
             self.specialized_manager.find_longest_cache_hit(block_hashes))
-        self.prefix_cache_stats.queries += len(block_hashes)
-        self.prefix_cache_stats.hits += len(computed_blocks)
+        if self.log_stats:
+            assert self.prefix_cache_stats is not None
+            self.prefix_cache_stats.queries += len(block_hashes)
+            self.prefix_cache_stats.hits += len(computed_blocks)
 
         if last_block_hash is not None:
             # Add back the last block hash if it was removed.
@@ -317,17 +323,19 @@ class KVCacheManager:
 
     def reset_prefix_cache(self) -> bool:
         """Reset prefix cache. This function may be used in RLHF
-        flows to invalid prefix caching after the weights are updated,
+        flows to invalidate prefix caching after the weights are updated,
         or used for resetting prefix caching status for benchmarking.
 
         Returns:
             bool: True if the prefix cache is successfully reset,
             False otherwise.
         """
-        if self.block_pool.reset_prefix_cache():
+        if not self.block_pool.reset_prefix_cache():
+            return False
+        if self.log_stats:
+            assert self.prefix_cache_stats is not None
             self.prefix_cache_stats.reset = True
-            return True
-        return False
+        return True
 
     def get_num_common_prefix_blocks(
         self,
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 7e658d134cf77..69e7cc8ee08ce 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -798,11 +798,13 @@ class Scheduler(SchedulerInterface):
     ) -> Optional[SchedulerStats]:
         if not self.log_stats:
             return None
+        prefix_cache_stats = self.kv_cache_manager.make_prefix_cache_stats()
+        assert prefix_cache_stats is not None
         return SchedulerStats(
             num_running_reqs=len(self.running),
             num_waiting_reqs=len(self.waiting),
             gpu_cache_usage=self.kv_cache_manager.usage,
-            prefix_cache_stats=self.kv_cache_manager.make_prefix_cache_stats(),
+            prefix_cache_stats=prefix_cache_stats,
             spec_decoding_stats=spec_decoding_stats,
         )
 

From 83f3c3bd91f7ef549cb4b355659e91388cb32656 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 19 Apr 2025 17:26:11 +0800
Subject: [PATCH 516/593] [Model] Refactor Phi-4-multimodal to use merged
 processor and support V1 (#15477)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/supported_models.md        |    2 +-
 examples/offline_inference/audio_language.py  |    2 +-
 examples/offline_inference/vision_language.py |    5 +-
 .../vision_language_multi_image.py            |    4 +-
 requirements/docs.txt                         |    1 +
 .../audio_language/test_ultravox.py           |   27 +-
 .../vision_language/test_phi4mm.py            |    6 +-
 .../multimodal/processing/test_common.py      |    1 +
 .../multimodal/processing/test_phi4mm.py      |   59 +
 vllm/entrypoints/chat_utils.py                |    7 +-
 vllm/model_executor/models/phi3v.py           |    2 +-
 vllm/model_executor/models/phi4mm.py          | 1791 ++++++-----------
 vllm/model_executor/models/phi4mm_audio.py    |   75 +-
 vllm/multimodal/audio.py                      |   53 +-
 vllm/multimodal/parse.py                      |   29 +-
 15 files changed, 818 insertions(+), 1246 deletions(-)
 create mode 100644 tests/models/multimodal/processing/test_phi4mm.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 98f18319f1e40..1b80c801d5be5 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -1004,7 +1004,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * `microsoft/Phi-4-multimodal-instruct`, etc.
   * ✅︎
   *
-  *
+  * ✅︎
 - * `PixtralForConditionalGeneration`
   * Pixtral
   * T + I<sup>+</sup>
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 077b5c762a252..e3c75d5cb6a96 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -89,7 +89,7 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
     engine_args = EngineArgs(
         model=model_path,
         trust_remote_code=True,
-        max_model_len=4096,
+        max_model_len=12800,
         max_num_seqs=2,
         enable_lora=True,
         max_lora_rank=320,
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 510a043ce421e..bd7035b7615ac 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -814,10 +814,13 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
     engine_args = EngineArgs(
         model=model_path,
         trust_remote_code=True,
-        max_model_len=4096,
+        max_model_len=5120,
         max_num_seqs=2,
+        max_num_batched_tokens=12800,
         enable_lora=True,
         max_lora_rank=320,
+        # Note - mm_processor_kwargs can also be passed to generate/chat calls
+        mm_processor_kwargs={"dynamic_hd": 16},
         limit_mm_per_prompt={"image": 1},
     )
 
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index e2e14d16228a9..f165ea9efa10f 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -503,11 +503,13 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
     engine_args = EngineArgs(
         model=model_path,
         trust_remote_code=True,
-        max_model_len=10000,
+        max_model_len=4096,
         max_num_seqs=2,
         limit_mm_per_prompt={"image": len(image_urls)},
         enable_lora=True,
         max_lora_rank=320,
+        # Note - mm_processor_kwargs can also be passed to generate/chat calls
+        mm_processor_kwargs={"dynamic_hd": 4},
     )
 
     placeholders = "".join(f"<|image_{i}|>"
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 416ca503b36c0..99fb87def6dd2 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -18,6 +18,7 @@ transformers
 mistral_common >= 1.5.4
 aiohttp
 starlette
+scipy
 openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
 fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
 partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index bd1dcba6a9951..e9dcba8ec0899 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -1,14 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
-from typing import Optional
+from typing import Any, Optional
 
 import numpy as np
 import pytest
 import pytest_asyncio
 from transformers import AutoModel, AutoTokenizer
 
-from vllm.multimodal.audio import resample_audio
+from vllm.multimodal.audio import resample_audio_librosa
 from vllm.sequence import SampleLogprobs
 
 from ....conftest import HfRunner, VllmRunner
@@ -43,6 +43,18 @@ def audio(request):
     return AudioAsset(request.param)
 
 
+def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]:
+    """Convert kwargs to CLI args."""
+    args = []
+    for key, value in params_kwargs.items():
+        if isinstance(value, bool):
+            if value:
+                args.append(f"--{key.replace('_','-')}")
+        else:
+            args.append(f"--{key.replace('_','-')}={value}")
+    return args
+
+
 @pytest.fixture(params=[
     pytest.param({}, marks=pytest.mark.cpu_model),
     pytest.param(CHUNKED_PREFILL_KWARGS),
@@ -52,10 +64,7 @@ def server(request, audio_assets):
         "--dtype", "bfloat16", "--max-model-len", "4096", "--enforce-eager",
         "--limit-mm-per-prompt",
         json.dumps({"audio": len(audio_assets)}), "--trust-remote-code"
-    ] + [
-        f"--{key.replace('_','-')}={value}"
-        for key, value in request.param.items()
-    ]
+    ] + params_kwargs_to_cli_args(request.param)
 
     with RemoteOpenAIServer(MODEL_NAME,
                             args,
@@ -136,9 +145,9 @@ def run_test(
                 [hf_prompt],
                 max_tokens,
                 num_logprobs=num_logprobs,
-                audios=[(resample_audio(audio[0],
-                                        orig_sr=audio[1],
-                                        target_sr=16000), 16000)])
+                audios=[(resample_audio_librosa(audio[0],
+                                                orig_sr=audio[1],
+                                                target_sr=16000), 16000)])
             for _, hf_prompt, audio in prompts_and_audios
         ]
 
diff --git a/tests/models/decoder_only/vision_language/test_phi4mm.py b/tests/models/decoder_only/vision_language/test_phi4mm.py
index 3cd830015076d..11460a1a8d2b5 100644
--- a/tests/models/decoder_only/vision_language/test_phi4mm.py
+++ b/tests/models/decoder_only/vision_language/test_phi4mm.py
@@ -181,7 +181,7 @@ def run_test(
     ],
 )
 @pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_model_len", [4096])
+@pytest.mark.parametrize("max_model_len", [12800])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
 def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
@@ -225,7 +225,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
     ],
 )
 @pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_model_len", [10000])
+@pytest.mark.parametrize("max_model_len", [25600])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
 def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
@@ -258,7 +258,7 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
 
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_model_len", [10000])
+@pytest.mark.parametrize("max_model_len", [12800])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
 def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index a8f21ff919b71..d56638f051f24 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -274,6 +274,7 @@ def _test_processing_correctness_mistral(
     "nvidia/NVLM-D-72B",
     "google/paligemma-3b-mix-224",
     "google/paligemma2-3b-ft-docci-448",
+    "microsoft/Phi-4-multimodal-instruct",
     "mistralai/Pixtral-12B-2409",
     "mistral-community/pixtral-12b",
     "Qwen/Qwen-VL-Chat",
diff --git a/tests/models/multimodal/processing/test_phi4mm.py b/tests/models/multimodal/processing/test_phi4mm.py
new file mode 100644
index 0000000000000..797986adba4af
--- /dev/null
+++ b/tests/models/multimodal/processing/test_phi4mm.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for phi4mm's multimodal preprocessing kwargs."""
+import pytest
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from ....conftest import _ImageAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["microsoft/Phi-4-multimodal-instruct"])
+# yapf: disable
+@pytest.mark.parametrize(
+    ("mm_processor_kwargs", "expected_toks_per_img"),
+    [
+        ({"dynamic_hd": 4}, 1329),
+        ({"dynamic_hd": 16}, 4433),
+        # the default num_crops of phi-4-multimodal is 36
+        ({}, 9585),
+    ])
+# yapf: enable
+@pytest.mark.parametrize("num_imgs", [1, 2])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
+def test_processor_override(
+    image_assets: _ImageAssets,
+    model_id: str,
+    mm_processor_kwargs: dict[str, int],
+    expected_toks_per_img: int,
+    num_imgs: int,
+    kwargs_on_init: bool,
+):
+    """Ensure Phi4MMMultiModalProcessor handles dynamic_hd properly."""
+    # Avoid initializing CUDA early
+    from vllm.model_executor.models.phi4mm import _IMAGE_PLACEHOLDER_TOKEN_ID
+
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
+
+    # Build the image str / prompt based on the number of images we pass
+    img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
+    prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
+
+    image_size = ctx.get_hf_config(
+    ).embd_layer["image_embd_layer"]["crop_size"]
+    dummy_image_size = (image_size * 7, image_size * 7)
+    dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
+    mm_data = {"image": [dummy_image] * num_imgs}
+
+    processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+
+    # Ensure we have the right number of placeholders per num_crops size
+    img_tok_count = processed_inputs["prompt_token_ids"].count(
+        _IMAGE_PLACEHOLDER_TOKEN_ID)
+    assert img_tok_count == expected_toks_per_img * num_imgs
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index e505a4592e2d0..0b662f1a7ec3c 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -482,11 +482,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
         if modality in ("image", "image_embeds"):
             if model_type == "chatglm":
                 return "<|begin_of_image|><|endoftext|><|end_of_image|>"
-            if model_type == "phi3_v":
-                # Workaround since this token is not defined in the tokenizer
+            if model_type in ("phi3_v", "phi4mm"):
                 return f"<|image_{current_count}|>"
-            if model_type == "phi4mm":
-                return "<|endoftext10|>"  # 200010 (see vocab.json in hf model)
             if model_type in ("minicpmo", "minicpmv"):
                 return "(<image>./</image>)"
             if model_type in ("blip-2", "florence2", "fuyu", "paligemma",
@@ -522,7 +519,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
             if model_type == "ultravox":
                 return "<|audio|>"
             if model_type == "phi4mm":
-                return "<|endoftext11|>"  # 200011 (see vocab.json in hf model)
+                return f"<|audio_{current_count}|>"
             if model_type in ("qwen2_audio", "qwen2_5_omni"):
                 return (f"Audio {current_count}: "
                         f"<|audio_bos|><|AUDIO|><|audio_eos|>")
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 7f41ad2359df6..5b43871b75910 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -327,7 +327,7 @@ class Phi3VProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[ProcessorMixin],
+        processor: Optional[ProcessorMixin] = None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index ec19797f88754..cdd762f5fec30 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -1,41 +1,42 @@
 # SPDX-License-Identifier: Apache-2.0
 import math
-import re
-from functools import lru_cache
-from typing import (Dict, Iterable, List, Literal, Mapping, Optional, Tuple,
-                    TypedDict, Union)
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Any, Dict, List, Literal, Optional, Tuple, TypedDict, Union
 
 import numpy as np
-import scipy.signal
 import torch
 import torch.nn as nn
-import torchvision.transforms as T
-from PIL import Image
-from transformers import PretrainedConfig, SiglipVisionConfig
-from transformers.utils import logging
+from transformers import (BatchFeature, PretrainedConfig, ProcessorMixin,
+                          SequenceFeatureExtractor, SiglipVisionConfig)
 
 from vllm.config import VllmConfig
 from vllm.distributed import get_pp_group
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext)
-from vllm.inputs.data import TokenInputs, token_inputs
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
-from vllm.sequence import IntermediateTensors, SequenceData
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.parse import (AudioProcessorItems, ImageEmbeddingItems,
+                                   ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems, MultiModalDataParser)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.utils import is_list_of
 
 from .idefics2_vision_model import Idefics2VisionTransformer
-from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsV0Only
+from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal
 from .phi4mm_audio import AudioEmbedding
-from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, maybe_prefix,
+                    merge_multimodal_embeddings)
 
 # <|endoftext10|> (see vocab.json in hf model)
 _IMAGE_PLACEHOLDER_TOKEN_ID = 200010
@@ -43,115 +44,19 @@ _IMAGE_PLACEHOLDER_TOKEN_ID = 200010
 _AUDIO_PLACEHOLDER_TOKEN_ID = 200011
 
 _AUDIO_MAX_SOUNDFILE_SIZE = 241_000
-DUMMY_SAMPLING_FREQUENCY = 16_000  # kHz
-
-DYNAMIC_HD = 16
-AUDIO_TOKEN_PATTERN = r"<\|audio_(\d+)\|>"
-IMAGE_TOKEN_PATTERN = r"<\|image_(\d+)\|>"
 
 SIGLIP_NAME = "siglip-so400m-patch14-448"
 VISION_ENCODER_TO_PROCESSING_CONFIG = {
     'siglip-so400m-patch14-448': {
-        'dynamic_hd': 16,
         'vit_image_size': 448,
         'vit_patch_size': 14,
         'token_compression_factor': 2,
     },
 }
-logger = logging.get_logger(__name__)
-# This is a workaround to prevent text (user input) + audio + image
-# from being used in the same prompt.
-# It includes token ids for "/n" and tokens in added_tokens_decoder
-# from the tokenizer_confg.json file.
-NON_USER_INPUT_TOKENS = {
-    198, 200010, 200011, 199999, 200018, 200019, 200020, 200021, 200022,
-    200023, 200024, 200025, 200026, 200027, 200028
-}
 
 
-def get_max_dummy_image(ctx: InputContext):
-    hf_config = ctx.get_hf_config()
-    vision_encoder_name = hf_config.img_processor
-    if vision_encoder_name is None:
-        vision_encoder_name = SIGLIP_NAME
-    prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
-    dynamic_hd_size = prepro_config['dynamic_hd']
-    vit_image_size = prepro_config['vit_image_size']
-
-    max_side = vit_image_size * dynamic_hd_size
-    dummy_image = dummy_image_for_phi4mm(vit_image_size, max_side)
-    return dummy_image
-
-
-# image token length
-def get_max_phi4mm_image_tokens(ctx: InputContext):
-    dummy_image = get_max_dummy_image(ctx)
-
-    hf_config = ctx.get_hf_config()
-    vision_encoder_name = hf_config.img_processor
-    if vision_encoder_name is None:
-        vision_encoder_name = SIGLIP_NAME
-    prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
-    dynamic_hd_size = prepro_config['dynamic_hd']
-    vit_image_size = prepro_config['vit_image_size']
-    vit_patch_size = prepro_config['vit_patch_size']
-    token_compression_factor = prepro_config['token_compression_factor']
-
-    image_num_tokens = _compute_num_image_tokens(dummy_image, dynamic_hd_size,
-                                                 vit_image_size,
-                                                 vit_patch_size,
-                                                 token_compression_factor)
-    return image_num_tokens
-
-
-def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height,
-                              image_size):
-    best_ratio_diff = float('inf')
-    best_ratio = (1, 1)
-    area = width * height
-    for ratio in target_ratios:
-        target_aspect_ratio = ratio[0] / ratio[1]
-        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
-        if ratio_diff < best_ratio_diff:
-            best_ratio_diff = ratio_diff
-            best_ratio = ratio
-        elif ratio_diff == best_ratio_diff:
-            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
-                best_ratio = ratio
-    return best_ratio
-
-
-def _find_target_aspect_ratio(image, image_size, max_num, min_num):
-    orig_width, orig_height = image.size
-
-    w_crop_num = math.ceil(orig_width / float(image_size))
-    h_crop_num = math.ceil(orig_height / float(image_size))
-    if w_crop_num * h_crop_num > max_num:
-        aspect_ratio = orig_width / orig_height
-
-        # calculate the existing image aspect ratio
-        target_ratios = set((i, j) for i in range(1, max_num + 1)
-                            for j in range(1, max_num + 1)
-                            if i * j <= max_num and i * j >= min_num)
-        target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
-
-        # find the closest aspect ratio to the target
-        target_aspect_ratio = find_closest_aspect_ratio(
-            aspect_ratio, target_ratios, orig_width, orig_height, image_size)
-
-        # calculate the target width and height
-        target_width = image_size * target_aspect_ratio[0]
-        target_height = image_size * target_aspect_ratio[1]
-        logger.debug("target_aspect_ratio: %s", target_aspect_ratio)
-    else:
-        target_width = image_size * w_crop_num
-        target_height = image_size * h_crop_num
-        target_aspect_ratio = (w_crop_num, h_crop_num)
-    return target_aspect_ratio, target_height, target_width
-
-
-def _get_padding_size(image, target_height, target_width):
-    orig_width, orig_height = image.size
+def _get_padding_size(orig_width: int, orig_height: int, target_height: int,
+                      target_width: int):
     ratio_width = target_width / orig_width
     ratio_height = target_height / orig_height
 
@@ -164,181 +69,6 @@ def _get_padding_size(image, target_height, target_width):
     return padding_height, padding_width
 
 
-def dynamic_preprocess(image,
-                       min_num=1,
-                       max_num=12,
-                       image_size=384,
-                       mask_size=27):
-    target_aspect_ratio, target_height, target_width =\
-          _find_target_aspect_ratio(
-        image, image_size, max_num, min_num)
-    padding_height, padding_width = _get_padding_size(image, target_height,
-                                                      target_width)
-
-    # Calculate the ratio
-    orig_width, orig_height = image.size
-    ratio_width = target_width / orig_width
-    ratio_height = target_height / orig_height
-    if ratio_width < ratio_height:
-        new_size = (target_width, int(orig_height * ratio_width))
-    else:
-        new_size = (int(orig_width * ratio_height), target_height)
-
-    attention_mask = torch.ones((int(mask_size * target_aspect_ratio[1]),
-                                 int(mask_size * target_aspect_ratio[0])))
-    if padding_width >= 14:
-        attention_mask[:, -math.floor(padding_width / 14):] = 0
-    if padding_height >= 14:
-        attention_mask[-math.floor(padding_height / 14):, :] = 0
-    assert attention_mask.sum(
-    ) > 0, f'attention mask is empty {attention_mask}'
-
-    if min(new_size[1], target_height) < 10 or min(new_size[0],
-                                                   target_width) < 10:
-        raise ValueError(f'the aspect ratio is very extreme {new_size}')
-
-    image = T.functional.resize(
-        image,
-        [new_size[1], new_size[0]],
-    )
-
-    resized_img = T.functional.pad(image,
-                                   [0, 0, padding_width, padding_height],
-                                   fill=[255, 255, 255])
-
-    return resized_img, attention_mask
-
-
-def pad_to_max_num_crops(images, max_crops=5):
-    """
-    images: B x 3 x H x W, B<=max_crops
-    """
-    B, _, H, W = images.shape
-    if max_crops > B:
-        pad = torch.zeros(max_crops - B,
-                          3,
-                          H,
-                          W,
-                          dtype=images.dtype,
-                          device=images.device)
-        images = torch.cat([images, pad], dim=0)
-    return images
-
-
-def pad_mask_to_max_num_crops(masks, max_crops=5):
-    B, H, W = masks.shape
-    if max_crops > B:
-        pad = torch.ones(max_crops - B,
-                         H,
-                         W,
-                         dtype=masks.dtype,
-                         device=masks.device)
-        masks = torch.cat([masks, pad], dim=0)
-    return masks
-
-
-def preprocess(images, dynamic_hd_size, vit_resolution, vit_patch_size):
-
-    # Basic settings.
-    img_processor = T.Compose([
-        T.ToTensor(),
-        T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
-    ])
-    # Dynamic HD
-    base_resolution = vit_resolution
-    images = [image.convert('RGB') for image in images]
-    # cover 384 and 448 resolution
-    mask_resolution = base_resolution // vit_patch_size
-    elems, image_attention_masks = [], []
-    for im in images:
-        elem, attention_mask = dynamic_preprocess(im,
-                                                  max_num=dynamic_hd_size,
-                                                  image_size=base_resolution,
-                                                  mask_size=mask_resolution)
-        elems.append(elem)
-        image_attention_masks.append(attention_mask)
-    hd_images = [img_processor(im) for im in elems]
-    global_image = [
-        torch.nn.functional.interpolate(
-            im.unsqueeze(0).float(),
-            size=(base_resolution, base_resolution),
-            mode='bicubic',
-        ).to(im.dtype) for im in hd_images
-    ]
-    shapes = [[im.size(1), im.size(2)] for im in hd_images]
-    mask_shapes = [[mask.size(0), mask.size(1)]
-                   for mask in image_attention_masks]
-    global_attention_mask = [
-        torch.ones((1, mask_resolution, mask_resolution)) for _ in hd_images
-    ]
-    hd_images_reshape = [
-        im.reshape(1, 3, h // base_resolution, base_resolution,
-                   w // base_resolution, base_resolution).permute(
-                       0, 2, 4, 1, 3, 5).reshape(-1, 3, base_resolution,
-                                                 base_resolution).contiguous()
-        for im, (h, w) in zip(hd_images, shapes)
-    ]
-    attention_masks_reshape = [
-        mask.reshape(1, h // mask_resolution, mask_resolution,
-                     w // mask_resolution, mask_resolution).permute(
-                         0, 1, 3, 2, 4).reshape(-1, mask_resolution,
-                                                mask_resolution).contiguous()
-        for mask, (h, w) in zip(image_attention_masks, mask_shapes)
-    ]
-    # NOTE token compression is hard coded here, and odd numbers seems to fail
-    downsample_attention_masks = [
-        mask[:, 0::2,
-             0::2].reshape(1, h // mask_resolution, w // mask_resolution,
-                           mask_resolution // 2 + mask_resolution % 2,
-                           mask_resolution // 2 + mask_resolution % 2).permute(
-                               0, 1, 3, 2, 4)
-        for mask, (h, w) in zip(attention_masks_reshape, mask_shapes)
-    ]
-    downsample_attention_masks = [
-        mask.reshape(mask.size(1) * mask.size(2),
-                     mask.size(3) * mask.size(4))
-        for mask in downsample_attention_masks
-    ]
-    # NOTE hard coded number of tokens
-    num_img_tokens = [
-        256 + 1 + int(mask.sum().item()) + int(mask[:, 0].sum().item()) + 16
-        for mask in downsample_attention_masks
-    ]
-
-    hd_images_reshape = [
-        torch.cat([_global_image] + [_im], dim=0)
-        for _global_image, _im in zip(global_image, hd_images_reshape)
-    ]
-    hd_masks_reshape = [
-        torch.cat([_global_mask] + [_mask],
-                  dim=0) for _global_mask, _mask in zip(
-                      global_attention_mask, attention_masks_reshape)
-    ]
-    max_crops = max([img.size(0) for img in hd_images_reshape])
-    image_transformed = [
-        pad_to_max_num_crops(im, max_crops) for im in hd_images_reshape
-    ]
-    image_transformed = torch.stack(image_transformed, dim=0)
-    mask_transformed = [
-        pad_mask_to_max_num_crops(mask, max_crops) \
-            for mask in hd_masks_reshape
-    ]
-    mask_transformed = torch.stack(mask_transformed, dim=0)
-
-    returned_input_image_embeds = image_transformed
-    returned_image_sizes = torch.tensor(shapes, dtype=torch.long)
-    returned_image_attention_mask = mask_transformed
-    returned_num_img_tokens = num_img_tokens
-
-    data = {
-        "pixel_values": returned_input_image_embeds,
-        "image_sizes": returned_image_sizes,
-        "image_attention_mask": returned_image_attention_mask,
-        "num_img_tokens": returned_num_img_tokens,
-    }
-    return data
-
-
 def get_navit_vision_model(layer_idx: int = -1, **kwargs):
     vision_config = {
         "hidden_size": 1152,
@@ -492,7 +222,7 @@ class Phi4MMImageEncoder(nn.Module):
 
     def forward(self, pixel_values: torch.FloatTensor,
                 image_sizes: torch.Tensor,
-                image_attention_mask: torch.Tensor) -> torch.FloatTensor:
+                image_attention_mask: torch.Tensor) -> list[torch.FloatTensor]:
         """
         process image and return vision embeddings.
 
@@ -656,15 +386,49 @@ class Phi4MMImageEncoder(nn.Module):
         for _output_img in output_imgs:
             img_feature_proj = self.img_projection(
                 _output_img.to(target_device).to(target_dtype))
-            img_set_tensor.append(img_feature_proj)
+            img_set_tensor.append(img_feature_proj.squeeze(0))
 
         return img_set_tensor
 
 
+class Phi4MMImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: Union[torch.Tensor, List[torch.Tensor]]
+    """
+    Shape:
+    `(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
+
+    Note that `num_patches` may be different per batch and image,
+    in which case the data is passed as a list instead of a batched tensor.
+    """
+
+    image_sizes: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, 2)`
+
+    This should be in `(height, width)` format.
+    """
+
+    num_img_tokens: list[int]
+    """Shape: `(batch_size * num_images)`"""
+
+    image_attention_mask: torch.Tensor
+    """Shape: `(batch_size * num_images, H_mask, W_mask)`"""
+
+
+class Phi4MMImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: Union[torch.Tensor, List[torch.Tensor]]
+    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
 class Phi4MMAudioFeatureInputs(TypedDict):
     type: Literal["audio_features"]
-    data: Tuple[NestedTensors]
-    """Shape: `((batch_size, num_audios, 80, M), )"""
+    data: Union[torch.Tensor, List[torch.Tensor]]
+    """Shape: `(batch_size * num_audios, 80, M)"""
 
 
 class Phi4MMAudioEmbeddingInputs(TypedDict):
@@ -673,730 +437,10 @@ class Phi4MMAudioEmbeddingInputs(TypedDict):
     """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)"""
 
 
+Phi4MMImageInput = Union[Phi4MMImagePixelInputs, Phi4MMImageEmbeddingInputs]
 Phi4MMAudioInputs = Union[Phi4MMAudioFeatureInputs, Phi4MMAudioEmbeddingInputs]
 
 
-def speechlib_mel(sample_rate, n_fft, n_mels, fmin=None, fmax=None):
-    """Create a Mel filter-bank the same as SpeechLib FbankFC.
-
-    Args:
-        sample_rate (int): Sample rate in Hz. number > 0 [scalar]
-        n_fft (int): FFT size. int > 0 [scalar]
-        n_mel (int): Mel filter size. int > 0 [scalar]
-        fmin (float): lowest frequency (in Hz). If None use 0.0.
-            float >= 0 [scalar]
-        fmax: highest frequency (in Hz). If None use sample_rate / 2.
-            float >= 0 [scalar]
-
-    Returns
-        out (numpy.ndarray): Mel transform matrix
-            [shape=(n_mels, 1 + n_fft/2)]
-    """
-
-    bank_width = int(n_fft // 2 + 1)
-    if fmax is None:
-        fmax = sample_rate / 2
-    if fmin is None:
-        fmin = 0
-    assert fmin >= 0, "fmin cannot be negative"
-    assert (fmin < fmax <=
-            sample_rate / 2), "fmax must be between (fmin, samplerate / 2]"
-
-    def mel(f):
-        return 1127.0 * np.log(1.0 + f / 700.0)
-
-    def bin2mel(fft_bin):
-        return 1127.0 * np.log(1.0 + fft_bin * sample_rate / (n_fft * 700.0))
-
-    def f2bin(f):
-        return int((f * n_fft / sample_rate) + 0.5)
-
-    # Spec 1: FFT bin range [f2bin(fmin) + 1, f2bin(fmax) - 1]
-    klo = f2bin(fmin) + 1
-    khi = f2bin(fmax)
-
-    khi = max(khi, klo)
-
-    # Spec 2: SpeechLib uses triangles in Mel space
-    mlo = mel(fmin)
-    mhi = mel(fmax)
-    m_centers = np.linspace(mlo, mhi, n_mels + 2)
-    ms = (mhi - mlo) / (n_mels + 1)
-
-    matrix = np.zeros((n_mels, bank_width), dtype=np.float32)
-    for m in range(0, n_mels):
-        left = m_centers[m]
-        center = m_centers[m + 1]
-        right = m_centers[m + 2]
-        for fft_bin in range(klo, khi):
-            mbin = bin2mel(fft_bin)
-            if left < mbin < right:
-                matrix[m, fft_bin] = 1.0 - abs(center - mbin) / ms
-
-    return matrix
-
-
-class LogFbankProcessor:
-
-    def __init__(self):
-
-        self._eightk_method = "fillzero"
-        self._mel = speechlib_mel(16000, 512, 80, fmin=None, fmax=7690).T
-
-        self._hamming400 = np.hamming(400)  # for 16k audio
-        self._hamming200 = np.hamming(200)  # for 8k audio
-
-    def extract_spectrogram(self, wav, fs):
-        """Extract spectrogram features from waveform.
-        Args:
-            wav (1D array): waveform of the input
-            fs (int): sampling rate of the waveform, 16000 or 8000.
-                If fs=8000, the waveform will be resampled to 16000Hz.
-        Output:
-            log_fbank (2D array): a TxD matrix of log Mel filterbank features.
-                D=80, and T is the number of frames.
-        """
-        if wav.ndim > 1:
-            wav = np.squeeze(wav)
-
-        # by default, we extract the mean if stereo
-        if len(wav.shape) == 2:
-            wav = wav.mean(1)
-
-        # Resample to 16000 or 8000 if needed
-        if fs > 16000:
-            wav = scipy.signal.resample_poly(wav, 1, fs // 16000)
-            fs = 16000
-        elif 8000 < fs < 16000:
-            wav = scipy.signal.resample_poly(wav, 1, fs // 8000)
-            fs = 8000
-        elif fs < 8000:
-            raise RuntimeError(f"Unsupported sample rate {fs}")
-
-        if fs == 8000:
-            if self._eightk_method == "resample":
-                # Input audio is 8 kHz. Convert to 16 kHz before feature
-                # extraction
-                wav = scipy.signal.resample_poly(wav, 2, 1)
-                fs = 16000
-            # Do nothing here for fillzero method
-        elif fs != 16000:
-            # Input audio is not a supported sample rate.
-            raise RuntimeError(
-                f"Input data using an unsupported sample rate: {fs}")
-
-        preemphasis = 0.97
-
-        if fs == 8000:
-            n_fft = 256
-            win_length = 200
-            hop_length = 80
-            fft_window = self._hamming200
-        elif fs == 16000:
-            n_fft = 512
-            win_length = 400
-            hop_length = 160
-            fft_window = self._hamming400
-
-        # Spec 1: SpeechLib cut remaining sample insufficient for a hop
-        n_batch = (wav.shape[0] - win_length) // hop_length + 1
-        # Here we don't use stride_tricks since the input array may not satisfy
-        # memory layout requirement and we need writeable output
-        # Here we only use list of views before copy to destination
-        # so it is more efficient than broadcasting
-        y_frames = np.array(
-            [
-                wav[_stride:_stride + win_length]
-                for _stride in range(0, hop_length * n_batch, hop_length)
-            ],
-            dtype=np.float32,
-        )
-
-        # Spec 2: SpeechLib applies preemphasis within each batch
-        y_frames_prev = np.roll(y_frames, 1, axis=1)
-        y_frames_prev[:, 0] = y_frames_prev[:, 1]
-        y_frames = (y_frames - preemphasis * y_frames_prev) * 32768
-
-        S = np.fft.rfft(fft_window * y_frames, n=n_fft,
-                        axis=1).astype(np.complex64)
-
-        if fs == 8000:
-            # Need to pad the output to look like 16 kHz data but with zeros in
-            # the 4 to 8 kHz bins.
-            frames, bins = S.shape
-            padarray = np.zeros((frames, bins))
-            S = np.concatenate((S[:, 0:-1], padarray),
-                               axis=1)  # Nyquist bin gets set to zero
-
-        spec = np.abs(S).astype(np.float32)
-        return spec
-
-    def extract_features(self, wav, fs):
-        """Extract log filterbank features from waveform.
-        Args:
-            wav (1D array): waveform of the input
-            fs (int): sampling rate of the waveform, 16000 or 8000.
-                If fs=8000, the waveform will be resampled to 16000Hz.
-        Output:
-            log_fbank (2D array): a TxD matrix of log Mel filterbank features.
-                D=80, and T is the number of frames.
-        """
-        spec = self.extract_spectrogram(wav, fs)
-        spec_power = spec**2
-
-        fbank_power = np.clip(spec_power.dot(self._mel), 1.0, None)
-        log_fbank = np.log(fbank_power).astype(np.float32)
-
-        return log_fbank
-
-
-@lru_cache
-def audio_feature_extractor() -> LogFbankProcessor:
-    # Creates an instance of the audio processor, needed to extract the
-    # the audio features from the sound file
-    # LRU cache ensures that we only make one copy
-    return LogFbankProcessor()
-
-
-def _compute_num_image_tokens(image, dynamic_hd_size, vit_image_size,
-                              vit_patch_size, token_compression_factor):
-    """
-    compute the number of tokens an image is expected to take up considering 
-    the image encoder architecture and exclude output features containing 
-    only padding pixels
-
-    for siglip, vit_image_size=448, vit_patch_size=14, so output will be 
-    32x32 feature map
-    NOTE right now, Phi4MM uses hard-coded token_compression_factor=2
-    """
-    assert vit_image_size % vit_patch_size == 0, \
-        "vit_image_size must be divisible by vit_patch_size"
-    assert vit_image_size // vit_patch_size % token_compression_factor == 0, \
-        "vit_image_size // vit_patch_size must be divisible by "\
-            "token_compression_factor"
-
-    target_aspect_ratio, target_height, target_width = (
-        _find_target_aspect_ratio(image,
-                                  vit_image_size,
-                                  dynamic_hd_size,
-                                  min_num=1))
-    assert target_aspect_ratio[
-        0] * vit_image_size == target_width, \
-            f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}"
-    assert target_aspect_ratio[
-        1] * vit_image_size == target_height, \
-            f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}"
-    assert (target_height % vit_image_size == 0
-            and target_width % vit_image_size == 0)
-
-    padding_height, padding_width = _get_padding_size(image, target_height,
-                                                      target_width)
-    assert padding_width == 0 or padding_height == 0, \
-        "padding_width or padding_height must be 0"
-
-    target_feat_width = target_width // vit_patch_size
-    target_feat_height = target_height // vit_patch_size
-    if padding_width >= vit_patch_size:
-        assert padding_height == 0, "padding_height not 0"
-        non_pad_feat_width = target_feat_width - math.floor(
-            padding_width / vit_patch_size)
-        non_pad_feat_height = target_feat_height
-    elif padding_height >= vit_patch_size:
-        assert padding_width == 0, "padding_width not 0"
-        non_pad_feat_height = target_feat_height - math.floor(
-            padding_height / vit_patch_size)
-        non_pad_feat_width = target_feat_width
-    else:
-        # small padding shorter than a vit patch
-        non_pad_feat_width = target_feat_width
-        non_pad_feat_height = target_feat_height
-
-    feat_width = non_pad_feat_width // token_compression_factor
-    feat_height = non_pad_feat_height // token_compression_factor
-    # NOTE it's possible that the non-padding feature is not divisible
-    if non_pad_feat_width % token_compression_factor != 0:
-        feat_width += 1
-    if non_pad_feat_height % token_compression_factor != 0:
-        feat_height += 1
-    num_hd_patch_tokens = feat_width * feat_height
-    num_hd_newline_tokens = feat_height
-    vit_feature_size = vit_image_size // vit_patch_size
-    num_global_image_tokens = (vit_feature_size // token_compression_factor)**2
-    num_sep_tokens = 1
-    num_global_image_newline_tokens = \
-        vit_feature_size // token_compression_factor
-
-    return (num_global_image_tokens + num_sep_tokens + num_hd_patch_tokens +
-            num_hd_newline_tokens + num_global_image_newline_tokens)
-
-
-def compute_logfbank_output_size(wav_length: int, fs: int) -> Tuple[int, int]:
-    """
-    Compute the output size of the `extract_features` method.
-
-    Args:
-        wav_length (int): Length of the input waveform in samples.
-        fs (int): Sampling rate of the waveform, either 16000 or 8000.
-
-    Returns:
-        tuple (int, int): Output size as (T, D), where:
-            T: Number of time frames.
-            D: Number of Mel filterbank bins (80).
-    """
-
-    # Resample to 16000 or 8000 if needed
-    if fs > 16000:
-        wav_length //= fs // 16000
-        fs = 16000
-    elif 8000 <= fs < 16000:
-        # We'll resample to 16K from 8K
-        wav_length *= 2
-        fs = 16000
-    elif fs < 8000:
-        raise RuntimeError(f"Unsupported sample rate {fs}")
-
-    # Spectrogram parameters for 16 kHz
-    win_length = 400  # Frame length in samples
-    hop_length = 160  # Frame shift in samples
-    mel_bins = 80  # Number of mel filterbank bins
-
-    # Calculate number of frames (T)
-    T = (wav_length - win_length) // hop_length + 1
-    if T < 1:
-        raise ValueError("Waveform too short for given parameters.")
-
-    # Return time frames (T) and mel bins (D)
-    return T, mel_bins
-
-
-def _get_audio_embed_sizes(audios, ctx: InputContext):
-    """
-    Get the audio embedding sizes for each audio file.
-
-    Args:
-        audios (List[Tuple[np.ndarray, int]]): List of audio files as tuples of
-            waveform and sample rate.
-        ctx (InputContext): Input context.
-
-    Returns:
-        List[int]: List of audio embedding sizes.
-    """
-    audio_embed_sizes = []
-    for audio in audios:
-        audio_data, sf = audio
-        audio_frames, _ = compute_logfbank_output_size(len(audio_data), sf)
-        audio_embed_size = _compute_audio_embed_size(ctx.get_hf_config(),
-                                                     audio_frames)
-        audio_embed_sizes.append(audio_embed_size)
-    return audio_embed_sizes
-
-
-def _get_audio_id_to_input_ids(audios, ctx: InputContext, prompt_str=""):
-    """
-    The following will search for `<|audio_{idx}|>` tokens and
-    return a mapping of audio placeholder tokens to audio placeholder token ids
-    based on the size of the audio embeddings.
-
-    Args:
-        audios (List[Tuple[np.ndarray, int]]): List of audio files as tuples of
-            waveform and sample rate.
-        ctx (InputContext): Input context.
-        prompt_str (str): The prompt string.
-
-    Returns:
-        Dict[str, List[int]]: Mapping of audio placeholder tokens to audio 
-        placeholder token ids.
-
-    """
-    if len(audios) == 0:
-        return {}
-
-    audio_embed_sizes = _get_audio_embed_sizes(audios, ctx)
-    audio_ids = re.findall(AUDIO_TOKEN_PATTERN, prompt_str)
-    audio_ids = [int(audio_id) for audio_id in audio_ids]
-    assert len(audio_ids) == len(
-        audio_embed_sizes
-    ), "Number of audio tokens and audio features do not match"
-    assert tuple(audio_ids) == tuple(range(1,
-                                           len(audio_ids) +
-                                           1)), "Audio ids are not in order!"
-    audio_id_to_input_ids = {
-        f"<|audio_{audio_id}|>":
-        [_AUDIO_PLACEHOLDER_TOKEN_ID] * audio_embed_size
-        for audio_id, audio_embed_size in zip(audio_ids, audio_embed_sizes)
-    }
-
-    return audio_id_to_input_ids
-
-
-def _count_image_tokens(images, ctx: InputContext):
-    hf_config = ctx.get_hf_config()
-    vision_encoder_name = hf_config.img_processor
-    if vision_encoder_name is None:
-        vision_encoder_name = SIGLIP_NAME
-    prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
-    dynamic_hd_size = prepro_config['dynamic_hd']
-    vit_image_size = prepro_config['vit_image_size']
-    vit_patch_size = prepro_config['vit_patch_size']
-    token_compression_factor = prepro_config['token_compression_factor']
-
-    image_token_counts = [
-        _compute_num_image_tokens(image, dynamic_hd_size, vit_image_size,
-                                  vit_patch_size, token_compression_factor)
-        for image in images
-    ]
-    return image_token_counts
-
-
-def _get_image_id_to_input_ids(images, prompt, ctx: InputContext):
-    if len(images) == 0:
-        return {}
-
-    image_ids = re.findall(IMAGE_TOKEN_PATTERN, prompt)
-    image_ids = [int(image_id) for image_id in image_ids]
-    assert len(image_ids) == len(
-        set(image_ids)), "Duplicate image tokens in prompt"
-    assert len(images) == len(
-        image_ids), "Number of images and image tokens in prompt do not match"
-
-    # NOTE the following assertion is not strictly necessary
-    assert tuple(image_ids) == tuple(range(1,
-                                           len(image_ids) +
-                                           1)), "Image ids are not in order"
-
-    image_token_counts = _count_image_tokens(images, ctx)
-    image_id_to_input_ids = {
-        f"<|image_{image_id}|>": [_IMAGE_PLACEHOLDER_TOKEN_ID] * num_tokens
-        for image_id, num_tokens in zip(image_ids, image_token_counts)
-    }
-    return image_id_to_input_ids
-
-
-def input_processor_for_phi4mm(ctx: InputContext,
-                               inputs: DecoderOnlyInputs) -> TokenInputs:
-    """
-    Implements the input processor, which transforms the input prompt ids
-    to include the audio placeholder token.  This will become the `input_ids`
-    in `forward` for the model.
-
-    Args:
-        ctx (InputContext): Input context.
-        inputs (DecoderOnlyInputs): The inputs (e.g. prompt, prompt_token_ids)
-        to process.
-
-    Returns:
-        TokenInputs: Processed inputs
-    """
-    multi_modal_data = inputs.get("multi_modal_data")
-    if (multi_modal_data is None or
-        ("audio" not in multi_modal_data and "image" not in multi_modal_data)):
-        # pure text input, so no need to do pre-processing
-        return inputs
-
-    prompt_str = inputs.get("prompt")
-    prompt_token_ids = inputs.get("prompt_token_ids")
-    # for offline_inference, we will get str input and we parse MM special
-    # tokens from it
-    # (ignore prompt_token_ids)
-    # for OAI server, we will get prompt_token_ids, where MM special tokens
-    # are already parsed
-
-    if 'audio' in multi_modal_data:
-        audios = multi_modal_data["audio"]
-
-        if not isinstance(audios, list):
-            audios = [audios]
-        if prompt_str is not None:
-            audio_id_to_input_ids = _get_audio_id_to_input_ids(
-                audios, ctx, prompt_str=prompt_str)
-            audio_embed_sizes = []
-        elif prompt_token_ids is not None:
-            audio_id_to_input_ids = {}
-            audio_embed_sizes = _get_audio_embed_sizes(audios, ctx)
-    else:
-        audio_id_to_input_ids = {}
-        audio_embed_sizes = []
-
-    if 'image' in multi_modal_data:
-        # PIL Image or list of PIL Images
-        images = multi_modal_data["image"]
-        if not isinstance(images, list):
-            images = [images]
-        if prompt_str is not None:
-            image_id_to_input_ids = _get_image_id_to_input_ids(
-                images, prompt_str, ctx)
-            image_token_counts = []
-        elif prompt_token_ids is not None:
-            image_id_to_input_ids = {}
-            image_token_counts = _count_image_tokens(images, ctx)
-    else:
-        image_id_to_input_ids = {}
-        image_token_counts = []
-
-    # Handle the case where the prompt is a string and we need to manually
-    # tokenize it.
-    # In this case, the `audio_id_to_input_ids` dict will be mapping from
-    # an audio placeholder
-    # string (e.g. `<|audio_1|>`) to the audio placeholder tokens for the
-    # given audio length.
-    if prompt_str:
-        pattern = r"(<\|image_\d+\|>|<\|audio_\d+\|>)"
-        prompt_chunk_strings = re.split(pattern, prompt_str)
-        prompt_chunk_strings = [s for s in prompt_chunk_strings if s != ""]
-
-        # Create the new input_ids with the placeholder image and audio
-        # tokens inserted
-        tokenizer = cached_tokenizer_from_config(ctx.model_config)
-        input_ids = []
-        has_imag, has_audio, has_user_text_input = False, False, False
-        for prompt_chunk_string in prompt_chunk_strings:
-            if re.match(IMAGE_TOKEN_PATTERN, prompt_chunk_string):
-                input_ids.extend(image_id_to_input_ids[prompt_chunk_string])
-                has_imag = True
-            elif re.match(AUDIO_TOKEN_PATTERN, prompt_chunk_string):
-                input_ids.extend(audio_id_to_input_ids[prompt_chunk_string])
-                has_audio = True
-            else:
-                curr_token_ids = tokenizer(prompt_chunk_string).input_ids
-                if not has_user_text_input:
-                    for token_id in curr_token_ids:
-                        if token_id not in NON_USER_INPUT_TOKENS:
-                            has_user_text_input = True
-                            break
-                input_ids.extend(curr_token_ids)
-        if has_audio and has_imag and has_user_text_input:
-            raise ValueError(
-                "Phi4MMForCausalLM does not support text + audio + image" +
-                " inputs in the same prompt")
-    # Handle the case where the prompt is already tokenized
-    else:
-        assert prompt_token_ids is not None, \
-            "If string prompt isn't provided, prompt_token_ids must be"
-
-        i = 0
-        input_ids = prompt_token_ids
-        # only needed for later assertion
-        img_cnt, audio_cnt, user_text_input_cnt = 0, 0, 0
-        image_token_count_iter = iter(image_token_counts)
-        audio_embed_size_iter = iter(audio_embed_sizes)
-        while i < len(input_ids):
-            token_id = input_ids[i]
-            if token_id == _AUDIO_PLACEHOLDER_TOKEN_ID:
-                token_count = next(audio_embed_size_iter)
-                audio_cnt += 1
-            elif token_id == _IMAGE_PLACEHOLDER_TOKEN_ID:
-                token_count = next(image_token_count_iter)
-                img_cnt += 1
-            else:
-                user_text_input_cnt += 1 if token_id not in \
-                    NON_USER_INPUT_TOKENS else 0
-                i += 1
-                continue
-            tokens = [token_id] * token_count
-            input_ids = input_ids[:i] + tokens + input_ids[i + 1:]
-            i += token_count
-
-        if audio_cnt > 0 and img_cnt > 0 and user_text_input_cnt > 0:
-            raise ValueError(
-                "Phi4MMForCausalLM does not support text + audio + image" +
-                " inputs in the same prompt")
-        # If the below assertion fails, it might be that input pure-text
-        # messages contain image/audio special tokens literally
-        # (<|endoftext10|>, <|endoftext11|>).
-        assert (img_cnt == len(image_token_counts)), (
-            f"Number of image tokens in prompt_token_ids ({img_cnt}) "
-            f"does not match number of images ({len(image_token_counts)})")
-        assert (audio_cnt == len(audio_embed_sizes)), (
-            f"Number of audio tokens in prompt_token_ids ({audio_cnt}) "
-            f"does not match number of audios ({len(audio_embed_sizes)})")
-
-    # NOTE: Create a defensive copy of the original inputs
-    return token_inputs(
-        prompt_token_ids=input_ids,
-        prompt=prompt_str,
-        multi_modal_data=multi_modal_data,
-    )
-
-
-def _compute_audio_embed_size(hf_config, audio_frames):
-    """
-    Compute the audio embedding size based on the audio frames and
-    compression rate.
-    """
-    compression_rate = hf_config.embd_layer['audio_embd_layer'][
-        'compression_rate']
-    # NOTE: this is a hard-coded value but might be configurable in the future
-    qformer_compression_rate = 1
-    integer = audio_frames // compression_rate
-    remainder = audio_frames % compression_rate
-
-    result = integer if remainder == 0 else integer + 1
-
-    integer = result // qformer_compression_rate
-    remainder = result % qformer_compression_rate
-    result = integer if remainder == 0 else integer + 1  # qformer compression
-
-    return result
-
-
-def get_max_phi4mm_audio_tokens(ctx: InputContext) -> int:
-    return 10000
-
-
-def dummy_audio_for_phi4mm(audio_count: int) -> dict:
-    """
-    Create dummy audio data for the Phi4MM model, which is used for profiling.
-
-    Args:
-        audio_count (int): Number of audio samples.
-
-    Returns:
-        dict: Dummy audio data.
-    """
-    dummy_audio = np.full((_AUDIO_MAX_SOUNDFILE_SIZE, ), 0.0)
-    return [(dummy_audio, DUMMY_SAMPLING_FREQUENCY)] * audio_count
-
-
-def dummy_image_for_phi4mm(width: int, height: int):
-    image = Image.new('RGB', (width, height), color='black')
-    return image
-
-
-def dummy_data_for_phi4mm(ctx: InputContext, seq_len: int,
-                          mm_counts: Mapping[str, int]) -> DummyData:
-    """
-    Create dummy sequence (input_ids) and audio data for the Phi4MM model, 
-    which is used for profiling.
-
-    In this case, the sequence data is a bunch of 0s with a number of audio 
-    tokens that correspond to the audio embed size of the 
-    _AUDIO_MAX_SOUNDFILE_SIZE.
-
-    Args:
-        ctx (InputContext): Input context.
-        seq_len (int): Length of the sequence.
-        mm_counts (Mapping[str, int]): Multi-modal counts.
-
-    Returns:
-        Tuple: Dummy sequence data and dummy audio data.
-    """
-    audio_count = mm_counts["audio"]
-    audio_frames, _ = compute_logfbank_output_size(_AUDIO_MAX_SOUNDFILE_SIZE,
-                                                   DUMMY_SAMPLING_FREQUENCY)
-    audio_feature_size = _compute_audio_embed_size(ctx.get_hf_config(),
-                                                   audio_frames)
-
-    image_count = mm_counts["image"]
-    dummy_image = get_max_dummy_image(ctx)
-    max_image_tokens = get_max_phi4mm_image_tokens(ctx)
-    total_image_tokens = image_count * max_image_tokens
-
-    if seq_len - audio_feature_size * audio_count - total_image_tokens < 0:
-        raise RuntimeError(
-            f"Phi4MM cannot process {audio_count} audios and {image_count}"
-            f"images in a prompt, please increase max_model_len to be at"
-            f" larger than "
-            f"{audio_feature_size * audio_count + total_image_tokens}"
-            " or reduce audio/image limit by --limit-mm-per-prompt.")
-
-    if audio_feature_size * audio_count > total_image_tokens:
-        seq_data = SequenceData.from_prompt_token_counts(
-            (_AUDIO_PLACEHOLDER_TOKEN_ID, audio_feature_size * audio_count),
-            (0, seq_len - audio_feature_size * audio_count),
-        )
-        mm_data = {
-            "audio": dummy_audio_for_phi4mm(audio_count),
-        }
-    else:
-        seq_data = SequenceData.from_prompt_token_counts(
-            (_IMAGE_PLACEHOLDER_TOKEN_ID, total_image_tokens),
-            (0, seq_len - total_image_tokens),
-        )
-        mm_data = {
-            "image": [dummy_image] * image_count,
-        }
-    return DummyData(seq_data, mm_data)
-
-
-def input_mapper_for_phi4mm_audio(ctx: InputContext,
-                                  data: object) -> MultiModalKwargs:
-    """
-    This function is used to create the MultiModalKwargs for the Phi4MM 
-    (audio) model.
-    Specifically, for audio, we extract the audio features from the sound 
-    file and create pairs of audio features and audio embed lengths (the
-    latter of which is used to repeat the audio placeholder token in the 
-    input prompt IDs).
-    These pairs are used, downstream, in `_audio_features_to_embeddings`
-    (via `_process_audio_input`).
-
-    Note that the incoming audio data (each entry in `data`) is a tuple of 
-    the audio data and the sampling frequency (e.g. from soundfile.read).
-
-    Args:
-        ctx (InputContext): Input context.
-        data (object): Audio data.
-
-    Returns:
-        MultiModalKwargs: Multi-modal inputs.
-    """
-    if not isinstance(data, list):
-        data = [data]
-
-    if len(data) == 0:
-        return MultiModalKwargs()
-
-    audio_features = []
-    for audio_input in data:
-        if not isinstance(audio_input, tuple):
-            raise NotImplementedError(
-                f"Unsupported data type: {type(audio_input)}")
-
-        audio, sf = audio_input
-        feature_extractor = audio_feature_extractor()
-        single_audio_features = feature_extractor.extract_features(audio, sf)
-        feat_stride = (1 if not hasattr(feature_extractor, "stride") else
-                       feature_extractor.stride)
-        audio_frames = len(single_audio_features) * feat_stride
-        single_audio_embed_size = _compute_audio_embed_size(
-            ctx.get_hf_config(), audio_frames)
-        single_audio_feature_audio_len_pair = (
-            single_audio_features,
-            [single_audio_embed_size],
-        )
-        audio_features.append(single_audio_feature_audio_len_pair)
-    return MultiModalKwargs({"audio_features": audio_features})
-
-
-def input_mapper_for_phi4mm_image(ctx: InputContext, data: object):
-    if not isinstance(data, list):
-        data = [data]
-    # data: list of PIL images
-    if len(data) == 0:
-        return MultiModalKwargs()
-    hf_config = ctx.get_hf_config()
-    vision_encoder_name = hf_config.img_processor
-    if vision_encoder_name is None:
-        vision_encoder_name = SIGLIP_NAME
-    prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
-    dynamic_hd_size = prepro_config['dynamic_hd']
-    vit_image_size = prepro_config['vit_image_size']
-    vit_patch_size = prepro_config['vit_patch_size']
-
-    image_input_dict = preprocess(data, dynamic_hd_size, vit_image_size,
-                                  vit_patch_size)
-    return MultiModalKwargs({
-        "pixel_values":
-        image_input_dict["pixel_values"],
-        "image_sizes":
-        image_input_dict["image_sizes"],
-        "image_attention_mask":
-        image_input_dict["image_attention_mask"],
-        "num_img_tokens":
-        image_input_dict["num_img_tokens"],
-    })
-
-
 def cat_with_pad(tensors, dim, padding_value=0):
     """
     cat along dim, while pad to max for all other dims
@@ -1423,18 +467,447 @@ def cat_with_pad(tensors, dim, padding_value=0):
     return output
 
 
-@MULTIMODAL_REGISTRY.register_input_mapper("audio",
-                                           input_mapper_for_phi4mm_audio)
-@MULTIMODAL_REGISTRY.register_input_mapper("image",
-                                           input_mapper_for_phi4mm_image)
-@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
-    "audio", get_max_phi4mm_audio_tokens)
-@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
-    "image", get_max_phi4mm_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi4mm)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_phi4mm)
-class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal,
-                        SupportsV0Only):
+class Phi4MMProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_processor(
+        self,
+        *,
+        dynamic_hd: Optional[int] = None,
+        **kwargs: object,
+    ) -> ProcessorMixin:
+        if dynamic_hd is not None:
+            kwargs["dynamic_hd"] = dynamic_hd
+
+        return self.ctx.get_hf_processor(**kwargs)
+
+    @property
+    def image_tokens(self) -> list[str]:
+        return [f"<|image_{i+1}|>" for i in range(100)]
+
+    @property
+    def audio_tokens(self) -> list[str]:
+        return [f"<|audio_{i+1}|>" for i in range(100)]
+
+    def get_dynamic_hd(
+        self,
+        processor: Optional[ProcessorMixin] = None,
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+        image_processor = processor.image_processor
+        return image_processor.dynamic_hd
+
+    def get_feature_extractor(self) -> SequenceFeatureExtractor:
+        return self.get_hf_processor().audio_processor
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"audio": None, "image": None}
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        return {
+            "image": self.get_max_image_tokens(),
+            "audio": self.get_max_audio_tokens(),
+        }
+
+    def get_max_audio_tokens(self) -> int:
+        sr = self.get_feature_extractor().sampling_rate
+        num_frames = self.get_audio_num_frames(_AUDIO_MAX_SOUNDFILE_SIZE, sr)
+        return self._compute_audio_embed_size(num_frames)
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+        return self.get_num_image_tokens(image_width=target_width,
+                                         image_height=target_height)
+
+    def _find_target_aspect_ratio(
+        self,
+        orig_width: int,
+        orig_height: int,
+        image_size: int,
+        max_num: int,
+        min_num: int,
+    ):
+        w_crop_num = math.ceil(orig_width / float(image_size))
+        h_crop_num = math.ceil(orig_height / float(image_size))
+        if w_crop_num * h_crop_num > max_num:
+            aspect_ratio = orig_width / orig_height
+
+            # calculate the existing image aspect ratio
+            target_ratios = set((i, j) for i in range(1, max_num + 1)
+                                for j in range(1, max_num + 1)
+                                if i * j <= max_num and i * j >= min_num)
+            target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+            # find the closest aspect ratio to the target
+            image_processor = self.get_hf_processor().image_processor
+            target_aspect_ratio = image_processor.find_closest_aspect_ratio(
+                aspect_ratio,
+                target_ratios,
+                orig_width,
+                orig_height,
+                image_size,
+            )
+
+            # calculate the target width and height
+            target_width = image_size * target_aspect_ratio[0]
+            target_height = image_size * target_aspect_ratio[1]
+        else:
+            target_width = image_size * w_crop_num
+            target_height = image_size * h_crop_num
+            target_aspect_ratio = (w_crop_num, h_crop_num)
+        return target_aspect_ratio, target_height, target_width
+
+    def _compute_num_image_tokens(
+        self,
+        orig_width: int,
+        orig_height: int,
+        dynamic_hd_size: int,
+        vit_image_size: int,
+        vit_patch_size: int,
+        token_compression_factor: int = 2,
+    ):
+        """
+        compute the number of tokens an image is expected to take up considering
+        the image encoder architecture and exclude output features containing 
+        only padding pixels
+
+        for siglip, vit_image_size=448, vit_patch_size=14, so output will be 
+        32x32 feature map
+        NOTE right now, Phi4MM uses hard-coded token_compression_factor=2
+        """
+        assert vit_image_size % vit_patch_size == 0, (
+            "vit_image_size must be divisible by vit_patch_size")
+        assert (vit_image_size // vit_patch_size %
+                token_compression_factor == 0), (
+                    "vit_image_size // vit_patch_size must be divisible by "
+                    "token_compression_factor")
+
+        target_aspect_ratio, target_height, target_width = (
+            self._find_target_aspect_ratio(orig_width,
+                                           orig_height,
+                                           vit_image_size,
+                                           dynamic_hd_size,
+                                           min_num=1))
+        assert target_aspect_ratio[0] * vit_image_size == target_width, (
+            f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}")
+        assert target_aspect_ratio[1] * vit_image_size == target_height, (
+            f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}")
+        assert (target_height % vit_image_size == 0
+                and target_width % vit_image_size == 0)
+
+        padding_height, padding_width = _get_padding_size(
+            orig_width, orig_height, target_height, target_width)
+        assert padding_width == 0 or padding_height == 0, \
+            "padding_width or padding_height must be 0"
+
+        target_feat_width = target_width // vit_patch_size
+        target_feat_height = target_height // vit_patch_size
+        if padding_width >= vit_patch_size:
+            assert padding_height == 0, "padding_height not 0"
+            non_pad_feat_width = target_feat_width - math.floor(
+                padding_width / vit_patch_size)
+            non_pad_feat_height = target_feat_height
+        elif padding_height >= vit_patch_size:
+            assert padding_width == 0, "padding_width not 0"
+            non_pad_feat_height = target_feat_height - math.floor(
+                padding_height / vit_patch_size)
+            non_pad_feat_width = target_feat_width
+        else:
+            # small padding shorter than a vit patch
+            non_pad_feat_width = target_feat_width
+            non_pad_feat_height = target_feat_height
+
+        feat_width = non_pad_feat_width // token_compression_factor
+        feat_height = non_pad_feat_height // token_compression_factor
+        # NOTE it's possible that the non-padding feature is not divisible
+        if non_pad_feat_width % token_compression_factor != 0:
+            feat_width += 1
+        if non_pad_feat_height % token_compression_factor != 0:
+            feat_height += 1
+        num_hd_patch_tokens = feat_width * feat_height
+        num_hd_newline_tokens = feat_height
+        vit_feature_size = vit_image_size // vit_patch_size
+        num_global_image_tokens = (vit_feature_size //
+                                   token_compression_factor)**2
+        num_sep_tokens = 1
+        num_global_image_newline_tokens = \
+            vit_feature_size // token_compression_factor
+
+        return (num_global_image_tokens + num_sep_tokens +
+                num_hd_patch_tokens + num_hd_newline_tokens +
+                num_global_image_newline_tokens)
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[ProcessorMixin] = None,
+    ) -> int:
+        hf_config = self.get_hf_config()
+        vision_encoder_name = hf_config.img_processor
+        if vision_encoder_name is None:
+            vision_encoder_name = SIGLIP_NAME
+        prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[
+            vision_encoder_name]
+        vit_image_size = prepro_config['vit_image_size']
+        vit_patch_size = prepro_config['vit_patch_size']
+        token_compression_factor = prepro_config['token_compression_factor']
+
+        dynamic_hd_size = self.get_dynamic_hd(processor=processor)
+
+        image_num_tokens = self._compute_num_image_tokens(
+            image_width,
+            image_height,
+            dynamic_hd_size=dynamic_hd_size,
+            vit_image_size=vit_image_size,
+            vit_patch_size=vit_patch_size,
+            token_compression_factor=token_compression_factor,
+        )
+
+        return image_num_tokens
+
+    def get_image_size_with_most_features(
+        self,
+        processor: Optional[ProcessorMixin] = None,
+    ) -> ImageSize:
+        hf_config = self.get_hf_config()
+        vision_encoder_name = hf_config.img_processor
+        if vision_encoder_name is None:
+            vision_encoder_name = SIGLIP_NAME
+        prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[
+            vision_encoder_name]
+        vit_image_size = prepro_config['vit_image_size']
+
+        max_side = vit_image_size * self.get_dynamic_hd(processor=processor)
+        return ImageSize(height=max_side, width=vit_image_size)
+
+    def get_audio_num_frames(self, audio_len: int, sr: float) -> int:
+        """
+        Compute the output size of the `extract_features` method.
+
+        Args:
+            audio_len (int): Length of the input waveform in samples.
+            sr (float): Sampling rate of the waveform, either 16000 or 8000.
+
+        Returns:
+            tuple (int, int): Output size as (T, D), where:
+                T: Number of time frames.
+                D: Number of Mel filterbank bins (80).
+        """
+
+        # Resample to 16000 or 8000 if needed
+        if sr > 16000:
+            audio_len //= sr // 16000
+        elif 8000 <= sr < 16000:
+            # We'll resample to 16K from 8K
+            audio_len *= 2
+        elif sr < 8000:
+            raise RuntimeError(f"Unsupported sample rate {sr}")
+
+        # Spectrogram parameters for 16 kHz
+        win_length = 400  # Frame length in samples
+        hop_length = 160  # Frame shift in samples
+
+        # Calculate number of frames (T)
+        num_frames = (audio_len - win_length) // hop_length + 1
+        if num_frames < 1:
+            raise ValueError("Waveform too short for given parameters.")
+
+        # Return time frames (T)
+        return num_frames
+
+    def _compute_audio_embed_size(self, audio_frames: int) -> int:
+        """
+        Compute the audio embedding size based on the audio frames and
+        compression rate.
+        """
+        hf_config = self.get_hf_config()
+        compression_rate = hf_config.embd_layer['audio_embd_layer'][
+            'compression_rate']
+        # NOTE: this is a hard-coded value but might be configurable
+        # in the future
+        qformer_compression_rate = 1
+        integer = audio_frames // compression_rate
+        remainder = audio_frames % compression_rate
+
+        result = integer if remainder == 0 else integer + 1
+
+        integer = result // qformer_compression_rate
+        remainder = result % qformer_compression_rate
+        # qformer compression
+        result = integer if remainder == 0 else integer + 1
+
+        return result
+
+
+class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+        num_images = mm_counts.get("image", 0)
+
+        image_tokens: list[str] = self.info.image_tokens[:num_images]
+        audio_tokens: list[str] = self.info.audio_tokens[:num_audios]
+
+        return "".join(image_tokens + audio_tokens)
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_audios = mm_counts.get("audio", 0)
+        num_images = mm_counts.get("image", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "audio":
+            self._get_dummy_audios(length=_AUDIO_MAX_SOUNDFILE_SIZE,
+                                   num_audios=num_audios),
+        }
+
+        return mm_data
+
+
+class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        feature_extractor = self.info.get_feature_extractor()
+        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate,
+                                    audio_resample_method="scipy")
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if not mm_data:
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        sr = self.info.get_feature_extractor().sampling_rate
+        if (audio_data := mm_data.get("audios", [])):
+            mm_data['audios'] = [(data, sr) for data in audio_data]
+
+        processed_outputs = super()._call_hf_processor(prompt, mm_data,
+                                                       mm_kwargs)
+
+        num_img_tokens = [
+            self.info.get_num_image_tokens(image_width=img_size[0],
+                                           image_height=img_size[1])
+            for img_size in processed_outputs["image_sizes"]
+        ]
+        processed_outputs["num_img_tokens"] = num_img_tokens
+
+        audio_features = processed_outputs['input_audio_embeds']
+        feature_sizes = [
+            self.info.get_audio_num_frames(len(audio), sr)
+            for audio in audio_data
+        ]
+        processed_outputs['input_audio_embeds'] = [
+            audio_features[idx, :size]
+            for idx, size in enumerate(feature_sizes)
+        ]
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            input_image_embeds=MultiModalFieldConfig.batched("image"),
+            image_attention_mask=MultiModalFieldConfig.batched("image"),
+            image_sizes=MultiModalFieldConfig.batched("image"),
+            num_img_tokens=MultiModalFieldConfig.batched("image"),
+            input_audio_embeds=MultiModalFieldConfig.batched("audio"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        image_tokens: list[str] = self.info.image_tokens  # type: ignore
+        audio_tokens: list[str] = self.info.audio_tokens  # type: ignore
+        feature_extractor = self.info.get_feature_extractor()
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        def get_image_replacement_phi4mm(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                num_image_tokens = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                num_image_tokens = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    processor=hf_processor,
+                )
+
+            image_tokens = [_IMAGE_PLACEHOLDER_TOKEN_ID] * num_image_tokens
+
+            return image_tokens
+
+        def get_audio_replacement_phi4mm(item_idx: int):
+            audios = mm_items.get_items("audio", AudioProcessorItems)
+            # TODO(Isotr0py): support embedding inputs
+            audio_len = audios.get_audio_length(item_idx)
+            audio_frames = self.info.get_audio_num_frames(
+                audio_len, feature_extractor.sampling_rate)
+            audio_embed_size = self.info._compute_audio_embed_size(
+                audio_frames)
+
+            audio_tokens = [_AUDIO_PLACEHOLDER_TOKEN_ID] * audio_embed_size
+
+            return audio_tokens
+
+        num_images = mm_items.get_count("image", strict=False)
+        num_audios = mm_items.get_count("audio", strict=False)
+
+        image_repl = [
+            PromptReplacement(
+                modality="image",
+                target=image_token,
+                replacement=get_image_replacement_phi4mm,
+            ) for image_token in image_tokens[:num_images]
+        ]
+        audio_repl = [
+            PromptReplacement(
+                modality="audio",
+                target=audio_token,
+                replacement=get_audio_replacement_phi4mm,
+            ) for audio_token in audio_tokens[:num_audios]
+        ]
+        return image_repl + audio_repl
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Phi4MMMultiModalProcessor,
+    info=Phi4MMProcessingInfo,
+    dummy_inputs=Phi4MMDummyInputsBuilder,
+)
+class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
     """
     Implements the Phi-4-multimodal-instruct model in vLLM.
     """
@@ -1518,48 +991,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal,
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size, logit_scale)
-        self.sampler = Sampler()
-
-    def _audio_features_to_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        input_features: List[torch.Tensor],
-        audio_input_sizes: torch.Tensor,
-        audio_projection_mode: str,
-    ) -> torch.Tensor:
-        """
-        Convert audio features to embeddings, which are used as input to the 
-        model (via `inputs_embeds`).
-
-        Args:
-            input_ids (torch.Tensor): Input IDs (the prompt in this case).
-            input_features (list[torch.Tensor]): Input features (the audio 
-            embeddings).
-            audio_input_sizes (list[torch.Tensor]): Audio input sizes (the 
-            audio embed lengths to use for padding the audio placeholder token 
-            in the input prompt IDs).
-        """
-        # The audio projection can either be a single linear or Sequential,
-        # so handle both cases
-        if isinstance(self.embed_tokens_extend.audio_projection,
-                      nn.Sequential):
-            target_dtype = self.embed_tokens_extend.audio_projection[
-                0].bias.dtype
-        else:
-            target_dtype = self.embed_tokens_extend.audio_projection.bias.dtype
-
-        audio_input = [
-            input.unsqueeze(0).to(target_dtype) for input in input_features
-        ]
-        kwargs = {
-            "wte": self.model.embed_tokens,
-            'audio_projection_mode': audio_projection_mode
-        }
-        audio_embeddings = self.embed_tokens_extend(input_ids, audio_input,
-                                                    audio_input_sizes,
-                                                    **kwargs)
-        audio_embeddings = audio_embeddings.to(target_dtype)
-        return audio_embeddings
+        self.sampler = get_sampler()
 
     def _parse_and_validate_audio_input(
             self, **kwargs: object) -> Optional[Phi4MMAudioInputs]:
@@ -1574,7 +1006,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal,
         Returns:
             Optional[Phi4MMAudioInputs]: Parsed and validated audio inputs.
         """
-        audio_features = kwargs.pop("audio_features", None)
+        audio_features = kwargs.pop("input_audio_embeds", None)
         audio_embeds = kwargs.pop("audio_embeds", None)
 
         if audio_features is None and audio_embeds is None:
@@ -1586,7 +1018,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal,
                                  f"Got type: {type(audio_features)}")
 
             return Phi4MMAudioFeatureInputs(type="audio_features",
-                                            data=audio_features)
+                                            data=flatten_bn(audio_features))
 
         if audio_embeds is not None:
             if not isinstance(audio_embeds, (torch.Tensor, list)):
@@ -1598,8 +1030,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal,
 
         raise AssertionError("This line should be unreachable.")
 
-    def _process_audio_input(self, input_ids: torch.Tensor,
-                             audio_input: Phi4MMAudioInputs,
+    def _process_audio_input(self, audio_input: Phi4MMAudioInputs,
                              audio_projection_mode: str) -> NestedTensors:
         """
         Create the audio embeddings from the audio input, where the audio input
@@ -1607,8 +1038,6 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal,
         created by `input_mapper_for_phi4mm_audio`.
 
         Args:
-            input_ids (torch.Tensor): Input IDs (the prompt in this case, 
-            before the audio token replication).
             audio_input (Phi4MMAudioInputs): Audio input.
 
         Returns:
@@ -1620,21 +1049,20 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal,
         audio_features = audio_input["data"]
         # (e.g. multiple examples) and the second dim is the multi-audio dim
         # (e.g. multiple audios in the same example)
-        audio_feature = [i[0] for j in audio_features for i in j]
-        audio_feature_len = [i[1].item() for j in audio_features for i in j]
-        # Add the batch dim via `squeeze`
 
-        return self._audio_features_to_embeddings(
-            input_ids.unsqueeze(0),
-            audio_feature,
-            audio_feature_len,
-            audio_projection_mode,
-        ).squeeze(0)
+        dtype = next(self.embed_tokens_extend.parameters()).dtype
+        audio_embeds = [
+            self.embed_tokens_extend(
+                features.to(dtype),
+                audio_projection_mode=audio_projection_mode,
+            ) for features in audio_features
+        ]
+        return audio_embeds
 
     def _parse_and_validate_image_input(self,
                                         **kwargs: object) -> Optional[Dict]:
-        pixel_values: Optional[Dict] = kwargs.get("pixel_values")
-        if pixel_values is None:
+        input_image_embeds: NestedTensors = kwargs.get("input_image_embeds")
+        if input_image_embeds is None:
             return None
 
         image_sizes = kwargs.get("image_sizes")
@@ -1643,23 +1071,24 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal,
         assert image_sizes is not None and image_attention_mask is not None\
               and num_img_tokens is not None, "Missing image inputs"
 
-        if isinstance(pixel_values, list):
-            assert pixel_values[0].dim() == 5, "Incorrect image inputs"
+        if is_list_of(input_image_embeds, torch.Tensor):
+            assert all(p.dim() == 5
+                       for p in input_image_embeds), "Incorrect image inputs"
             # list len is batch_size.
             # each tensor has dimension: num_img_per_example, num_hd_patches,
             # channels, height, width.
             # need to pad along num_hd_patches.
             # mask size num_img_per_prompt, num_hd_patches, feat_h, heat_w.
-            pixel_values = cat_with_pad(pixel_values, dim=0)
-        elif isinstance(pixel_values, torch.Tensor):
+            input_image_embeds = cat_with_pad(input_image_embeds, dim=0)
+        elif isinstance(input_image_embeds, torch.Tensor):
             # dimension: batch_size, num_img_per_example, num_hd_patches,
             # channels, height, width.
             # we flatten first 2 dims to make it a single large batch for
             # SigLIP Encoder.
-            assert pixel_values.dim() == 6, "Incorrect image inputs"
-            pixel_values = pixel_values.flatten(0, 1)
+            assert input_image_embeds.dim() == 6, "Incorrect image inputs"
+            input_image_embeds = input_image_embeds.flatten(0, 1)
         else:
-            raise ValueError("Incorrect pixel_values inputs")
+            raise ValueError("Incorrect input_image_embeds inputs")
 
         if isinstance(image_attention_mask, list):
             image_attention_mask = cat_with_pad(image_attention_mask, dim=0)
@@ -1685,80 +1114,140 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal,
         else:
             raise ValueError("Incorrect image_attention_mask inputs")
 
-        return {
-            'pixel_values': pixel_values,
-            'image_sizes': image_sizes,
-            'image_attention_mask': image_attention_mask,
-            'num_img_tokens': num_img_tokens,
-        }
+        return Phi4MMImagePixelInputs(
+            type="pixel_values",
+            data=input_image_embeds,
+            image_sizes=image_sizes,
+            image_attention_mask=image_attention_mask,
+            num_img_tokens=num_img_tokens,
+        )
 
-    def merge_image_features_to_inputs_embeds(
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("input_image_embeds",
+                             "image_embeds") and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+            if input_key in ("input_audio_embeds",
+                             "audio_embeds") and "audios" not in modalities:
+                modalities["audios"] = self._parse_and_validate_audio_input(
+                    **kwargs)
+
+        return modalities
+
+    def _process_image_input(
+            self, image_input: Phi4MMImagePixelInputs) -> list[torch.Tensor]:
+        if image_input["type"] == "image_embeds":
+            image_embeds = image_input["image_embeds"].type(self.visual.dtype)
+        else:
+            dtype = next(self.vision_encoder.parameters()).dtype
+            pixel_values = image_input['data'].to(dtype)
+            image_sizes = image_input['image_sizes']
+            image_attention_mask = image_input['image_attention_mask']
+            image_embeds = self.vision_encoder(pixel_values, image_sizes,
+                                               image_attention_mask)
+        return image_embeds
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
+            return None
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        audio_projection_mode = 'speech'
+        for modality in modalities:
+            # make sure process images first
+            if modality == "images":
+                audio_projection_mode = "vision"
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += tuple(vision_embeddings)
+            if modality == "audios":
+                audio_input = modalities["audios"]
+                audio_embeddings = self._process_audio_input(
+                    audio_input, audio_projection_mode=audio_projection_mode)
+                multimodal_embeddings += tuple(audio_embeddings)
+
+        return multimodal_embeddings
+
+    def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        inputs_embeds: torch.Tensor,
-        image_set_tensors: List[torch.Tensor],
-    ):
-        position_tuple = (input_ids == _IMAGE_PLACEHOLDER_TOKEN_ID).nonzero(
-            as_tuple=True)
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.model.embed_tokens(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                [_IMAGE_PLACEHOLDER_TOKEN_ID, _AUDIO_PLACEHOLDER_TOKEN_ID])
+        return inputs_embeds
 
-        assert all([t.shape[0] == 1 for t in image_set_tensors
-                    ]), 'img_set_tensor should have shape (1, N_tokens, C)'
-        # Shape: (merged_N_tokens, C)
-        image_set_tensor = torch.cat(image_set_tensors, dim=1).squeeze(0)
-        image_set_tensor = image_set_tensor.to(inputs_embeds.dtype).to(
-            inputs_embeds.device)
-        merged_embeds = inputs_embeds.index_put(
-            indices=position_tuple,
-            values=image_set_tensor,
-            accumulate=False,
-        )
-        return merged_embeds
+    def get_input_embeddings_v0(
+        self,
+        input_ids: torch.Tensor,
+        image_input: Optional[Phi4MMImagePixelInputs] = None,
+        audio_input: Optional[Phi4MMAudioFeatureInputs] = None,
+    ) -> torch.Tensor:
+        audio_projection_mode = 'speech'
+        inputs_embeds = self.get_input_embeddings(input_ids)
+        if image_input is not None:
+            image_embeds = self._process_image_input(image_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                image_embeds,
+                placeholder_token_id=_IMAGE_PLACEHOLDER_TOKEN_ID,
+            )
+            audio_projection_mode = 'vision'
+
+        if audio_input is not None:
+            audio_embeds = self._process_audio_input(
+                audio_input, audio_projection_mode=audio_projection_mode)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                audio_embeds,
+                placeholder_token_id=_AUDIO_PLACEHOLDER_TOKEN_ID,
+            )
+        return inputs_embeds
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> torch.Tensor:
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
-        else:
-            # Each entry in this is a pair of audio_features and audio_embed
-            # lengths
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner from
+        # `get_multimodal_embeddings` and `get_input_embeddings`, this
+        # condition is only for v0 compatibility.
+        elif inputs_embeds is None:
+            image_input = self._parse_and_validate_image_input(**kwargs)
             audio_input = self._parse_and_validate_audio_input(**kwargs)
-            image_inputs = self._parse_and_validate_image_input(**kwargs)
 
-            has_audio = audio_input is not None
-            has_image = image_inputs is not None
-
-            if has_audio:
-                audio_projection_mode = 'vision' if has_image else 'speech'
-                inputs_embeds = self._process_audio_input(
-                    input_ids, audio_input, audio_projection_mode)
-
-            if has_image:
-                dtype = self.vision_encoder.img_processor.embeddings.\
-                    patch_embedding.weight.dtype
-                pixel_values = image_inputs['pixel_values'].to(dtype)
-                image_sizes = image_inputs['image_sizes']
-                image_attention_mask = image_inputs['image_attention_mask']
-                image_set_tensors = self.vision_encoder(
-                    pixel_values, image_sizes, image_attention_mask)
-                if not has_audio:
-                    inputs_embeds = self.model.embed_tokens(input_ids)
-
-                inputs_embeds = self.merge_image_features_to_inputs_embeds(
-                    input_ids, inputs_embeds, image_set_tensors)
-
-            if has_image or has_audio:
-                # multi-modal input, we have set inputs_embeds properly in
-                # previous steps
-                input_ids = None
-            else:
-                # text-only, we keep using original input_ids
+            if image_input is None and audio_input is None:
                 inputs_embeds = None
+            else:
+                inputs_embeds = self.get_input_embeddings_v0(
+                    input_ids,
+                    image_input=image_input,
+                    audio_input=audio_input)
+                input_ids = None
 
         hidden_states = self.model(
             input_ids,
diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py
index db90848f98099..34a7a73d057ae 100644
--- a/vllm/model_executor/models/phi4mm_audio.py
+++ b/vllm/model_executor/models/phi4mm_audio.py
@@ -1159,8 +1159,11 @@ class AudioEmbedding(nn.Module):
         input_embeds: torch.FloatTensor,
         audio_attention_mask: torch.Tensor = None,
         audio_projection_mode: str = "speech",
-    ):
-
+    ) -> torch.FloatTensor:
+        """
+        arguments:
+            input_embeds: audio features (B, T, D)  B: num audios in a sequence
+        """
         if self.freeze_audio_processor:
             with torch.no_grad():
                 audio_features, masks = self.encoder(input_embeds,
@@ -1210,62 +1213,20 @@ class AudioEmbedding(nn.Module):
 
     def forward(
         self,
-        input_ids: torch.LongTensor,
-        input_embeds: torch.FloatTensor,
-        audio_embed_sizes,
-        **kwargs,
+        audio_features: torch.FloatTensor,
+        audio_attention_mask: torch.Tensor = None,
+        audio_projection_mode: str = "speech",
     ) -> torch.FloatTensor:
         """
         arguments:
-            input_ids: input text ids (B, U)
-            input_embeds: audio features (B, T, D)  B: num audios in a sequence
+            audio_features: audio features (T, D)
+        
+        returns:
+            audio_embeds: audio embeddings (num_audio_tokens, hidden_dim)
         """
-        assert input_embeds is not None and len(input_embeds) == len(
-            audio_embed_sizes)
-
-        input_shape = input_ids.size()
-        input_ids = input_ids.view(-1, input_shape[-1])
-
-        with torch.no_grad():
-            positions = (input_ids == _AUDIO_PLACEHOLDER_TOKEN_ID).nonzero(
-                as_tuple=False)
-
-        if not isinstance(input_embeds, list):
-            input_embeds = [input_embeds]
-
-        audio_projection_mode = kwargs.get("audio_projection_mode", "speech")
-        audio_set_tensor = [
-            self.get_audio_features(
-                input_embed, audio_projection_mode=audio_projection_mode)
-            for input_embed in input_embeds
-        ]
-
-        with torch.no_grad():
-            input_ids.clamp_min_(0).clamp_max_(self.vocab_size)
-
-        if "wte" in kwargs:
-            # we use the token embedding layer from the huggingface model, this
-            # is REQUIRED to make sure we are using the loaded weights.
-            hidden_states = kwargs["wte"](input_ids)
-        else:
-            # otherwise, we use token embedding in pretrained mixformer from
-            # phi team
-            hidden_states = self.wte(input_ids)
-
-        if len(positions.tolist()) > 0:
-            assert sum(audio_embed_sizes) == len(
-                positions
-            ), "please ensure the encoder outputs have the same length as"\
-                " defined in input_ids!"
-            idx = 0
-            for i in range(len(audio_embed_sizes)):
-                cnt = audio_embed_sizes[i]
-                assert audio_set_tensor[i].shape[0] == 1
-                hidden_states[
-                    positions[idx, 0],
-                    positions[idx, 1]:positions[idx, 1] + cnt,
-                ] = (audio_set_tensor[i][0, :audio_embed_sizes[i], :].to(
-                    hidden_states.dtype).to(hidden_states.device))
-                idx += cnt
-
-        return hidden_states
+        audio_embeds = self.get_audio_features(
+            audio_features.unsqueeze(0),
+            audio_attention_mask=audio_attention_mask,
+            audio_projection_mode=audio_projection_mode,
+        )
+        return audio_embeds.squeeze(0)
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index f379ec1682a3c..70a912c9c9ef8 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
-
 import base64
 from io import BytesIO
 from pathlib import Path
+from typing import Literal, Optional
 
 import numpy as np
 import numpy.typing as npt
@@ -43,7 +43,7 @@ class AudioPlugin(MultiModalPlugin):
             "There is no default maximum multimodal tokens")
 
 
-def resample_audio(
+def resample_audio_librosa(
     audio: npt.NDArray[np.floating],
     *,
     orig_sr: float,
@@ -52,6 +52,55 @@ def resample_audio(
     return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
 
 
+def resample_audio_scipy(
+    audio: npt.NDArray[np.floating],
+    *,
+    orig_sr: float,
+    target_sr: float,
+):
+    # lazy import scipy.signal, otherwise it will crash doc build.
+    import scipy.signal
+
+    if orig_sr > target_sr:
+        return scipy.signal.resample_poly(audio, 1, orig_sr // target_sr)
+    elif orig_sr < target_sr:
+        return scipy.signal.resample_poly(audio, target_sr // orig_sr, 1)
+    return audio
+
+
+class AudioResampler:
+    """Resample audio data to a target sample rate."""
+
+    def __init__(
+        self,
+        target_sr: Optional[float] = None,
+        method: Literal["librosa", "scipy"] = "librosa",
+    ):
+        self.target_sr = target_sr
+        self.method = method
+
+    def resample(
+        self,
+        audio: npt.NDArray[np.floating],
+        *,
+        orig_sr: float,
+    ) -> npt.NDArray[np.floating]:
+        if self.target_sr is None:
+            raise RuntimeError("Audio resampling is not supported when "
+                               "`target_sr` is not provided")
+        if self.method == "librosa":
+            return resample_audio_librosa(audio,
+                                          orig_sr=orig_sr,
+                                          target_sr=self.target_sr)
+        elif self.method == "scipy":
+            return resample_audio_scipy(audio,
+                                        orig_sr=orig_sr,
+                                        target_sr=self.target_sr)
+        else:
+            raise ValueError(f"Invalid resampling method: {self.method}. "
+                             "Supported methods are 'librosa' and 'scipy'.")
+
+
 class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
 
     def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index fc5a294564e3c..9707b9cfcf8bf 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -3,8 +3,8 @@
 from abc import ABC, abstractmethod
 from collections import UserDict
 from collections.abc import Callable, Iterator, Mapping, Sequence
-from typing import (TYPE_CHECKING, Any, Generic, NamedTuple, Optional, TypeVar,
-                    Union)
+from typing import (TYPE_CHECKING, Any, Generic, Literal, NamedTuple, Optional,
+                    TypeVar, Union)
 
 import numpy as np
 import torch
@@ -14,7 +14,7 @@ from typing_extensions import TypeAlias, TypeGuard, assert_never
 
 from vllm.utils import is_list_of
 
-from .audio import resample_audio
+from .audio import AudioResampler
 from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem,
                      ImageItem, ModalityData, MultiModalDataDict,
                      MultiModalFieldConfig, MultiModalKwargs, VideoItem)
@@ -308,10 +308,18 @@ class MultiModalDataParser:
             items to the model's expected sampling rate.
     """
 
-    def __init__(self, *, target_sr: Optional[float] = None) -> None:
+    def __init__(
+        self,
+        *,
+        target_sr: Optional[float] = None,
+        audio_resample_method: Literal["librosa", "scipy"] = "librosa",
+    ) -> None:
         super().__init__()
 
-        self.target_sr = target_sr
+        self.audio_resampler = AudioResampler(
+            target_sr=target_sr,
+            method=audio_resample_method,
+        )
 
     def _is_embeddings(
             self, data: object
@@ -374,15 +382,8 @@ class MultiModalDataParser:
             if orig_sr is None:
                 new_audio = audio
             else:
-                target_sr = self.target_sr
-                if target_sr is None:
-                    raise RuntimeError(
-                        "Audio resampling is not supported when "
-                        "`target_sr` is not provided")
-
-                new_audio = resample_audio(audio,
-                                           orig_sr=orig_sr,
-                                           target_sr=target_sr)
+                new_audio = self.audio_resampler.resample(audio,
+                                                          orig_sr=orig_sr)
 
             new_audios.append(new_audio)
 

From 5124f5bf51b83e6f344c1bc6652e8c4d81313b34 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.me>
Date: Sat, 19 Apr 2025 02:37:02 -0700
Subject: [PATCH 517/593] [Model] Qwen2.5-Omni Cleanup  (#16872)

---
 docs/source/models/supported_models.md             | 4 ++--
 vllm/model_executor/models/qwen2_5_omni_thinker.py | 3 ---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 1b80c801d5be5..331f18db817d1 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -1117,8 +1117,8 @@ Our PaliGemma implementations have the same problem as Gemma 3 (see above) for b
 :::
 
 :::{note}
-To use Qwen2.5-Omni, you have to install a fork of Hugging Face Transformers library from source via
-`pip install git+https://github.com/BakerBunker/transformers.git@qwen25omni`.
+To use Qwen2.5-Omni, you have to install Hugging Face Transformers library from source via
+`pip install git+https://github.com/huggingface/transformers.git`.
 
 Read audio from video pre-processing is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
 `--mm-processor-kwargs '{"use_audio_in_video": True}'`.
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index 517d6eb7d6d0e..5b0693623ed9e 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -518,9 +518,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
         """
         Qwen2.5-Omni reimplements this function to handle text only.
         """
-        print(prompt)
-        print(hf_processor_mm_kwargs)
-        print(mm_items)
         if isinstance(prompt, str):
             if enable_hf_prompt_update:
                 return self._apply_hf_processor_text_mm(

From 205d84aaa974d6c7752705bb2835bac6809794cc Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 19 Apr 2025 20:13:06 +0800
Subject: [PATCH 518/593] [VLM] Clean up models (#16873)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 examples/offline_inference/mistral-small.py   |  1 +
 examples/offline_inference/vision_language.py |  2 +-
 vllm/model_executor/models/phi4mm.py          | 23 -------------------
 .../models/qwen2_5_omni_thinker.py            | 18 ---------------
 4 files changed, 2 insertions(+), 42 deletions(-)

diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py
index af1831bd36d1f..37c3181dc5faf 100644
--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@@ -62,6 +62,7 @@ def run_simple_demo(args: argparse.Namespace):
         tokenizer_mode="mistral" if args.format == "mistral" else "auto",
         config_format="mistral" if args.format == "mistral" else "auto",
         load_format="mistral" if args.format == "mistral" else "auto",
+        limit_mm_per_prompt={"image": 1},
         max_model_len=4096,
         max_num_seqs=2,
         tensor_parallel_size=2,
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index bd7035b7615ac..d02ac17cfdd68 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -957,7 +957,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
             "max_pixels": 1280 * 28 * 28,
             "fps": [1],
         },
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     if modality == "image":
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index cdd762f5fec30..03ca143f9c08e 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -503,26 +503,6 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": None, "image": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {
-            "image": self.get_max_image_tokens(),
-            "audio": self.get_max_audio_tokens(),
-        }
-
-    def get_max_audio_tokens(self) -> int:
-        sr = self.get_feature_extractor().sampling_rate
-        num_frames = self.get_audio_num_frames(_AUDIO_MAX_SOUNDFILE_SIZE, sr)
-        return self._compute_audio_embed_size(num_frames)
-
-    def get_max_image_tokens(self) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-        return self.get_num_image_tokens(image_width=target_width,
-                                         image_height=target_height)
-
     def _find_target_aspect_ratio(
         self,
         orig_width: int,
@@ -764,9 +744,6 @@ class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
         num_audios = mm_counts.get("audio", 0)
         num_images = mm_counts.get("image", 0)
 
-        target_width, target_height = \
-            self.info.get_image_size_with_most_features()
-
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
 
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index 5b0693623ed9e..19bdae1de70af 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -172,26 +172,9 @@ class Qwen2_5OmniThinkerProcessingInfo(Qwen2AudioProcessingInfo,
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
 
-    def get_max_audio_tokens(self) -> int:
-        hf_config = self.get_hf_config()
-        max_source_position = hf_config.audio_config.max_source_positions
-        output_lengths = (max_source_position - 2) // 2 + 1
-        return output_lengths
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": None, "image": None, "video": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {
-            "audio": self.get_max_audio_tokens(),
-            "image": self.get_max_image_tokens(),
-            "video": self.get_max_video_tokens(seq_len, mm_counts),
-        }
-
 
 class Qwen2_5OmniThinkerDummyInputsBuilder(
         BaseDummyInputsBuilder[Qwen2_5OmniThinkerProcessingInfo]):
@@ -210,7 +193,6 @@ class Qwen2_5OmniThinkerDummyInputsBuilder(
         return (audio_token * num_audios + image_token * num_images +
                 video_token * num_videos)
 
-    # TODO: @abstractmethod after transition
     def get_dummy_mm_data(
         self,
         seq_len: int,

From d6195a748b3d68d3e81a4f09513ece4e04f5c571 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sun, 20 Apr 2025 00:40:38 +0800
Subject: [PATCH 519/593] [doc] update hyperlink (#16877)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 docs/source/models/extensions/fastsafetensor.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/extensions/fastsafetensor.md b/docs/source/models/extensions/fastsafetensor.md
index 66cd710c97e9f..531d58690014e 100644
--- a/docs/source/models/extensions/fastsafetensor.md
+++ b/docs/source/models/extensions/fastsafetensor.md
@@ -1,5 +1,5 @@
 Loading Model weights with fastsafetensors
 ===================================================================
 
-Using fastsafetensor library enables loading model weights to GPU memory by leveraging GPU direct storage. See https://github.com/foundation-model-stack/fastsafetensors for more details.
+Using fastsafetensors library enables loading model weights to GPU memory by leveraging GPU direct storage. See [their GitHub repository](https://github.com/foundation-model-stack/fastsafetensors) for more details.
 For enabling this feature, set the environment variable ``USE_FASTSAFETENSOR`` to ``true``

From 682e0b6d2fbef012abde9b90c73da23f3065e2b8 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@users.noreply.github.com>
Date: Sat, 19 Apr 2025 12:50:46 -0400
Subject: [PATCH 520/593] Log how much time loading a compiled artifact takes
 (#16848)

Signed-off-by: rzou <zou3519@gmail.com>
---
 vllm/compilation/backends.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 45988c2e9b0d4..c493a764f56d6 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -110,10 +110,14 @@ class CompilerManager:
         compiled_graph = self.load(graph, example_inputs, graph_index,
                                    runtime_shape)
         if compiled_graph is not None:
-            if graph_index == 0:
-                # adds some info logging for the first graph
-                logger.info("Directly load the compiled graph for shape %s "
-                            "from the cache", str(runtime_shape))  # noqa
+            if graph_index == num_graphs - 1:
+                # after loading the last graph for this shape, record the time.
+                # there can be multiple graphs due to piecewise compilation.
+                now = time.time()
+                elapsed = now - compilation_start_time
+                logger.info(
+                    "Directly load the compiled graph(s) for shape %s "
+                    "from the cache, took %.3f s", str(runtime_shape), elapsed)
             return compiled_graph
 
         # no compiler cached the graph, or the cache is disabled,

From 87aaadef73543ab3e63eea933c39cee42c418e90 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Staszek=20Pa=C5=9Bko?= <staszek@gmail.com>
Date: Sat, 19 Apr 2025 19:28:34 +0200
Subject: [PATCH 521/593] Serialize tensors using int8 views (#16866)

Signed-off-by: Staszek Pasko <staszek@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/test_serial_utils.py | 16 ++++++++-----
 vllm/v1/serial_utils.py       | 45 +++++++++++++++++++++++++++++------
 2 files changed, 48 insertions(+), 13 deletions(-)

diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py
index e58d3c403c197..df9832fc4e480 100644
--- a/tests/v1/test_serial_utils.py
+++ b/tests/v1/test_serial_utils.py
@@ -47,6 +47,10 @@ def test_encode_decode():
             torch.rand((1, 10), dtype=torch.float32),
             torch.rand((3, 5, 4000), dtype=torch.float64),
             torch.tensor(1984),  # test scalar too
+            # Make sure to test bf16 which numpy doesn't support.
+            torch.rand((3, 5, 1000), dtype=torch.bfloat16),
+            torch.tensor([float("-inf"), float("inf")] * 1024,
+                         dtype=torch.bfloat16),
         ],
         numpy_array=np.arange(512),
         unrecognized=UnrecognizedType(33),
@@ -64,7 +68,7 @@ def test_encode_decode():
     # There should be the main buffer + 4 large tensor buffers
     # + 1 large numpy array. "large" is <= 512 bytes.
     # The two small tensors are encoded inline.
-    assert len(encoded) == 6
+    assert len(encoded) == 8
 
     decoded: MyType = decoder.decode(encoded)
 
@@ -76,7 +80,7 @@ def test_encode_decode():
 
     encoded2 = encoder.encode_into(obj, preallocated)
 
-    assert len(encoded2) == 6
+    assert len(encoded2) == 8
     assert encoded2[0] is preallocated
 
     decoded2: MyType = decoder.decode(encoded2)
@@ -114,15 +118,15 @@ def test_multimodal_kwargs():
 
     total_len = sum(memoryview(x).cast("B").nbytes for x in encoded)
 
-    # expected total encoding length, should be 44536, +-20 for minor changes
-    assert total_len >= 44516 and total_len <= 44556
+    # expected total encoding length, should be 44559, +-20 for minor changes
+    assert total_len >= 44539 and total_len <= 44579
     decoded: MultiModalKwargs = decoder.decode(encoded).mm[0]
     assert all(nested_equal(d[k], decoded[k]) for k in d)
 
 
 def test_multimodal_items_by_modality():
-    e1 = MultiModalFieldElem("audio", "a0", torch.zeros(1000,
-                                                        dtype=torch.int16),
+    e1 = MultiModalFieldElem("audio", "a0",
+                             torch.zeros(1000, dtype=torch.bfloat16),
                              MultiModalBatchedField())
     e2 = MultiModalFieldElem(
         "video",
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 4f7987ee46a6e..a3ad8cb920962 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -80,7 +80,7 @@ class MsgpackEncoder:
 
     def enc_hook(self, obj: Any) -> Any:
         if isinstance(obj, torch.Tensor):
-            return self._encode_ndarray(obj.numpy())
+            return self._encode_tensor(obj)
 
         # Fall back to pickle for object or void kind ndarrays.
         if isinstance(obj, np.ndarray) and obj.dtype.kind not in ('O', 'V'):
@@ -133,9 +133,27 @@ class MsgpackEncoder:
         # backing buffers that we've stashed in `aux_buffers`.
         return obj.dtype.str, obj.shape, data
 
+    def _encode_tensor(
+        self, obj: torch.Tensor
+    ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]:
+        assert self.aux_buffers is not None
+        # this creates a copy of the tensor if it's not already contiguous
+        obj = obj.contiguous()
+        #  view the tensor as a 1D array of bytes
+        arr = obj.view((obj.numel(), )).view(torch.uint8).numpy()
+        if obj.nbytes < self.size_threshold:
+            # Smaller tensors are encoded inline, just like ndarrays.
+            data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr.data)
+        else:
+            # Otherwise encode index of backing buffer to avoid copy.
+            data = len(self.aux_buffers)
+            self.aux_buffers.append(arr.data)
+        dtype = str(obj.dtype)[6:]  # remove 'torch.' prefix
+        return dtype, obj.shape, data
+
     def _encode_nested_tensors(self, nt: NestedTensors) -> Any:
         if isinstance(nt, torch.Tensor):
-            return self._encode_ndarray(nt.numpy())
+            return self._encode_tensor(nt)
         if isinstance(nt, (int, float)):
             # Although it violates NestedTensors type, MultiModalKwargs
             # values are sometimes floats.
@@ -186,7 +204,7 @@ class MsgpackDecoder:
             if issubclass(t, np.ndarray):
                 return self._decode_ndarray(obj)
             if issubclass(t, torch.Tensor):
-                return torch.from_numpy(self._decode_ndarray(obj))
+                return self._decode_tensor(obj)
             if issubclass(t, MultiModalKwargs):
                 if isinstance(obj, list):
                     return MultiModalKwargs.from_items(
@@ -199,11 +217,24 @@ class MsgpackDecoder:
 
     def _decode_ndarray(self, arr: Any) -> np.ndarray:
         dtype, shape, data = arr
-        # Copy from inline representation, otherwise Torch is unhappy since
-        # the returned memory is non-writeable.
+        # zero-copy decode. We assume the ndarray will not be kept around,
+        # as it now locks the whole received message buffer in memory.
+        buffer = self.aux_buffers[data] if isinstance(data, int) else data
+        return np.ndarray(buffer=buffer, dtype=np.dtype(dtype), shape=shape)
+
+    def _decode_tensor(self, arr: Any) -> torch.Tensor:
+        dtype, shape, data = arr
+        # Copy from inline representation, to decouple the memory storage
+        # of the message from the original buffer. And also make Torch
+        # not complain about a readonly memoryview.
         buffer = self.aux_buffers[data] if isinstance(data, int) \
             else bytearray(data)
-        return np.ndarray(buffer=buffer, dtype=np.dtype(dtype), shape=shape)
+        # Create numpy wrapper around the bytes
+        arr = np.ndarray(buffer=buffer, dtype=np.uint8, shape=(len(buffer), ))
+        torch_dtype = getattr(torch, dtype)
+        assert isinstance(torch_dtype, torch.dtype)
+        # Convert back to proper shape & type
+        return torch.from_numpy(arr).view(torch_dtype).view(shape)
 
     def _decode_mm_items(self, obj: list) -> list[MultiModalKwargsItem]:
         decoded_items = []
@@ -228,7 +259,7 @@ class MsgpackDecoder:
         if not isinstance(obj, list):
             raise TypeError(f"Unexpected NestedTensors contents: {type(obj)}")
         if obj and isinstance(obj[0], str):
-            return torch.from_numpy(self._decode_ndarray(obj))
+            return self._decode_tensor(obj)
         return [self._decode_nested_tensors(x) for x in obj]
 
     def ext_hook(self, code: int, data: memoryview) -> Any:

From 4b07d36891510ab43677695d2dbb098937eaa5e1 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sun, 20 Apr 2025 05:25:04 +0100
Subject: [PATCH 522/593] Improve configs - `CacheConfig` (#16835)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config.py           | 122 +++++++++++++++++-------------
 vllm/engine/arg_utils.py | 158 +++++++++++++--------------------------
 vllm/platforms/neuron.py |   2 +-
 3 files changed, 124 insertions(+), 158 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 5b5ac40f6aa2c..d841eeb7a4747 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1245,22 +1245,70 @@ class ModelConfig:
                 or getattr(self.hf_config, "is_matryoshka", False))
 
 
-class CacheConfig:
-    """Configuration for the KV cache.
+BlockSize = Literal[8, 16, 32, 64, 128]
+CacheDType = Literal["auto", "fp8", "fp8_e4m3", "fp8_e5m2"]
+PrefixCachingHashAlgo = Literal["builtin", "sha256"]
 
-    Args:
-        block_size: Size of a cache block in number of tokens.
-        gpu_memory_utilization: Fraction of GPU memory to use for the
-            vLLM execution.
-        swap_space: Size of the CPU swap space per GPU (in GiB).
-        cache_dtype: Data type for kv cache storage.
-        is_attention_free: Whether the model is attention-free.
-        num_gpu_blocks_override: Number of GPU blocks to use. This overrides the
-            profiled num_gpu_blocks if specified. Does nothing if None.
-        sliding_window: Sliding window size for the KV cache.
-        enable_prefix_caching: Whether to enable prefix caching.
-        cpu_offload_gb: Size of the CPU offload buffer in GiB.
+
+@config
+@dataclass
+class CacheConfig:
+    """Configuration for the KV cache."""
+
+    block_size: Optional[BlockSize] = None
+    """Size of a contiguous cache block in number of tokens. This is ignored on
+    neuron devices and set to `--max-model-len`. On CUDA devices, only block
+    sizes up to 32 are supported. On HPU devices, block size defaults to 128.
     """
+    gpu_memory_utilization: float = 0.9
+    """The fraction of GPU memory to be used for the model executor, which can
+    range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory
+    utilization. If unspecified, will use the default value of 0.9. This is a
+    per-instance limit, and only applies to the current vLLM instance. It does
+    not matter if you have another vLLM instance running on the same GPU. For
+    example, if you have two vLLM instances running on the same GPU, you can
+    set the GPU memory utilization to 0.5 for each instance."""
+    swap_space: float = 4
+    """Size of the CPU swap space per GPU (in GiB)."""
+    cache_dtype: CacheDType = "auto"
+    """Data type for kv cache storage. If "auto", will use model data type.
+    CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports
+    fp8 (=fp8_e4m3)."""
+    is_attention_free: bool = False
+    """Whether the model is attention-free. This is primarily set in
+    `ModelConfig` and that value should be manually duplicated here."""
+    num_gpu_blocks_override: Optional[int] = None
+    """Number of GPU blocks to use. This overrides the profiled `num_gpu_blocks`
+    if specified. Does nothing if `None`. Used for testing preemption."""
+    sliding_window: Optional[int] = None
+    """Sliding window size for the KV cache. This is primarily set in
+    `ModelConfig` and that value should be manually duplicated here."""
+    enable_prefix_caching: Optional[bool] = None
+    """Whether to enable prefix caching. Disabled by default for V0. Enabled by
+    default for V1."""
+    prefix_caching_hash_algo: PrefixCachingHashAlgo = "builtin"
+    """Set the hash algorithm for prefix caching:\n
+    - "builtin" is Python's built-in hash.\n
+    - "sha256" is collision resistant but with certain overheads."""
+    cpu_offload_gb: float = 0
+    """The space in GiB to offload to CPU, per GPU. Default is 0, which means
+    no offloading. Intuitively, this argument can be seen as a virtual way to
+    increase the GPU memory size. For example, if you have one 24 GB GPU and
+    set this to 10, virtually you can think of it as a 34 GB GPU. Then you can
+    load a 13B model with BF16 weight, which requires at least 26GB GPU memory.
+    Note that this requires fast CPU-GPU interconnect, as part of the model is
+    loaded from CPU memory to GPU memory on the fly in each model forward pass.
+    """
+    calculate_kv_scales: bool = False
+    """This enables dynamic calculation of `k_scale` and `v_scale` when
+    kv_cache_dtype is fp8. If `False`, the scales will be loaded from the model
+    checkpoint if available. Otherwise, the scales will default to 1.0."""
+
+    # Will be set after profiling.
+    num_gpu_blocks: Optional[int] = field(default=None, init=False)
+    """The number of blocks to allocate for GPU memory."""
+    num_cpu_blocks: Optional[int] = field(default=None, init=False)
+    """The number of blocks to allocate for CPU memory."""
 
     def compute_hash(self) -> str:
         """
@@ -1281,43 +1329,13 @@ class CacheConfig:
                                usedforsecurity=False).hexdigest()
         return hash_str
 
-    def __init__(
-        self,
-        block_size: int,
-        gpu_memory_utilization: float,
-        swap_space: float,
-        cache_dtype: str,
-        is_attention_free: bool = False,
-        num_gpu_blocks_override: Optional[int] = None,
-        sliding_window: Optional[int] = None,
-        enable_prefix_caching: bool = False,
-        prefix_caching_hash_algo: str = "builtin",
-        cpu_offload_gb: float = 0,
-        calculate_kv_scales: Optional[bool] = None,
-    ) -> None:
-        self.block_size = block_size
-        self.gpu_memory_utilization = gpu_memory_utilization
-        self.swap_space_bytes = swap_space * GiB_bytes
-        self.num_gpu_blocks_override = num_gpu_blocks_override
-        self.cache_dtype = cache_dtype
-        self.is_attention_free = is_attention_free
-        self.sliding_window = sliding_window
-        self.enable_prefix_caching = enable_prefix_caching
-        self.prefix_caching_hash_algo = prefix_caching_hash_algo
-        self.cpu_offload_gb = cpu_offload_gb
-        self.calculate_kv_scales = calculate_kv_scales
+    def __post_init__(self) -> None:
+        self.swap_space_bytes = self.swap_space * GiB_bytes
+
         self._verify_args()
         self._verify_cache_dtype()
         self._verify_prefix_caching()
 
-        # Will be set after profiling.
-        self.num_gpu_blocks: Optional[int] = None
-        self.num_cpu_blocks: Optional[int] = None
-
-        # Set calculate_kv_scales to False if the value is unset.
-        if self.calculate_kv_scales is None:
-            self.calculate_kv_scales = False
-
     def metrics_info(self):
         # convert cache_config to dict(key: str, value: str) for prometheus
         # metrics info
@@ -1336,7 +1354,7 @@ class CacheConfig:
     def _verify_cache_dtype(self) -> None:
         if self.cache_dtype == "auto":
             pass
-        elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2"):
+        elif self.cache_dtype in get_args(CacheDType):
             logger.info(
                 "Using fp8 data type to store kv cache. It reduces the GPU "
                 "memory footprint and boosts the performance. "
@@ -1354,12 +1372,12 @@ class CacheConfig:
                 "Prefix caching is not supported with sliding window. "
                 "Run with --disable-sliding-window to use prefix caching.")
 
-        if self.enable_prefix_caching and self.prefix_caching_hash_algo not in (
-                "builtin", "sha256"):
+        if (self.enable_prefix_caching and self.prefix_caching_hash_algo
+                not in get_args(PrefixCachingHashAlgo)):
             raise ValueError(
                 "Unknown prefix caching hash algorithm: "
-                f"{self.prefix_caching_hash_algo}. Must be either "
-                "'builtin' or 'sha256'.")
+                f"{self.prefix_caching_hash_algo}. Must be one of "
+                f"{get_args(PrefixCachingHashAlgo)}.")
 
     def verify_with_parallel_config(
         self,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 1f719392bd9f2..4c3477ddf5b3e 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -16,16 +16,16 @@ from typing_extensions import TypeIs
 
 import vllm.envs as envs
 from vllm import version
-from vllm.config import (CacheConfig, CompilationConfig, Config, ConfigFormat,
-                         DecodingConfig, Device, DeviceConfig,
-                         DistributedExecutorBackend, HfOverrides,
+from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
+                         Config, ConfigFormat, DecodingConfig, Device,
+                         DeviceConfig, DistributedExecutorBackend, HfOverrides,
                          KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
                          ModelConfig, ModelImpl, MultiModalConfig,
                          ObservabilityConfig, ParallelConfig, PoolerConfig,
-                         PoolType, PromptAdapterConfig, SchedulerConfig,
-                         SchedulerPolicy, SpeculativeConfig, TaskOption,
-                         TokenizerPoolConfig, VllmConfig, get_attr_docs,
-                         get_field)
+                         PoolType, PrefixCachingHashAlgo, PromptAdapterConfig,
+                         SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
+                         TaskOption, TokenizerPoolConfig, VllmConfig,
+                         get_attr_docs, get_field)
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
@@ -138,7 +138,7 @@ class EngineArgs:
     load_format: str = LoadConfig.load_format
     config_format: ConfigFormat = ConfigFormat.AUTO
     dtype: str = 'auto'
-    kv_cache_dtype: str = 'auto'
+    kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
     seed: Optional[int] = None
     max_model_len: Optional[int] = None
     # Note: Specifying a custom executor backend by passing a class
@@ -154,15 +154,16 @@ class EngineArgs:
     enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
     max_parallel_loading_workers: Optional[
         int] = ParallelConfig.max_parallel_loading_workers
-    block_size: Optional[int] = None
-    enable_prefix_caching: Optional[bool] = None
-    prefix_caching_hash_algo: str = "builtin"
+    block_size: Optional[BlockSize] = CacheConfig.block_size
+    enable_prefix_caching: Optional[bool] = CacheConfig.enable_prefix_caching
+    prefix_caching_hash_algo: PrefixCachingHashAlgo = \
+        CacheConfig.prefix_caching_hash_algo
     disable_sliding_window: bool = False
     disable_cascade_attn: bool = False
     use_v2_block_manager: bool = True
-    swap_space: float = 4  # GiB
-    cpu_offload_gb: float = 0  # GiB
-    gpu_memory_utilization: float = 0.90
+    swap_space: float = CacheConfig.swap_space
+    cpu_offload_gb: float = CacheConfig.cpu_offload_gb
+    gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
     max_num_batched_tokens: Optional[
         int] = SchedulerConfig.max_num_batched_tokens
     max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills
@@ -211,7 +212,8 @@ class EngineArgs:
     num_scheduler_steps: int = SchedulerConfig.num_scheduler_steps
     multi_step_stream_outputs: bool = SchedulerConfig.multi_step_stream_outputs
     ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
-    num_gpu_blocks_override: Optional[int] = None
+    num_gpu_blocks_override: Optional[
+        int] = CacheConfig.num_gpu_blocks_override
     num_lookahead_slots: int = SchedulerConfig.num_lookahead_slots
     model_loader_extra_config: dict = \
         get_field(LoadConfig, "model_loader_extra_config")
@@ -250,7 +252,7 @@ class EngineArgs:
     enable_sleep_mode: bool = False
     model_impl: str = "auto"
 
-    calculate_kv_scales: Optional[bool] = None
+    calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
 
     additional_config: Optional[Dict[str, Any]] = None
     enable_reasoning: Optional[bool] = None
@@ -306,12 +308,19 @@ class EngineArgs:
             cls_docs = get_attr_docs(cls)
             kwargs = {}
             for field in fields(cls):
-                name = field.name
+                # Get the default value of the field
                 default = field.default
-                # This will only be True if default is MISSING
                 if field.default_factory is not MISSING:
                     default = field.default_factory()
-                kwargs[name] = {"default": default, "help": cls_docs[name]}
+
+                # Get the help text for the field
+                name = field.name
+                help = cls_docs[name]
+                # Escape % for argparse
+                help = help.replace("%", "%%")
+
+                # Initialise the kwargs dictionary for the field
+                kwargs[name] = {"default": default, "help": help}
 
                 # Make note of if the field is optional and get the actual
                 # type of the field if it is
@@ -319,6 +328,8 @@ class EngineArgs:
                 field_type = get_args(
                     field.type)[0] if optional else field.type
 
+                # Set type, action and choices for the field depending on the
+                # type of the field
                 if can_be_type(field_type, bool):
                     # Creates --no-<name> and --<name> flags
                     kwargs[name]["action"] = argparse.BooleanOptionalAction
@@ -463,14 +474,6 @@ class EngineArgs:
             '* "bfloat16" for a balance between precision and range.\n'
             '* "float" is shorthand for FP32 precision.\n'
             '* "float32" for FP32 precision.')
-        parser.add_argument(
-            '--kv-cache-dtype',
-            type=str,
-            choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
-            default=EngineArgs.kv_cache_dtype,
-            help='Data type for kv cache storage. If "auto", will use model '
-            'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
-            'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
         parser.add_argument('--max-model-len',
                             type=human_readable_int,
                             default=EngineArgs.max_model_len,
@@ -544,33 +547,30 @@ class EngineArgs:
         parallel_group.add_argument(
             '--disable-custom-all-reduce',
             **parallel_kwargs["disable_custom_all_reduce"])
-        # KV cache arguments
-        parser.add_argument('--block-size',
-                            type=int,
-                            default=EngineArgs.block_size,
-                            choices=[8, 16, 32, 64, 128],
-                            help='Token block size for contiguous chunks of '
-                            'tokens. This is ignored on neuron devices and '
-                            'set to ``--max-model-len``. On CUDA devices, '
-                            'only block sizes up to 32 are supported. '
-                            'On HPU devices, block size defaults to 128.')
 
-        parser.add_argument(
-            "--enable-prefix-caching",
-            action=argparse.BooleanOptionalAction,
-            default=EngineArgs.enable_prefix_caching,
-            help="Enables automatic prefix caching. "
-            "Use ``--no-enable-prefix-caching`` to disable explicitly.",
-        )
-        parser.add_argument(
-            "--prefix-caching-hash-algo",
-            type=str,
-            choices=["builtin", "sha256"],
-            default=EngineArgs.prefix_caching_hash_algo,
-            help="Set the hash algorithm for prefix caching. "
-            "Options are 'builtin' (Python's built-in hash) or 'sha256' "
-            "(collision resistant but with certain overheads).",
+        # KV cache arguments
+        cache_kwargs = get_kwargs(CacheConfig)
+        cache_group = parser.add_argument_group(
+            title="CacheConfig",
+            description=CacheConfig.__doc__,
         )
+        cache_group.add_argument('--block-size', **cache_kwargs["block_size"])
+        cache_group.add_argument('--gpu-memory-utilization',
+                                 **cache_kwargs["gpu_memory_utilization"])
+        cache_group.add_argument('--swap-space', **cache_kwargs["swap_space"])
+        cache_group.add_argument('--kv-cache-dtype',
+                                 **cache_kwargs["cache_dtype"])
+        cache_group.add_argument('--num-gpu-blocks-override',
+                                 **cache_kwargs["num_gpu_blocks_override"])
+        cache_group.add_argument("--enable-prefix-caching",
+                                 **cache_kwargs["enable_prefix_caching"])
+        cache_group.add_argument("--prefix-caching-hash-algo",
+                                 **cache_kwargs["prefix_caching_hash_algo"])
+        cache_group.add_argument('--cpu-offload-gb',
+                                 **cache_kwargs["cpu_offload_gb"])
+        cache_group.add_argument('--calculate-kv-scales',
+                                 **cache_kwargs["calculate_kv_scales"])
+
         parser.add_argument('--disable-sliding-window',
                             action='store_true',
                             help='Disables sliding window, '
@@ -588,43 +588,6 @@ class EngineArgs:
                             type=int,
                             default=EngineArgs.seed,
                             help='Random seed for operations.')
-        parser.add_argument('--swap-space',
-                            type=float,
-                            default=EngineArgs.swap_space,
-                            help='CPU swap space size (GiB) per GPU.')
-        parser.add_argument(
-            '--cpu-offload-gb',
-            type=float,
-            default=0,
-            help='The space in GiB to offload to CPU, per GPU. '
-            'Default is 0, which means no offloading. Intuitively, '
-            'this argument can be seen as a virtual way to increase '
-            'the GPU memory size. For example, if you have one 24 GB '
-            'GPU and set this to 10, virtually you can think of it as '
-            'a 34 GB GPU. Then you can load a 13B model with BF16 weight, '
-            'which requires at least 26GB GPU memory. Note that this '
-            'requires fast CPU-GPU interconnect, as part of the model is '
-            'loaded from CPU memory to GPU memory on the fly in each '
-            'model forward pass.')
-        parser.add_argument(
-            '--gpu-memory-utilization',
-            type=float,
-            default=EngineArgs.gpu_memory_utilization,
-            help='The fraction of GPU memory to be used for the model '
-            'executor, which can range from 0 to 1. For example, a value of '
-            '0.5 would imply 50%% GPU memory utilization. If unspecified, '
-            'will use the default value of 0.9. This is a per-instance '
-            'limit, and only applies to the current vLLM instance.'
-            'It does not matter if you have another vLLM instance running '
-            'on the same GPU. For example, if you have two vLLM instances '
-            'running on the same GPU, you can set the GPU memory utilization '
-            'to 0.5 for each instance.')
-        parser.add_argument(
-            '--num-gpu-blocks-override',
-            type=int,
-            default=None,
-            help='If specified, ignore GPU profiling result and use this number'
-            ' of GPU blocks. Used for testing preemption.')
         parser.add_argument(
             '--max-logprobs',
             type=int,
@@ -994,15 +957,6 @@ class EngineArgs:
                             help="Enable sleep mode for the engine. "
                             "(only cuda platform is supported)")
 
-        parser.add_argument(
-            '--calculate-kv-scales',
-            action='store_true',
-            help='This enables dynamic calculation of '
-            'k_scale and v_scale when kv-cache-dtype is fp8. '
-            'If calculate-kv-scales is false, the scales will '
-            'be loaded from the model checkpoint if available. '
-            'Otherwise, the scales will default to 1.0.')
-
         parser.add_argument(
             "--additional-config",
             type=json.loads,
@@ -1625,9 +1579,7 @@ class EngineArgs:
                 self.enable_prefix_caching = False
 
             # VLLM_V0 only supports builtin hash algo for prefix caching.
-            if self.prefix_caching_hash_algo is None:
-                self.prefix_caching_hash_algo = "builtin"
-            elif self.prefix_caching_hash_algo == "sha256":
+            if self.prefix_caching_hash_algo == "sha256":
                 raise ValueError(
                     "sha256 is not supported for prefix caching in V0 engine. "
                     "Please use 'builtin'.")
@@ -1646,10 +1598,6 @@ class EngineArgs:
         if self.enable_prefix_caching is None:
             self.enable_prefix_caching = True
 
-        # if using prefix caching, we must set a hash algo
-        if self.enable_prefix_caching and self.prefix_caching_hash_algo is None:
-            self.prefix_caching_hash_algo = "builtin"
-
         # V1 should use the new scheduler by default.
         # Swap it only if this arg is set to the original V0 default
         if self.scheduler_cls == EngineArgs.scheduler_cls:
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index c1f426e5b8801..e37a3a578cf20 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -50,7 +50,7 @@ class NeuronPlatform(Platform):
         if cache_config:
             # neuron needs block_size = max_model_len
             vllm_config.cache_config.block_size = \
-                vllm_config.model_config.max_model_len
+                vllm_config.model_config.max_model_len  # type: ignore
 
     @classmethod
     def is_pin_memory_available(cls) -> bool:

From fe742aef5aaf406c62cafa248068818bfe517d6e Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@users.noreply.github.com>
Date: Sun, 20 Apr 2025 00:25:19 -0400
Subject: [PATCH 523/593] [easy] Pass compile_fx only the config patches
 (#16845)

Signed-off-by: rzou <zou3519@gmail.com>
---
 vllm/compilation/compiler_interface.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index f6c752073c7d8..833be289265f1 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -167,8 +167,7 @@ class InductorAdaptor(CompilerInterface):
         compiler_config: Dict[str, Any],
         runtime_shape: Optional[int] = None
     ) -> Tuple[Optional[Callable], Optional[Any]]:
-        from torch._inductor import config
-        current_config = config.get_config_copy()
+        current_config = {}
         from torch._inductor.compile_fx import compile_fx
 
         # disable remote cache

From bb3605db85c2d986bf20e0fdd0a50ec27502c367 Mon Sep 17 00:00:00 2001
From: qizixi <alexqizixi@gmail.com>
Date: Sun, 20 Apr 2025 20:54:29 -0700
Subject: [PATCH 524/593] [Bugfix] Fix v1/spec_decode/test_ngram.py (#16895)

Signed-off-by: qizixi <qizixi@meta.com>
---
 tests/v1/spec_decode/test_ngram.py | 53 +++++++++++++-----------------
 vllm/config.py                     | 15 +++++----
 2 files changed, 30 insertions(+), 38 deletions(-)

diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index a81b4897e5d65..5caa4f052fc33 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 
+from vllm.config import SpeculativeConfig, VllmConfig
 from vllm.v1.spec_decode.ngram_proposer import (NgramProposer,
                                                 _find_subarray_kmp,
                                                 _kmp_lps_array)
@@ -39,50 +40,40 @@ def test_find_subarray_kmp():
 
 
 def test_ngram_proposer():
-    proposer = NgramProposer()
+
+    def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
+        return NgramProposer(vllm_config=VllmConfig(
+            speculative_config=SpeculativeConfig.from_dict(
+                {
+                    "prompt_lookup_min": min_n,
+                    "prompt_lookup_max": max_n,
+                    "num_speculative_tokens": k,
+                    "method": "ngram",
+                })))
 
     # No match.
-    result = proposer.propose(
-        context_token_ids=np.array([1, 2, 3, 4, 5]),
-        min_n=2,
-        max_n=2,
-        k=2,
-    )
+    result = ngram_proposer(
+        2, 2, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 5]))
     assert result is None
 
     # No match for 4-gram.
-    result = proposer.propose(
-        context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]),
-        min_n=4,
-        max_n=4,
-        k=2,
-    )
+    result = ngram_proposer(
+        4, 4, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
     assert result is None
 
     # No match for 4-gram but match for 3-gram.
-    result = proposer.propose(
-        context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]),
-        min_n=3,
-        max_n=4,
-        k=2,
-    )
+    result = ngram_proposer(
+        3, 4, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
     assert np.array_equal(result, np.array([4, 1]))
 
     # Match for both 4-gram and 3-gram.
     # In this case, the proposer should return the 4-gram match.
-    result = proposer.propose(
-        context_token_ids=np.array([2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]),
-        min_n=3,
-        max_n=4,
-        k=2,
-    )
+    result = ngram_proposer(3, 4, 2).propose(
+        context_token_ids=np.array([2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]))
     assert np.array_equal(result, np.array([1, 2]))  # Not [5, 1]
 
     # Match for 2-gram and 3-gram, but not 4-gram.
-    result = proposer.propose(
-        context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4]),
-        min_n=2,
-        max_n=4,
-        k=2,
-    )
+    result = ngram_proposer(
+        2, 4,
+        2).propose(context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4]))
     assert np.array_equal(result, np.array([1, 2]))  # Not [5, 2]
diff --git a/vllm/config.py b/vllm/config.py
index d841eeb7a4747..36b9f433150c3 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -120,7 +120,7 @@ def get_attr_docs(cls: type[Any]) -> dict[str, str]:
     def pairwise(iterable):
         """
         Manually implement https://docs.python.org/3/library/itertools.html#itertools.pairwise
-        
+
         Can be removed when Python 3.9 support is dropped.
         """
         iterator = iter(iterable)
@@ -266,7 +266,7 @@ class ModelConfig:
         config_format: The config format which shall be loaded.
             Defaults to 'auto' which defaults to 'hf'.
         hf_token: The token to use as HTTP bearer authorization for remote files
-            . If `True`, will use the token generated when running 
+            . If `True`, will use the token generated when running
             `huggingface-cli login` (stored in `~/.huggingface`).
         hf_overrides: If a dictionary, contains arguments to be forwarded to the
             HuggingFace config. If a callable, it is called to update the
@@ -1624,7 +1624,7 @@ class ParallelConfig:
     """The full name of the worker class to use. If "auto", the worker class
     will be determined based on the platform."""
     sd_worker_cls: str = "auto"
-    """The full name of the worker class to use for speculative decofing. 
+    """The full name of the worker class to use for speculative decofing.
     If "auto", the worker class will be determined based on the platform."""
     worker_extension_cls: str = ""
     """The full name of the worker extension class to use. The worker extension
@@ -1815,13 +1815,13 @@ class SchedulerConfig:
 
     max_num_batched_tokens: int = None  # type: ignore
     """Maximum number of tokens to be processed in a single iteration.
-    
+
     This config has no static default. If left unspecified by the user, it will
     be set in `EngineArgs.create_engine_config` based on the usage context."""
 
     max_num_seqs: int = None  # type: ignore
     """Maximum number of sequences to be processed in a single iteration.
-    
+
     This config has no static default. If left unspecified by the user, it will
     be set in `EngineArgs.create_engine_config` based on the usage context."""
 
@@ -1867,7 +1867,7 @@ class SchedulerConfig:
     # TODO (ywang96): Make this configurable.
     max_num_encoder_input_tokens: int = field(init=False)
     """Multimodal encoder compute budget, only used in V1.
-    
+
     NOTE: This is not currently configurable. It will be overridden by
     max_num_batched_tokens in case max multimodal embedding size is larger."""
 
@@ -2306,7 +2306,8 @@ class SpeculativeConfig:
         if self.model is None and self.num_speculative_tokens is not None:
             # TODO(Shangming): Refactor mtp configuration logic when supporting
             # mtp acceleration for more models besides deepseek_v3
-            if self.target_model_config.hf_text_config.model_type \
+            if self.target_model_config and \
+                self.target_model_config.hf_text_config.model_type \
                         == "deepseek_v3":
                 # use the draft model from the same model:
                 self.model = self.target_model_config.model

From 4c41278b77a8b14fbab6dfeb95d5185c12018fbc Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 20 Apr 2025 22:37:16 -0700
Subject: [PATCH 525/593] [CI/CD][V1] Add spec decode tests to CI (#16900)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 .buildkite/test-pipeline.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index c4a54bfbcf5e6..f41d15c2324e6 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -205,6 +205,7 @@ steps:
     - pytest -v -s v1/sample
     - pytest -v -s v1/worker
     - pytest -v -s v1/structured_output
+    - pytest -v -s v1/spec_decode
     - pytest -v -s v1/test_stats.py
     - pytest -v -s v1/test_utils.py
     - pytest -v -s v1/test_oracle.py

From 26c0406555a5e782a4591953389477bcf9695d10 Mon Sep 17 00:00:00 2001
From: Yang Fan <suyang.fy@alibaba-inc.com>
Date: Mon, 21 Apr 2025 18:25:21 +0800
Subject: [PATCH 526/593] [Bugfix] Fix distributed bug in Qwen2.5-VL &
 Qwen2.5-Omni (#16907)

---
 vllm/model_executor/models/qwen2_5_vl.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index d5bc3446edb8f..30980316ecfc7 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -198,9 +198,8 @@ class Qwen2_5_VisionMLP(nn.Module):
 
 def all_gather_interleave(local_tensor, hidden_size: int, tp_size: int):
     """All-gather the input tensor interleavely across model parallel group."""
-    import torch.distributed as dist
     gathered_tensors = [torch.zeros_like(local_tensor) for _ in range(tp_size)]
-    dist.all_gather(gathered_tensors, local_tensor)
+    parallel_state.get_tp_group().all_gather(gathered_tensors, local_tensor)
 
     gathered_tensors_split = [
         torch.split(tensor, hidden_size // tp_size, -1)

From b34f33438a9db25319b132720eeef5cea2fc6c69 Mon Sep 17 00:00:00 2001
From: Alex Brooks <alex.brooks@ibm.com>
Date: Mon, 21 Apr 2025 05:10:01 -0600
Subject: [PATCH 527/593] [Doc] Split dummy_processor_inputs() in Multimodal
 Docs (#16915)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
---
 docs/source/contributing/model/multimodal.md | 62 +++++++++++---------
 docs/source/design/mm_processing.md          |  2 +-
 2 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index 03d830fe90f11..b42536f054d76 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -128,11 +128,9 @@ HF processing as well as memory profiling.
 
 ### For memory profiling
 
-Override the abstract method {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`
-to construct dummy inputs for memory profiling. This dummy input should result in the worst-case memory usage of
-the model so that vLLM can reserve the correct amount of memory for it.
+Override the abstract methods {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text` and {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_mm_data` to construct dummy inputs for memory profiling. These dummy inputs should result in the worst-case memory usage of the model so that vLLM can reserve the correct amount of memory for it.
 
-Assuming that the memory usage increases with the number of tokens, the dummy input can be constructed to maximize the number of output embeddings, which is the same number as placeholder feature tokens.
+Assuming that the memory usage increases with the number of tokens, the dummy inputs can be constructed to maximize the number of output embeddings, which is the same number as placeholder feature tokens.
 
 ::::{tab-set}
 :::{tab-item} Basic example: LLaVA
@@ -244,38 +242,45 @@ def get_num_image_tokens(
 ```
 
 Notice that the number of image tokens doesn't depend on the image width and height.
-We can simply use a dummy `image_size`:
+We can simply use a dummy `image_size` to calculate the multimodal profiling data:
 
 ```python
+# NOTE: In actuality, this is usually implemented as part of the
+# model's subclass of `BaseProcessingInfo`, but we show it as is
+# here for simplicity.
 def get_image_size_with_most_features(self) -> ImageSize:
     hf_config = self.get_hf_config()
     width = height = hf_config.image_size
     return ImageSize(width=width, height=height)
 
-def get_dummy_processor_inputs(
+def get_dummy_mm_data(
     self,
     seq_len: int,
     mm_counts: Mapping[str, int],
-) -> ProcessorInputs:
+) -> MultiModalDataDict:
     num_images = mm_counts.get("image", 0)
 
-    processor = self.info.get_hf_processor()
-    image_token = processor.image_token
-  
-    hf_config = self.get_hf_config()
-    target_width, target_height = self.info.get_image_size_with_most_features()
+    target_width, target_height = \
+        self.info.get_image_size_with_most_features()
 
-    mm_data = {
+    return {
         "image":
         self._get_dummy_images(width=target_width,
                                height=target_height,
                                num_images=num_images)
     }
+```
 
-    return ProcessorInputs(
-        prompt_text=image_token * num_images,
-        mm_data=mm_data,
-    )
+For the text, we simply expand the multimodal image token from the model config to match the desired number of images.
+
+```python
+def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+    num_images = mm_counts.get("image", 0)
+
+    processor = self.info.get_hf_processor()
+    image_token = processor.image_token
+
+    return image_token * num_images
 ```
 
 :::
@@ -412,29 +417,30 @@ def get_image_size_with_most_features(self) -> ImageSize:
 
 Fuyu does not expect image placeholders in the inputs to HF processor, so
 the dummy prompt text is empty regardless of the number of images.
-Otherwise, the logic of this method is very similar to LLaVA:
 
 ```python
-def get_dummy_processor_inputs(
+def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+    return ""
+```
+
+For the multimodal image profiling data, the logic is very similar to LLaVA:
+
+```python
+def get_dummy_mm_data(
     self,
     seq_len: int,
     mm_counts: Mapping[str, int],
-) -> ProcessorInputs:
+) -> MultiModalDataDict:
     target_width, target_height = \
         self.info.get_image_size_with_most_features()
     num_images = mm_counts.get("image", 0)
 
-    mm_data = {
+    return {
         "image":
         self._get_dummy_images(width=target_width,
-                                height=target_height,
-                                num_images=num_images)
+                               height=target_height,
+                               num_images=num_images)
     }
-
-    return ProcessorInputs(
-        prompt_text="",
-        mm_data=mm_data,
-    )
 ```
 
 :::
diff --git a/docs/source/design/mm_processing.md b/docs/source/design/mm_processing.md
index 0947c1da1e547..dc92a3c2c511e 100644
--- a/docs/source/design/mm_processing.md
+++ b/docs/source/design/mm_processing.md
@@ -47,7 +47,7 @@ Moreover, since the tokenized text has not passed through the HF processor, we h
 
 ### Dummy text
 
-We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
+We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text`. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
 
 (mm-automatic-prompt-updating)=
 

From d41faaf9df6d1a741d5fdd4a282b16783cace888 Mon Sep 17 00:00:00 2001
From: Han Zhang <623606860@qq.com>
Date: Mon, 21 Apr 2025 20:18:28 +0800
Subject: [PATCH 528/593] Restore buffers when wake up from level 2 sleep
 (#16564) (#16889)

Signed-off-by: Han <zh950713@gmail.com>
---
 vllm/v1/worker/gpu_worker.py | 20 ++++++++++++++++++++
 vllm/worker/worker.py        | 20 ++++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 3a29f8d0deefe..424c73e3ab7fa 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -54,6 +54,9 @@ class Worker(WorkerBase):
             from vllm.utils import init_cached_hf_modules
             init_cached_hf_modules()
 
+        # Buffers saved before sleep
+        self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
+
         # Torch profiler. Enabled and configured through env vars:
         # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
         if envs.VLLM_TORCH_PROFILER_DIR:
@@ -73,6 +76,15 @@ class Worker(WorkerBase):
 
     def sleep(self, level: int = 1) -> None:
         free_bytes_before_sleep = torch.cuda.mem_get_info()[0]
+
+        # Save the buffers before level 2 sleep
+        if level == 2:
+            model = self.model_runner.model
+            self._sleep_saved_buffers = {
+                name: buffer.cpu().clone()
+                for name, buffer in model.named_buffers()
+            }
+
         allocator = CuMemAllocator.get_instance()
         allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple())
         free_bytes_after_sleep, total = torch.cuda.mem_get_info()
@@ -88,6 +100,14 @@ class Worker(WorkerBase):
         allocator = CuMemAllocator.get_instance()
         allocator.wake_up(tags)
 
+        # Restore the buffers after level 2 sleep
+        if len(self._sleep_saved_buffers):
+            model = self.model_runner.model
+            for name, buffer in model.named_buffers():
+                if name in self._sleep_saved_buffers:
+                    buffer.data.copy_(self._sleep_saved_buffers[name].data)
+            self._sleep_saved_buffers = {}
+
     def init_device(self):
         if self.device_config.device.type == "cuda":
             # torch.distributed.all_reduce does not free the input tensor until
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 9ea003bec5e06..78ea990de820c 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -95,6 +95,9 @@ class Worker(LocalOrDistributedWorkerBase):
         self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
         self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {}
 
+        # Buffers saved before sleep
+        self._sleep_saved_buffers: Dict[str, torch.Tensor] = {}
+
         # Torch profiler. Enabled and configured through env vars:
         # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
         if envs.VLLM_TORCH_PROFILER_DIR:
@@ -124,6 +127,15 @@ class Worker(LocalOrDistributedWorkerBase):
 
     def sleep(self, level: int = 1) -> None:
         free_bytes_before_sleep = torch.cuda.mem_get_info()[0]
+
+        # Save the buffers before level 2 sleep
+        if level == 2:
+            model = self.model_runner.model
+            self._sleep_saved_buffers = {
+                name: buffer.cpu().clone()
+                for name, buffer in model.named_buffers()
+            }
+
         allocator = CuMemAllocator.get_instance()
         allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple())
         free_bytes_after_sleep, total = torch.cuda.mem_get_info()
@@ -139,6 +151,14 @@ class Worker(LocalOrDistributedWorkerBase):
         allocator = CuMemAllocator.get_instance()
         allocator.wake_up(tags=tags)
 
+        # Restore the buffers after level 2 sleep
+        if len(self._sleep_saved_buffers):
+            model = self.model_runner.model
+            for name, buffer in model.named_buffers():
+                if name in self._sleep_saved_buffers:
+                    buffer.data.copy_(self._sleep_saved_buffers[name].data)
+            self._sleep_saved_buffers = {}
+
     def init_device(self) -> None:
         if self.device_config.device.type == "cuda":
             # torch.distributed.all_reduce does not free the input tensor until

From d9ac9e3dc5a15a517d67a2c0f040ad0103e58251 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Mon, 21 Apr 2025 20:29:40 +0800
Subject: [PATCH 529/593] [Misc] fix collect_env version parse (#15267)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 vllm/collect_env.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/vllm/collect_env.py b/vllm/collect_env.py
index a404c1c3cb585..9cfceb7c45cc5 100644
--- a/vllm/collect_env.py
+++ b/vllm/collect_env.py
@@ -282,11 +282,18 @@ def get_vllm_version():
 
     if __version__ == "dev":
         return "N/A (dev)"
-
-    if len(__version_tuple__) == 4:  # dev build
-        git_sha = __version_tuple__[-1][1:]  # type: ignore
-        return f"{__version__} (git sha: {git_sha}"
-
+    version_str = __version_tuple__[-1]
+    if isinstance(version_str, str) and version_str.startswith('g'):
+        # it's a dev build
+        if '.' in version_str:
+            # it's a dev build containing local changes
+            git_sha = version_str.split('.')[0][1:]
+            date = version_str.split('.')[-1][1:]
+            return f"{__version__} (git sha: {git_sha}, date: {date})"
+        else:
+            # it's a dev build without local changes
+            git_sha = version_str[1:]  # type: ignore
+            return f"{__version__} (git sha: {git_sha})"
     return __version__
 
 
From 7272bfae77310c417e0b747a0590d9dd782c9f6b Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Mon, 21 Apr 2025 21:25:49 +0800
Subject: [PATCH 530/593] [Misc] Refactor platform to get device specific
 stream and event (#14411)

Signed-off-by: shen-shanshan <467638484@qq.com>
---
 vllm/platforms/interface.py |  9 +++++++++
 vllm/spec_decode/metrics.py | 11 ++++++-----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 8c099b9531c5f..4707c3749b7e2 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -404,6 +404,15 @@ class Platform:
     ) -> None:
         """Raises if this request is unsupported on this platform"""
 
+    def __getattr__(self, key: str):
+        device = getattr(torch, self.device_name, None)
+        if device is not None and hasattr(device, key):
+            return getattr(device, key)
+        else:
+            logger.warning("Current platform %s doesn't has '%s' attribute.",
+                           self.device_name, key)
+            return None
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py
index bc0e0a121cd55..0bb8d602ec8f1 100644
--- a/vllm/spec_decode/metrics.py
+++ b/vllm/spec_decode/metrics.py
@@ -8,6 +8,7 @@ import torch
 
 from vllm.model_executor.layers.spec_decode_base_sampler import (
     SpecDecodeBaseSampler)
+from vllm.platforms import current_platform
 from vllm.utils import is_pin_memory_available
 
 
@@ -89,14 +90,14 @@ class AsyncMetricsCollector:
         self._rank = rank
         if isinstance(device_type, torch.device):
             device_type = device_type.type
-        if device_type == 'cuda':
-            self._copy_stream = torch.cuda.Stream()
+        stream = current_platform.Stream
+        if stream is not None:
+            self._copy_stream = stream()
 
     def maybe_collect_rejsample_metrics(
             self, k: int) -> Optional[SpecDecodeWorkerMetrics]:
-        # currently using cuda.Event, skip for any non_cuda_alike platform
-        from vllm.platforms import current_platform
-        if not current_platform.is_cuda_alike():
+        # Skip for any platform that doesn't have device Event
+        if current_platform.Event is None:
             return None
 
         # If a copy was initiated in the previous call, collect and return.

From 55d6d3fdb8a08aec7c0b09be95c4982705921ab3 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 21 Apr 2025 22:26:34 +0800
Subject: [PATCH 531/593] [Bugfix] Fix GLM rotary_dim issue and support v1
 (#16912)

Signed-off-by: isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/glm.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/glm.py b/vllm/model_executor/models/glm.py
index 8d52da8b7482c..6269ebcee5c08 100644
--- a/vllm/model_executor/models/glm.py
+++ b/vllm/model_executor/models/glm.py
@@ -3,13 +3,13 @@
 from vllm.config import VllmConfig
 from vllm.model_executor.models.llama import LlamaForCausalLM
 
-from .interfaces import SupportsV0Only
 from .utils import PPMissingLayer
 
 
-class GlmForCausalLM(LlamaForCausalLM, SupportsV0Only):
+class GlmForCausalLM(LlamaForCausalLM):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        vllm_config.model_config.hf_config.partial_rotary_factor = 0.5
         super().__init__(vllm_config=vllm_config, prefix=prefix)
         # Hack Llama model to fit HF format GLM implementation
         # Attention difference between GLM and Llama:
@@ -17,7 +17,6 @@ class GlmForCausalLM(LlamaForCausalLM, SupportsV0Only):
         # 2. There is no bias for o_proj in attention
         for layer in self.model.layers:
             if not isinstance(layer, PPMissingLayer):
-                layer.self_attn.rotary_emb.rotary_dim //= 2
                 layer.self_attn.rotary_emb.is_neox_style = False
                 layer.self_attn.o_proj.bias = None
                 layer.self_attn.o_proj.skip_bias_add = True

From 3b34fd5273580942bb573f511a4fc3d2522d67f3 Mon Sep 17 00:00:00 2001
From: Kartik Ramesh <kartikx2000@gmail.com>
Date: Mon, 21 Apr 2025 10:51:43 -0500
Subject: [PATCH 532/593] Raise error for data-parallel with
 benchmark_throughput (#16737)

Signed-off-by: Kartik Ramesh <kartikx2000@gmail.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
---
 benchmarks/benchmark_throughput.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 67e509c1f550f..1f65277e1bfeb 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -523,6 +523,13 @@ def validate_args(args):
         raise ValueError(
             "Tokenizer must be the same as the model for MII backend.")
 
+    # --data-parallel is not supported currently.
+    # https://github.com/vllm-project/vllm/issues/16222
+    if args.data_parallel_size > 1:
+        raise ValueError(
+            "Data parallel is not supported in offline benchmark, \
+            please use benchmark serving instead")
+
 
 if __name__ == "__main__":
     parser = FlexibleArgumentParser(description="Benchmark the throughput.")

From fe3462c774922e3dd1566bcf4bea8c34a6c6b62b Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Tue, 22 Apr 2025 00:02:57 +0800
Subject: [PATCH 533/593] [XPU][Bugfix] minor fix for XPU (#15591)

Signed-off-by: yan ma <yan.ma@intel.com>
---
 .../getting_started/installation/gpu/xpu.inc.md      |  2 ++
 vllm/attention/backends/ipex_attn.py                 | 12 ++++++------
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/source/getting_started/installation/gpu/xpu.inc.md
index c41905f250f83..fbf5421eeec5b 100644
--- a/docs/source/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@@ -23,6 +23,8 @@ Currently, there are no pre-built XPU wheels.
 - Second, install Python packages for vLLM XPU backend building:
 
 ```console
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
 pip install --upgrade pip
 pip install -v -r requirements/xpu.txt
 ```
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index 99917a92af5f9..27959caa651a4 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -220,8 +220,8 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
                 value_cache,
                 attn_metadata.slot_mapping.flatten(),
                 self.kv_cache_dtype,
-                layer._k_scale,
-                layer._v_scale,
+                layer._k_scale_float,
+                layer._v_scale_float,
             )
 
         if attn_metadata.is_prompt:
@@ -306,8 +306,8 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
                     max_seq_len,
                     self.alibi_slopes,
                     self.kv_cache_dtype,
-                    layer._k_scale,
-                    layer._v_scale,
+                    layer._k_scale_float,
+                    layer._v_scale_float,
                 )
             else:
                 # Run PagedAttention V2.
@@ -339,8 +339,8 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
                     max_seq_len,
                     self.alibi_slopes,
                     self.kv_cache_dtype,
-                    layer._k_scale,
-                    layer._v_scale,
+                    layer._k_scale_float,
+                    layer._v_scale_float,
                 )
 
             # Reshape the output tensor.

From 63e26fff78611d80b1ed576855ba7881d1b95fa3 Mon Sep 17 00:00:00 2001
From: David Xia <david@davidxia.com>
Date: Mon, 21 Apr 2025 12:15:18 -0400
Subject: [PATCH 534/593] [doc] install required python3-dev apt package
 (#16888)

Signed-off-by: David Xia <david@davidxia.com>
---
 docs/source/getting_started/installation/cpu/build.inc.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/getting_started/installation/cpu/build.inc.md b/docs/source/getting_started/installation/cpu/build.inc.md
index 39d9dfbd2b2e2..19e817bed70e9 100644
--- a/docs/source/getting_started/installation/cpu/build.inc.md
+++ b/docs/source/getting_started/installation/cpu/build.inc.md
@@ -2,7 +2,7 @@ First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as
 
 ```console
 sudo apt-get update  -y
-sudo apt-get install -y gcc-12 g++-12 libnuma-dev
+sudo apt-get install -y gcc-12 g++-12 libnuma-dev python3-dev
 sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 ```
 

From f728ab8e3578c22b42ed53e51b5e8ec35328d8b9 Mon Sep 17 00:00:00 2001
From: David Xia <david@davidxia.com>
Date: Mon, 21 Apr 2025 13:45:51 -0400
Subject: [PATCH 535/593] [Doc] mention how to install in CPU editable mode
 (#16923)

Signed-off-by: David Xia <david@davidxia.com>
---
 docs/source/getting_started/installation/cpu/build.inc.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/source/getting_started/installation/cpu/build.inc.md b/docs/source/getting_started/installation/cpu/build.inc.md
index 19e817bed70e9..f385f3d5b1984 100644
--- a/docs/source/getting_started/installation/cpu/build.inc.md
+++ b/docs/source/getting_started/installation/cpu/build.inc.md
@@ -26,3 +26,9 @@ Finally, build and install vLLM CPU backend:
 ```console
 VLLM_TARGET_DEVICE=cpu python setup.py install
 ```
+
+If you want to develop vllm, install it in editable mode instead.
+
+```console
+VLLM_TARGET_DEVICE=cpu python setup.py develop
+```

From 299ebb62b269ce167eb1c71b5e39a1dc1f65ce1c Mon Sep 17 00:00:00 2001
From: Chanh Nguyen <chanhnguyen@gmail.com>
Date: Mon, 21 Apr 2025 11:18:22 -0700
Subject: [PATCH 536/593] [Core] Speed up decode by remove synchronizing
 operation in sampler (#16436)

Signed-off-by: Chanh Nguyen <cnguyen@linkedin.com>
Co-authored-by: Chanh Nguyen <cnguyen@linkedin.com>
---
 vllm/model_executor/layers/utils.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
index a9ef973917e19..5e56be0619b57 100644
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -47,10 +47,15 @@ def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
         output_tokens_tensor, vocab_size, num_seqs)
     repetition_penalties = repetition_penalties.unsqueeze(dim=1).repeat(
         1, vocab_size)
-    logits[logits > 0] /= torch.where(prompt_mask | output_mask,
-                                      repetition_penalties, 1.0)[logits > 0]
-    logits[logits <= 0] *= torch.where(prompt_mask | output_mask,
-                                       repetition_penalties, 1.0)[logits <= 0]
+
+    # If token appears in prompt or output, apply, otherwise use 1.0 for no-op.
+    penalties = torch.where(prompt_mask | output_mask, repetition_penalties,
+                            1.0)
+
+    # If logits are positive, divide by penalty, otherwise multiply by penalty.
+    scaling = torch.where(logits > 0, 1.0 / penalties, penalties)
+    logits *= scaling
+
     # We follow the definition in OpenAI API.
     # Refer to https://platform.openai.com/docs/api-reference/parameter-details
     logits -= frequency_penalties.unsqueeze(dim=1) * output_bin_counts

From 3a0fba5cf470c580b779e47dd4ea8f11381faf1a Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 21 Apr 2025 12:38:50 -0700
Subject: [PATCH 537/593] [V1][Spec Decode] Handle draft tokens beyond
 max_model_len (#16087)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/v1/core/test_scheduler.py       |  7 +++-
 tests/v1/spec_decode/test_max_len.py  | 57 +++++++++++++++++++++++++++
 tests/v1/spec_decode/test_ngram.py    | 28 ++++++++-----
 vllm/v1/core/sched/scheduler.py       |  7 ++++
 vllm/v1/spec_decode/eagle.py          | 35 ++++++++++++++--
 vllm/v1/spec_decode/ngram_proposer.py | 10 ++++-
 vllm/v1/worker/gpu_model_runner.py    |  8 +++-
 7 files changed, 137 insertions(+), 15 deletions(-)
 create mode 100644 tests/v1/spec_decode/test_max_len.py

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 691ca59b062c2..f173344344f95 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -30,6 +30,7 @@ def create_scheduler(
     use_kv_connector: bool = False,
     num_blocks: int = 10000,
     block_size: int = 16,
+    max_model_len: Optional[int] = None,
 ) -> Scheduler:
     '''Create scheduler under test.
 
@@ -44,12 +45,15 @@ def create_scheduler(
     Returns:
       :class:`Scheduler` instance
     '''
+    if max_model_len is None:
+        max_model_len = max_num_batched_tokens
     scheduler_config = SchedulerConfig(
         max_num_seqs=max_num_seqs,
         max_num_batched_tokens=max_num_batched_tokens,
-        max_model_len=max_num_batched_tokens,
+        max_model_len=max_model_len,
         long_prefill_token_threshold=long_prefill_token_threshold,
         disable_chunked_mm_input=disable_chunked_mm_input,
+        enable_chunked_prefill=True,
     )
     model_config = ModelConfig(
         model=model,
@@ -296,6 +300,7 @@ def test_no_mm_input_chunking():
         model="llava-hf/llava-1.5-7b-hf",
         max_num_batched_tokens=1024,
         disable_chunked_mm_input=True,
+        max_model_len=2048,
     )
     mm_positions = [[PlaceholderRange(offset=400, length=800)]]
     requests = create_requests(num_requests=1,
diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py
new file mode 100644
index 0000000000000..f577fb4ab3295
--- /dev/null
+++ b/tests/v1/spec_decode/test_max_len.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Test whether spec decoding handles the max model length properly."""
+
+import pytest
+
+from vllm import LLM, SamplingParams
+
+_PROMPTS = [
+    "1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1",
+    "Repeat the following sentence 10 times: Consistency is key to mastering any skill.",  # noqa: E501
+    "Who won the Turing Award in 2018, and for what contribution? Describe in detail.",  # noqa: E501
+]
+
+
+@pytest.mark.parametrize("num_speculative_tokens", [1, 3, 10])
+def test_ngram_max_len(
+    monkeypatch: pytest.MonkeyPatch,
+    num_speculative_tokens: int,
+):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        llm = LLM(
+            model="facebook/opt-125m",
+            max_model_len=100,
+            enforce_eager=True,  # For faster initialization.
+            speculative_config={
+                "method": "ngram",
+                "prompt_lookup_max": 5,
+                "prompt_lookup_min": 3,
+                "num_speculative_tokens": num_speculative_tokens,
+            },
+        )
+        sampling_params = SamplingParams(max_tokens=100, ignore_eos=True)
+        llm.generate(_PROMPTS, sampling_params)
+
+
+@pytest.mark.parametrize("num_speculative_tokens", [1, 3, 10])
+def test_eagle_max_len(
+    monkeypatch: pytest.MonkeyPatch,
+    num_speculative_tokens: int,
+):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        llm = LLM(
+            model="meta-llama/Meta-Llama-3-8B-Instruct",
+            enforce_eager=True,  # For faster initialization.
+            speculative_config={
+                "method": "eagle",
+                "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
+                "num_speculative_tokens": num_speculative_tokens,
+            },
+            max_model_len=100,
+        )
+        sampling_params = SamplingParams(max_tokens=100, ignore_eos=True)
+        llm.generate(_PROMPTS, sampling_params)
diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index 5caa4f052fc33..50548219fff04 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 
-from vllm.config import SpeculativeConfig, VllmConfig
+from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
 from vllm.v1.spec_decode.ngram_proposer import (NgramProposer,
                                                 _find_subarray_kmp,
                                                 _kmp_lps_array)
@@ -42,14 +42,24 @@ def test_find_subarray_kmp():
 def test_ngram_proposer():
 
     def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
-        return NgramProposer(vllm_config=VllmConfig(
-            speculative_config=SpeculativeConfig.from_dict(
-                {
-                    "prompt_lookup_min": min_n,
-                    "prompt_lookup_max": max_n,
-                    "num_speculative_tokens": k,
-                    "method": "ngram",
-                })))
+        # Dummy model config. Just to set max_model_len.
+        model_config = ModelConfig(model="facebook/opt-125m",
+                                   task="generate",
+                                   max_model_len=100,
+                                   tokenizer="facebook/opt-125m",
+                                   tokenizer_mode="auto",
+                                   dtype="auto",
+                                   seed=None,
+                                   trust_remote_code=False)
+        return NgramProposer(
+            vllm_config=VllmConfig(model_config=model_config,
+                                   speculative_config=SpeculativeConfig.
+                                   from_dict({
+                                       "prompt_lookup_min": min_n,
+                                       "prompt_lookup_max": max_n,
+                                       "num_speculative_tokens": k,
+                                       "method": "ngram",
+                                   })))
 
     # No match.
     result = ngram_proposer(
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 69e7cc8ee08ce..16efc42f212e0 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -185,6 +185,13 @@ class Scheduler(SchedulerInterface):
             num_new_tokens = min(num_new_tokens, token_budget)
             assert num_new_tokens > 0
 
+            # Make sure the input position does not exceed the max model len.
+            # This is necessary when using spec decoding.
+            num_new_tokens = min(
+                num_new_tokens,
+                self.max_model_len - request.num_computed_tokens)
+            assert num_new_tokens > 0
+
             # Schedule encoder inputs.
             if request.has_encoder_inputs:
                 (encoder_inputs_to_schedule, num_new_tokens,
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 2322463c0713d..9505bd7ce43dd 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -12,6 +12,8 @@ from vllm.model_executor.models.llama_eagle import EagleLlamaForCausalLM
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.v1.sample.metadata import SamplingMetadata
 
+PADDING_SLOT_ID = -1
+
 
 class EagleProposer:
 
@@ -23,6 +25,7 @@ class EagleProposer:
         self.vllm_config = vllm_config
         self.num_speculative_tokens = (
             vllm_config.speculative_config.num_speculative_tokens)
+        self.max_model_len = vllm_config.model_config.max_model_len
         self.block_size = vllm_config.cache_config.block_size
         # We need +1 here because the arange is used to set query_start_loc,
         # which has one more element than batch_size.
@@ -112,22 +115,48 @@ class EagleProposer:
             # Update the inputs.
             input_ids = draft_token_ids_list[-1]
             positions += 1
+
+            # NOTE(woosuk): We should handle the case where the draft model
+            # generates tokens beyond the max model length. Since it is complex
+            # to remove such requests from the batch, we keep them in the batch
+            # but adjust the position ids and slot mappings to avoid the
+            # out-of-range access during the model execution. The draft tokens
+            # generated with this adjustment should be ignored.
+            exceeds_max_model_len = positions >= self.max_model_len
+            # Mask out the position ids that exceed the max model length.
+            # Otherwise, we may get out-of-range error in RoPE.
+            clamped_positions = torch.where(exceeds_max_model_len, 0,
+                                            positions)
+
+            # Increment the sequence lengths.
             attn_metadata.max_seq_len += 1
             attn_metadata.seq_lens += 1
+            # Consider max model length.
+            attn_metadata.max_seq_len = min(attn_metadata.max_seq_len,
+                                            self.max_model_len)
+            # For the requests that exceed the max model length, we set the
+            # sequence length to 1 to minimize their overheads in attention.
+            attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len, 1)
+
             # Compute the slot mapping.
-            block_numbers = positions // self.block_size
+            block_numbers = clamped_positions // self.block_size
             block_ids = block_table.gather(dim=1,
                                            index=block_numbers.view(-1, 1))
             block_ids = block_ids.view(-1)
             attn_metadata.slot_mapping = (block_ids * self.block_size +
-                                          positions % self.block_size)
+                                          clamped_positions % self.block_size)
+            # Mask out the slot mappings that exceed the max model length.
+            # Otherwise, the KV cache will be inadvertently updated with the
+            # padding tokens.
+            attn_metadata.slot_mapping.masked_fill_(exceeds_max_model_len,
+                                                    PADDING_SLOT_ID)
 
             # Run the model.
             with set_forward_context(attn_metadata, self.vllm_config):
                 hidden_states = self.model(
                     input_ids=input_ids,
                     hidden_states=hidden_states,
-                    positions=positions,
+                    positions=clamped_positions,
                 )
             logits = self.model.compute_logits(hidden_states, None)
             draft_token_ids, probs = compute_probs_and_sample_next_token(
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index 7e548bb48b57c..704153d43a2b4 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -18,6 +18,9 @@ class NgramProposer:
         # tokens follow the match, we will return the maximum amount of
         # tokens until the end.
         self.k = vllm_config.speculative_config.num_speculative_tokens
+        # Maximum length of the model.
+        self.max_model_len = vllm_config.model_config.max_model_len
+
         # Trigger Numba JIT compilation for N-gram proposer.
         # This usually takes less than 1 second.
         self.propose(np.zeros(1024, dtype=np.int32))
@@ -50,9 +53,14 @@ class NgramProposer:
               followed that pattern. Here we will return [4,2,3] because 
               we only have three tokens after the match.
         """
+        # Do not generate draft tokens beyond the max model length.
+        k = min(self.k, self.max_model_len - context_token_ids.shape[0])
+        if k <= 0:
+            return None
+
         # TODO(woosuk): Optimize this.
         for n in range(self.max_n, self.min_n - 1, -1):
-            result = _find_subarray_kmp(context_token_ids, n, self.k)
+            result = _find_subarray_kmp(context_token_ids, n, k)
             if result is not None:
                 return result
         return None
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 7c88ecc31d025..4cb5a8e171a9c 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1271,7 +1271,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 draft_token_ids.append([])
                 continue
 
-            # Skip requests that require top-p, top-k, etc.
+            # Skip requests that require sampling parameters that are not
+            # supported with speculative decoding.
             req_id = self.input_batch.req_ids[i]
             if not is_spec_decode_supported(req_id, self.input_batch):
                 draft_token_ids.append([])
@@ -1280,6 +1281,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             # Add sampled_token_ids to token_ids_cpu.
             start_idx = self.input_batch.num_tokens_no_spec[i]
             end_idx = start_idx + num_sampled_ids
+            if end_idx >= self.max_model_len:
+                # Skip requests that have already reached the max model length.
+                draft_token_ids.append([])
+                continue
+
             self.input_batch.token_ids_cpu[i, start_idx:end_idx] = sampled_ids
             drafter_output = self.drafter.propose(
                 self.input_batch.token_ids_cpu[i, :end_idx])

From 471fe65630e02ec7a85ca6c7566127b801677425 Mon Sep 17 00:00:00 2001
From: Chengji Yao <chengjiyao@google.com>
Date: Mon, 21 Apr 2025 14:43:13 -0700
Subject: [PATCH 538/593] [TPU][V1] Implicitly adjust page size when there's
 SMEM OOM (#16871)

Signed-off-by: Chengji Yao <chengjiyao@google.com>
---
 tests/v1/tpu/test_basic.py           |  7 +++++--
 vllm/platforms/tpu.py                | 14 ++++++++++++++
 vllm/v1/attention/backends/pallas.py | 15 +++++++++++++++
 3 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py
index 8164952fe3823..a4571a554572c 100644
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@@ -22,6 +22,7 @@ MODELS = [
 ]
 
 TENSOR_PARALLEL_SIZES = [1]
+MAX_NUM_REQS = [16, 1024]
 
 # TODO: Enable when CI/CD will have a multi-tpu instance
 # TENSOR_PARALLEL_SIZES = [1, 4]
@@ -32,12 +33,14 @@ TENSOR_PARALLEL_SIZES = [1]
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES)
+@pytest.mark.parametrize("max_num_seqs", MAX_NUM_REQS)
 def test_basic(
     vllm_runner: type[VllmRunner],
     monkeypatch: pytest.MonkeyPatch,
     model: str,
     max_tokens: int,
     tensor_parallel_size: int,
+    max_num_seqs: int,
 ) -> None:
     prompt = "The next numbers of the sequence " + ", ".join(
         str(i) for i in range(1024)) + " are:"
@@ -51,9 +54,9 @@ def test_basic(
                 # Note: max_num_batched_tokens == 1024 is needed here to
                 # actually test chunked prompt
                 max_num_batched_tokens=1024,
-                max_model_len=8196,
+                max_model_len=8192,
                 gpu_memory_utilization=0.7,
-                max_num_seqs=16,
+                max_num_seqs=max_num_seqs,
                 tensor_parallel_size=tensor_parallel_size) as vllm_model:
             vllm_outputs = vllm_model.generate_greedy(example_prompts,
                                                       max_tokens)
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 83dd3e9c817af..b1e221e28b434 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -97,6 +97,20 @@ class TpuPlatform(Platform):
                 "Using bfloat16 instead.", vllm_config.model_config.dtype)
             vllm_config.model_config.dtype = torch.bfloat16
 
+        if envs.VLLM_USE_V1:
+            from vllm.v1.attention.backends.pallas import (
+                PallasAttentionBackend)
+            min_page_size = PallasAttentionBackend.get_min_page_size(
+                vllm_config)
+            if min_page_size > vllm_config.cache_config.block_size:
+                logger.warning(
+                    "Increase the page size from %s to %s to make sure there's"
+                    "no SMEM OOM",
+                    vllm_config.cache_config.block_size,
+                    min_page_size,
+                )
+                vllm_config.cache_config.block_size = min_page_size
+
         parallel_config = vllm_config.parallel_config
         scheduler_config = vllm_config.scheduler_config
         if parallel_config.worker_cls == "auto":
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index 3e8149a24ebf7..05b97172bc6c0 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -10,7 +10,9 @@ import torch_xla.experimental.custom_kernel  # noqa: F401
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer, AttentionType)
 from vllm.attention.backends.utils import CommonAttentionState
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.utils import cdiv
 
 logger = init_logger(__name__)
 
@@ -50,6 +52,19 @@ class PallasAttentionBackend(AttentionBackend):
     ) -> None:
         raise RuntimeError("swap_blocks is not used for the TPU backend.")
 
+    # In recent TPU generations, up to v6e, the SMEM size is 1MB. The
+    # block_tables within the PallasMetadata constitute almost the entire SMEM
+    # requirement. Its size is max_num_seqs * num_page_per_seq * 4 (Int). Here
+    # we simply make sure that the size is smaller than half of SMEM capacity.
+    @staticmethod
+    def get_min_page_size(vllm_config: VllmConfig) -> int:
+        max_num_page_per_req = (1024 * 1024 // 2 //
+                                vllm_config.scheduler_config.max_num_seqs // 4)
+        min_page_size = cdiv(vllm_config.model_config.max_model_len,
+                             max_num_page_per_req)
+        min_page_size = 1 << (min_page_size - 1).bit_length()
+        return min_page_size
+
 
 @dataclass
 class PallasMetadata:

From 71eda0bb76b340eda7cb449edec75d70d8470cc7 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Mon, 21 Apr 2025 18:35:32 -0600
Subject: [PATCH 539/593] Update Qwen1.5-MoE-W4A16-compressed-tensors.yaml
 (#16946)

---
 .../configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml         | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
index 166af81a3f0ee..eb4a50fd4fccd 100644
--- a/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
@@ -4,8 +4,8 @@ tasks:
 - name: "gsm8k"
   metrics:
   - name: "exact_match,strict-match"
-    value: 0.31
+    value: 0.30
   - name: "exact_match,flexible-extract"
-    value: 0.47
+    value: 0.465
 limit: 1319
 num_fewshot: 5

From 210207525e085b111a304089f4413fb5056263b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Tue, 22 Apr 2025 02:36:59 +0200
Subject: [PATCH 540/593] [TPU][V1] Capture multimodal encoder during model
 compilation (#15051)

Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: NickLucche <nlucches@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Siyuan Liu <lsiyuan@google.com>
---
 .../scripts/hardware_ci/run-tpu-v1-test.sh    |   4 +-
 tests/v1/tpu/test_multimodal.py               |  91 ++++++
 vllm/v1/worker/tpu_model_runner.py            | 278 +++++++++++++++---
 vllm/v1/worker/tpu_worker.py                  |   2 +-
 4 files changed, 327 insertions(+), 48 deletions(-)
 create mode 100644 tests/v1/tpu/test_multimodal.py

diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index 0bf563d5be244..6b5e86a0ebd64 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -17,7 +17,7 @@ source /etc/environment
 docker run --privileged --net host --shm-size=16G -it \
     -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
     vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
-    && python3 -m pip install pytest tpu-info \
+    && python3 -m pip install pytest pytest-asyncio tpu-info \
     && python3 -m pip install lm_eval[api]==0.4.4 \
     && export VLLM_USE_V1=1 \
     && export VLLM_XLA_CHECK_RECOMPILATION=1 \
@@ -42,6 +42,8 @@ docker run --privileged --net host --shm-size=16G -it \
     && echo TEST_8 \
     && pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py \
     && echo TEST_9 \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py \
+    && echo TEST_10 \
     && pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" \
 
 
diff --git a/tests/v1/tpu/test_multimodal.py b/tests/v1/tpu/test_multimodal.py
new file mode 100644
index 0000000000000..eb62e0e4b201a
--- /dev/null
+++ b/tests/v1/tpu/test_multimodal.py
@@ -0,0 +1,91 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import openai
+import pytest
+
+from vllm import envs
+from vllm.multimodal.utils import encode_image_base64, fetch_image
+from vllm.platforms import current_platform
+
+from ...entrypoints.openai.test_vision import TEST_IMAGE_URLS
+from ...utils import RemoteOpenAIServer
+
+if not envs.VLLM_USE_V1:
+    pytest.skip(
+        "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
+        allow_module_level=True,
+    )
+
+
+@pytest.fixture(scope="session")
+def base64_encoded_image() -> dict[str, str]:
+    return {
+        image_url: encode_image_base64(fetch_image(image_url))
+        for image_url in TEST_IMAGE_URLS
+    }
+
+
+@pytest.mark.asyncio
+@pytest.mark.skipif(not current_platform.is_tpu(),
+                    reason="This test needs a TPU")
+@pytest.mark.parametrize("model_name", ["llava-hf/llava-1.5-7b-hf"])
+async def test_basic_vision(model_name: str, base64_encoded_image: dict[str,
+                                                                        str]):
+
+    def whats_in_this_image_msg(b64):
+        return [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this image?"
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{b64}"
+                    },
+                },
+            ],
+        }]
+
+    server_args = [
+        "--max-model-len",
+        "1024",
+        "--max-num-seqs",
+        "16",
+        "--gpu-memory-utilization",
+        "0.95",
+        "--trust-remote-code",
+        "--max-num-batched-tokens",
+        "576",
+        # NOTE: max-num-batched-tokens>=mm_item_size
+        "--disable_chunked_mm_input",
+        "--chat-template",
+        "examples/template_llava.jinja"
+    ]
+
+    # Server will pre-compile on first startup (takes a long time).
+    with RemoteOpenAIServer(model_name, server_args,
+                            max_wait_seconds=600) as remote_server:
+        client: openai.AsyncOpenAI = remote_server.get_async_client()
+
+        # Other requests now should be much faster
+        for image_url in TEST_IMAGE_URLS:
+            image_base64 = base64_encoded_image[image_url]
+            chat_completion_from_base64 = await client.chat.completions\
+                .create(
+                model=model_name,
+                messages=whats_in_this_image_msg(image_base64),
+                max_completion_tokens=24,
+                temperature=0.0)
+            result = chat_completion_from_base64
+            assert result
+            choice = result.choices[0]
+            assert choice.finish_reason == "length"
+
+            message = choice.message
+            message = result.choices[0].message
+            assert message.content is not None and len(message.content) >= 10
+            assert message.role == "assistant"
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index f31454ab31f77..7eb464660e959 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 import bisect
+import gc
 import time
 from typing import TYPE_CHECKING, Optional, cast
 from unittest.mock import patch
@@ -21,7 +22,8 @@ from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargs,
+                                    PlaceholderRange)
 from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sequence import IntermediateTensors
 from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available
@@ -37,8 +39,7 @@ from vllm.v1.sample.tpu.sampler import Sampler as TPUSampler
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
-from .utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs,
-                    scatter_mm_placeholders)
+from .utils import sanity_check_mm_encoder_outputs
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
@@ -198,7 +199,7 @@ class TPUModelRunner:
                                             device="cpu")
         self.slot_mapping_np = self.slot_mapping_cpu.numpy()
         self.block_table_cpu = torch.zeros(
-            (self.max_num_tokens, self.max_num_blocks_per_req),
+            (self.max_num_reqs, self.max_num_blocks_per_req),
             dtype=self.input_batch.block_table.get_cpu_tensor().dtype,
             device="cpu")
 
@@ -220,6 +221,37 @@ class TPUModelRunner:
         self.num_reqs_paddings = _get_req_paddings(
             min_req_size=MIN_NUM_SEQS, max_req_size=self.max_num_reqs)
 
+        # Get maximum number of mm items per modality (batch size).
+        self.max_num_mm_items_by_modality = dict()
+        if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0
+                and self.encoder_cache_size > 0):
+            max_tokens_by_modality_dict = (
+                MULTIMODAL_REGISTRY.
+                get_max_tokens_per_item_by_nonzero_modality(self.model_config))
+            for modality, max_tokens in max_tokens_by_modality_dict.items():
+                # Check how many items of this modality can be supported by
+                # the encoder budget.
+                encoder_budget = min(self.max_num_encoder_input_tokens,
+                                     self.encoder_cache_size)
+
+                max_num_mm_items_encoder_budget = cdiv(encoder_budget,
+                                                       max_tokens)
+
+                # Check how many items of this modality can be supported by
+                # the decoder budget.
+                max_mm_items_per_req = self.mm_registry.\
+                    get_mm_limits_per_prompt(self.model_config)[modality]
+
+                # NOTE: We do not consider max_num_batched_tokens on purpose
+                # because the multimodal embeddings can be generated in advance
+                # and chunked prefilled.
+                max_num_mm_items_decoder_budget = self.max_num_reqs * \
+                    max_mm_items_per_req
+
+                max_num_mm_items = min(max_num_mm_items_encoder_budget,
+                                       max_num_mm_items_decoder_budget)
+                self.max_num_mm_items_by_modality[modality] = max_num_mm_items
+
     def _update_num_xla_graphs(self, case_str):
         check_comp = self.check_recompilation and not self.enforce_eager
         if not check_comp:
@@ -606,29 +638,36 @@ class TPUModelRunner:
             # 2. A list or tuple (length: num_items) of tensors, each of shape
             # (feature_size, hidden_size) in case the feature size is dynamic
             # depending on the input multimodal items.
+            xm.mark_step()
             curr_group_outputs = self.model.get_multimodal_embeddings(
                 **batched_mm_inputs)
+            xm.mark_step()
 
             sanity_check_mm_encoder_outputs(
                 curr_group_outputs,
                 expected_num_items=len(grouped_mm_inputs),
             )
 
-            for output in curr_group_outputs:
-                encoder_outputs.append(output)
+            if isinstance(curr_group_outputs, torch.Tensor):
+                encoder_outputs.append(curr_group_outputs)
+            else:
+                assert isinstance(curr_group_outputs, (list, tuple))
+                for output in curr_group_outputs:
+                    encoder_outputs.append(output)
 
         # Cache the encoder outputs.
+        # NOTE (NickLucche) here we diverge from logic in other runners, as we
+        # assume to only have whole mm items to process. Hence we avoid the
+        # intrinsic dynamism that `scatter_mm_placeholders` introduces.
         for (req_id, input_id, pos_info), output in zip(
                 req_ids_pos,
                 encoder_outputs,
         ):
             if req_id not in self.encoder_cache:
                 self.encoder_cache[req_id] = {}
-
-            self.encoder_cache[req_id][input_id] = scatter_mm_placeholders(
-                output,
-                is_embed=pos_info.is_embed,
-            )
+            assert pos_info.is_embed is None, "Expected all positions to be"\
+                " contiguous and embeddings."
+            self.encoder_cache[req_id][input_id] = output
 
     def _gather_mm_embeddings(
         self,
@@ -641,6 +680,10 @@ class TPUModelRunner:
             req_state = self.requests[req_id]
             num_computed_tokens = req_state.num_computed_tokens
             mm_positions = req_state.mm_positions
+            # TODO unroll loop and assume/enforce --disable_chunked_mm_input
+            # NOTE (NickLucche) here we diverge from logic in other runners, as
+            # we assume to only have whole mm items to process. Hence we avoid
+            # the intrinsic dynamism that `gather_mm_placeholders` introduces.
             for i, pos_info in enumerate(mm_positions):
                 start_pos = pos_info.offset
                 num_encoder_tokens = pos_info.length
@@ -657,25 +700,33 @@ class TPUModelRunner:
                     # in the decoder's KV cache.
                     continue
 
-                start_idx = max(num_computed_tokens - start_pos, 0)
-                end_idx = min(
-                    num_computed_tokens - start_pos + num_scheduled_tokens,
-                    num_encoder_tokens)
-                assert start_idx < end_idx
                 assert req_id in self.encoder_cache
                 assert i in self.encoder_cache[req_id]
+                assert pos_info.is_embed is None, "Expected all positions to"\
+                " be contiguous and embeddings."
                 encoder_output = self.encoder_cache[req_id][i]
-
-                if (is_embed := pos_info.is_embed) is not None:
-                    is_embed = is_embed[start_idx:end_idx]
-
-                mm_embeds_item = gather_mm_placeholders(
-                    encoder_output[start_idx:end_idx],
-                    is_embed=is_embed,
-                )
-                mm_embeds.append(mm_embeds_item)
+                mm_embeds.append(encoder_output)
         return mm_embeds
 
+    def _get_model_inputs(self, input_ids: torch.Tensor,
+                          mm_embeds: list[torch.Tensor]):
+        if self.is_multimodal_model:
+            # NOTE(woosuk): To unify token ids and soft tokens (vision
+            # embeddings), we always use embeddings (rather than token ids)
+            # as input to the multimodal model, even when the input is text.
+            if mm_embeds:
+                inputs_embeds = self.model.get_input_embeddings(
+                    input_ids, mm_embeds)
+            else:
+                inputs_embeds = self.model.get_input_embeddings(input_ids)
+            return None, inputs_embeds
+        else:
+            # For text-only models, we use token ids as input.
+            # While it is possible to use embeddings as input just like the
+            # multimodal models, it is not desirable for performance since
+            # then the embedding layer is not included in the CUDA graph.
+            return input_ids, None
+
     @torch.no_grad()
     def execute_model(
         self,
@@ -694,27 +745,13 @@ class TPUModelRunner:
             mm_embeds = self._gather_mm_embeddings(scheduler_output)
         else:
             mm_embeds = []
-
+        xm.mark_step()
         # Prepare inputs
         attn_metadata, logits_indices, padded_num_reqs = self._prepare_inputs(
             scheduler_output)
-        if self.is_multimodal_model:
-            # NOTE(woosuk): To unify token ids and soft tokens (vision
-            # embeddings), we always use embeddings (rather than token ids)
-            # as input to the multimodal model, even when the input is text.
-            if mm_embeds:
-                inputs_embeds = self.model.get_input_embeddings(
-                    self.input_ids, mm_embeds)
-            else:
-                inputs_embeds = self.model.get_input_embeddings(self.input_ids)
-            input_ids = None
-        else:
-            # For text-only models, we use token ids as input.
-            # While it is possible to use embeddings as input just like the
-            # multimodal models, it is not desirable for performance since
-            # then the embedding layer is not included in the CUDA graph.
-            input_ids = self.input_ids
-            inputs_embeds = None
+        input_ids, inputs_embeds = self._get_model_inputs(
+            self.input_ids, mm_embeds)
+        xm.mark_step()
         num_reqs = self.input_batch.num_reqs
         # Run the decoder
         with set_forward_context(attn_metadata, self.vllm_config):
@@ -890,9 +927,70 @@ class TPUModelRunner:
                              inputs_embeds=inputs_embeds)
         self._hidden_states_dtype = out.dtype
 
+    def _precompile_mm_encoder(self) -> None:
+        # Pre-compile MM encoder for all supported data modalities.
+        hf_config = self.vllm_config.model_config.hf_config
+        for mode, max_items_by_mode in \
+            self.max_num_mm_items_by_modality.items():
+            logger.info(
+                "Compiling Multimodal %s Encoder with different input"
+                " shapes.", mode)
+            start = time.perf_counter()
+            # No padding for MM encoder just yet.
+            for num_items in range(1, max_items_by_mode + 1):
+                logger.info("  -- mode: %s items: %d", mode, num_items)
+                batched_dummy_mm_inputs = self._get_mm_dummy_batch(
+                    mode, num_items)
+                # Run multimodal encoder.
+                xm.mark_step()
+                mm_embeds = self.model.\
+                    get_multimodal_embeddings(**batched_dummy_mm_inputs)
+                xm.mark_step()
+                num_patches = mm_embeds[0].shape[0]
+                items_size = num_patches * num_items
+
+                # NOTE (NickLucche) pre-compile `get_input_embeddings` when mm
+                # embeddings are present. We assume `--disable-mm-chunked`,
+                # hence only whole items can be scheduled. This implies we just
+                # need to compile when `num_items` fit the (padded) `input_ids`
+                for num_tokens in self.num_tokens_paddings:
+                    if num_tokens >= items_size:
+                        # XLA Workaround: if torch.zeros(..device) is used, XLA
+                        # compiles a scalar+expansion op, which won't match
+                        # the graph generated at runtime. CPU->TPU must be used
+                        placeholders_ids = torch.zeros(num_tokens,
+                                                       dtype=torch.int32,
+                                                       device="cpu")
+                        # Align placeholders and actual num mm_embeddings.
+                        placeholders_ids[:items_size] = \
+                            hf_config.image_token_index
+
+                        placeholders_ids = placeholders_ids.to(self.device)
+                        # Assign outputs or the graph will be cut short.
+                        a, b = self._get_model_inputs(placeholders_ids,
+                                                      [mm_embeds])
+                        assert a is None
+                        xm.mark_step()
+
+            # Pre-compile `get_input_embeddings` when mm_embeddings are not
+            # present. Chunk is only made of text, no mm_placeholders.
+            for num_tokens in self.num_tokens_paddings:
+                placeholders_ids = torch.zeros(num_tokens,
+                                               dtype=torch.int32,
+                                               device="cpu")
+                placeholders_ids = placeholders_ids.to(self.device)
+                a, b = self._get_model_inputs(placeholders_ids, [])
+                assert a is None
+                xm.mark_step()
+
+            xm.wait_device_ops()
+            end = time.perf_counter()
+            logger.info(
+                "Multimodal %s Encoder compilation finished in in %.2f "
+                "[secs].", mode, end - start)
+
     def _precompile_backbone(self) -> None:
         logger.info("Compiling the model with different input shapes.")
-
         start = time.perf_counter()
         for num_tokens in self.num_tokens_paddings:
             logger.info("  -- num_tokens: %d", num_tokens)
@@ -962,11 +1060,70 @@ class TPUModelRunner:
         """
         Precompile all the subgraphs with possible input shapes.
         """
-        # TODO: precompile encoder
+        self._precompile_mm_encoder()
         self._precompile_backbone()
         self._precompile_select_hidden_states()
         self._precompile_sample_from_hidden()
 
+    def profile_run(
+        self,
+        num_tokens: int,
+    ) -> None:
+        # Profile with multimodal encoder & encoder cache.
+        # TODO: handle encoder-decoder models once we support them.
+        if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0
+                and self.encoder_cache_size > 0):
+
+            # NOTE: Currently model is profiled with a single non-text
+            # modality with the max possible input tokens even when
+            # it supports multiple.
+            dummy_data_modality, max_num_mm_items = max(
+                self.max_num_mm_items_by_modality.items(), key=lambda t: t[1])
+
+            encoder_budget = min(self.max_num_encoder_input_tokens,
+                                 self.encoder_cache_size)
+
+            logger.info(
+                "Encoder cache will be initialized with a budget of %d tokens,"
+                " and profiled with %s %s items of the maximum feature size.",
+                encoder_budget, max_num_mm_items, dummy_data_modality)
+
+            # Create dummy batch of multimodal inputs.
+            batched_dummy_mm_inputs = self._get_mm_dummy_batch(
+                dummy_data_modality, max_num_mm_items)
+
+            # Run multimodal encoder.
+            # Isolate encoder graph from post-processing to minimize
+            # impact of recompilation until it's fixed.
+            start = time.perf_counter()
+            xm.mark_step()
+            dummy_encoder_outputs = self.model.get_multimodal_embeddings(
+                **batched_dummy_mm_inputs)
+            xm.mark_step()
+            xm.wait_device_ops()
+            end = time.perf_counter()
+            logger.info(
+                "Multimodal Encoder profiling finished in in %.2f [secs].",
+                end - start)
+
+            assert len(dummy_encoder_outputs) == max_num_mm_items, (
+                "Expected dimension 0 of encoder outputs to match the number "
+                f"of multimodal data items: {max_num_mm_items}, got "
+                f"{len(dummy_encoder_outputs)=} instead. This is most likely "
+                "due to the 'get_multimodal_embeddings' method of the model "
+                "not implemented correctly.")
+
+            # Cache the dummy encoder outputs.
+            self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
+
+        # Trigger compilation for general shape.
+        self._dummy_run(num_tokens)
+
+        xm.mark_step()
+        xm.wait_device_ops()
+        self.encoder_cache.clear()
+        gc.collect()
+
     def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         """
         Initialize KV cache based on `kv_cache_config`.
@@ -1045,6 +1202,36 @@ class TPUModelRunner:
     def get_input_embeddings(self, *args, **kwargs):
         return self.model.get_input_embeddings(*args, **kwargs)
 
+    def _get_mm_dummy_batch(self, modality: str,
+                            batch_size: int) -> BatchedTensorInputs:
+        # Dummy data for pre-compiling multimodal models.
+        dummy_request_data = self.mm_registry.get_decoder_dummy_data(
+            model_config=self.model_config,
+            seq_len=self.max_num_tokens,
+        )
+        dummy_mm_data = dummy_request_data.multi_modal_data
+
+        # Dummy data definition in V0 may contain multiple multimodal items
+        # (e.g, multiple images) for a single request, therefore here we
+        # always replicate first item by max_num_mm_items times since in V1
+        # they are scheduled to be processed separately.
+        assert isinstance(dummy_mm_data, MultiModalKwargs), (
+            "Expected dummy multimodal data to be of type "
+            f"MultiModalKwargs, got {type(dummy_mm_data)=} instead. "
+            "This is most likely due to the model not having a merged "
+            "processor.")
+
+        # When models have a merged processor, their dummy data is
+        # already batched `MultiModalKwargs`, therefore we take the first
+        # `MultiModalKwargsItem` from the desired modality to profile on.
+        dummy_mm_item = dummy_mm_data.get_item(modality=modality, item_index=0)
+        dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item])
+
+        batched_dummy_mm_inputs = MultiModalKwargs.batch([dummy_mm_kwargs] *
+                                                         batch_size)
+        return MultiModalKwargs.as_kwargs(batched_dummy_mm_inputs,
+                                          device=self.device)
+
 
 def _get_req_paddings(min_req_size: int, max_req_size: int) -> list[int]:
     logger.info("Preparing request paddings:")
@@ -1088,7 +1275,6 @@ def _get_token_paddings(min_token_size: int, max_token_size: int,
             if num >= max_token_size:
                 break
             num *= 2
-
     else:
         logger.info("Using incremental token paddings:")
         while num <= padding_gap:
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index 8f2b4acc32c30..2204f037a6d50 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -157,7 +157,7 @@ class TPUWorker:
             runner_kv_caches)
 
         # `max_num_tokens >= max_num_batched_tokens` due to padding.
-        self.model_runner._dummy_run(self.model_runner.max_num_tokens)
+        self.model_runner.profile_run(self.model_runner.max_num_tokens)
 
         # Synchronize before measuring the memory usage.
         xm.wait_device_ops()

From 986537f1c3c8288635b1f00944409489ab1cf21b Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Mon, 21 Apr 2025 18:38:41 -0600
Subject: [PATCH 541/593] [V1] V1 FlashInfer Attention (#16684)

Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Aurick Qiao <qiao@aurick.net>
---
 tests/v1/e2e/test_cascade_attention.py   |  10 +-
 vllm/engine/arg_utils.py                 |  13 +-
 vllm/platforms/cuda.py                   |   3 +
 vllm/v1/attention/backends/flash_attn.py |   7 +-
 vllm/v1/attention/backends/flashinfer.py | 639 +++++++++++++++++++++++
 vllm/v1/attention/backends/mla/common.py |   7 +-
 vllm/v1/worker/gpu_model_runner.py       |   2 +-
 7 files changed, 668 insertions(+), 13 deletions(-)
 create mode 100755 vllm/v1/attention/backends/flashinfer.py

diff --git a/tests/v1/e2e/test_cascade_attention.py b/tests/v1/e2e/test_cascade_attention.py
index a8079dcce5e2f..48c265560348c 100644
--- a/tests/v1/e2e/test_cascade_attention.py
+++ b/tests/v1/e2e/test_cascade_attention.py
@@ -1,13 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import pytest
+
 from vllm import LLM, SamplingParams
 
+from ...utils import fork_new_process_for_each_test
 
-def test_cascade_attention(example_system_message, monkeypatch):
+
+@fork_new_process_for_each_test
+@pytest.mark.parametrize("attn_backend",
+                         ["FLASH_ATTN_VLLM_V1", "FLASHINFER_VLLM_V1"])
+def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
     prompt = "\n<User>: Implement fibonacci sequence in Python.\n<Claude>:"
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
+        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
 
         llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
         sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 4c3477ddf5b3e..b30967c676d59 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1474,10 +1474,17 @@ class EngineArgs:
                                    recommend_to_remove=False)
                 return False
 
-        # No FlashInfer or XFormers so far.
+        # No XFormers so far.
         V1_BACKENDS = [
-            "FLASH_ATTN_VLLM_V1", "FLASH_ATTN", "PALLAS", "PALLAS_VLLM_V1",
-            "TRITON_ATTN_VLLM_V1", "TRITON_MLA", "FLASHMLA"
+            "FLASH_ATTN_VLLM_V1",
+            "FLASH_ATTN",
+            "PALLAS",
+            "PALLAS_VLLM_V1",
+            "TRITON_ATTN_VLLM_V1",
+            "TRITON_MLA",
+            "FLASHMLA",
+            "FLASHINFER",
+            "FLASHINFER_VLLM_V1",
         ]
         if (envs.is_set("VLLM_ATTENTION_BACKEND")
                 and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 0576022be448b..452c138277618 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -213,6 +213,9 @@ class CudaPlatformBase(Platform):
                         return ("vllm.attention.backends."
                                 "flashmla.FlashMLABackend")
         if use_v1:
+            if selected_backend == _Backend.FLASHINFER:
+                logger.info_once("Using FlashInfer backend on V1 engine.")
+                return "vllm.v1.attention.backends.flashinfer.FlashInferBackend"
             if selected_backend == _Backend.TRITON_ATTN_VLLM_V1:
                 logger.info_once("Using Triton backend on V1 engine.")
                 return ("vllm.v1.attention.backends."
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index c039cd8067f33..dd6021468ac81 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -64,10 +64,6 @@ class FlashAttentionBackend(AttentionBackend):
             raise ValueError("Block size must be a multiple of 16.")
         return (2, num_blocks, block_size, num_kv_heads, head_size)
 
-    @staticmethod
-    def use_cascade_attention(*args, **kwargs) -> bool:
-        return use_cascade_attention(*args, **kwargs)
-
 
 @dataclass
 class FlashAttentionMetadata:
@@ -402,6 +398,9 @@ class FlashAttentionMetadataBuilder:
         )
         return attn_metadata
 
+    def use_cascade_attention(self, *args, **kwargs) -> bool:
+        return use_cascade_attention(*args, **kwargs)
+
 
 class FlashAttentionImpl(AttentionImpl):
 
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
new file mode 100755
index 0000000000000..17341ecfa4fe7
--- /dev/null
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -0,0 +1,639 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Attention layer with FlashInfer."""
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Optional
+
+import torch
+from flashinfer import (BatchDecodeWithPagedKVCacheWrapper,
+                        BatchPrefillWithPagedKVCacheWrapper,
+                        MultiLevelCascadeAttentionWrapper)
+
+import vllm.envs as envs
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionType)
+from vllm.attention.layer import Attention
+from vllm.config import VllmConfig, get_current_vllm_config
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.flash_attn import use_cascade_attention
+
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.output import SchedulerOutput
+    from vllm.v1.worker.gpu_input_batch import InputBatch
+    from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+
+FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
+
+logger = init_logger(__name__)
+
+
+class FlashInferBackend(AttentionBackend):
+
+    accept_output_buffer: bool = True
+
+    @staticmethod
+    def get_supported_head_sizes() -> list[int]:
+        return [64, 128, 256]
+
+    @staticmethod
+    def get_name() -> str:
+        return "FLASHINFER_VLLM_V1"
+
+    @staticmethod
+    def get_impl_cls() -> type[FlashInferImpl]:
+        return FlashInferImpl
+
+    @staticmethod
+    def get_metadata_cls() -> type[FlashInferMetadata]:
+        return FlashInferMetadata
+
+    @staticmethod
+    def get_builder_cls() -> type[FlashInferMetadataBuilder]:
+        return FlashInferMetadataBuilder
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> tuple[int, ...]:
+        return (num_blocks, 2, block_size, num_kv_heads, head_size)
+
+
+@dataclass
+class PerLayerParameters:
+    """
+    Currently, FlashInfer backend only support models in which all layers share
+    the same values for the following hyperparameters.
+    """
+
+    window_left: int
+    logits_soft_cap: Optional[float]
+    sm_scale: float
+
+
+def get_per_layer_parameters(
+        vllm_config: VllmConfig) -> dict[str, PerLayerParameters]:
+    """
+    Scan all attention layers and determine some hyperparameters
+    to use during `plan`.
+    """
+
+    layers = vllm_config.compilation_config.static_forward_context
+    per_layer_params: dict[str, PerLayerParameters] = {}
+
+    for key, layer in layers.items():
+        assert isinstance(layer, Attention)
+
+        impl = layer.impl
+        assert isinstance(impl, FlashInferImpl)
+
+        # Infer hyperparameters from the attention layer
+        window_size = impl.sliding_window
+        window_left = window_size[0] if window_size is not None else -1
+        logits_soft_cap = impl.logits_soft_cap
+        sm_scale = impl.scale
+
+        per_layer_params[key] = PerLayerParameters(window_left,
+                                                   logits_soft_cap, sm_scale)
+
+    return per_layer_params
+
+
+def infer_global_hyperparameters(
+        per_layer_params: dict[str, PerLayerParameters]) -> PerLayerParameters:
+    """
+    Currently, FlashInfer backend only support models in which all layers share
+    the same values for the following hyperparameters:
+    - `window_left`
+    - `logits_soft_cap`
+    - `sm_scale`
+
+    So this function asserts that all layers share the same values for these
+    hyperparameters and returns the global values.
+    """
+
+    assert len(per_layer_params) > 0, "No attention layers found in the model."
+
+    param_sets = list(per_layer_params.values())
+    global_params = param_sets[0]
+    for params in param_sets:
+        assert params == global_params, (
+            "FlashInfer backend currently only supports models in which all "
+            "layers share the same values for the following hyperparameters: "
+            "`window_left`, `logits_soft_cap`, `sm_scale`.")
+
+    return global_params
+
+
+@dataclass
+class FlashInferMetadata:
+
+    num_actual_tokens: int  # Number of tokens excluding padding.
+
+    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
+    # the batch, used to index into subquery. E.g., if the subquery length
+    # is [4, 6], it is [0, 4, 10].
+    qo_indptr: torch.Tensor
+    # An example for paged_kv_indices, paged_kv_indptr:
+    # request 1, page indices [0, 5, 8]
+    # request 2, page indices [1, 6, 7]
+    # request 3, page indices [3, 4]
+    # paged_kv_indices is a concatenation of page indices of all requests:
+    # [0, 5, 8, 1, 6, 7, 3, 4]
+    # paged_kv_indptr is used to index into paged_kv_indices:
+    # [0, 3, 6, 8]
+    # The indptr of the paged kv cache, shape: [batch_size + 1]
+    paged_kv_indptr: torch.Tensor
+    # The page indices of the paged kv cache
+    paged_kv_indices: torch.Tensor
+    # The number of entries in the last page of each request in
+    # the paged kv cache, shape: [batch_size]
+    paged_kv_last_page_len: torch.Tensor
+    # The number of query/output heads
+    num_qo_heads: int
+    # The number of key/value heads
+    num_kv_heads: int
+    # The dimension of the attention heads
+    head_dim: int
+    # Block size of vllm
+    page_size: int
+    # The data type of the paged kv cache
+    data_type: torch.dtype
+    # The data type of the query
+    q_data_type: torch.dtype
+
+    slot_mapping: torch.Tensor
+
+    # For handling prefill decode split
+    num_decodes: int
+    num_decode_tokens: int
+    num_prefills: int
+    num_prefill_tokens: int
+
+    # For cascade attention.
+    use_cascade: bool
+    shared_qo_indptr: Optional[torch.Tensor] = None
+    shared_kv_page_indptr: Optional[torch.Tensor] = None
+    shared_kv_page_indices: Optional[torch.Tensor] = None
+    shared_kv_last_page_len: Optional[torch.Tensor] = None
+
+    prefill_wrapper: Optional[BatchPrefillWithPagedKVCacheWrapper] = None
+    decode_wrapper: Optional[BatchDecodeWithPagedKVCacheWrapper] = None
+    cascade_wrapper: Optional[MultiLevelCascadeAttentionWrapper] = None
+
+    # For logging.
+    num_input_tokens: int = 0  # Number of tokens including padding.
+
+    @property
+    def query_start_loc(self):
+        # The GPUModelRunner expects to be able to access this property.
+        return self.qo_indptr
+
+    def __post_init__(self):
+        # Refer to
+        # https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157
+        supported_head_sizes = FlashInferBackend.get_supported_head_sizes()
+        if self.head_dim is not None and self.head_dim \
+                not in supported_head_sizes:
+            raise ValueError(
+                f"Only {supported_head_sizes} are supported for head_dim,",
+                f" received {self.head_dim}.")
+
+
+class FlashInferMetadataBuilder:
+
+    def __init__(self, runner: GPUModelRunner):
+        self.runner = runner
+        self._workspace_buffer = None
+        self._prefill_wrapper = None  # Wrapper for prefill/append
+        self._decode_wrapper = None  # Wrapper for decode
+        self._cascade_wrapper = None  # Wrapper for cascade attention
+
+        # Global hyperparameters shared by all attention layers
+        self.global_hyperparameters: Optional[PerLayerParameters] = None
+
+        self.vllm_config = get_current_vllm_config()
+
+    def reorder_batch(self, input_batch: InputBatch,
+                      scheduler_output: SchedulerOutput) -> bool:
+        # We now want to reorder the batch so that the "decode" requests are and
+        # the front and the "prefill" requests are at the using the least amount
+        # swaps possible. (NOTE for now we loosely use "decode" to mean requests
+        # where attention is likely memory-bound and "prefill" to mean requests
+        # where attention is likely compute-bound, TODO(lucas): figure out a
+        # better naming here)
+        decodes = []
+        prefills = []
+        num_decode_tokens = 0
+        num_prefill_tokens = 0
+
+        for i, req_id in enumerate(input_batch.req_ids):
+            num_tokens = scheduler_output.num_scheduled_tokens[req_id]
+            # for now treat 1 scheduled token as "decode" even if its not,
+            # we should update this to something like < 8 in the future but
+            # currently the decode run only supports num_tokens = 1
+            if num_tokens == 1:
+                decodes.append(i)
+                num_decode_tokens += num_tokens
+            else:
+                prefills.append(i)
+                num_prefill_tokens += num_tokens
+
+        # We hope that this is fairly minimal since decodes
+        # should be around for a number of iterations so hopefully they are
+        # relatively stationary (and new request are generally appended to the
+        # persistent batch so already should be at the back)
+        # To achieve this we loop over the decodes in descending order and
+        # the prefills in ascending order. We swap decodes from the  "back"
+        # i.e. past where the last decode should be in the reodorered with
+        # prefills from the front of the batch.
+        # `decodes` and `prefills` are already in ascending order just based on
+        # the above loop
+        num_decodes = len(decodes)
+        num_prefills = len(prefills)
+        modified_batch = False
+
+        for i in range(1, min(num_decodes, num_prefills) + 1):
+            # If the decode is at the "back" of the batch, i, we can swap it
+            # with the prefill closest to the front of the batch
+            decode_idx = decodes[num_decodes - i]
+            if decode_idx < num_decodes:
+                break
+
+            input_batch.swap_states(prefills[i - 1], decode_idx)
+            modified_batch = True
+
+        # Save for next `build` call
+        # TODO(lucas): this is a bit of a hack, we should probably have a
+        # better way of doing this
+        self._num_decodes = num_decodes
+        self._num_prefills = num_prefills
+        self._num_decode_tokens = num_decode_tokens
+        self._num_prefill_tokens = num_prefill_tokens
+
+        return modified_batch
+
+    def _get_workspace_buffer(self):
+        if self._workspace_buffer is None:
+            self._workspace_buffer = torch.empty(
+                FLASHINFER_WORKSPACE_BUFFER_SIZE,
+                dtype=torch.uint8,
+                device=self.runner.device)
+        return self._workspace_buffer
+
+    def _get_prefill_wrapper(self):
+        if self._prefill_wrapper is None:
+            self._prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper(
+                self._get_workspace_buffer(), "NHD")
+        return self._prefill_wrapper
+
+    def _get_decode_wrapper(self):
+        if self._decode_wrapper is None:
+            num_qo_heads = (self.runner.model_config.get_num_attention_heads(
+                self.runner.parallel_config))
+            num_kv_heads = self.runner.model_config.get_num_kv_heads(
+                self.runner.parallel_config)
+            use_tensor_cores = envs.VLLM_FLASHINFER_FORCE_TENSOR_CORES or (
+                num_qo_heads // num_kv_heads > 4)
+            self._decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
+                self._get_workspace_buffer(),
+                "NHD",
+                use_tensor_cores=use_tensor_cores)
+        return self._decode_wrapper
+
+    def _get_cascade_wrapper(self):
+        if self._cascade_wrapper is None:
+            self._cascade_wrapper = MultiLevelCascadeAttentionWrapper(
+                2, self._get_workspace_buffer(), "NHD")
+        return self._cascade_wrapper
+
+    def _plan(self, attn_metadata: FlashInferMetadata):
+        if self.global_hyperparameters is None:
+            self.global_hyperparameters = infer_global_hyperparameters(
+                get_per_layer_parameters(self.vllm_config))
+        if attn_metadata.use_cascade:
+            attn_metadata.cascade_wrapper = self._get_cascade_wrapper()
+            attn_metadata.cascade_wrapper.plan(
+                [attn_metadata.shared_qo_indptr, attn_metadata.qo_indptr],
+                [
+                    attn_metadata.shared_kv_page_indptr,
+                    attn_metadata.paged_kv_indptr
+                ],
+                [
+                    attn_metadata.shared_kv_page_indices,
+                    attn_metadata.paged_kv_indices
+                ],
+                [
+                    attn_metadata.shared_kv_last_page_len,
+                    attn_metadata.paged_kv_last_page_len
+                ],
+                attn_metadata.num_qo_heads,
+                attn_metadata.num_kv_heads,
+                attn_metadata.head_dim,
+                attn_metadata.page_size,
+                causal=True,
+                sm_scale=self.global_hyperparameters.sm_scale,
+                window_left=self.global_hyperparameters.window_left,
+                logits_soft_cap=self.global_hyperparameters.logits_soft_cap,
+                q_data_type=attn_metadata.q_data_type,
+            )
+        else:
+            # Regular attention (common case).
+            # Decodes are at the front and prefills are at the back,
+            # according to reorder_batch()
+            if self._num_prefills > 0:
+                # Decodes are first so prefills start after the last decode
+                prefill_start = self._num_decodes
+                attn_metadata.prefill_wrapper = self._get_prefill_wrapper()
+                assert attn_metadata.qo_indptr[prefill_start:].shape[
+                    0] == self._num_prefills + 1
+                assert attn_metadata.paged_kv_indptr[prefill_start:].shape[
+                    0] == self._num_prefills + 1
+                assert attn_metadata.paged_kv_last_page_len[
+                    prefill_start:].shape[0] == self._num_prefills
+                # Since prefill_wrapper.run() will be called with
+                # query[num_decode_tokens:] we need to adjust the qo_indptr
+                # to be relative to the start of the prefill queries.
+                qo_indptr = attn_metadata.qo_indptr[
+                    prefill_start:] - attn_metadata.qo_indptr[prefill_start]
+                attn_metadata.prefill_wrapper.plan(
+                    qo_indptr,
+                    attn_metadata.paged_kv_indptr[prefill_start:],
+                    attn_metadata.paged_kv_indices,
+                    attn_metadata.paged_kv_last_page_len[prefill_start:],
+                    attn_metadata.num_qo_heads,
+                    attn_metadata.num_kv_heads,
+                    attn_metadata.head_dim,
+                    attn_metadata.page_size,
+                    causal=True,
+                    sm_scale=self.global_hyperparameters.sm_scale,
+                    window_left=self.global_hyperparameters.window_left,
+                    logits_soft_cap=self.global_hyperparameters.
+                    logits_soft_cap,
+                    q_data_type=attn_metadata.q_data_type,
+                    kv_data_type=attn_metadata.data_type,
+                )
+
+            if self._num_decodes > 0:
+                attn_metadata.decode_wrapper = self._get_decode_wrapper()
+                attn_metadata.decode_wrapper.plan(
+                    attn_metadata.paged_kv_indptr[:self._num_decodes + 1],
+                    attn_metadata.paged_kv_indices,
+                    attn_metadata.paged_kv_last_page_len[:self._num_decodes],
+                    attn_metadata.num_qo_heads,
+                    attn_metadata.num_kv_heads,
+                    attn_metadata.head_dim,
+                    attn_metadata.page_size,
+                    # Disable flashinfer's pos encoding and use vllm's rope.
+                    pos_encoding_mode="NONE",
+                    sm_scale=self.global_hyperparameters.sm_scale,
+                    window_left=self.global_hyperparameters.window_left,
+                    logits_soft_cap=self.global_hyperparameters.
+                    logits_soft_cap,
+                    q_data_type=attn_metadata.q_data_type,
+                    kv_data_type=attn_metadata.data_type,
+                )
+
+    def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
+              common_prefix_len: int):
+        assert self._num_decodes + self._num_prefills == num_reqs
+        assert (self._num_decode_tokens +
+                self._num_prefill_tokens == num_actual_tokens)
+        page_size = self.runner.block_size
+        device = self.runner.device
+        qo_indptr = self.runner.query_start_loc_cpu[:num_reqs + 1].to(
+            self.runner.device, non_blocking=True)
+        seq_lens = self.runner.seq_lens_cpu[:num_reqs].to(self.runner.device,
+                                                          non_blocking=True)
+        block_table = (
+            self.runner.input_batch.block_table.get_device_tensor()[:num_reqs])
+        slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
+            self.runner.device, non_blocking=True).long()
+
+        block_table_bounds = (seq_lens + page_size - 1) // page_size
+
+        use_cascade = common_prefix_len > 0
+        if use_cascade:
+            # Grab the blocks of the shared prefix from the first request.
+            assert common_prefix_len % page_size == 0
+            num_common_kv_blocks = common_prefix_len // page_size
+            shared_qo_indptr = torch.tensor([0, num_actual_tokens],
+                                            dtype=torch.int32,
+                                            device=device)
+            shared_kv_page_indptr = torch.tensor([0, num_common_kv_blocks],
+                                                 dtype=torch.int32,
+                                                 device=device)
+            shared_kv_page_indices = block_table[0, :num_common_kv_blocks]
+            shared_kv_last_page_len = torch.tensor([page_size],
+                                                   dtype=torch.int32,
+                                                   device=device)
+            # Remove the blocks of the shared prefix from all requests.
+            block_table = block_table[:, num_common_kv_blocks:]
+            block_table_bounds -= num_common_kv_blocks
+        else:
+            shared_qo_indptr = None
+            shared_kv_page_indptr = None
+            shared_kv_page_indices = None
+            shared_kv_last_page_len = None
+
+        mask = (torch.arange(block_table.size(1),
+                             dtype=block_table.dtype,
+                             device=block_table.device).unsqueeze(0)
+                < block_table_bounds.unsqueeze(1))
+        paged_kv_indices = block_table[mask]
+
+        paged_kv_indptr = torch.cat([
+            torch.zeros(1,
+                        dtype=block_table_bounds.dtype,
+                        device=block_table_bounds.device),
+            block_table_bounds.cumsum(dim=0, dtype=torch.int32)
+        ])
+
+        paged_kv_last_page_len = seq_lens % page_size
+        paged_kv_last_page_len = torch.where(paged_kv_last_page_len == 0,
+                                             page_size, paged_kv_last_page_len)
+
+        attn_metadata = FlashInferMetadata(
+            num_actual_tokens=num_actual_tokens,
+            qo_indptr=qo_indptr,
+            paged_kv_indptr=paged_kv_indptr,
+            paged_kv_indices=paged_kv_indices,
+            paged_kv_last_page_len=paged_kv_last_page_len,
+            num_qo_heads=self.runner.num_query_heads,
+            num_kv_heads=self.runner.num_kv_heads,
+            head_dim=self.runner.head_size,
+            page_size=page_size,
+            data_type=self.runner.kv_cache_dtype,
+            q_data_type=self.runner.dtype,
+            slot_mapping=slot_mapping,
+            num_decodes=self._num_decodes,
+            num_decode_tokens=self._num_decode_tokens,
+            num_prefills=self._num_prefills,
+            num_prefill_tokens=self._num_prefill_tokens,
+            use_cascade=use_cascade,
+            shared_qo_indptr=shared_qo_indptr,
+            shared_kv_page_indptr=shared_kv_page_indptr,
+            shared_kv_page_indices=shared_kv_page_indices,
+            shared_kv_last_page_len=shared_kv_last_page_len,
+        )
+
+        self._plan(attn_metadata)
+
+        return attn_metadata
+
+    def use_cascade_attention(self, *args, **kwargs) -> bool:
+        if self.runner.kv_cache_dtype != self.runner.model_config.dtype:
+            # TODO: The cascade wrapper currently does not support setting
+            # kv cache dtype to something different from query dtype.
+            return False
+        return use_cascade_attention(*args, **kwargs)
+
+
+class FlashInferImpl(AttentionImpl):
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[list[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        blocksparse_params: Optional[dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: AttentionType = AttentionType.DECODER,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.alibi_slopes = alibi_slopes
+        if sliding_window is None:
+            self.sliding_window = (-1, -1)
+        else:
+            self.sliding_window = (sliding_window - 1, 0)
+        self.kv_cache_dtype = kv_cache_dtype
+        self.logits_soft_cap = logits_soft_cap
+
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "FlashInferImpl")
+
+    def forward(
+        self,
+        layer: torch.nn.Module,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: FlashInferMetadata,
+        output: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with FlashInfer.
+
+        Args:
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size]
+            kv_cache = [num_blocks, 2, block_size, num_kv_heads, head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+        assert output is not None, "Output tensor must be provided."
+
+        if attn_metadata is None:
+            # Profiling run.
+            return output
+
+        # IMPORTANT!
+        # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in
+        # eager-mode PyTorch. Thus, we need to be careful about any CPU overhead
+        # in this method. For example, `view` and `slice` (or `[:n]`) operations
+        # are surprisingly slow even in the case they do not invoke any GPU ops.
+        # Minimize the PyTorch ops in this method as much as possible.
+        # Whenever making a change in this method, please benchmark the
+        # performance to make sure it does not introduce any overhead.
+
+        num_actual_tokens = attn_metadata.num_actual_tokens
+        # Reshape the input keys and values and store them in the cache.
+        # NOTE(woosuk): Here, key and value are padded while slot_mapping is
+        # not padded. However, we don't need to do key[:num_actual_tokens] and
+        # value[:num_actual_tokens] because the reshape_and_cache_flash op uses
+        # the slot_mapping's shape to determine the number of actual tokens.
+        torch.ops._C_cache_ops.reshape_and_cache_flash(
+            key,
+            value,
+            kv_cache[:, 0],
+            kv_cache[:, 1],
+            attn_metadata.slot_mapping,
+            self.kv_cache_dtype,
+            layer._k_scale,
+            layer._v_scale,
+        )
+
+        window_left = (self.sliding_window[0]
+                       if self.sliding_window is not None else -1)
+
+        # Inputs and outputs may be padded for CUDA graphs
+        query = query[:num_actual_tokens]
+        output_padded = output
+        output = output[:num_actual_tokens]
+
+        if attn_metadata.use_cascade:
+            # Cascade attention (rare case).
+            assert attn_metadata.cascade_wrapper is not None
+            output.copy_(attn_metadata.cascade_wrapper.run(query, kv_cache))
+            return output
+
+        num_decode_tokens = attn_metadata.num_decode_tokens
+        num_prefill_tokens = attn_metadata.num_prefill_tokens
+
+        # Regular attention (common case).
+        # Decodes are at the front and prefills are at the back,
+        # according to reorder_batch()
+        if prefill_wrapper := attn_metadata.prefill_wrapper:
+            prefill_query = query[num_decode_tokens:]
+            assert prefill_query.shape[0] == num_prefill_tokens
+            assert prefill_wrapper is not None
+            assert prefill_wrapper._causal
+            assert prefill_wrapper._window_left == window_left
+            assert prefill_wrapper._logits_soft_cap == (self.logits_soft_cap
+                                                        or 0.0)
+            assert prefill_wrapper._sm_scale == self.scale
+            prefill_wrapper.run(
+                prefill_query,
+                kv_cache,
+                k_scale=layer._k_scale_float,
+                v_scale=layer._v_scale_float,
+                out=output[num_decode_tokens:],
+            )
+
+        if decode_wrapper := attn_metadata.decode_wrapper:
+            decode_query = query[:num_decode_tokens]
+            assert decode_query.shape[0] == num_decode_tokens
+            assert decode_wrapper is not None
+            assert decode_wrapper._window_left == window_left
+            assert decode_wrapper._logits_soft_cap == (self.logits_soft_cap
+                                                       or 0.0)
+            assert decode_wrapper._sm_scale == self.scale
+            decode_wrapper.run(
+                decode_query,
+                kv_cache,
+                k_scale=layer._k_scale_float,
+                v_scale=layer._v_scale_float,
+                out=output[:num_decode_tokens],
+            )
+
+        return output_padded
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index c0a6bd29623e6..f826f8a21789e 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -251,10 +251,6 @@ class MLACommonBackend(AttentionBackend):
     def get_supported_head_sizes() -> list[int]:
         return [576]
 
-    @staticmethod
-    def use_cascade_attention(*args, **kwargs) -> bool:
-        return False
-
 
 @dataclass
 class MLACommonPrefillMetadata:
@@ -574,6 +570,9 @@ class MLACommonMetadataBuilder(Generic[M]):
             decode=decode_metadata,
         )
 
+    def use_cascade_attention(self, *args, **kwargs) -> bool:
+        return False
+
 
 class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
     """
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4cb5a8e171a9c..d5efe2dda16d0 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -696,7 +696,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         # common_prefix_len should be a multiple of the block size.
         common_prefix_len = (common_prefix_len // self.block_size *
                              self.block_size)
-        use_cascade = self.attn_backend.use_cascade_attention(
+        use_cascade = self.attn_metadata_builder.use_cascade_attention(
             common_prefix_len=common_prefix_len,
             query_lens=num_scheduled_tokens,
             num_query_heads=self.num_query_heads,

From fa3bba2a538de76c630f75de160bfb43e4e1cd4b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Tue, 22 Apr 2025 02:46:07 +0200
Subject: [PATCH 542/593] [TPU][V1] Enable Top-P (#16843)

Signed-off-by: NickLucche <nlucches@redhat.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
---
 tests/v1/tpu/test_sampler.py   |  7 ++++---
 vllm/v1/sample/tpu/metadata.py | 12 +++++-------
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/tests/v1/tpu/test_sampler.py b/tests/v1/tpu/test_sampler.py
index 74ad81406b4a2..50d40aa9dec2c 100644
--- a/tests/v1/tpu/test_sampler.py
+++ b/tests/v1/tpu/test_sampler.py
@@ -42,7 +42,7 @@ def test_sampler_different(model_name: str):
         sampling_params = SamplingParams(temperature=0.3, seed=42)
         output2 = llm.generate(prompts, sampling_params)
 
-    # Batch-case with TopK
+    # Batch-case with TopK/P
     for B in [4, 16]:
         p = prompts * B
         sampling_params = [
@@ -51,9 +51,10 @@ def test_sampler_different(model_name: str):
                 min_p=0.8,
                 max_tokens=64,
                 # Vary number of ks
-                top_k=random.randint(4, 12)) for _ in range(B)
+                top_k=random.randint(4, 12),
+                top_p=random.random()) for _ in range(B)
         ]
-        # Make sure first two reqs have the same K
+        # Make sure first two reqs have the same K/P
         sampling_params[0] = sampling_params[1]
         output = llm.generate(p, sampling_params)
         assert output[0].outputs[0].text == output[1].outputs[0].text
diff --git a/vllm/v1/sample/tpu/metadata.py b/vllm/v1/sample/tpu/metadata.py
index 917d8baf60bf0..d4ea8c2dee071 100644
--- a/vllm/v1/sample/tpu/metadata.py
+++ b/vllm/v1/sample/tpu/metadata.py
@@ -11,7 +11,7 @@ DEFAULT_SAMPLING_PARAMS = dict(
     min_p=0.0,
     # strictly disabled for now
     top_k=0,
-    # top_p=0.0,
+    top_p=1.0,
     # frequency_penalties=0.0,
     # presence_penalties=0.0,
     # repetition_penalties=0.0,
@@ -26,11 +26,9 @@ class TPUSupportedSamplingMetadata:
     temperature: torch.Tensor = None
 
     min_p: torch.Tensor = None
-    # Still too slow on forward_native!
     top_k: torch.Tensor = None
     top_p: torch.Tensor = None
 
-    # Greedy sampling flag for compiling single xla graph.
     all_greedy: bool = True
 
     # unsupported, you need to return an extra tensor of static size BxV
@@ -103,9 +101,8 @@ class TPUSupportedSamplingMetadata:
                    DEFAULT_SAMPLING_PARAMS["min_p"])
         fill_slice(input_batch.top_k_cpu_tensor,
                    DEFAULT_SAMPLING_PARAMS["top_k"])
-        # TODO Temporarily disabled until sampling options are enabled
-        # fill_slice(input_batch.top_p_cpu_tensor,
-        #            DEFAULT_SAMPLING_PARAMS["top_p"])
+        fill_slice(input_batch.top_p_cpu_tensor,
+                   DEFAULT_SAMPLING_PARAMS["top_p"])
 
         # Slice persistent device tensors to a fixed pre-compiled padded shape.
         return cls(
@@ -113,7 +110,8 @@ class TPUSupportedSamplingMetadata:
             to(xla_device),
             all_greedy=input_batch.all_greedy,
             # TODO enable more and avoid returning None values
-            top_p=None,  # input_batch.top_p[:padded_num_reqs],
+            top_p=input_batch.top_p_cpu_tensor[:padded_num_reqs].to(
+                xla_device),
             top_k=input_batch.top_k_cpu_tensor[:padded_num_reqs].to(
                 xla_device),
             min_p=input_batch.min_p_cpu_tensor[:padded_num_reqs].to(

From 29f395c97ca5c0e9e077c6166efee4b5ff948d1c Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 22 Apr 2025 09:04:38 +0800
Subject: [PATCH 543/593] [Doc] Remove unnecessary V1 flag (#16924)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/design/v1/torch_compile.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/design/v1/torch_compile.md b/docs/source/design/v1/torch_compile.md
index 83409380ba9a3..7920131643c26 100644
--- a/docs/source/design/v1/torch_compile.md
+++ b/docs/source/design/v1/torch_compile.md
@@ -99,7 +99,7 @@ This time, Inductor compilation is completely bypassed, and we will load from di
 
 The above example just uses Inductor to compile for a general shape (i.e. symbolic shape). We can also use Inductor to compile for some of the specific shapes, for example:
 
-`VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'compile_sizes': [1, 2, 4, 8]}"`
+`vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'compile_sizes': [1, 2, 4, 8]}"`
 
 Then it will also compile a specific kernel just for batch size `1, 2, 4, 8`. At this time, all of the shapes in the computation graph are static and known, and we will turn on auto-tuning to tune for max performance. This can be slow when you run it for the first time, but the next time you run it, we can directly bypass the tuning and run the tuned kernel.
 
@@ -134,6 +134,6 @@ The cudagraphs are captured and managed by the compiler backend, and replayed wh
 
 By default, vLLM will try to determine a set of sizes to capture cudagraph. You can also override it using the config `cudagraph_capture_sizes`:
 
-`VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.2-1B --compilation-config "{'cudagraph_capture_sizes': [1, 2, 4, 8]}"`
+`vllm serve meta-llama/Llama-3.2-1B --compilation-config "{'cudagraph_capture_sizes': [1, 2, 4, 8]}"`
 
 Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture.

From 1311913f5537b36a7b12f481ebd15f7ad775db58 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 21 Apr 2025 19:54:19 -0700
Subject: [PATCH 544/593] [BugFix][Spec Decode] No in-place update to draft
 probs (#16952)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/spec_decode/eagle.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 9505bd7ce43dd..3efafa8f0b1f6 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -264,7 +264,9 @@ def compute_probs_and_sample_next_token(
     # TODO(woosuk): Consider seeds.
     q = torch.empty_like(probs)
     q.exponential_()
-    next_token_ids = probs.div_(q).argmax(dim=-1).view(-1)
+    # NOTE(woosuk): We shouldn't use `probs.div_(q)` because the draft_probs
+    # will be used later for rejection sampling.
+    next_token_ids = probs.div(q).argmax(dim=-1).view(-1)
     if not sampling_metadata.all_random:
         greedy_token_ids = probs.argmax(dim=-1)
         next_token_ids = torch.where(

From 0e4254492f2367484ac04d8201a3ac35db742656 Mon Sep 17 00:00:00 2001
From: Jeffrey Li <46302202+jeffrey-dot-li@users.noreply.github.com>
Date: Mon, 21 Apr 2025 23:40:19 -0400
Subject: [PATCH 545/593] [Bugfix]: fix issue with n>1 sampling on v1 requests
 overriding each other (#16863)

Signed-off-by: Jeffrey Li <jeffrey.dot.li@gmail.com>
---
 tests/v1/engine/test_output_processor.py | 81 ++++++++++++++++++++++++
 vllm/outputs.py                          | 34 ++++++----
 vllm/v1/engine/output_processor.py       |  9 +--
 3 files changed, 104 insertions(+), 20 deletions(-)

diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 9ac42dbc34a49..f8d96caf1a276 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -921,3 +921,84 @@ async def test_request_output_collector():
     # Cumulative logprobs should be the last one.
     cumulative_logprob_expected = 1.0 * num_to_put
     assert output.outputs[0].cumulative_logprob == cumulative_logprob_expected
+
+
+@pytest.mark.asyncio
+async def test_cumulative_output_collector_n():
+    """Test collector correctly handles multiple outputs by index."""
+    collector = RequestOutputCollector(RequestOutputKind.CUMULATIVE)
+    outputs = [
+        RequestOutput(
+            request_id="my-request-id",
+            prompt=None,
+            prompt_token_ids=[1, 2, 3],
+            prompt_logprobs=None,
+            outputs=[
+                CompletionOutput(
+                    index=0,
+                    text="a",
+                    token_ids=[0],
+                    cumulative_logprob=None,
+                    logprobs=None,
+                    finish_reason=None,
+                ),
+                CompletionOutput(
+                    index=1,
+                    text="b",
+                    token_ids=[1],
+                    cumulative_logprob=None,
+                    logprobs=None,
+                    finish_reason=None,
+                ),
+            ],
+            finished=False,
+        ),
+        RequestOutput(
+            request_id="my-request-id",
+            prompt=None,
+            prompt_token_ids=[1, 2, 3],
+            prompt_logprobs=None,
+            outputs=[
+                CompletionOutput(
+                    index=0,
+                    text="ab",
+                    token_ids=[0, 1],
+                    cumulative_logprob=None,
+                    logprobs=None,
+                    finish_reason=None,
+                ),
+                CompletionOutput(
+                    index=2,
+                    text="c",
+                    token_ids=[2],
+                    cumulative_logprob=None,
+                    logprobs=None,
+                    finish_reason=None,
+                ),
+            ],
+            finished=False,
+        ),
+    ]
+    for output in outputs:
+        collector.put(output)
+
+    # Get the output and check that the text and token_ids are correct.
+    result = await collector.get()
+    # We are expecting
+    # [{index: 0, text: "ab"}, {index: 1, text: "b"}, {index: 2, text: "c"}]
+    assert len(result.outputs) == 3
+    # First is the one where index is 0
+    first = [k for k in result.outputs if k.index == 0]
+    assert len(first) == 1
+    assert first[0].text == "ab"
+
+    # Second is the one where index is 1
+    second = [k for k in result.outputs if k.index == 1]
+    assert len(second) == 1
+    assert second[0].text == "b"
+    assert second[0].token_ids == [1]
+
+    # Third is the one where index is 2
+    third = [k for k in result.outputs if k.index == 2]
+    assert len(third) == 1
+    assert third[0].text == "c"
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 014e8d5d88238..c8b9be5424e4b 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -134,26 +134,32 @@ class RequestOutput:
         self.encoder_prompt_token_ids = encoder_prompt_token_ids
         self.num_cached_tokens = num_cached_tokens
 
-    def add(self, next_output: "RequestOutput") -> None:
+    def add(self, next_output: "RequestOutput", aggregate: bool) -> None:
         """Merge subsequent RequestOutput into this one"""
 
         self.finished |= next_output.finished
 
         for next_completion in next_output.outputs:
-            for completion in self.outputs:
+            for i, completion in enumerate(self.outputs):
                 if completion.index == next_completion.index:
-                    # Merge outputs with same index
-                    completion.text += next_completion.text
-                    if not isinstance(completion.token_ids, MutableSequence):
-                        completion.token_ids = list(completion.token_ids)
-                    completion.token_ids.extend(next_completion.token_ids)
-                    if next_completion.logprobs:
-                        assert completion.logprobs is not None
-                        completion.logprobs.extend(next_completion.logprobs)
-                    completion.cumulative_logprob = (
-                        next_completion.cumulative_logprob)
-                    completion.finish_reason = next_completion.finish_reason
-                    completion.stop_reason = next_completion.stop_reason
+                    if aggregate:
+                        # Merge outputs with same index
+                        completion.text += next_completion.text
+                        if not isinstance(completion.token_ids,
+                                          MutableSequence):
+                            completion.token_ids = list(completion.token_ids)
+                        completion.token_ids.extend(next_completion.token_ids)
+                        if next_completion.logprobs:
+                            assert completion.logprobs is not None
+                            completion.logprobs.extend(
+                                next_completion.logprobs)
+                        completion.cumulative_logprob = (
+                            next_completion.cumulative_logprob)
+                        completion.finish_reason = next_completion.finish_reason
+                        completion.stop_reason = next_completion.stop_reason
+                    else:
+                        # Replace the output with the new one
+                        self.outputs[i] = next_completion
                     break
             else:
                 self.outputs.append(next_completion)
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 21e2a1aee4e2c..d652b17e55b35 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -37,12 +37,9 @@ class RequestOutputCollector:
             self.output = output
             self.ready.set()
         elif isinstance(self.output, RequestOutput):
-            if self.aggregate:
-                # Coalesce the outputs in delta case.
-                self.output.add(output)
-            else:
-                # Just replace latest in non-delta case.
-                self.output = output
+            # This ensures that request outputs with different request indexes
+            # (if n > 1) do not override each other.
+            self.output.add(output, aggregate=self.aggregate)
 
     async def get(self) -> RequestOutput:
         """Get operation blocks on put event."""

From 5b794cae8d0df288e83845e95a2874101e98357a Mon Sep 17 00:00:00 2001
From: kliuae <17350011+kliuae@users.noreply.github.com>
Date: Tue, 22 Apr 2025 11:42:34 +0800
Subject: [PATCH 546/593] [ROCm] Add aiter tkw1 kernel for Llama4 fp8 (#16727)

Signed-off-by: kliuae <kuanfu.liu@embeddedllm.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 docker/Dockerfile.rocm_base                   |   2 +-
 vllm/envs.py                                  |   8 --
 .../layers/fused_moe/fused_moe.py             |   6 +-
 .../layers/fused_moe/rocm_aiter_fused_moe.py  | 136 ++++++++++++++----
 .../compressed_tensors_moe.py                 |  25 +++-
 .../model_executor/layers/quantization/fp8.py |   5 +-
 6 files changed, 134 insertions(+), 48 deletions(-)

diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
index b8523fbc2a01c..05192eb69b54b 100644
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -12,7 +12,7 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="1a7f4dfa"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="8970b25b"
+ARG AITER_BRANCH="5a77249"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 
 FROM ${BASE_IMAGE} AS base
diff --git a/vllm/envs.py b/vllm/envs.py
index ac6089977846c..0a7067b8a6a5f 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -77,7 +77,6 @@ if TYPE_CHECKING:
     VLLM_ROCM_USE_AITER: bool = False
     VLLM_ROCM_USE_AITER_LINEAR: bool = True
     VLLM_ROCM_USE_AITER_MOE: bool = True
-    VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE: bool = False
     VLLM_ROCM_USE_AITER_RMSNORM: bool = True
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ROCM_MOE_PADDING: bool = True
@@ -546,13 +545,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: (os.getenv("VLLM_ROCM_USE_AITER_MOE", "True").lower() in
              ("true", "1")),
 
-    # Whether to use aiter block scaled moe kernel.
-    # By default this is disabled.
-    "VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE":
-    lambda:
-    (os.getenv("VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE", "false").lower() in
-     ("true", "1")),
-
     # use aiter rms norm op if aiter ops are enabled.
     "VLLM_ROCM_USE_AITER_RMSNORM":
     lambda: (os.getenv("VLLM_ROCM_USE_AITER_RMSNORM", "True").lower() in
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 2a988b8644b53..a209715ede77c 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -23,9 +23,7 @@ from vllm.model_executor.layers.quantization.utils.int8_utils import (
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 
-from .rocm_aiter_fused_moe import (is_rocm_aiter_moe_enabled,
-                                   rocm_aiter_fused_experts,
-                                   rocm_aiter_topk_softmax)
+from .rocm_aiter_fused_moe import is_rocm_aiter_moe_enabled
 
 logger = init_logger(__name__)
 
@@ -846,6 +844,7 @@ def vllm_topk_softmax(topk_weights: torch.Tensor, topk_indices: torch.Tensor,
 
 def dispatch_topk_func() -> Callable[..., tuple[torch.Tensor, ...]]:
     if is_rocm_aiter_moe_enabled():
+        from .rocm_aiter_fused_moe import rocm_aiter_topk_softmax
         return rocm_aiter_topk_softmax
     return vllm_topk_softmax
 
@@ -1102,6 +1101,7 @@ def torch_vllm_outplace_fused_experts(**kwargs) -> torch.Tensor:
 
 def dispatch_fused_experts_func(inplace: bool) -> Callable[..., torch.Tensor]:
     if is_rocm_aiter_moe_enabled():
+        from .rocm_aiter_fused_moe import rocm_aiter_fused_experts
         return rocm_aiter_fused_experts
     if inplace:
         return torch_vllm_inplace_fused_experts
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index 4214e89448212..1315dcead49d0 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -10,28 +10,68 @@ from vllm.platforms import current_platform
 def is_rocm_aiter_moe_enabled() -> bool:
     return current_platform.is_rocm() \
         and envs.VLLM_ROCM_USE_AITER_MOE \
-        and envs.VLLM_ROCM_USE_AITER \
+        and envs.VLLM_ROCM_USE_AITER
 
 
-def is_rocm_aiter_block_scaled_moe_enabled() -> bool:
-    return is_rocm_aiter_moe_enabled() and \
-        envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE
+def rocm_aiter_asm_moe_tkw1(hidden_states,
+                            w1,
+                            w2,
+                            topk_weight,
+                            topk_ids,
+                            fc1_scale=None,
+                            fc2_scale=None,
+                            fc1_smooth_scale=None,
+                            fc2_smooth_scale=None,
+                            a16=False,
+                            per_tensor_quant_scale=None,
+                            expert_mask=None,
+                            activation_str: str = "silu") -> None:
+
+    from aiter import ActivationType
+    from aiter.fused_moe_bf16_asm import asm_moe_tkw1
+
+    activation = \
+        ActivationType.Gelu if activation_str == "gelu" else ActivationType.Silu
+
+    return asm_moe_tkw1(hidden_states,
+                        w1,
+                        w2,
+                        topk_weight,
+                        topk_ids,
+                        fc1_scale=fc1_scale,
+                        fc2_scale=fc2_scale,
+                        fc1_smooth_scale=fc1_smooth_scale,
+                        fc2_smooth_scale=fc2_smooth_scale,
+                        a16=a16,
+                        per_tensor_quant_scale=per_tensor_quant_scale,
+                        expert_mask=expert_mask,
+                        activation=activation)
 
 
 def rocm_aiter_fused_experts(
-        *,
-        hidden_states: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        use_fp8_w8a8: bool = False,
-        apply_router_weight_on_input: bool = False,
-        w1_scale: Optional[torch.Tensor] = None,
-        w2_scale: Optional[torch.Tensor] = None,
-        block_shape: Optional[List[int]] = None,
-        expert_mask: Optional[torch.Tensor] = None,
-        **kwagrs  # Ignore additional keyword arguments
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    activation: str = "silu",
+    apply_router_weight_on_input: bool = False,
+    use_fp8_w8a8: bool = False,
+    use_int8_w8a8: bool = False,
+    use_int8_w8a16: bool = False,
+    use_int4_w4a16: bool = False,
+    per_channel_quant: bool = False,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    w1_zp: Optional[torch.Tensor] = None,
+    w2_zp: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[List[int]] = None,
+    allow_deep_gemm: bool = False,
 ) -> torch.Tensor:
 
     import aiter as rocm_aiter
@@ -40,25 +80,21 @@ def rocm_aiter_fused_experts(
     from vllm.model_executor.layers.quantization.utils.fp8_utils import (
         per_token_group_quant_fp8)
 
-    if apply_router_weight_on_input:
-        assert (topk_weights.dim() == 2
-                ), "`topk_weights` should be in shape (num_tokens, topk)"
-        _, topk = topk_weights.shape
-        assert (
-            topk == 1
-        ), "Only support topk=1 when `apply_router_weight_on_input` is True"
+    # All AITER Fused MoE kernels are expecting the following datatypes
+    topk_weights = topk_weights.to(torch.float32)
+    topk_ids = topk_ids.to(torch.int32)
 
-        hidden_states = hidden_states * topk_weights.to(hidden_states.dtype)
-        topk_ids = topk_ids.to(torch.int32)
-        topk_weights = torch.ones_like(topk_weights, dtype=torch.float32)
+    if (block_shape is not None) and use_fp8_w8a8:
+        assert not apply_router_weight_on_input, (
+            "apply_router_weight_on_input is not supported for block scaled moe"
+        )
 
-    if envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE and use_fp8_w8a8:
         assert w1_scale is not None
         assert w2_scale is not None
 
         local_E = E = w1.shape[0]
-        if expert_mask is not None:
-            E = expert_mask.numel()
+        if expert_map is not None:
+            E = expert_map.numel()
 
         topk = topk_ids.shape[1]
         model_dim = w1.shape[-1]
@@ -80,7 +116,7 @@ def rocm_aiter_fused_experts(
                                                E,
                                                model_dim,
                                                dtype,
-                                               expert_mask=expert_mask)
+                                               expert_mask=expert_map)
 
         a1, a1_scale = per_token_group_quant_fp8(hidden_states, scale_blk_k)
         rocm_aiter.fmoe_fp8_blockscale_g1u1(
@@ -102,7 +138,33 @@ def rocm_aiter_fused_experts(
         )
         return out_asm
 
+    elif per_channel_quant and apply_router_weight_on_input and use_fp8_w8a8:
+        # AITER tkw1 kernel for FP8 models with `apply_router_weight_on_input`
+        # This applies topk_weights on the GEMM output of the first FC layer
+        #  rather than the second FC.
+        assert (topk_weights.dim() == 2
+                ), "`topk_weights` should be in shape (num_tokens, topk)"
+        assert topk_weights.shape[-1] == 1, (
+            "Only support topk=1 when"
+            " `apply_router_weight_on_input` is True")
+
+        return rocm_aiter_asm_moe_tkw1(hidden_states,
+                                       w1,
+                                       w2,
+                                       topk_weights,
+                                       topk_ids,
+                                       fc1_scale=w1_scale,
+                                       fc2_scale=w2_scale,
+                                       fc1_smooth_scale=None,
+                                       fc2_smooth_scale=None,
+                                       a16=False,
+                                       per_tensor_quant_scale=None,
+                                       expert_mask=expert_map,
+                                       activation_str=activation)
+
     elif use_fp8_w8a8:
+        assert not apply_router_weight_on_input, (
+            "apply_router_weight_on_input is not supported for fp8_w8a8")
         return rocm_aiter_asm_fmoe.asm_moe(hidden_states=hidden_states,
                                            w1=w1,
                                            w2=w2,
@@ -114,6 +176,18 @@ def rocm_aiter_fused_experts(
                                            fc2_smooth_scale=None,
                                            a16=False)
 
+    if apply_router_weight_on_input:
+        assert (topk_weights.dim() == 2
+                ), "`topk_weights` should be in shape (num_tokens, topk)"
+        _, topk = topk_weights.shape
+        assert (
+            topk == 1
+        ), "Only support topk=1 when `apply_router_weight_on_input` is True"
+
+        hidden_states = hidden_states * topk_weights.to(hidden_states.dtype)
+        topk_ids = topk_ids.to(torch.int32)
+        topk_weights = torch.ones_like(topk_weights, dtype=torch.float32)
+
     return rocm_aiter.ck_moe(hidden_states=hidden_states,
                              w1=w1,
                              w2=w2,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 628724c5b7d67..4e01b298d084e 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -250,6 +250,28 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
                                                         requires_grad=False)
 
+        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+            is_rocm_aiter_moe_enabled)
+
+        # Property to determine if AITER is used
+        if is_rocm_aiter_moe_enabled():
+            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa E501
+                rocm_aiter_fused_experts, shuffle_weights)
+
+            # reshaping weights is required for aiter moe kernel.
+            shuffled_w13, shuffled_w2 = shuffle_weights(
+                layer.w13_weight.data, layer.w2_weight.data)
+
+            layer.w13_weight = torch.nn.Parameter(shuffled_w13,
+                                                  requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(shuffled_w2,
+                                                 requires_grad=False)
+
+            self.fused_experts_func = rocm_aiter_fused_experts
+        else:
+            from vllm.model_executor.layers.fused_moe import fused_experts
+            self.fused_experts_func = fused_experts
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -268,7 +290,6 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
-        from vllm.model_executor.layers.fused_moe import fused_experts
 
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
@@ -282,7 +303,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias)
 
-        return fused_experts(
+        return self.fused_experts_func(
             x,
             layer.w13_weight,
             layer.w2_weight,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index b7327f47733b3..be76785baccce 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -575,8 +575,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
     def process_weights_after_loading(self, layer: Module) -> None:
         # Lazy import to avoid importing triton too early.
         from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-            expand_weights, is_rocm_aiter_block_scaled_moe_enabled,
-            is_rocm_aiter_moe_enabled, shuffle_weights)
+            expand_weights, is_rocm_aiter_moe_enabled, shuffle_weights)
 
         # TODO (rob): refactor block quant into separate class.
         if self.block_quant:
@@ -603,7 +602,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             layer.w2_weight = Parameter(w2_weight, requires_grad=False)
             layer.w2_weight_scale_inv = Parameter(w2_weight_scale_inv,
                                                   requires_grad=False)
-            if is_rocm_aiter_block_scaled_moe_enabled():
+            if is_rocm_aiter_moe_enabled():
                 # reshaping weights is required for aiter moe kernel.
                 shuffled_w13, shuffled_w2 = shuffle_weights(
                     layer.w13_weight.data, layer.w2_weight.data)

From c9acbf11410e138d9ddd85b3bd0be13428286077 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 22 Apr 2025 11:44:24 +0800
Subject: [PATCH 547/593] [Misc] Remove the chunked prefill warning for LoRA 
 (#16925)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/config.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 36b9f433150c3..937e188fc1459 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2686,13 +2686,6 @@ class LoRAConfig:
         elif isinstance(self.lora_dtype, str):
             self.lora_dtype = getattr(torch, self.lora_dtype)
 
-    def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
-        # Reminder: Please update docs/source/features/compatibility_matrix.md
-        # If the feature combo become valid
-        if scheduler_config.chunked_prefill_enabled:
-            logger.warning("LoRA with chunked prefill is still experimental "
-                           "and may be unstable.")
-
     def verify_lora_support(self):
         if self.long_lora_scaling_factors is not None and envs.VLLM_USE_V1:
             raise ValueError(
@@ -3820,8 +3813,6 @@ class VllmConfig:
         if self.lora_config:
             self.lora_config.verify_with_cache_config(self.cache_config)
             self.lora_config.verify_with_model_config(self.model_config)
-            self.lora_config.verify_with_scheduler_config(
-                self.scheduler_config)
             self.lora_config.verify_lora_support()
         if self.prompt_adapter_config:
             self.prompt_adapter_config.verify_with_model_config(

From 7b8a2ab76fc3438b1bd6e3caa86165cc0058c462 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Mon, 21 Apr 2025 23:44:32 -0400
Subject: [PATCH 548/593] [Kernel] Add expert_map support to Cutlass FP8 MOE
 (#16861)

Signed-off-by: varun sundar rabindranath <vsundarr@redhat.com>
Co-authored-by: varun sundar rabindranath <vsundarr@redhat.com>
---
 .../quantization/cutlass_w8a8/moe/moe_data.cu |  17 +-
 tests/kernels/test_cutlass_moe.py             | 438 +++++++++++-------
 vllm/config.py                                |   1 +
 .../layers/fused_moe/cutlass_moe.py           |  41 +-
 .../compressed_tensors_moe.py                 |   5 +-
 5 files changed, 331 insertions(+), 171 deletions(-)

diff --git a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
index 2fb0417ce6c41..894727383a639 100644
--- a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
@@ -46,14 +46,26 @@ __global__ void compute_expert_offsets(
 }
 
 __global__ void compute_arg_sorts(const int* __restrict__ topk_ids,
+                                  const int32_t* __restrict__ expert_offsets,
                                   int32_t* input_permutation,
                                   int32_t* output_permutation,
                                   int32_t* atomic_buffer, const int topk_length,
                                   const int topk) {
-  int expert_id = blockIdx.x;
+  int const blk_expert_id = blockIdx.x;
+  int const num_experts = gridDim.x;
+  int32_t const num_tokens = expert_offsets[num_experts];
 
   for (int i = threadIdx.x; i < topk_length; i += THREADS_PER_EXPERT) {
-    if (topk_ids[i] == expert_id) {
+    int const expert_id = topk_ids[i];
+    if (expert_id == -1 && blockIdx.x == 0) {
+      // output_permutation is used to re-order the moe outputs. It is
+      // used as c2 = c2[c_map], where c2 is a torch.tensor that is the
+      // output of the cutlass kernels and c_map is the output_permutation.
+      // c2 is initialized to zeros, therefore by setting the output_permutation
+      // to num_tokens, we are guaranteed to fill the moe outputs to zero
+      // for "invalid" topk_ids.
+      output_permutation[i] = num_tokens;
+    } else if (expert_id == blk_expert_id) {
       int start = atomicAdd(&atomic_buffer[expert_id], 1);
       input_permutation[start] = i / topk;
       output_permutation[i] = start;
@@ -83,6 +95,7 @@ void get_cutlass_moe_mm_data_caller(
       static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
   compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
       static_cast<const int32_t*>(topk_ids.data_ptr()),
+      static_cast<const int32_t*>(expert_offsets.data_ptr()),
       static_cast<int32_t*>(input_permutation.data_ptr()),
       static_cast<int32_t*>(output_permutation.data_ptr()),
       static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(),
diff --git a/tests/kernels/test_cutlass_moe.py b/tests/kernels/test_cutlass_moe.py
index 3cfed6ae8538f..975cd418a171f 100644
--- a/tests/kernels/test_cutlass_moe.py
+++ b/tests/kernels/test_cutlass_moe.py
@@ -1,4 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
+import dataclasses
+from typing import Optional
+
 import pytest
 import torch
 
@@ -12,32 +15,204 @@ from vllm.platforms import current_platform
 NUM_EXPERTS = [40, 64]
 TOP_KS = [6, 8]
 
-
-def run(a: torch.Tensor, a_scale: torch.Tensor, w1_q: torch.Tensor,
-        w2_q: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor,
-        topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-        ab_strides1: torch.Tensor, c_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor, c_strides2: torch.Tensor):
-    with set_current_vllm_config(
-            VllmConfig(parallel_config=ParallelConfig(
-                pipeline_parallel_size=1))):
-        return cutlass_moe_fp8(a,
-                               w1_q,
-                               w2_q,
-                               w1_scale,
-                               w2_scale,
-                               topk_weights,
-                               topk_ids,
-                               ab_strides1,
-                               c_strides1,
-                               ab_strides2,
-                               c_strides2,
-                               a1_scale=a_scale)
+MNK_FACTORS = [
+    (2, 1024, 1024),
+    (2, 1024, 1536),
+    (2, 3072, 1024),
+    (2, 3072, 1536),
+    (64, 1024, 1024),
+    (64, 1024, 1536),
+    (64, 3072, 1024),
+    (64, 3072, 1536),
+    (224, 1024, 1024),
+    (224, 1024, 1536),
+    (224, 3072, 1024),
+    (224, 3072, 1536),
+]
 
 
-@pytest.mark.parametrize("m", [2, 64, 224])
-@pytest.mark.parametrize("n", [1024, 3072])
-@pytest.mark.parametrize("k", [1024, 1536])
+@dataclasses.dataclass
+class MOETensors:
+    a: torch.Tensor
+    w1: torch.Tensor
+    w2: torch.Tensor
+    ab_strides1: torch.Tensor
+    c_strides1: torch.Tensor
+    ab_strides2: torch.Tensor
+    c_strides2: torch.Tensor
+
+    @staticmethod
+    def make_moe_tensors(m: int, k: int, n: int, e: int,
+                         dtype: torch.dtype) -> "MOETensors":
+        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+        w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+        w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
+        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
+        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+        return MOETensors(a=a,
+                          w1=w1,
+                          w2=w2,
+                          ab_strides1=ab_strides1,
+                          c_strides1=c_strides1,
+                          ab_strides2=ab_strides2,
+                          c_strides2=c_strides2)
+
+
+@dataclasses.dataclass
+class MOETensors8Bit(MOETensors):
+    # quantized
+    a_q: Optional[torch.Tensor] = None  # a -> a_q
+    w1_q: Optional[torch.Tensor] = None  # w1 -> w1_q
+    w2_q: Optional[torch.Tensor] = None  # w2 -> w2_q
+    a_scale: Optional[torch.Tensor] = None
+    w1_scale: Optional[torch.Tensor] = None
+    w2_scale: Optional[torch.Tensor] = None
+    # dequantized
+    a_d: Optional[torch.Tensor] = None  # a -> a_q -> a_d
+    w1_d: Optional[torch.Tensor] = None  # w1 -> w1_q -> w1_d
+    w2_d: Optional[torch.Tensor] = None  # w2 -> w2_q -> w2_d
+
+    @staticmethod
+    def make_moe_tensors_8bit(m: int, k: int, n: int, e: int,
+                              per_act_token: bool,
+                              per_out_channel: bool) -> "MOETensors8Bit":
+        dtype = torch.half
+        q_dtype = torch.float8_e4m3fn
+
+        moe_tensors_fp16 = MOETensors.make_moe_tensors(m, k, n, e, dtype)
+
+        # a -> a_q, w1 -> w1_q, w2 -> w2_q
+        n_b_scales = 2 * n if per_out_channel else 1
+        k_b_scales = k if per_out_channel else 1
+        # Get the right scale for tests.
+        _, a_scale = ops.scaled_fp8_quant(
+            moe_tensors_fp16.a, use_per_token_if_dynamic=per_act_token)
+        a_q, _ = ops.scaled_fp8_quant(moe_tensors_fp16.a,
+                                      a_scale,
+                                      use_per_token_if_dynamic=per_act_token)
+        w1_q = torch.empty((e, 2 * n, k), device="cuda", dtype=q_dtype)
+        w2_q = torch.empty((e, k, n), device="cuda", dtype=q_dtype)
+
+        w1_scale = torch.empty((e, n_b_scales, 1),
+                               device="cuda",
+                               dtype=torch.float32)
+        w2_scale = torch.empty((e, k_b_scales, 1),
+                               device="cuda",
+                               dtype=torch.float32)
+        for expert in range(e):
+            w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
+                moe_tensors_fp16.w1[expert],
+                use_per_token_if_dynamic=per_out_channel)
+            w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
+                moe_tensors_fp16.w2[expert],
+                use_per_token_if_dynamic=per_out_channel)
+
+        # a_q -> a_d, w1_q -> w1_d, w2_q -> w2_d
+        a_d = a_q.float().mul(a_scale).to(dtype)
+        w1_d = torch.empty_like(moe_tensors_fp16.w1)
+        w2_d = torch.empty_like(moe_tensors_fp16.w2)
+        for expert in range(e):
+            w1_d[expert] = (w1_q[expert].float() * w1_scale[expert]).half()
+            w2_d[expert] = (w2_q[expert].float() * w2_scale[expert]).half()
+
+        return MOETensors8Bit(a=moe_tensors_fp16.a,
+                              w1=moe_tensors_fp16.w1,
+                              w2=moe_tensors_fp16.w2,
+                              ab_strides1=moe_tensors_fp16.ab_strides1,
+                              c_strides1=moe_tensors_fp16.c_strides1,
+                              ab_strides2=moe_tensors_fp16.ab_strides2,
+                              c_strides2=moe_tensors_fp16.c_strides2,
+                              a_q=a_q,
+                              w1_q=w1_q,
+                              w2_q=w2_q,
+                              a_scale=a_scale,
+                              w1_scale=w1_scale,
+                              w2_scale=w2_scale,
+                              a_d=a_d,
+                              w1_d=w1_d,
+                              w2_d=w2_d)
+
+
+def run_with_expert_maps(num_experts: int, num_local_experts: int,
+                         **cutlass_moe_kwargs):
+
+    def slice_experts():
+        slice_params = [
+            "w1_q", "w2_q", "ab_strides1", "ab_strides2", "c_strides1",
+            "c_strides2", "w1_scale", "w2_scale"
+        ]
+        full_tensors = {
+            k: v
+            for k, v in cutlass_moe_kwargs.items()
+            if k in slice_params and k in cutlass_moe_kwargs
+        }
+
+        for i in range(0, num_experts, num_local_experts):
+            s, e = i, i + num_local_experts
+
+            # make expert map
+            expert_map = [-1] * num_experts
+            expert_map[s:e] = list(range(num_local_experts))
+            expert_map = torch.tensor(expert_map,
+                                      dtype=torch.int32,
+                                      device="cuda")
+
+            # update cutlass moe arg with expert_map
+            cutlass_moe_kwargs["expert_map"] = expert_map
+            # update cutlass moe arg tensors
+            for k, t in full_tensors.items():
+                cutlass_moe_kwargs[k] = t[s:e]
+
+            yield cutlass_moe_kwargs
+
+    out_tensor = torch.zeros_like(cutlass_moe_kwargs["a"])
+    for kwargs in slice_experts():
+        out_tensor = out_tensor + cutlass_moe_fp8(**kwargs)
+
+    return out_tensor
+
+
+def run_8_bit(moe_tensors: MOETensors8Bit,
+              topk_weights: torch.Tensor,
+              topk_ids: torch.Tensor,
+              num_local_experts: Optional[int] = None) -> torch.Tensor:
+    assert not any([
+        t is None for t in [
+            moe_tensors.w1_q, moe_tensors.w2_q, moe_tensors.w1_scale,
+            moe_tensors.w2_scale, moe_tensors.a_scale
+        ]
+    ])
+
+    kwargs = {
+        'a': moe_tensors.a,
+        'w1_q': moe_tensors.w1_q.transpose(1, 2),  # type: ignore[union-attr]
+        'w2_q': moe_tensors.w2_q.transpose(1, 2),  # type: ignore[union-attr]
+        'topk_weights': topk_weights,
+        'topk_ids_': topk_ids,
+        'ab_strides1': moe_tensors.ab_strides1,
+        'c_strides1': moe_tensors.c_strides1,
+        'ab_strides2': moe_tensors.ab_strides2,
+        'c_strides2': moe_tensors.c_strides2,
+        'w1_scale': moe_tensors.w1_scale,
+        'w2_scale': moe_tensors.w2_scale,
+        'a1_scale': moe_tensors.a_scale
+    }
+
+    num_experts = moe_tensors.w1.size(0)
+    with_ep = num_local_experts is not None or num_local_experts == num_experts
+    if not with_ep:
+        return cutlass_moe_fp8(**kwargs)
+
+    assert num_local_experts is not None
+    return run_with_expert_maps(
+        num_experts,
+        num_local_experts,  # type: ignore[arg-type]
+        **kwargs)
+
+
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("per_act_token", [True, False])
@@ -46,7 +221,7 @@ def run(a: torch.Tensor, a_scale: torch.Tensor, w1_q: torch.Tensor,
     (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
         current_platform.get_device_capability()),
     reason="Grouped gemm is not supported on this GPU type.")
-def test_cutlass_moe_no_graph(
+def test_cutlass_moe_8_bit_no_graph(
     m: int,
     n: int,
     k: int,
@@ -60,80 +235,21 @@ def test_cutlass_moe_no_graph(
             VllmConfig(parallel_config=ParallelConfig(
                 pipeline_parallel_size=1))):
 
-        dtype = torch.half
+        mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token,
+                                                  per_out_ch)
 
-        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
-        w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
-        w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+        score = torch.randn((m, e), device="cuda", dtype=torch.half)
+        topk_weights, topk_ids = fused_topk(mt.a,
+                                            score,
+                                            topk,
+                                            renormalize=False)
 
-        # Get the right scale for tests.
-        _, a_scale1 = ops.scaled_fp8_quant(
-            a, use_per_token_if_dynamic=per_act_token)
-        a_q, _ = ops.scaled_fp8_quant(a,
-                                      a_scale1,
-                                      use_per_token_if_dynamic=per_act_token)
+        # Note that we are using the dequantized versions of the tensors.
+        # Using a, w1 and w2 directly results in minor output differences.
+        triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights,
+                                      topk_ids)
 
-        a_d = a_q.float().mul(a_scale1).to(dtype)
-
-        n_b_scales = 2 * n if per_out_ch else 1
-        k_b_scales = k if per_out_ch else 1
-
-        w1_q = torch.empty((e, 2 * n, k),
-                           device="cuda",
-                           dtype=torch.float8_e4m3fn)
-        w2_q = torch.empty((e, k, n), device="cuda", dtype=torch.float8_e4m3fn)
-        w1_scale = torch.empty((e, n_b_scales, 1),
-                               device="cuda",
-                               dtype=torch.float32)
-        w2_scale = torch.empty((e, k_b_scales, 1),
-                               device="cuda",
-                               dtype=torch.float32)
-
-        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
-        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
-        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-
-        for expert in range(e):
-            w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
-                w1[expert], use_per_token_if_dynamic=per_out_ch)
-            w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
-                w2[expert], use_per_token_if_dynamic=per_out_ch)
-        w1_q = w1_q.transpose(1, 2)
-        w2_q = w2_q.transpose(1, 2)
-
-        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
-        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
-        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-
-        w1_d = torch.empty_like(w1)
-        w2_d = torch.empty_like(w2)
-        for expert in range(e):
-            w1_d[expert] = (w1_q[expert].t().float() * w1_scale[expert]).half()
-            w2_d[expert] = (w2_q[expert].t().float() * w2_scale[expert]).half()
-
-        score = torch.randn((m, e), device="cuda", dtype=dtype)
-        topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
-
-        triton_output = fused_experts(a_d, w1_d, w2_d, topk_weights, topk_ids)
-
-        cutlass_output = cutlass_moe_fp8(a,
-                                         w1_q,
-                                         w2_q,
-                                         w1_scale,
-                                         w2_scale,
-                                         topk_weights,
-                                         topk_ids,
-                                         ab_strides1,
-                                         c_strides1,
-                                         ab_strides2,
-                                         c_strides2,
-                                         a1_scale=a_scale1)
-
-        #print(triton_output)
-        #print(cutlass_output)
-        #print("*")
+        cutlass_output = run_8_bit(mt, topk_weights, topk_ids)
 
         torch.testing.assert_close(triton_output,
                                    cutlass_output,
@@ -141,9 +257,7 @@ def test_cutlass_moe_no_graph(
                                    rtol=1e-2)
 
 
-@pytest.mark.parametrize("m", [2, 64, 224])
-@pytest.mark.parametrize("n", [1024, 3072])
-@pytest.mark.parametrize("k", [1024, 1536])
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("per_act_token", [True, False])
@@ -152,7 +266,7 @@ def test_cutlass_moe_no_graph(
     (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
         current_platform.get_device_capability()),
     reason="Grouped gemm is not supported on this GPU type.")
-def test_cutlass_moe_cuda_graph(
+def test_cutlass_moe_8_bit_cuda_graph(
     m: int,
     n: int,
     k: int,
@@ -168,77 +282,83 @@ def test_cutlass_moe_cuda_graph(
 
         dtype = torch.half
 
-        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
-        w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
-        w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
-
-        # Get the right scale for tests.
-        _, a_scale1 = ops.scaled_fp8_quant(
-            a, use_per_token_if_dynamic=per_act_token)
-        a_q, _ = ops.scaled_fp8_quant(a,
-                                      a_scale1,
-                                      use_per_token_if_dynamic=per_act_token)
-
-        a_d = a_q.float().mul(a_scale1).to(dtype)
-
-        n_b_scales = 2 * n if per_out_ch else 1
-        k_b_scales = k if per_out_ch else 1
-
-        w1_q = torch.empty((e, 2 * n, k),
-                           device="cuda",
-                           dtype=torch.float8_e4m3fn)
-        w2_q = torch.empty((e, k, n), device="cuda", dtype=torch.float8_e4m3fn)
-        w1_scale = torch.empty((e, n_b_scales, 1),
-                               device="cuda",
-                               dtype=torch.float32)
-        w2_scale = torch.empty((e, k_b_scales, 1),
-                               device="cuda",
-                               dtype=torch.float32)
-
-        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
-        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
-        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-
-        for expert in range(e):
-            w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
-                w1[expert], use_per_token_if_dynamic=per_out_ch)
-            w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
-                w2[expert], use_per_token_if_dynamic=per_out_ch)
-        w1_q = w1_q.transpose(1, 2)
-        w2_q = w2_q.transpose(1, 2)
-
-        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
-        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
-        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-
-        w1_d = torch.empty_like(w1)
-        w2_d = torch.empty_like(w2)
-        for expert in range(e):
-            w1_d[expert] = (w1_q[expert].t().float() * w1_scale[expert]).half()
-            w2_d[expert] = (w2_q[expert].t().float() * w2_scale[expert]).half()
+        mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token,
+                                                  per_out_ch)
 
         score = torch.randn((m, e), device="cuda", dtype=dtype)
-        topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
+        topk_weights, topk_ids = fused_topk(mt.a,
+                                            score,
+                                            topk,
+                                            renormalize=False)
 
-        triton_output = fused_experts(a_d, w1_d, w2_d, topk_weights, topk_ids)
+        # Note that we are using the dequantized versions of the tensors.
+        # Using a, w1 and w2 directly results in minor output differences.
+        triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights,
+                                      topk_ids)
 
         stream = torch.cuda.Stream()
         graph = torch.cuda.CUDAGraph()
         with torch.cuda.graph(graph, stream=stream):
-            cutlass_output = run(a, a_scale1, w1_q, w2_q, w1_scale, w2_scale,
-                                 topk_weights, topk_ids, ab_strides1,
-                                 c_strides1, ab_strides2, c_strides2)
+            cutlass_output = run_8_bit(mt, topk_weights, topk_ids)
+
         torch.cuda.synchronize()
         graph.replay()
         torch.cuda.synchronize()
 
-        #print(triton_output)
-        #print(cutlass_output)
-        #print("*")
-
         torch.testing.assert_close(triton_output,
                                    cutlass_output,
                                    atol=9e-2,
                                    rtol=1e-2)
+
+
+@pytest.mark.parametrize("m", [64])
+@pytest.mark.parametrize("n", [1024])
+@pytest.mark.parametrize("k", [4096])
+@pytest.mark.parametrize("e", [16])
+@pytest.mark.parametrize("topk", [1, 8])
+@pytest.mark.parametrize("per_act_token", [True])
+@pytest.mark.parametrize("per_out_channel", [True])
+@pytest.mark.parametrize("ep_size", [1, 2, 4, 8, 16])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+def test_cutlass_moe_8_bit_EP(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_channel: bool,
+    ep_size: int,
+):
+    current_platform.seed_everything(7)
+    with set_current_vllm_config(
+            VllmConfig(parallel_config=ParallelConfig(
+                pipeline_parallel_size=1))):
+
+        mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token,
+                                                  per_out_channel)
+
+        score = torch.randn((m, e), device="cuda", dtype=torch.half)
+        topk_weights, topk_ids = fused_topk(mt.a,
+                                            score,
+                                            topk,
+                                            renormalize=False)
+
+        # Note that we are using the dequantized versions of the tensors.
+        # Using a, w1 and w2 directly results in minor output differences.
+        triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights,
+                                      topk_ids)
+
+        assert e % ep_size == 0, "Cannot distribute experts evenly"
+        cutlass_output = run_8_bit(mt,
+                                   topk_weights,
+                                   topk_ids,
+                                   num_local_experts=e // ep_size)
+
+        torch.testing.assert_close(triton_output,
+                                   cutlass_output,
+                                   atol=5e-2,
+                                   rtol=1e-2)
diff --git a/vllm/config.py b/vllm/config.py
index 937e188fc1459..a3ed94bc50f82 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1693,6 +1693,7 @@ class ParallelConfig:
         factors: list[Any] = []
         factors.append(self.pipeline_parallel_size)
         factors.append(self.tensor_parallel_size)
+        factors.append(self.enable_expert_parallel)
         return hashlib.sha256(str(factors).encode()).hexdigest()
 
     def __post_init__(self) -> None:
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
index d6a27aa0ddc47..960c7f8348571 100644
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -15,7 +15,7 @@ def cutlass_moe_fp8(
     w1_scale: torch.Tensor,
     w2_scale: torch.Tensor,
     topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
+    topk_ids_: torch.Tensor,
     ab_strides1: torch.Tensor,
     c_strides1: torch.Tensor,
     ab_strides2: torch.Tensor,
@@ -23,6 +23,7 @@ def cutlass_moe_fp8(
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
     out_dtype: torch.dtype = torch.half,
+    expert_map: Optional[torch.Tensor] = None,
     apply_router_weight_on_input: bool = False,
 ) -> torch.Tensor:
     """
@@ -57,12 +58,19 @@ def cutlass_moe_fp8(
         quantize the intermediate result between the gemms.
         Shape: scalar or [M]
     - out_dtype (torch.Tensor): The output tensor type.
+    - expert_map (Optional[torch.Tensor]): In the case of Expert parallel,
+        every Rank is responsible for a subset of experts. expert_map is a
+        mapping from global expert-id to local expert-id. When expert_map[i]
+        is -1, it means that this Rank is not responsible for global
+        expert-id i.
+    - apply_router_weight_on_input (bool): When true, the topk weights are
+        applied directly on the inputs. This is only applicable when topk is 1.
 
     Returns:
     - torch.Tensor: The fp16 output tensor after applying the MoE layer.
     """
 
-    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
+    assert topk_weights.shape == topk_ids_.shape, "topk shape mismatch"
     assert w1_q.dtype == torch.float8_e4m3fn
     assert w2_q.dtype == torch.float8_e4m3fn
     assert a.shape[1] == w1_q.shape[1], "Hidden size mismatch w1"
@@ -96,7 +104,13 @@ def cutlass_moe_fp8(
     k = w1_q.size(1)
     n = w2_q.size(1)
 
-    topk = topk_ids.size(1)
+    local_topk_ids = topk_ids_
+    if expert_map is not None:
+        "Translate info from expert_map to topk_ids"
+        local_topk_ids = torch.where(expert_map[topk_ids_] != -1,
+                                     expert_map[topk_ids_], -1)
+
+    topk = local_topk_ids.size(1)
 
     per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
         a2_scale.numel() != 1 if a2_scale is not None else False)
@@ -120,10 +134,23 @@ def cutlass_moe_fp8(
                                  dtype=torch.int32,
                                  device=device)
 
-    a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
-    c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
+    a_map_initializer = torch.empty
+    c2_initializer = torch.empty
+    if expert_map is not None:
+        # With expert_map each Rank processes only a subset of experts. As
+        # a result not all of a_map and c2 tensors are filled. We fill it
+        # zeros for correctness.
+        a_map_initializer = torch.zeros
+        c2_initializer = torch.zeros
 
-    ops.get_cutlass_moe_mm_data(topk_ids, expert_offsets, problem_sizes1,
+    a_map = a_map_initializer((local_topk_ids.numel()),
+                              dtype=torch.int32,
+                              device=device)
+    c_map = torch.empty((local_topk_ids.numel()),
+                        dtype=torch.int32,
+                        device=device)
+
+    ops.get_cutlass_moe_mm_data(local_topk_ids, expert_offsets, problem_sizes1,
                                 problem_sizes2, a_map, c_map, num_experts, n,
                                 k)
 
@@ -131,7 +158,7 @@ def cutlass_moe_fp8(
     rep_a1_scales = a1_scale[a_map] if per_act_token else a1_scale
 
     c1 = torch.empty((m * topk, n * 2), device=device, dtype=out_dtype)
-    c2 = torch.empty((m * topk, k), device=device, dtype=out_dtype)
+    c2 = c2_initializer((m * topk, k), device=device, dtype=out_dtype)
 
     ops.cutlass_moe_mm(c1, rep_a_q, w1_q, rep_a1_scales, w1_scale,
                        expert_offsets[:-1], problem_sizes1, ab_strides1,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 4e01b298d084e..d74d4e9273b79 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -67,7 +67,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
             else:
                 return CompressedTensorsWNA16MarlinMoEMethod(quant_config)
         elif (quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
-              and layer.activation == "silu" and layer.expert_map is None):
+              and layer.activation == "silu"):
             return CompressedTensorsW8A8Fp8MoECutlassMethod(quant_config)
         elif quant_config._is_fp8_w8a8(weight_quant, input_quant):
             return CompressedTensorsW8A8Fp8MoEMethod(quant_config)
@@ -510,8 +510,6 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
     ) -> torch.Tensor:
 
         assert activation == "silu"
-        assert global_num_experts == layer.w13_weight.shape[0]
-        assert expert_map is None
 
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
@@ -542,6 +540,7 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
             a1_scale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
             out_dtype=x.dtype,
+            expert_map=expert_map,
             apply_router_weight_on_input=apply_router_weight_on_input,
         )
 

From b9b47469507c44fa7a1330bfcaaf05951bb444c3 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Tue, 22 Apr 2025 11:45:27 +0800
Subject: [PATCH 549/593] [V1] Remove additional_config check (#16710)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 vllm/engine/arg_utils.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index b30967c676d59..06529ae25a839 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1361,11 +1361,6 @@ class EngineArgs:
                                recommend_to_remove=True)
             return False
 
-        if self.additional_config != EngineArgs.additional_config:
-            _raise_or_fallback(feature_name="--additional-config",
-                               recommend_to_remove=False)
-            return False
-
         # Xgrammar and Guidance are supported.
         SUPPORTED_GUIDED_DECODING = [
             "xgrammar", "xgrammar:disable-any-whitespace", "guidance",

From 188b7f9b8c30b09afb507beca480a1ee7d41c154 Mon Sep 17 00:00:00 2001
From: Charlie Fu <charlifu@amd.com>
Date: Mon, 21 Apr 2025 22:46:22 -0500
Subject: [PATCH 550/593] [Performance][ROCm] Add skinny gemms for unquantized
 linear on ROCm (#15830)

Signed-off-by: charlifu <charlifu@amd.com>
Co-authored-by: Tyler Michael Smith <tysmith@redhat.com>
---
 CMakeLists.txt                                |    1 +
 csrc/rocm/ops.h                               |    9 +
 csrc/rocm/skinny_gemms.cu                     | 1600 +++++++++++++++++
 csrc/rocm/torch_bindings.cpp                  |   18 +
 tests/kernels/test_rocm_skinny_gemms.py       |   80 +
 vllm/_custom_ops.py                           |   20 +
 vllm/envs.py                                  |    6 +
 vllm/model_executor/layers/linear.py          |    4 +-
 .../layers/quantization/utils/w8a8_utils.py   |  261 ++-
 vllm/model_executor/layers/utils.py           |   37 +-
 vllm/platforms/interface.py                   |    7 +
 vllm/platforms/rocm.py                        |    5 +
 12 files changed, 1955 insertions(+), 93 deletions(-)
 create mode 100644 csrc/rocm/skinny_gemms.cu
 create mode 100644 tests/kernels/test_rocm_skinny_gemms.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4f4b20d3515bc..21464a0560d93 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -678,6 +678,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
   #
   set(VLLM_ROCM_EXT_SRC
     "csrc/rocm/torch_bindings.cpp"
+    "csrc/rocm/skinny_gemms.cu"
     "csrc/rocm/attention.cu")
 
   define_gpu_extension_target(
diff --git a/csrc/rocm/ops.h b/csrc/rocm/ops.h
index afb735450e0cb..b90cfdc617afd 100644
--- a/csrc/rocm/ops.h
+++ b/csrc/rocm/ops.h
@@ -2,6 +2,15 @@
 
 #include <torch/all.h>
 
+torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
+                    const int64_t rows_per_block);
+
+torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
+                       const int64_t CuCount);
+
+void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
+               at::Tensor& scale_a, at::Tensor& scale_b, const int64_t CuCount);
+
 void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums,
                      torch::Tensor& max_logits, torch::Tensor& tmp_out,
                      torch::Tensor& query, torch::Tensor& key_cache,
diff --git a/csrc/rocm/skinny_gemms.cu b/csrc/rocm/skinny_gemms.cu
new file mode 100644
index 0000000000000..29dbbe8e35e8f
--- /dev/null
+++ b/csrc/rocm/skinny_gemms.cu
@@ -0,0 +1,1600 @@
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+#include <stdexcept>
+#include <algorithm>
+
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
+#include "quantization/fp8/common.cuh"
+
+#if defined(__HIPCC__) && (defined(__gfx90a__) || defined(__gfx942__))
+  #define __HIP__MI300_MI250__
+#endif
+
+#if defined(__HIPCC__) && defined(__gfx942__)
+  #define __HIP__MI300__
+#endif
+
+#if defined(NDEBUG)
+  #undef NDEBUG
+  #include <assert.h>
+  #define UNREACHABLE_CODE assert(false);
+  #define NDEBUG
+#else
+  #define UNREACHABLE_CODE assert(false);
+#endif
+
+template <typename T>
+struct scalar {};
+
+template <typename T>
+struct scalar2 {};
+
+template <typename T>
+__device__ __forceinline__ float2 __s22float2(T v);
+
+template <typename T>
+__device__ __forceinline__ T __float2s(float v);
+
+template <typename T>
+__device__ __forceinline__ T __float22s2_rn(float2 v);
+
+// Definitions and cvt functions for fp16
+template <>
+struct scalar<c10::Half> {
+  using type = half;
+};
+
+template <>
+struct scalar2<c10::Half> {
+  using type = __half2;
+};
+
+template <>
+__device__ __forceinline__ half __float2s(float v) {
+  return __float2half(v);
+}
+
+template <>
+__device__ __forceinline__ float2 __s22float2(__half2 v) {
+  return __half22float2(v);
+}
+
+template <>
+__device__ __forceinline__ __half2 __float22s2_rn(float2 v) {
+  return __float22half2_rn(v);
+}
+
+// Definitions and cvt functions for bf16
+template <>
+struct scalar<c10::BFloat16> {
+  using type = __hip_bfloat16;
+};
+
+template <>
+struct scalar2<c10::BFloat16> {
+  using type = __hip_bfloat162;
+};
+
+template <>
+__device__ __forceinline__ __hip_bfloat16 __float2s(float v) {
+  return __float2bfloat16(v);
+}
+
+template <>
+__device__ __forceinline__ float2 __s22float2(__hip_bfloat162 v) {
+  return __bfloat1622float2(v);
+}
+
+template <>
+__device__ __forceinline__ __hip_bfloat162 __float22s2_rn(float2 v) {
+  return __float22bfloat162_rn(v);
+}
+
+template <typename T>
+__device__ __forceinline__ T loadnt(T* addr) {
+  return __builtin_nontemporal_load(addr);
+}
+
+__device__ __forceinline__ float4 load_ntmprl(const float4* addr) {
+  auto addr_alias = reinterpret_cast<const float*>(addr);
+  auto dat0 = loadnt(addr_alias);
+  auto dat1 = loadnt(addr_alias + 1);
+  auto dat2 = loadnt(addr_alias + 2);
+  auto dat3 = loadnt(addr_alias + 3);
+  return make_float4(dat0, dat1, dat2, dat3);
+}
+
+// TBlock fetches entire rows of A, and entire col of B (K dimension); assume
+// N=1 for time being grid is M/A_NUM_ROWS blocks
+template <typename scalar_t, int NUM_A_ROWS_PER_BLOCK>
+__global__ void LLGemm1_kernel(const scalar_t* in_a, const scalar_t* in_b,
+                               scalar_t* out_c, const int K) {
+  using scalar2_t = typename scalar2<scalar_t>::type;
+  auto af4 = reinterpret_cast<const float4*>(in_a);
+  auto bf4 = reinterpret_cast<const scalar2_t*>(in_b);
+  auto c = reinterpret_cast<scalar2_t*>(out_c);
+  __shared__ float red_smem[NUM_A_ROWS_PER_BLOCK][WARP_SIZE];
+  const int row_addr = blockIdx.x * NUM_A_ROWS_PER_BLOCK * K / 8;
+  const int threadid = threadIdx.x;
+  const int warp = threadIdx.x / WARP_SIZE;
+  const int lane = threadIdx.x % WARP_SIZE;
+  const int num_warps = blockDim.x / WARP_SIZE;
+  const int qwarpid = threadid / num_warps;
+  const int qthreadid = threadid % num_warps;
+  float4 rowA_elem4[NUM_A_ROWS_PER_BLOCK];
+  scalar2_t colB_elem4x, colB_elem4y, colB_elem4z, colB_elem4w;
+  float acc[NUM_A_ROWS_PER_BLOCK];
+  scalar2_t acch2;
+  scalar2_t oval;
+
+  // As we later use warp shuffle operations, we may have more threads in the
+  // block than the actual available data, hence the if guard here.
+  if (threadid * 8 < K) {
+#pragma unroll
+    for (int i = 0; i < NUM_A_ROWS_PER_BLOCK; i++) {
+      // rowA_elem4[i] holds 8 * half numbers seen as a single float4.
+      rowA_elem4[i] = load_ntmprl(&af4[row_addr + threadid + K / 8 * i]);
+    }
+  }
+
+  colB_elem4x = bf4[threadid * 4 + 0];
+  colB_elem4y = bf4[threadid * 4 + 1];
+  colB_elem4z = bf4[threadid * 4 + 2];
+  colB_elem4w = bf4[threadid * 4 + 3];
+
+  scalar2_t Af2;
+  scalar2_t Bf2;
+  float2 S;
+
+  auto Ah2ptr = reinterpret_cast<scalar2_t*>(&rowA_elem4);
+  scalar2_t* ah2lptr;
+
+#pragma unroll
+  for (int i = 0; i < NUM_A_ROWS_PER_BLOCK; i++) {
+    // Multiply-add on 8 scalar_t.
+    ah2lptr = Ah2ptr + i * 4;
+    Af2 = *(ah2lptr);
+    acch2 = __hmul2(Af2, colB_elem4x);
+    Af2 = *(ah2lptr + 1);
+    acch2 = __hfma2(Af2, colB_elem4y, acch2);
+    Af2 = *(ah2lptr + 2);
+    acch2 = __hfma2(Af2, colB_elem4z, acch2);
+    Af2 = *(ah2lptr + 3);
+    acch2 = __hfma2(Af2, colB_elem4w, acch2);
+    S = __s22float2(acch2);
+
+    // See comment above concerning the if guard.
+    acc[i] = (threadid * 8 < K ? S.x + S.y : 0.f);
+  }
+
+// all reduce across warp.
+#pragma unroll
+  for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+#pragma unroll
+    for (int i = 0; i < NUM_A_ROWS_PER_BLOCK; i++) {
+      acc[i] += __shfl_xor(acc[i], mask);
+    }
+  }
+
+  // Warp leaders store the data to shared memory.
+  if (lane < NUM_A_ROWS_PER_BLOCK) {
+    red_smem[lane][warp] = acc[lane];
+  }
+
+  // Make sure the data is in shared memory.
+  __syncthreads();
+
+  if (qwarpid < NUM_A_ROWS_PER_BLOCK) {
+    acc[qwarpid] = qthreadid < num_warps ? red_smem[qwarpid][qthreadid] : 0.f;
+    for (int mask = num_warps / 2; mask >= 1; mask /= 2) {
+      acc[qwarpid] += __shfl_xor(acc[qwarpid], mask);
+    }
+    float oval2 = __shfl_xor(acc[qwarpid], num_warps);
+
+    if (lane % (num_warps * 2) == 0) {
+      oval = __float22s2_rn<scalar2_t>(make_float2(acc[qwarpid], oval2));
+      c[blockIdx.x * NUM_A_ROWS_PER_BLOCK / 2 + qwarpid / 2] = oval;
+    }
+  }
+}
+
+torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
+                    const int64_t rows_per_block) {
+  auto M = in_a.size(0);
+  auto K = in_a.size(1);
+  auto N = in_b.size(0);
+
+  TORCH_CHECK(N == 1, "Row number of activation tensor must be 1.");
+  TORCH_CHECK(in_a.dtype() == in_b.dtype());
+  TORCH_CHECK(in_b.dtype() == torch::kFloat16 ||
+              in_b.dtype() == torch::kBFloat16);
+
+  auto out_c = torch::empty(
+      {N, M}, torch::TensorOptions().dtype(in_b.dtype()).device(in_b.device()));
+
+  // NUM_TREADS need to be a multiple of WARP_SIZE, as we are using warp shuffle
+  // operations.
+  const int NUM_THREADS =
+      K * 2 / 16 % WARP_SIZE == 0
+          ? K * 2 / 16
+          : K * 2 / 16 + (WARP_SIZE - K * 2 / 16 % WARP_SIZE);
+
+  int NUM_BLOCKS = M / rows_per_block;
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(in_b));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // call the kernel function...
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "LLGemm1", [&] {
+    auto a_ptr = in_a.data_ptr<scalar_t>();
+    auto b_ptr = in_b.data_ptr<scalar_t>();
+    auto c_ptr = out_c.data_ptr<scalar_t>();
+    if (rows_per_block == 2) {
+      LLGemm1_kernel<scalar_t, 2>
+          <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(a_ptr, b_ptr, c_ptr, K);
+    } else if (rows_per_block == 4) {
+      LLGemm1_kernel<scalar_t, 4>
+          <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(a_ptr, b_ptr, c_ptr, K);
+    } else if (rows_per_block == 8) {
+      LLGemm1_kernel<scalar_t, 8>
+          <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(a_ptr, b_ptr, c_ptr, K);
+    } else if (rows_per_block == 16) {
+      LLGemm1_kernel<scalar_t, 16>
+          <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(a_ptr, b_ptr, c_ptr, K);
+    } else {
+      NUM_BLOCKS = M / 4;
+      LLGemm1_kernel<scalar_t, 4>
+          <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(a_ptr, b_ptr, c_ptr, K);
+    }
+  });
+
+  return out_c;
+}
+
+#define DOT2C(V0, V2, V3)                                                     \
+  if constexpr (std::is_same_v<scalar_t, half>) {                             \
+    asm("v_dot2c_f32_f16 %0, %2, %3" : "=v"(V0) : "0"(V0), "v"(V2), "v"(V3)); \
+  } else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {            \
+    float2 s = __bfloat1622float2(*((__hip_bfloat162*)(&(V2)))) *             \
+               __bfloat1622float2(*((__hip_bfloat162*)(&(V3))));              \
+    V0 += (s.x + s.y);                                                        \
+  }
+
+#if defined(__HIP__MI300_MI250__)  // TODO: Add NAVI support
+// This version targets cases where A[] fits LDS capacity
+template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
+          int UNRL, int N>
+__global__ void __launch_bounds__(WvPrGrp* THRDS)
+    wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
+                     const scalar_t* __restrict__ A, scalar_t* C,
+                     const int _WvPrGrp, const int CuCount) {
+  using scalar8 =
+      __attribute__((__vector_size__((A_CHUNK / 2) * sizeof(float)))) float;
+  union bigType {
+    scalar_t h[A_CHUNK];
+    float f[A_CHUNK / 2];
+    float2 f2[A_CHUNK / 4];
+    double d[A_CHUNK / 4];
+    scalar8 h8;
+  };
+
+  //----------------------------------------------------
+  // Reserving 64 KB of LDS to have 1 WG / CU
+  // Goal is to bring the activation matrix A to the LDS
+  // and use it across the lifetime of the work group
+  // TODO: When activation matrix is larger than 64 KB
+  //	     then this is not goint to work!
+  //----------------------------------------------------
+  __shared__ scalar_t s[1024 * 32];
+
+  //----------------------------------------------------
+  // Fetch the activation matrix to LDS
+  // Loop iteration:
+  // - Each thread (lane) is fetching 8 elements (A_Chunk)
+  // - Each wave will fetch 64*8=> 512 elements
+  // - Each WG will fetch 512 * 16 => 8K elements
+  // - Then the WG will move to another 8 K elements
+  // TODO: Logic below will only work when K is multiple of 8
+  //----------------------------------------------------
+  for (uint32_t k = 0; k < min(K * N, 32 * 1024);
+       k += THRDS * WvPrGrp * A_CHUNK) {
+    uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
+
+    if (k_in >= min(K * N, 32 * 1024)) break;
+
+    *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
+  }
+  __syncthreads();
+
+  if (threadIdx.y >= _WvPrGrp) return;
+
+  uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE;
+
+  float sum[N][YTILE];
+
+  //----------------------------------------------------
+  // Each wave works on a single column of weight matrix.
+  // There are 16 waves per WG, and hence, each WG is
+  // working on 16 columns of weight matrix. Moreover,
+  // we tile in column direction by YTILE, so when YTILE=1
+  // the above math is right, however, when YTILE=2 then
+  // each wave  will be working on 2 columns and WG will
+  // be working on 32 columns.
+  //
+  // Top level loop that makes WGs persistent!
+  // - WGs iterates across columns of weight matrix
+  // - Each wave within WG works on a given column(s)
+  // - After completing first set of columns, WGs start
+  //   working on the next set of available columns
+  //----------------------------------------------------
+  while (m < M) {
+    //----------------------------------------------------
+    // 'sum' accumulates the matrix A x B computation
+    // split across 64 lanes.
+    //
+    // YTILE represents how many column of weight matrix
+    // are being worked on by each wave.
+    //----------------------------------------------------
+    for (int i = 0; i < YTILE; i++)
+      for (int n = 0; n < N; n++) sum[n][i] = 0;
+
+    bigType bigA[N][UNRL];
+    bigType bigB[YTILE][UNRL];
+    //----------------------------------------------------
+    // Fetch weight matrix B in interleaved K-split!
+    // - Each thread (lane) is fetching 8 elements (A_Chunk)
+    // - Each wave will fetch 64*8=> 512 elements (1024B)
+    // - YTILE represents the number of column being serviced
+    //   by wave
+    // - Loop for fetching weight matrix (B) are unrolled
+    //
+    // Fetch activation matrix A from LDS
+    // - Loop for fetching activation matrix (A) are unrolled
+    //
+    // Finally, do the matrix multiplication in an unrolled
+    // fashion. This provides lot of food for compiler
+    // scheduling.
+    //
+    // TODO: Logic below will only work when K is multiple of 8
+    //----------------------------------------------------
+    // for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+    for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+      // Fetch the weight matrix from memory!
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        const scalar_t* B_ = &B[(m + 0) * K + k_];
+        bigB[0][k2].h8 = (loadnt((scalar8*)(&B_[0 * K])));
+        //----------------------------------------------------
+        // The following code with YTILE > 1 has to be deleted
+        //----------------------------------------------------
+        if constexpr (YTILE >= 2)
+          bigB[1][k2].h8 = (loadnt((scalar8*)(&B_[1 * K])));
+        if constexpr (YTILE >= 3)
+          bigB[2][k2].h8 = (loadnt((scalar8*)(&B_[2 * K])));
+        if constexpr (YTILE >= 4)
+          bigB[3][k2].h8 = (loadnt((scalar8*)(&B_[3 * K])));
+        if constexpr (YTILE >= 5)
+          bigB[4][k2].h8 = (loadnt((scalar8*)(&B_[4 * K])));
+        if constexpr (YTILE >= 6)
+          bigB[5][k2].h8 = (loadnt((scalar8*)(&B_[5 * K])));
+        if constexpr (YTILE >= 7)
+          bigB[6][k2].h8 = (loadnt((scalar8*)(&B_[6 * K])));
+        if constexpr (YTILE >= 8)
+          bigB[7][k2].h8 = (loadnt((scalar8*)(&B_[7 * K])));
+      }
+
+      // Fetch activation matrix from either just LDS or from both LDS / memory
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        // Fetch A activation matrix in interleaved fashion from LDS or memory
+
+        for (int n = 0; n < N; n++) {
+          bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
+        }
+      }
+
+      // Do the matrix multiplication in interleaved manner
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+        // Do the matrix multiplication of activation and weight matrix
+        // - Remember the accumulation is happening for K-split of 64!
+  #pragma unroll
+        for (uint32_t n = 0; n < N; n++) {
+  #pragma unroll
+          for (uint32_t b = 0; b < A_CHUNK / 2; b++) {
+            DOT2C(sum[n][0], bigA[n][k2].f[b], bigB[0][k2].f[b])
+            //----------------------------------------------------
+            // The following code with YTILE > 1
+            //----------------------------------------------------
+            if constexpr (YTILE >= 2) {
+              DOT2C(sum[n][1], bigA[n][k2].f[b], bigB[1][k2].f[b]);
+            }
+            if constexpr (YTILE >= 3) {
+              DOT2C(sum[n][2], bigA[n][k2].f[b], bigB[2][k2].f[b]);
+            }
+            if constexpr (YTILE >= 4) {
+              DOT2C(sum[n][3], bigA[n][k2].f[b], bigB[3][k2].f[b]);
+            }
+            if constexpr (YTILE >= 5) {
+              DOT2C(sum[n][4], bigA[n][k2].f[b], bigB[4][k2].f[b]);
+            }
+            if constexpr (YTILE >= 6) {
+              DOT2C(sum[n][5], bigA[n][k2].f[b], bigB[5][k2].f[b]);
+            }
+            if constexpr (YTILE >= 7) {
+              DOT2C(sum[n][6], bigA[n][k2].f[b], bigB[6][k2].f[b]);
+            }
+            if constexpr (YTILE >= 8) {
+              DOT2C(sum[n][7], bigA[n][k2].f[b], bigB[7][k2].f[b]);
+            }
+          }
+        }
+      }
+    }
+
+    //----------------------------------------------------
+    // Final reduction step using shuffle
+    //----------------------------------------------------
+    for (int n = 0; n < N; n++) {
+      for (int y = 0; y < YTILE; y++) {
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:8 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:4 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:2 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 wave_shr:1 bound_ctrl:0"
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:15 bound_ctrl:0"
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:31 bound_ctrl:0"
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+      }
+    }
+    if (threadIdx.x == 63) {
+      for (int n = 0; n < N; n++) {
+        for (int i = 0; i < YTILE; i++) {
+          // if (commitColumn[i]) C[m + i + n * M] = __float2half(sum[n][i]);
+          C[m + i + n * M] = __float2s<scalar_t>(sum[n][i]);
+        }
+      }
+    }
+
+    m += CuCount * _WvPrGrp * YTILE;
+  }
+}
+#else   // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
+          int UNRL, int N>
+__global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
+                                 const scalar_t* __restrict__ A, scalar_t* C,
+                                 const int _WvPrGrp, const int CuCount) {
+  UNREACHABLE_CODE
+}
+#endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+
+#if defined(__HIP__MI300_MI250__)  // TODO: Add NAVI support
+// This version targets cases where A[] marginally exceeds LDS capacity
+template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
+          int UNRL, int N>
+__global__ void __launch_bounds__(WvPrGrp* THRDS)
+    wvSplitK_hf_(const int K, const int M, const scalar_t* B,
+                 const scalar_t* __restrict__ A, scalar_t* C,
+                 const int _WvPrGrp, const int CuCount) {
+  using scalar8 =
+      __attribute__((__vector_size__((A_CHUNK / 2) * sizeof(float)))) float;
+  union bigType {
+    scalar_t h[A_CHUNK];
+    float f[A_CHUNK / 2];
+    float2 f2[A_CHUNK / 4];
+    double d[A_CHUNK / 4];
+    scalar8 h8;
+  };
+
+  //----------------------------------------------------
+  // Reserving 64 KB of LDS to have 1 WG / CU
+  // Goal is to bring the activation matrix A to the LDS
+  // and use it across the lifetime of the work group
+  // TODO: When activation matrix is larger than 64 KB
+  //	     then this is not goint to work!
+  //----------------------------------------------------
+  __shared__ scalar_t s[1024 * 32];
+
+  //----------------------------------------------------
+  // Computation of columns that need to be committed to memory!
+  //----------------------------------------------------
+  uint32_t commitColumn[YTILE];
+  for (uint32_t i = 0; i < YTILE; i++) {
+    commitColumn[i] = 1;
+  }
+
+  //----------------------------------------------------
+  // Indexing function into the column of weight matrix B
+  // Algorithm does 64 lane k-splitting / wave and uses
+  // WG ID and Thread ID to find the index.
+  //----------------------------------------------------
+  // int _WvPrGrp = mindiv(N, CuCount * YTILE, WvPrGrp);
+  uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE;
+
+  // Check whether there will be fragmenation!
+  // This will happen only for the last wave!
+  if (m < M && (m + YTILE) >= M) {
+    uint32_t startColumn = M - YTILE;
+    for (uint32_t i = 0; i < (m - startColumn); i++) {
+      commitColumn[i] = 0;
+    }
+    m = startColumn;
+  }
+
+  //----------------------------------------------------
+  // Fetch the activation matrix to LDS
+  // Loop iteration:
+  // - Each thread (lane) is fetching 8 elements (A_Chunk)
+  // - Each wave will fetch 64*8=> 512 elements
+  // - Each WG will fetch 512 * 16 => 8K elements
+  // - Then the WG will move to another 8 K elements
+  // TODO: Logic below will only work when K is multiple of 8
+  //----------------------------------------------------
+  for (uint32_t k = 0; k < min(K * N, 32 * 1024);
+       k += THRDS * WvPrGrp * A_CHUNK) {
+    uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
+
+    if (k_in >= min(K * N, 32 * 1024)) break;
+
+    *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
+  }
+
+  __syncthreads();
+
+  if (threadIdx.y >= _WvPrGrp) return;
+
+  float sum[N][YTILE];
+
+  //----------------------------------------------------
+  // Each wave works on a single column of weight matrix.
+  // There are 16 waves per WG, and hence, each WG is
+  // working on 16 columns of weight matrix. Moreover,
+  // we tile in column direction by YTILE, so when YTILE=1
+  // the above math is right, however, when YTILE=2 then
+  // each wave  will be working on 2 columns and WG will
+  // be working on 32 columns.
+  //
+  // Top level loop that makes WGs persistent!
+  // - WGs iterates across columns of weight matrix
+  // - Each wave within WG works on a given column(s)
+  // - After completing first set of columns, WGs start
+  //   working on the next set of available columns
+  //----------------------------------------------------
+  while (m < M) {
+    //----------------------------------------------------
+    // 'sum' accumulates the matrix A x B computation
+    // split across 64 lanes.
+    //
+    // YTILE represents how many column of weight matrix
+    // are being worked on by each wave.
+    //----------------------------------------------------
+    for (int i = 0; i < YTILE; i++)
+      for (int n = 0; n < N; n++) sum[n][i] = 0;
+
+    bigType bigA[N][UNRL];
+    bigType bigB[YTILE][UNRL];
+    //----------------------------------------------------
+    // Fetch weight matrix B in interleaved K-split!
+    // - Each thread (lane) is fetching 8 elements (A_Chunk)
+    // - Each wave will fetch 64*8=> 512 elements (1024B)
+    // - YTILE represents the number of column being serviced
+    //   by wave
+    // - Loop for fetching weight matrix (B) are unrolled
+    //
+    // Fetch activation matrix A from LDS
+    // - Loop for fetching activation matrix (A) are unrolled
+    //
+    // Finally, do the matrix multiplication in an unrolled
+    // fashion. This provides lot of food for compiler
+    // scheduling.
+    //
+    // TODO: Logic below will only work when K is multiple of 8
+    //----------------------------------------------------
+    for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+      // Fetch the weight matrix from memory!
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        const scalar_t* B_ = &B[(m + 0) * K + k_];
+        bigB[0][k2].h8 = (loadnt((scalar8*)(&B_[0 * K])));
+        //----------------------------------------------------
+        // The following code with YTILE > 1 has to be deleted
+        //----------------------------------------------------
+        if constexpr (YTILE >= 2)
+          bigB[1][k2].h8 = (loadnt((scalar8*)(&B_[1 * K])));
+        if constexpr (YTILE >= 3)
+          bigB[2][k2].h8 = (loadnt((scalar8*)(&B_[2 * K])));
+        if constexpr (YTILE >= 4)
+          bigB[3][k2].h8 = (loadnt((scalar8*)(&B_[3 * K])));
+        if constexpr (YTILE >= 5)
+          bigB[4][k2].h8 = (loadnt((scalar8*)(&B_[4 * K])));
+        if constexpr (YTILE >= 6)
+          bigB[5][k2].h8 = (loadnt((scalar8*)(&B_[5 * K])));
+        if constexpr (YTILE >= 7)
+          bigB[6][k2].h8 = (loadnt((scalar8*)(&B_[6 * K])));
+        if constexpr (YTILE >= 8)
+          bigB[7][k2].h8 = (loadnt((scalar8*)(&B_[7 * K])));
+      }
+
+      // Fetch activation matrix from either just LDS or from both LDS / memory
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        // Fetch A activation matrix in interleaved fashion from LDS or memory
+
+        for (int n = 0; n < N; n++) {
+          if (k_ + K * n < 32 * 1024)
+            bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
+          else
+            bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
+        }
+      }
+
+      // Do the matrix multiplication in interleaved manner
+  #pragma unroll
+      for (uint32_t n = 0; n < N; n++) {
+  #pragma unroll
+        for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+          uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+          uint32_t k_ = k + threadIdx.x * A_CHUNK;
+          if (k_ >= K) break;
+          // Do the matrix multiplication of activation and weight matrix
+          // - Remember the accumulation is happening for K-split of 64!
+  #pragma unroll
+          for (uint32_t b = 0; b < A_CHUNK / 2; b++) {
+            DOT2C(sum[n][0], bigA[n][k2].f[b], bigB[0][k2].f[b]);
+            //----------------------------------------------------
+            // The following code with YTILE > 1
+            //----------------------------------------------------
+            if constexpr (YTILE >= 2) {
+              DOT2C(sum[n][1], bigA[n][k2].f[b], bigB[1][k2].f[b]);
+            }
+            if constexpr (YTILE >= 3) {
+              DOT2C(sum[n][2], bigA[n][k2].f[b], bigB[2][k2].f[b]);
+            }
+            if constexpr (YTILE >= 4) {
+              DOT2C(sum[n][3], bigA[n][k2].f[b], bigB[3][k2].f[b]);
+            }
+            if constexpr (YTILE >= 5) {
+              DOT2C(sum[n][4], bigA[n][k2].f[b], bigB[4][k2].f[b]);
+            }
+            if constexpr (YTILE >= 6) {
+              DOT2C(sum[n][5], bigA[n][k2].f[b], bigB[5][k2].f[b]);
+            }
+            if constexpr (YTILE >= 7) {
+              DOT2C(sum[n][6], bigA[n][k2].f[b], bigB[6][k2].f[b]);
+            }
+            if constexpr (YTILE >= 8) {
+              DOT2C(sum[n][7], bigA[n][k2].f[b], bigB[7][k2].f[b]);
+            }
+          }
+        }
+      }
+    }
+
+    //----------------------------------------------------
+    // Final reduction step using shuffle
+    //----------------------------------------------------
+    for (int n = 0; n < N; n++) {
+      for (int y = 0; y < YTILE; y++) {
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:8 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:4 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:2 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 wave_shr:1 bound_ctrl:0"
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:15 bound_ctrl:0"
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:31 bound_ctrl:0"
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+      }
+    }
+
+    if (threadIdx.x == 63) {
+      for (int n = 0; n < N; n++) {
+        for (int i = 0; i < YTILE; i++) {
+          if (commitColumn[i])
+            C[m + i + n * M] = __float2s<scalar_t>(sum[n][i]);
+        }
+      }
+    }
+
+    m += CuCount * _WvPrGrp * YTILE;
+
+    // Check whether there will be fragmenation!
+    // This will happen only for the last wave!
+    if (m < M && (m + YTILE) >= M) {
+      uint32_t startColumn = M - YTILE;
+      for (uint32_t i = 0; i < (m - startColumn); i++) {
+        commitColumn[i] = 0;
+      }
+      m = startColumn;
+    }
+  }
+}
+
+#else   // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
+          int UNRL, int N>
+__global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B,
+                             const scalar_t* __restrict__ A, scalar_t* C,
+                             const int _WvPrGrp, const int CuCount) {
+  UNREACHABLE_CODE
+}
+#endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+
+#if defined(__HIP__MI300_MI250__)  // TODO: Add NAVI support
+// This version targets big A[] cases, where it is much larger than LDS capacity
+template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
+          int UNRL, int N>
+__global__ void __launch_bounds__(WvPrGrp* THRDS)
+    wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
+                     const scalar_t* __restrict__ A, scalar_t* C,
+                     const int _WvPrGrp, const int CuCount) {
+  using scalar8 =
+      __attribute__((__vector_size__((A_CHUNK / 2) * sizeof(float)))) float;
+
+  union bigType {
+    scalar_t h[A_CHUNK];
+    float f[A_CHUNK / 2];
+    float2 f2[A_CHUNK / 4];
+    double d[A_CHUNK / 4];
+    scalar8 h8;
+  };
+
+  //----------------------------------------------------
+  // Reserving 64 KB of LDS to have 1 WG / CU
+  // Goal is to bring the activation matrix A to the LDS
+  // and use it across the lifetime of the work group
+  // TODO: When activation matrix is larger than 64 KB
+  //	     then this is not goint to work!
+  //----------------------------------------------------
+  __shared__ scalar_t s[1024 * 32];
+
+  //----------------------------------------------------
+  // Computation of columns that need to be committed to memory!
+  //----------------------------------------------------
+  uint32_t commitColumn[YTILE];
+  for (uint32_t i = 0; i < YTILE; i++) {
+    commitColumn[i] = 1;
+  }
+
+  // int _WvPrGrp = mindiv(N, CuCount * YTILE, WvPrGrp);
+  if (threadIdx.y >= _WvPrGrp) return;
+
+  //----------------------------------------------------
+  // Indexing function into the column of weight matrix B
+  // Algorithm does 64 lane k-splitting / wave and uses
+  // WG ID and Thread ID to find the index.
+  //----------------------------------------------------
+  uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE;
+
+  // Check whether there will be fragmenation!
+  // This will happen only for the last wave!
+  if (m < M && (m + YTILE) >= M) {
+    uint32_t startColumn = M - YTILE;
+    for (uint32_t i = 0; i < (m - startColumn); i++) {
+      commitColumn[i] = 0;
+    }
+    m = startColumn;
+  }
+
+  //----------------------------------------------------
+  // Fetch the activation matrix to LDS
+  // Loop iteration:
+  // - Each thread (lane) is fetching 8 elements (A_Chunk)
+  // - Each wave will fetch 64*8=> 512 elements
+  // - Each WG will fetch 512 * 16 => 8K elements
+  // - Then the WG will move to another 8 K elements
+  // TODO: Logic below will only work when K is multiple of 8
+  //----------------------------------------------------
+  #define PCML
+  #ifndef PCML
+  for (uint32_t k = 0; k < min(K * N, 32 * 1024);
+       k += THRDS * WvPrGrp * A_CHUNK) {
+    uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
+
+    if (k_in >= min(K * N, 32 * 1024)) break;
+
+    *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
+  }
+  __syncthreads();
+  #endif
+
+  #define TUC (THRDS * UNRL * A_CHUNK)
+  uint32_t kBase = 0;
+  // find biggest k size that fits in LDS
+  uint32_t kFit = (32 * 1024) / N;
+  // kFit = (kFit%TWC==0) ? kFit : (kFit-kFit%TWC+TWC); //round up to multiple
+  // of TUC
+  kFit = (kFit % TUC == 0)
+             ? kFit
+             : (kFit - kFit % TUC);  // round up to multiple of TUC
+  // if (kFit == 0) kFit = TUC;
+  kFit = min(kFit, K);
+
+  float sum[N][YTILE];
+
+  //----------------------------------------------------
+  // Each wave works on a single column of weight matrix.
+  // There are 16 waves per WG, and hence, each WG is
+  // working on 16 columns of weight matrix. Moreover,
+  // we tile in column direction by YTILE, so when YTILE=1
+  // the above math is right, however, when YTILE=2 then
+  // each wave  will be working on 2 columns and WG will
+  // be working on 32 columns.
+  //
+  // Top level loop that makes WGs persistent!
+  // - WGs iterates across columns of weight matrix
+  // - Each wave within WG works on a given column(s)
+  // - After completing first set of columns, WGs start
+  //   working on the next set of available columns
+  //----------------------------------------------------
+  #ifdef PCML
+  int YW = (YTILE * _WvPrGrp);
+  uint32_t Mrndp = (M % YW == 0) ? M : (M - M % YW + YW);
+  while (m < Mrndp) {
+  #else
+  while (m < M) {
+  #endif
+    //----------------------------------------------------
+    // 'sum' accumulates the matrix A x B computation
+    // split across 64 lanes.
+    //
+    // YTILE represents how many column of weight matrix
+    // are being worked on by each wave.
+    //----------------------------------------------------
+    for (int i = 0; i < YTILE; i++)
+      for (int n = 0; n < N; n++) sum[n][i] = 0;
+
+    bigType bigA[N][UNRL];
+    bigType bigB[YTILE][UNRL];
+    //----------------------------------------------------
+    // Fetch weight matrix B in interleaved K-split!
+    // - Each thread (lane) is fetching 8 elements (A_Chunk)
+    // - Each wave will fetch 64*8=> 512 elements (1024B)
+    // - YTILE represents the number of column being serviced
+    //   by wave
+    // - Loop for fetching weight matrix (B) are unrolled
+    //
+    // Fetch activation matrix A from LDS
+    // - Loop for fetching activation matrix (A) are unrolled
+    //
+    // Finally, do the matrix multiplication in an unrolled
+    // fashion. This provides lot of food for compiler
+    // scheduling.
+    //
+    // TODO: Logic below will only work when K is multiple of 8
+    //----------------------------------------------------
+    for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+  #ifdef PCML
+      if ((k1 == 0) || (k1 == kBase + kFit)) {  // load next chunk of A[] to LDS
+        if (k1 != 0) kBase += kFit;
+        __syncthreads();
+        for (uint32_t k = 0; k < kFit; k += THRDS * _WvPrGrp * A_CHUNK) {
+          uint32_t kOff = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
+          if (kBase + kOff >= K) break;
+          if (kOff >= kFit) break;
+          for (uint32_t n = 0; n < N; n++) {
+            uint32_t k_in = kBase + n * K + kOff;
+            uint32_t k_ot = n * kFit + kOff;
+            *((bigType*)(&s[k_ot])) = *((bigType*)(&A[k_in]));
+          }
+        }
+        __syncthreads();
+      }
+      if (m >= M) continue;
+  #endif
+
+      // Fetch the weight matrix from memory!
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        const scalar_t* B_ = &B[(m + 0) * K + k_];
+        bigB[0][k2].h8 = (loadnt((scalar8*)(&B_[0 * K])));
+        //----------------------------------------------------
+        // The following code with YTILE > 1 has to be deleted
+        //----------------------------------------------------
+        if constexpr (YTILE >= 2)
+          bigB[1][k2].h8 = (loadnt((scalar8*)(&B_[1 * K])));
+        if constexpr (YTILE >= 3)
+          bigB[2][k2].h8 = (loadnt((scalar8*)(&B_[2 * K])));
+        if constexpr (YTILE >= 4)
+          bigB[3][k2].h8 = (loadnt((scalar8*)(&B_[3 * K])));
+        if constexpr (YTILE >= 5)
+          bigB[4][k2].h8 = (loadnt((scalar8*)(&B_[4 * K])));
+        if constexpr (YTILE >= 6)
+          bigB[5][k2].h8 = (loadnt((scalar8*)(&B_[5 * K])));
+        if constexpr (YTILE >= 7)
+          bigB[6][k2].h8 = (loadnt((scalar8*)(&B_[6 * K])));
+        if constexpr (YTILE >= 8)
+          bigB[7][k2].h8 = (loadnt((scalar8*)(&B_[7 * K])));
+      }
+
+      // Fetch activation matrix from either just LDS or from both LDS / memory
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        // Fetch A activation matrix in interleaved fashion from LDS or memory
+
+        for (int n = 0; n < N; n++) {
+  #ifdef PCML
+          bigA[n][k2] = *((const bigType*)(&(s[k_ - kBase + kFit * n])));
+  #else
+          if (k_ + K * n < 32 * 1024)
+            bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
+          else
+            bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
+  #endif
+        }
+      }
+
+      // Do the matrix multiplication in interleaved manner
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+  #pragma unroll
+        for (uint32_t n = 0; n < N; n++) {
+          // Do the matrix multiplication of activation and weight matrix
+          // - Remember the accumulation is happening for K-split of 64!
+  #pragma unroll
+          for (uint32_t b = 0; b < A_CHUNK / 2; b++) {
+            DOT2C(sum[n][0], bigA[n][k2].f[b], bigB[0][k2].f[b]);
+            //----------------------------------------------------
+            // The following code with YTILE > 1
+            //----------------------------------------------------
+            if constexpr (YTILE >= 2) {
+              DOT2C(sum[n][1], bigA[n][k2].f[b], bigB[1][k2].f[b]);
+            }
+            if constexpr (YTILE >= 3) {
+              DOT2C(sum[n][2], bigA[n][k2].f[b], bigB[2][k2].f[b]);
+            }
+            if constexpr (YTILE >= 4) {
+              DOT2C(sum[n][3], bigA[n][k2].f[b], bigB[3][k2].f[b]);
+            }
+            if constexpr (YTILE >= 5) {
+              DOT2C(sum[n][4], bigA[n][k2].f[b], bigB[4][k2].f[b]);
+            }
+            if constexpr (YTILE >= 6) {
+              DOT2C(sum[n][5], bigA[n][k2].f[b], bigB[5][k2].f[b]);
+            }
+            if constexpr (YTILE >= 7) {
+              DOT2C(sum[n][6], bigA[n][k2].f[b], bigB[6][k2].f[b]);
+            }
+            if constexpr (YTILE >= 8) {
+              DOT2C(sum[n][7], bigA[n][k2].f[b], bigB[7][k2].f[b]);
+            }
+          }
+        }
+      }
+    }
+
+  #ifdef PCML
+    if (m >= M) {
+      m += CuCount * _WvPrGrp * YTILE;
+      kBase = 0;
+      continue;
+    }
+  #endif
+
+    //----------------------------------------------------
+    // Final reduction step using shuffle
+    //----------------------------------------------------
+    for (int n = 0; n < N; n++) {
+      for (int y = 0; y < YTILE; y++) {
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:8 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:4 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:2 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 wave_shr:1 bound_ctrl:0"
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:15 bound_ctrl:0"
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:31 bound_ctrl:0"
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+      }
+    }
+
+    if (threadIdx.x == 63) {
+      for (int n = 0; n < N; n++) {
+        for (int i = 0; i < YTILE; i++) {
+          if (commitColumn[i])
+            C[m + i + n * M] = __float2s<scalar_t>(sum[n][i]);
+        }
+      }
+    }
+
+    m += CuCount * _WvPrGrp * YTILE;
+    kBase = 0;
+
+    // Check whether there will be fragmenation!
+    // This will happen only for the last wave!
+    if (m < M && (m + YTILE) >= M) {
+      uint32_t startColumn = M - YTILE;
+      for (uint32_t i = 0; i < (m - startColumn); i++) {
+        commitColumn[i] = 0;
+      }
+      m = startColumn;
+    }
+  }
+}
+#else   // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
+          int UNRL, int N>
+__global__ void wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
+                                 const scalar_t* __restrict__ A, scalar_t* C,
+                                 const int _WvPrGrp, const int CuCount) {
+  UNREACHABLE_CODE
+}
+#endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+
+int mindiv(int N, int div1, int div2) {
+  int nPrRnd = div1 * div2;
+  int rnds0 = N / nPrRnd;
+  nPrRnd -= div1 * 3;
+  int rnds3 = N / nPrRnd;
+  nPrRnd -= div1;
+  int rnds4 = N / nPrRnd;
+  nPrRnd -= div1;
+  int rnds5 = N / nPrRnd;
+  nPrRnd -= div1;
+  int rnds6 = N / nPrRnd;
+  nPrRnd -= div1;
+  int rnds7 = N / nPrRnd;
+  nPrRnd -= div1;
+  int rnds8 = N / nPrRnd;
+  nPrRnd -= div1;
+  int rnds9 = N / nPrRnd;
+  nPrRnd -= div1;
+  int rtn = div2;
+  if (rnds0 == rnds3) rtn = div2 - 3;
+  if (rnds0 == rnds4) rtn = div2 - 4;
+  if (rnds0 == rnds5) rtn = div2 - 5;
+  if (rnds0 == rnds6) rtn = div2 - 6;
+  if (rnds0 == rnds7) rtn = div2 - 7;
+  if (rnds0 == rnds8) rtn = div2 - 8;
+  if (rnds0 == rnds9) rtn = div2 - 9;
+  return rtn;
+}
+
+torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
+                       const int64_t CuCount) {
+  auto M_in = in_a.size(0);
+  auto K_in = in_a.size(1);
+  auto N_in = in_b.size(0);
+
+  TORCH_CHECK(in_a.dtype() == in_b.dtype());
+  TORCH_CHECK(K_in % 8 == 0, "k % 8 == 0");
+  TORCH_CHECK(in_a.dtype() == torch::kFloat16 ||
+              in_a.dtype() == torch::kBFloat16);
+
+  auto out_c = torch::empty(
+      {N_in, M_in},
+      torch::TensorOptions().dtype(in_b.dtype()).device(in_b.device()));
+
+  dim3 grid(CuCount);
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+#define WVSPLITK(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
+                 _N)                                                          \
+  {                                                                           \
+    dim3 block(64, _WvPrGrp);                                                 \
+    if ((K_in * N_in <= 32 * 1024) && (M_in % _YTILEs == 0)) {                \
+      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp);              \
+      wvSplitK_hf_sml_<fptype, 64, _YTILEs, _WvPrGrp, 8, _UNRLs, _N>          \
+          <<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp,    \
+                                       CuCount);                              \
+    } else if (K_in * N_in <= 32 * 1024 * 1.2) {                              \
+      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp);              \
+      wvSplitK_hf_<fptype, 64, _YTILEm, _WvPrGrp, 8, _UNRLm, _N>              \
+          <<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp,    \
+                                       CuCount);                              \
+    } else {                                                                  \
+      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEb, _WvPrGrp);              \
+      wvSplitK_hf_big_<fptype, 64, _YTILEb, _WvPrGrp, 8, _UNRLb, _N>          \
+          <<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp,    \
+                                       CuCount);                              \
+    }                                                                         \
+  }
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitK", [&] {
+    using fptype = typename scalar<scalar_t>::type;
+    fptype* af4 = reinterpret_cast<fptype*>(in_a.data_ptr());
+    const fptype* bf4 = reinterpret_cast<const fptype*>(in_b.data_ptr());
+    fptype* c = reinterpret_cast<fptype*>(out_c.data_ptr());
+    switch (N_in) {
+      case 1:
+        WVSPLITK(16, 2, 2, 2, 2, 2, 2, 1)
+        break;
+      case 2:
+        WVSPLITK(16, 2, 2, 2, 2, 2, 2, 2)
+        break;
+      case 3:
+        WVSPLITK(16, 4, 7, 7, 1, 1, 1, 3)
+        break;
+      case 4:
+        WVSPLITK(16, 4, 7, 7, 1, 1, 1, 4)
+        break;
+      default:
+        throw std::runtime_error(
+            "Unsupported N value: " + std::to_string(M_in) + "," +
+            std::to_string(K_in) + "," + std::to_string(N_in));
+    }
+  });
+  return out_c;
+}
+
+#if defined(__HIP__MI300__)  // TODO: Add NAVI support
+template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
+          int A_CHUNK, int UNRL, int N>
+__global__ void __launch_bounds__(WvPrGrp* THRDS)
+    wvSplitKQ_hf_sml_(const int K, const int Kp, const int M, const fp8_t* B,
+                      const fp8_t* __restrict__ A, scalar_t* C,
+                      const float* __restrict__ s_A,
+                      const float* __restrict__ s_B, const int _WvPrGrp,
+                      const int CuCount) {
+  using scalar8 =
+      __attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float;
+  using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int;
+  using intx4 = __attribute__((__vector_size__(4 * sizeof(int)))) int;
+  union bigType {
+    char f8[A_CHUNK];
+    char2 c2[A_CHUNK / 2];
+    scalar_t h[A_CHUNK / 2];
+    float f[A_CHUNK / 4];
+    int i[A_CHUNK / 4];
+    long l[A_CHUNK / 8];
+    intx4 l2[A_CHUNK / 16];
+    scalar8 h8;
+  };
+
+  __shared__ fp8_t s[1024 * 64];
+
+  for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
+       k < min(K * N, 64 * 1024); k += THRDS * WvPrGrp * A_CHUNK) {
+    *((bigType*)(&s[k])) = *((bigType*)(&A[k]));
+  }
+  __syncthreads();
+
+  if (threadIdx.y >= _WvPrGrp) return;
+
+  uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE;
+
+  using floatx16 = __attribute__((__vector_size__(16 * sizeof(float)))) float;
+  floatx16 sum[N][YTILE];
+  float sA = *s_A;
+  float sB = *s_B;
+
+  while (m < M) {
+    for (int i = 0; i < YTILE; i++)
+      for (int n = 0; n < N; n++) sum[n][i] = {0.f};
+
+    bigType bigA[N][UNRL];
+    bigType bigB[YTILE][UNRL];
+
+    for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+  #pragma unroll
+        for (uint32_t n = 0; n < N; ++n) bigA[n][k2].h8 = {0.f};
+  #pragma unroll
+        for (uint32_t y = 0; y < YTILE; ++y) bigB[y][k2].h8 = {0.f};
+      }
+
+      // Fetch the weight matrix from memory!
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        const fp8_t* B_ = &B[(m + 0) * Kp + k_];
+  #pragma unroll
+        for (uint32_t y = 0; y < YTILE; ++y) {
+          bigB[y][k2].h8 = (loadnt((scalar8*)(&B_[y * Kp])));
+        }
+      }
+
+  // Fetch activation matrix from either just LDS or from both LDS / memory
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+        for (int n = 0; n < N; n++) {
+          bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
+        }
+      }
+
+  // Do the matrix multiplication in interleaved manner
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        if (k >= K) break;
+
+        for (uint32_t n = 0; n < N; n++) {
+          for (int i = 0; i < A_CHUNK; i += 8) {
+            for (int y = 0; y < YTILE; ++y) {
+              sum[n][y] = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(
+                  bigA[n][k2].l[i / 8], bigB[y][k2].l[i / 8], sum[n][y], 0, 0,
+                  0);
+            }
+          }
+        }
+      }
+    }
+
+    // Final reduction
+    for (int n = 0; n < N; n++) {
+      for (int y = 0; y < YTILE; y++) {
+        float accm0 = sum[n][y][0];
+        float accm16 = sum[n][y][8];
+        asm("v_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][1]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][9]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][2]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][10]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][3]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][11]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][4]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][12]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:9 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][5]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:9 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][13]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:10 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][6]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:10 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][14]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:11 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][7]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:11 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][15]), "v"(accm16));
+        accm0 += __shfl(accm0, 36);
+        accm16 += __shfl(accm16, 52);
+        sum[n][y][0] = accm0 + __shfl(accm16, 16);
+      }
+    }
+
+    if (threadIdx.x == 0) {
+      for (int n = 0; n < N; n++) {
+        for (int y = 0; y < YTILE; y++) {
+          C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0] * sA * sB);
+        }
+      }
+    }
+
+    m += CuCount * _WvPrGrp * YTILE;
+  }
+}
+#else   // !defined(__HIP__MI300__) TODO: Add NAVI support
+template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
+          int A_CHUNK, int UNRL, int N>
+__global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M,
+                                  const fp8_t* B, const fp8_t* __restrict__ A,
+                                  scalar_t* C, const float* __restrict__ s_A,
+                                  const float* __restrict__ s_B,
+                                  const int _WvPrGrp, const int CuCount) {
+  UNREACHABLE_CODE
+}
+#endif  // defined(__HIP__MI300__) TODO: Add NAVI support
+
+#if defined(__HIP__MI300__)  // TODO: Add NAVI support
+template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
+          int A_CHUNK, int UNRL, int N>
+__global__ void __launch_bounds__(WvPrGrp* THRDS)
+    wvSplitKQ_hf_(const int K, const int Kp, const int M, const fp8_t* B,
+                  const fp8_t* __restrict__ A, scalar_t* C,
+                  const float* __restrict__ s_A, const float* __restrict__ s_B,
+                  const int _WvPrGrp, const int CuCount) {
+  using scalar8 =
+      __attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float;
+  using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int;
+  using intx4 = __attribute__((__vector_size__(4 * sizeof(int)))) int;
+  union bigType {
+    char f8[A_CHUNK];
+    char2 c2[A_CHUNK / 2];
+    scalar_t h[A_CHUNK / 2];
+    float f[A_CHUNK / 4];
+    int i[A_CHUNK / 4];
+    long l[A_CHUNK / 8];
+    intx4 l2[A_CHUNK / 16];
+    scalar8 h8;
+  };
+
+  __shared__ fp8_t s[1024 * 64];
+
+  for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
+       k < min(K * N, 64 * 1024); k += THRDS * WvPrGrp * A_CHUNK) {
+    *((bigType*)(&s[k])) = *((bigType*)(&A[k]));
+  }
+  __syncthreads();
+
+  if (threadIdx.y >= _WvPrGrp) return;
+
+  uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE;
+
+  using floatx16 = __attribute__((__vector_size__(16 * sizeof(float)))) float;
+  floatx16 sum[N][YTILE];
+  float sA = *s_A;
+  float sB = *s_B;
+
+  while (m < M) {
+    for (int i = 0; i < YTILE; i++)
+      for (int n = 0; n < N; n++) sum[n][i] = {0};
+
+    bigType bigA[N][UNRL];
+    bigType bigB[YTILE][UNRL];
+
+    for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+      // Fetch the weight matrix from memory!
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        const fp8_t* B_ = &B[(m + 0) * Kp + k_];
+        for (int y = 0; y < YTILE; ++y) {
+          if (y + m >= M) break;  // To avoid mem access fault.
+          bigB[y][k2].h8 = (loadnt((scalar8*)(&B_[y * Kp])));
+        }
+      }
+
+  // Fetch activation matrix from either just LDS or from both LDS / memory
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+        for (int n = 0; n < N; n++) {
+          if (k_ + K * n < 64 * 1024)
+            bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
+          else
+            bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
+        }
+      }
+
+  // Do the matrix multiplication in interleaved manner
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        for (uint32_t n = 0; n < N; n++) {
+          for (int i = 0; i < A_CHUNK; i += 8) {
+            for (int y = 0; y < YTILE; ++y) {
+              sum[n][y] = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(
+                  bigA[n][k2].l[i / 8], bigB[y][k2].l[i / 8], sum[n][y], 0, 0,
+                  0);
+            }
+          }
+        }
+      }
+    }
+
+    // Final reduction
+    for (int n = 0; n < N; n++) {
+      for (int y = 0; y < YTILE; y++) {
+        float accm0 = sum[n][y][0];
+        float accm16 = sum[n][y][8];
+        asm("v_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][1]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][9]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][2]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][10]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][3]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][11]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][4]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][12]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:9 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][5]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:9 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][13]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:10 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][6]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:10 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][14]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:11 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][7]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:11 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][15]), "v"(accm16));
+        accm0 += __shfl(accm0, 36);
+        accm16 += __shfl(accm16, 52);
+        sum[n][y][0] = accm0 + __shfl(accm16, 16);
+      }
+    }
+
+    if (threadIdx.x == 0) {
+      for (int n = 0; n < N; n++) {
+        for (int y = 0; y < YTILE; y++) {
+          if (y + m >= M) break;  // To avoid mem access fault.
+          C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0] * sA * sB);
+        }
+      }
+    }
+
+    m += CuCount * _WvPrGrp * YTILE;
+  }
+}
+#else   // !defined(__HIP__MI300__) TODO: Add NAVI support
+template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
+          int A_CHUNK, int UNRL, int N>
+__global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M,
+                              const fp8_t* B, const fp8_t* __restrict__ A,
+                              scalar_t* C, const float* __restrict__ s_A,
+                              const float* __restrict__ s_B, const int _WvPrGrp,
+                              const int CuCount) {
+  UNREACHABLE_CODE
+}
+#endif  // defined(__HIP__MI300__) TODO: Add NAVI support
+
+void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
+               at::Tensor& scale_a, at::Tensor& scale_b,
+               const int64_t CuCount) {
+  static c10::ScalarType kFp8Type = is_fp8_ocp()
+                                        ? c10::ScalarType::Float8_e4m3fn
+                                        : c10::ScalarType::Float8_e4m3fnuz;
+  auto M_in = in_a.size(0);
+  auto K_in = in_a.size(1);
+  auto N_in = in_b.size(0);
+  auto Kp_in = in_a.stride(0);
+  TORCH_CHECK(K_in % 16 == 0, "k % 16 == 0");
+  TORCH_CHECK(in_a.dtype() == in_b.dtype() && in_a.dtype() == kFp8Type);
+  TORCH_CHECK(out_c.dtype() == torch::kFloat16 ||
+              out_c.dtype() == torch::kBFloat16);
+
+  dim3 grid(CuCount);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+#define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
+                  _N)                                                          \
+  {                                                                            \
+    dim3 block(64, _WvPrGrp);                                                  \
+    if ((K_in * N_in <= 64 * 1024) && (M_in % _YTILEs == 0)) {                 \
+      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp);               \
+      wvSplitKQ_hf_sml_<fptype, fp8_t, 64, _YTILEs, _WvPrGrp, 16, _UNRLs, _N>  \
+          <<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, a_ptr, b_ptr, c_ptr, \
+                                       s_a, s_b, __wvPrGrp, CuCount);          \
+    } else {                                                                   \
+      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp);               \
+      wvSplitKQ_hf_<fptype, fp8_t, 64, _YTILEm, _WvPrGrp, 16, _UNRLm, _N>      \
+          <<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, a_ptr, b_ptr, c_ptr, \
+                                       s_a, s_b, __wvPrGrp, CuCount);          \
+    }                                                                          \
+  }
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(out_c.scalar_type(), "wvSplitKQ", [&] {
+    using fptype = typename scalar<scalar_t>::type;
+    auto c_ptr = reinterpret_cast<fptype*>(out_c.data_ptr());
+    auto s_a = scale_a.data_ptr<float>();
+    auto s_b = scale_b.data_ptr<float>();
+    VLLM_DISPATCH_FP8_TYPES(in_a.scalar_type(), "wvSplitKQ", [&] {
+      auto a_ptr = in_a.data_ptr<fp8_t>();
+      auto b_ptr = in_b.data_ptr<fp8_t>();
+      switch (N_in) {
+        case 1:
+          WVSPLITKQ(16, 2, 2, 2, 2, 2, 2, 1)
+          break;
+        case 2:
+          WVSPLITKQ(16, 2, 2, 2, 2, 2, 2, 2)
+          break;
+        case 3:
+          WVSPLITKQ(16, 4, 7, 7, 1, 1, 1, 3)
+          break;
+        case 4:
+          WVSPLITKQ(16, 4, 7, 7, 1, 1, 1, 4)
+          break;
+        default:
+          throw std::runtime_error(
+              "Unsupported N value: " + std::to_string(M_in) + "," +
+              std::to_string(K_in) + "," + std::to_string(N_in));
+      }
+    });
+  });
+}
\ No newline at end of file
diff --git a/csrc/rocm/torch_bindings.cpp b/csrc/rocm/torch_bindings.cpp
index 537e9357d52be..4ac6fd1e99408 100644
--- a/csrc/rocm/torch_bindings.cpp
+++ b/csrc/rocm/torch_bindings.cpp
@@ -14,6 +14,24 @@
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
   // vLLM custom ops for rocm
 
+  // Custom gemm op for matrix-vector multiplication
+  rocm_ops.def(
+      "LLMM1(Tensor in_a, Tensor in_b, int rows_per_block) -> "
+      "Tensor");
+  rocm_ops.impl("LLMM1", torch::kCUDA, &LLMM1);
+
+  // Custom gemm op for skinny matrix-matrix multiplication
+  rocm_ops.def(
+      "wvSplitK(Tensor in_a, Tensor in_b, int CuCount) -> "
+      "Tensor");
+  rocm_ops.impl("wvSplitK", torch::kCUDA, &wvSplitK);
+
+  // wvSplitK for fp8
+  rocm_ops.def(
+      "wvSplitKQ(Tensor in_a, Tensor in_b, Tensor! out_c, Tensor scale_a, "
+      "          Tensor scale_b, int CuCount) -> ()");
+  rocm_ops.impl("wvSplitKQ", torch::kCUDA, &wvSplitKQ);
+
   // Custom attention op
   // Compute the attention between an input query and the cached
   // keys/values using PagedAttention.
diff --git a/tests/kernels/test_rocm_skinny_gemms.py b/tests/kernels/test_rocm_skinny_gemms.py
new file mode 100644
index 0000000000000..622079c394457
--- /dev/null
+++ b/tests/kernels/test_rocm_skinny_gemms.py
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+
+import vllm._custom_ops as ops
+from tests.kernels.quant_utils import ref_dynamic_per_tensor_fp8_quant
+from vllm.platforms import current_platform
+
+DTYPES = [torch.bfloat16, torch.float16]
+M = [16, 32, 64, 128, 256, 512, 1024, 4096, 8192]
+K = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]  # k % 8 == 0
+N = [1, 2, 3, 4]
+SEEDS = [0]
+
+
+@pytest.mark.parametrize("n", [1])  # only test for batch size 1
+@pytest.mark.parametrize("k", K)
+@pytest.mark.parametrize("m", M)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("rows_per_block", [2, 4, 8, 16])
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.skipif(not current_platform.is_rocm(),
+                    reason="only test for rocm")
+@torch.inference_mode()
+def test_rocm_llmm1_kernel(n, k, m, dtype, rows_per_block, seed):
+    torch.manual_seed(seed)
+    A = torch.rand(n, k, dtype=dtype, device="cuda")
+    B = torch.rand(m, k, dtype=dtype, device="cuda")
+
+    ref_out = torch.matmul(A, B.t())
+    out = ops.LLMM1(B, A, rows_per_block)
+
+    assert torch.allclose(out, ref_out, rtol=0.01)
+
+
+@pytest.mark.parametrize("n", N)  # only test for batch size <= 4
+@pytest.mark.parametrize("k", K + [9216, 10240, 16384])
+@pytest.mark.parametrize("m", [8] + M)  # m >= 8
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.skipif(not current_platform.is_rocm(),
+                    reason="only test for rocm")
+def test_rocm_wvsplitk_kernel(n, k, m, dtype, seed):
+    torch.manual_seed(seed)
+    cu_count = current_platform.get_cu_count()
+
+    A = torch.rand(n, k, dtype=dtype, device="cuda")
+    B = torch.rand(m, k, dtype=dtype, device="cuda")
+
+    ref_out = torch.matmul(A, B.t())
+    out = ops.wvSplitK(B, A, cu_count)
+
+    assert torch.allclose(out, ref_out, rtol=0.01)
+
+
+@pytest.mark.parametrize("n", N)  # only test for batch size <= 4
+@pytest.mark.parametrize("k", K[1:] + [14336, 24576, 32768])  # k % 16 == 0
+@pytest.mark.parametrize("m", M + [28672])  # m >= 16
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.skipif(not current_platform.is_rocm(),
+                    reason="only test for rocm")
+def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed):
+    torch.manual_seed(seed)
+
+    A = torch.rand(n, k, device="cuda")
+    B = torch.rand(m, k, device="cuda")
+
+    A, scale_a = ref_dynamic_per_tensor_fp8_quant(A)
+    B, scale_b = ref_dynamic_per_tensor_fp8_quant(B)
+
+    ref_out = torch._scaled_mm(A,
+                               B.t(),
+                               out_dtype=dtype,
+                               scale_a=scale_a,
+                               scale_b=scale_b)
+    out = ops.wvSplitKQ(B, A, dtype, scale_a, scale_b,
+                        current_platform.get_cu_count())
+
+    assert torch.allclose(out, ref_out, rtol=0.01)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index bd930bb906531..11297d3b9f548 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1196,6 +1196,26 @@ def selective_scan_fwd(u: torch.Tensor, delta: torch.Tensor, A: torch.Tensor,
                                     ssm_states, pad_slot_id)
 
 
+# ROCm skinny gemms
+def LLMM1(a: torch.Tensor, b: torch.Tensor,
+          rows_per_block: int) -> torch.Tensor:
+    return torch.ops._rocm_C.LLMM1(a, b, rows_per_block)
+
+
+def wvSplitK(a: torch.Tensor, b: torch.Tensor, cu_count: int) -> torch.Tensor:
+    return torch.ops._rocm_C.wvSplitK(a, b, cu_count)
+
+
+def wvSplitKQ(a: torch.Tensor, b: torch.Tensor, out_dtype: torch.dtype,
+              scale_a: torch.Tensor, scale_b: torch.Tensor,
+              cu_count: int) -> torch.Tensor:
+    out = torch.empty((b.shape[0], a.shape[0]),
+                      dtype=out_dtype,
+                      device=b.device)
+    torch.ops._rocm_C.wvSplitKQ(a, b, out, scale_a, scale_b, cu_count)
+    return out
+
+
 # moe
 def moe_sum(input: torch.Tensor, output: torch.Tensor):
     torch.ops._moe_C.moe_sum(input, output)
diff --git a/vllm/envs.py b/vllm/envs.py
index 0a7067b8a6a5f..d2e21a8dcfc4b 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -78,6 +78,7 @@ if TYPE_CHECKING:
     VLLM_ROCM_USE_AITER_LINEAR: bool = True
     VLLM_ROCM_USE_AITER_MOE: bool = True
     VLLM_ROCM_USE_AITER_RMSNORM: bool = True
+    VLLM_ROCM_USE_SKINNY_GEMM: bool = True
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ROCM_MOE_PADDING: bool = True
     VLLM_ROCM_CUSTOM_PAGED_ATTN: bool = True
@@ -550,6 +551,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: (os.getenv("VLLM_ROCM_USE_AITER_RMSNORM", "True").lower() in
              ("true", "1")),
 
+    # use rocm skinny gemms
+    "VLLM_ROCM_USE_SKINNY_GEMM":
+    lambda: (os.getenv("VLLM_ROCM_USE_SKINNY_GEMM", "True").lower() in
+             ("true", "1")),
+
     # Pad the fp8 weights to 256 bytes for ROCm
     "VLLM_ROCM_FP8_PADDING":
     lambda: bool(int(os.getenv("VLLM_ROCM_FP8_PADDING", "1"))),
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 21035a9e5dbe9..c5536438f5192 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -6,7 +6,6 @@ from typing import Any, Literal, Optional, Union
 
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 from torch.nn.parameter import Parameter, UninitializedParameter
 
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
@@ -17,6 +16,7 @@ from vllm.distributed import (divide, get_tensor_model_parallel_rank,
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.utils import dispatch_unquantized_gemm
 # yapf: disable
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                            BlockQuantScaleParameter,
@@ -188,7 +188,7 @@ class UnquantizedLinearMethod(LinearMethodBase):
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
-        return F.linear(x, layer.weight, bias)
+        return dispatch_unquantized_gemm()(x, layer.weight, bias)
 
 
 class LinearBase(torch.nn.Module):
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index b8e6384d7359f..d279ffe45d6d7 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -1,10 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Optional, Tuple, Union
+from typing import Callable, List, Optional, Tuple, Union
 
 import torch
 
 from vllm import _custom_ops as ops
+from vllm import envs
 from vllm.config import CompilationLevel, get_current_vllm_config
 from vllm.platforms import current_platform
 
@@ -17,6 +18,7 @@ TORCH_DEVICE_IDENTITY = None
 # The condition is determined once as the operations
 # are time consuming.
 USE_ROWWISE_TORCH_SCALED_MM = (current_platform.is_rocm()
+                               and torch.__version__[0:3] >= "2.7"
                                and current_platform.has_device_capability(94))
 
 
@@ -131,6 +133,159 @@ def maybe_create_device_identity():
         TORCH_DEVICE_IDENTITY = torch.ones(1, dtype=torch.float32)
 
 
+def cutlass_w8a8_scaled_mm(*, qinput: torch.Tensor, weight: torch.Tensor,
+                           out_dtype: torch.dtype, scale_a: torch.Tensor,
+                           scale_b: torch.Tensor, bias: torch.Tensor,
+                           output_shape: List, **kwargs) -> torch.Tensor:
+
+    # Fused GEMM_DQ
+    output = ops.cutlass_scaled_mm(qinput,
+                                   weight,
+                                   out_dtype=out_dtype,
+                                   scale_a=scale_a,
+                                   scale_b=scale_b,
+                                   bias=bias)
+    return output.view(*output_shape)
+
+
+def rocm_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor,
+                                   weight: torch.Tensor,
+                                   out_dtype: torch.dtype,
+                                   scale_a: torch.Tensor,
+                                   scale_b: torch.Tensor, bias: torch.Tensor,
+                                   input_2d: torch.Tensor,
+                                   output_shape: List) -> torch.Tensor:
+    if envs.VLLM_ROCM_USE_SKINNY_GEMM and qinput.shape[
+            0] == 1 and qinput.shape[1] % 16 == 0:
+        output = ops.wvSplitKQ(weight.t(), qinput, out_dtype, scale_a, scale_b,
+                               current_platform.get_cu_count())
+    else:
+        output = torch._scaled_mm(qinput,
+                                  weight,
+                                  out_dtype=out_dtype,
+                                  scale_a=scale_a,
+                                  scale_b=scale_b,
+                                  bias=bias)
+
+    return torch.narrow(output, 0, 0, input_2d.shape[0]).view(*output_shape)
+
+
+def torch_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor,
+                                    weight: torch.Tensor,
+                                    out_dtype: torch.dtype,
+                                    scale_a: torch.Tensor,
+                                    scale_b: torch.Tensor, bias: torch.Tensor,
+                                    input_2d: torch.Tensor,
+                                    output_shape: List) -> torch.Tensor:
+    output = torch._scaled_mm(qinput,
+                              weight,
+                              out_dtype=out_dtype,
+                              scale_a=scale_a,
+                              scale_b=scale_b,
+                              bias=bias)
+    # A fix for discrepancy in scaled_mm which returns tuple
+    # for torch < 2.5 and a single value in torch >= 2.5
+    if type(output) is tuple and len(output) == 2:
+        output = output[0]
+
+    return torch.narrow(output, 0, 0, input_2d.shape[0]).view(*output_shape)
+
+
+def torch_per_token_w8a8_scaled_mm(*, qinput: torch.Tensor,
+                                   weight: torch.Tensor,
+                                   out_dtype: torch.dtype,
+                                   scale_a: torch.Tensor,
+                                   scale_b: torch.Tensor, bias: torch.Tensor,
+                                   input_2d: torch.Tensor,
+                                   output_shape: List) -> torch.Tensor:
+    # Note: Callers of this function should check USE_ROWWISE_TORCH_SCALED_MM
+    #  when using it.
+    #  For now it has only been validated on ROCm platform.
+    #  fp8 rowwise scaling in torch._scaled_mm is introduced in
+    #  https://github.com/pytorch/pytorch/pull/144432 using
+    #  hipBLASLt and ROCm 6.3, which only exists in torch 2.7 and above.
+    #
+    #  For CUDA platform please validate if the torch._scaled_mm supports
+    #  rowwise scaled GEMM before using it
+
+    # Fused GEMM_DQ Rowwise GEMM
+    output = torch._scaled_mm(qinput,
+                              weight,
+                              out_dtype=out_dtype,
+                              scale_a=scale_a,
+                              scale_b=scale_b.t(),
+                              bias=bias)
+
+    output = torch.narrow(output, 0, 0, input_2d.shape[0])
+    output = output.view(*output_shape)
+    return output
+
+
+def torch_channelwise_w8a8_scaled_mm(*, qinput: torch.Tensor,
+                                     weight: torch.Tensor,
+                                     out_dtype: torch.dtype,
+                                     scale_a: torch.Tensor,
+                                     scale_b: torch.Tensor, bias: torch.Tensor,
+                                     input_2d: torch.Tensor,
+                                     output_shape: List,
+                                     **kwargs) -> torch.Tensor:
+    # Use unfused DQ due to limitations with scaled_mm
+
+    # Symmetric quantized GEMM by definition computes the following:
+    #   C = (s_x * X) (s_w * W) + bias
+    # This is equivalent to dequantizing the weights and activations
+    # before applying a GEMM.
+    #
+    # In order to compute quantized operands, a quantized kernel
+    # will rewrite the above like so:
+    #   C = s_w * s_x * (X * W) + bias
+    #
+    # For the scaled_mm fallback case, we break this down, since it
+    # does not support s_w being a vector.
+
+    # GEMM
+    # This computes C = (X * W).
+    # Output in fp32 to allow subsequent ops to happen in-place
+    output = torch._scaled_mm(qinput,
+                              weight,
+                              scale_a=TORCH_DEVICE_IDENTITY,
+                              scale_b=TORCH_DEVICE_IDENTITY,
+                              out_dtype=torch.float32)
+    # A fix for discrepancy in scaled_mm which returns tuple
+    # for torch < 2.5 and a single value in torch >= 2.5
+    if type(output) is tuple and len(output) == 2:
+        output = output[0]
+    # Unpad (undo num_token_padding)
+    output = torch.narrow(output, 0, 0, input_2d.shape[0])
+    x_scale = torch.narrow(scale_a, 0, 0, input_2d.shape[0])
+
+    # DQ
+    # C = sw * sx * (X * W) + bias
+    output = output * x_scale * scale_b.t()
+    if bias is not None:
+        output = output + bias
+    return output.to(out_dtype).view(*output_shape)
+
+
+def dispatch_w8a8_scaled_mm(
+        cutlass_fp8_supported: bool, per_tensor_weights: bool,
+        per_tensor_activations: bool, use_per_token_if_dynamic: Optional[bool]
+) -> Callable[..., torch.Tensor]:
+
+    if cutlass_fp8_supported:
+        return cutlass_w8a8_scaled_mm
+    if per_tensor_weights and per_tensor_activations:
+        if current_platform.is_rocm():
+            return rocm_per_tensor_w8a8_scaled_mm
+        return torch_per_tensor_w8a8_scaled_mm
+    # torch.scaled_mm supports per tensor weights + activations only
+    # so fallback to naive if per channel or per token
+    if (use_per_token_if_dynamic and not per_tensor_weights
+            and not per_tensor_activations and USE_ROWWISE_TORCH_SCALED_MM):
+        return torch_per_token_w8a8_scaled_mm
+    return torch_channelwise_w8a8_scaled_mm
+
+
 # TODO(luka): follow similar pattern for marlin and block-fp8-linear
 #  https://github.com/vllm-project/vllm/issues/14397
 class Fp8LinearOp:
@@ -156,7 +311,8 @@ class Fp8LinearOp:
         if pad_output is None:
             config = get_current_vllm_config().compilation_config
             pad_output = config.level < CompilationLevel.PIECEWISE
-        self.output_padding = 17 if pad_output else None
+        self.output_padding = 17 if (
+            pad_output and not current_platform.is_rocm()) else None
 
     def apply(
         self,
@@ -195,18 +351,6 @@ class Fp8LinearOp:
                 input_scale,
                 scale_ub=input_scale_ub,
                 use_per_token_if_dynamic=use_per_token_if_dynamic)
-
-            # Fused GEMM_DQ
-            output = ops.cutlass_scaled_mm(qinput,
-                                           weight,
-                                           out_dtype=out_dtype,
-                                           scale_a=x_scale,
-                                           scale_b=weight_scale,
-                                           bias=bias)
-            return output.view(*output_shape)
-
-        # torch.scaled_mm supports per tensor weights + activations only
-        # so fallback to naive if per channel or per token
         else:
             if input.dtype != current_platform.fp8_dtype():
                 # Maybe apply padding to output, see comment in __init__
@@ -218,84 +362,21 @@ class Fp8LinearOp:
             else:
                 qinput, x_scale = input_2d, input_scale
 
-            per_tensor_weights = (weight_scale.numel() == 1)
-            per_tensor_activations = (x_scale.numel() == 1)
+        per_tensor_weights = (weight_scale.numel() == 1)
+        per_tensor_activations = (x_scale.numel() == 1)
 
-            if per_tensor_weights and per_tensor_activations:
-                # Fused GEMM_DQ
-                output = torch._scaled_mm(qinput,
-                                          weight,
-                                          out_dtype=out_dtype,
-                                          scale_a=x_scale,
-                                          scale_b=weight_scale,
-                                          bias=bias)
-                # A fix for discrepancy in scaled_mm which returns tuple
-                # for torch < 2.5 and a single value in torch >= 2.5
-                if type(output) is tuple and len(output) == 2:
-                    output = output[0]
+        w8a8_scaled_mm_func = dispatch_w8a8_scaled_mm(
+            self.cutlass_fp8_supported, per_tensor_weights,
+            per_tensor_activations, use_per_token_if_dynamic)
 
-                return torch.narrow(output, 0, 0,
-                                    input_2d.shape[0]).view(*output_shape)
-
-            elif (use_per_token_if_dynamic and not per_tensor_weights
-                  and not per_tensor_activations
-                  and USE_ROWWISE_TORCH_SCALED_MM):
-                # For now validated on ROCm platform
-                # fp8 rowwise scaling in torch._scaled_mm is introduced in
-                # https://github.com/pytorch/pytorch/pull/144432 using hipBLASLt
-                # and ROCm 6.3, which only exists in torch 2.7 and above.
-                # For CUDA platform please validate if the
-                # torch._scaled_mm support rowwise scaled GEMM
-                # Fused GEMM_DQ Rowwise GEMM
-                output = torch._scaled_mm(qinput,
-                                          weight,
-                                          out_dtype=out_dtype,
-                                          scale_a=x_scale,
-                                          scale_b=weight_scale.t(),
-                                          bias=bias)
-
-                output = torch.narrow(output, 0, 0, input_2d.shape[0])
-                output = output.view(*output_shape)
-                return output
-
-            else:
-                # Fallback for channelwise case, where we use unfused DQ
-                # due to limitations with scaled_mm
-
-                # Symmetric quantized GEMM by definition computes the following:
-                #   C = (s_x * X) (s_w * W) + bias
-                # This is equivalent to dequantizing the weights and activations
-                # before applying a GEMM.
-                #
-                # In order to compute quantized operands, a quantized kernel
-                # will rewrite the above like so:
-                #   C = s_w * s_x * (X * W) + bias
-                #
-                # For the scaled_mm fallback case, we break this down, since it
-                # does not support s_w being a vector.
-
-                # GEMM
-                # This computes C = (X * W).
-                # Output in fp32 to allow subsequent ops to happen in-place
-                output = torch._scaled_mm(qinput,
-                                          weight,
-                                          scale_a=TORCH_DEVICE_IDENTITY,
-                                          scale_b=TORCH_DEVICE_IDENTITY,
-                                          out_dtype=torch.float32)
-                # A fix for discrepancy in scaled_mm which returns tuple
-                # for torch < 2.5 and a single value in torch >= 2.5
-                if type(output) is tuple and len(output) == 2:
-                    output = output[0]
-                # Unpad (undo num_token_padding)
-                output = torch.narrow(output, 0, 0, input_2d.shape[0])
-                x_scale = torch.narrow(x_scale, 0, 0, input_2d.shape[0])
-
-                # DQ
-                # C = sw * sx * (X * W) + bias
-                output = output * x_scale * weight_scale.t()
-                if bias is not None:
-                    output = output + bias
-                return output.to(dtype=input.dtype).view(*output_shape)
+        return w8a8_scaled_mm_func(qinput=qinput,
+                                   weight=weight,
+                                   out_dtype=input.dtype,
+                                   scale_a=x_scale,
+                                   scale_b=weight_scale,
+                                   bias=bias,
+                                   input_2d=input_2d,
+                                   output_shape=output_shape)
 
 
 def normalize_e4m3fn_to_e4m3fnuz(
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
index 5e56be0619b57..686d031f7b728 100644
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -1,9 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 """Utility methods for model layers."""
-from typing import Tuple
+from typing import Callable, Optional, Tuple
 
 import torch
 
+from vllm import _custom_ops as ops
+from vllm import envs
+from vllm.platforms import current_platform
+
 
 def get_token_bin_counts_and_mask(
     tokens: torch.Tensor,
@@ -61,3 +65,34 @@ def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
     logits -= frequency_penalties.unsqueeze(dim=1) * output_bin_counts
     logits -= presence_penalties.unsqueeze(dim=1) * output_mask
     return logits
+
+
+def rocm_unquantized_gemm(x: torch.Tensor,
+                          weight: torch.Tensor,
+                          bias: Optional[torch.Tensor] = None):
+    k = weight.shape[1]
+    use_skinny = (envs.VLLM_ROCM_USE_SKINNY_GEMM and \
+                    x.dtype in [torch.float16, torch.bfloat16] \
+                    and k % 8 == 0 and bias is None)
+
+    if use_skinny is not True:
+        return torch.nn.functional.linear(x, weight, bias)
+
+    x_view = x.view(-1, x.size(-1))
+    n = x_view.shape[0]
+    m = weight.shape[0]
+    cu_count = current_platform.get_cu_count()
+
+    if m > 8 and n < 4:
+        out = ops.wvSplitK(weight, x_view, cu_count)
+        return out.view(*x.shape[:-1], weight.shape[0])
+    elif m % 4 == 0 and n == 1 and k <= 8192:
+        out = ops.LLMM1(weight, x_view, out, 4)
+        return out.view(*x.shape[:-1], weight.shape[0])
+    return torch.nn.functional.linear(x, weight, bias)
+
+
+def dispatch_unquantized_gemm() -> Callable[..., torch.Tensor]:
+    if current_platform.is_rocm():
+        return rocm_unquantized_gemm
+    return torch.nn.functional.linear
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 4707c3749b7e2..d8bde5c5cb321 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -413,6 +413,13 @@ class Platform:
                            self.device_name, key)
             return None
 
+    @classmethod
+    def get_cu_count(cls, device_id: int = 0) -> int:
+        """
+        Returns the total number of compute units (CU) on single GPU.
+        """
+        raise NotImplementedError
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index ca6528313a194..f44507eb54ccb 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -312,3 +312,8 @@ class RocmPlatform(Platform):
         gcn_arch = torch.cuda.get_device_properties(0).gcnArchName
         supported_archs = ['gfx94']
         return any(gfx in gcn_arch for gfx in supported_archs)
+
+    @classmethod
+    def get_cu_count(cls, device_id: int = 0) -> int:
+        return torch.cuda.get_device_properties(
+            device_id).multi_processor_count
\ No newline at end of file

From 71ce44047f20478f9c61d96907fdc2dac89e7e0a Mon Sep 17 00:00:00 2001
From: omer-dayan <omdayan@nvidia.com>
Date: Tue, 22 Apr 2025 07:21:49 +0300
Subject: [PATCH 551/593] Support S3 Sharded loading with RunAI Model Streamer
 (#16317)

Signed-off-by: Omer Dayan (SW-GPU) <omer@run.ai>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 vllm/config.py                             |  1 +
 vllm/model_executor/model_loader/loader.py | 80 ++++++++++++++--------
 2 files changed, 53 insertions(+), 28 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index a3ed94bc50f82..20ca20ad2b6d5 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1489,6 +1489,7 @@ class LoadFormat(str, enum.Enum):
     BITSANDBYTES = "bitsandbytes"
     MISTRAL = "mistral"
     RUNAI_STREAMER = "runai_streamer"
+    RUNAI_STREAMER_SHARDED = "runai_streamer_sharded"
     FASTSAFETENSORS = "fastsafetensors"
 
 
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index b0a0a20aa76f0..ae5662a9b48a9 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -611,8 +611,12 @@ class ShardedStateLoader(BaseModelLoader):
 
     DEFAULT_PATTERN = "model-rank-{rank}-part-{part}.safetensors"
 
-    def __init__(self, load_config: LoadConfig):
+    def __init__(self,
+                 load_config: LoadConfig,
+                 runai_model_streamer: bool = False):
         super().__init__(load_config)
+
+        self.runai_model_streamer = runai_model_streamer
         extra_config = ({} if load_config.model_loader_extra_config is None
                         else load_config.model_loader_extra_config.copy())
         self.pattern = extra_config.pop("pattern", self.DEFAULT_PATTERN)
@@ -659,7 +663,7 @@ class ShardedStateLoader(BaseModelLoader):
 
     def _prepare_weights(self, model_name_or_path: str,
                          revision: Optional[str]):
-        if os.path.isdir(model_name_or_path):
+        if is_s3(model_name_or_path) or os.path.isdir(model_name_or_path):
             return model_name_or_path
         else:
             allow_patterns = ["*.safetensors"]
@@ -678,12 +682,13 @@ class ShardedStateLoader(BaseModelLoader):
         device_config = vllm_config.device_config
         model_config = vllm_config.model_config
         target_device = torch.device(device_config.device)
-        from safetensors.torch import safe_open
 
         from vllm.distributed import get_tensor_model_parallel_rank
 
-        local_model_path = self._prepare_weights(model_config.model,
-                                                 model_config.revision)
+        model_weights = model_config.model
+        if hasattr(model_config, "model_weights"):
+            model_weights = model_config.model_weights
+        local_model_path = model_weights
 
         with set_default_torch_dtype(model_config.dtype):
             with target_device:
@@ -695,40 +700,56 @@ class ShardedStateLoader(BaseModelLoader):
                 local_model_path,
                 self.pattern.format(rank=rank, part="*"),
             )
-            filepaths = glob.glob(pattern)
+
+            filepaths = []
+            if is_s3(local_model_path):
+                file_pattern = f"*{self.pattern.format(rank=rank, part=" * ")}"
+                filepaths = s3_glob(path=local_model_path,
+                                    allow_pattern=[file_pattern])
+            else:
+                filepaths = glob.glob(pattern)
             if not filepaths:
                 # TODO: support un-sharded checkpoints too
                 raise ValueError(
                     f"Could not find checkpoint files '{pattern}', only "
                     f"pre-sharded checkpoints are currently supported!")
             state_dict = self._filter_subtensors(model.state_dict())
-            for path in filepaths:
-                with safe_open(path, framework="pt") as f:
-                    for key in f.keys():  # noqa: SIM118
-                        tensor = f.get_tensor(key)
-                        # If loading with LoRA enabled, additional padding may
-                        # be added to certain parameters. We only load into a
-                        # narrowed view of the parameter data.
-                        param_data = state_dict[key].data
-                        param_shape = state_dict[key].shape
-                        for dim, size in enumerate(tensor.shape):
-                            if size < param_shape[dim]:
-                                param_data = param_data.narrow(dim, 0, size)
-                        if tensor.shape != param_shape:
-                            logger.warning(
-                                "loading tensor of shape %s into "
-                                "parameter '%s' of shape %s",
-                                tensor.shape,
-                                key,
-                                param_shape,
-                            )
-                        param_data.copy_(tensor)
-                        state_dict.pop(key)
+            for key, tensor in self.iterate_over_files(filepaths):
+                # If loading with LoRA enabled, additional padding may
+                # be added to certain parameters. We only load into a
+                # narrowed view of the parameter data.
+                param_data = state_dict[key].data
+                param_shape = state_dict[key].shape
+                for dim, size in enumerate(tensor.shape):
+                    if size < param_shape[dim]:
+                        param_data = param_data.narrow(dim, 0, size)
+                if tensor.shape != param_shape:
+                    logger.warning(
+                        "loading tensor of shape %s into "
+                        "parameter '%s' of shape %s",
+                        tensor.shape,
+                        key,
+                        param_shape,
+                    )
+                param_data.copy_(tensor)
+                state_dict.pop(key)
             if state_dict:
                 raise ValueError(
                     f"Missing keys {tuple(state_dict)} in loaded state!")
         return model.eval()
 
+    def iterate_over_files(
+            self, paths) -> Generator[Tuple[str, torch.Tensor], None, None]:
+        if self.runai_model_streamer:
+            yield from runai_safetensors_weights_iterator(paths, True)
+        else:
+            from safetensors.torch import safe_open
+            for path in paths:
+                with safe_open(path, framework="pt") as f:
+                    for key in f.keys():  # noqa: SIM118
+                        tensor = f.get_tensor(key)
+                        yield key, tensor
+
     @staticmethod
     def save_model(
         model: torch.nn.Module,
@@ -1515,4 +1536,7 @@ def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
     if load_config.load_format == LoadFormat.RUNAI_STREAMER:
         return RunaiModelStreamerLoader(load_config)
 
+    if load_config.load_format == LoadFormat.RUNAI_STREAMER_SHARDED:
+        return ShardedStateLoader(load_config, runai_model_streamer=True)
+
     return DefaultModelLoader(load_config)

From d6da9322c84feb83884a177e8a445986b88b7223 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 22 Apr 2025 12:45:55 +0800
Subject: [PATCH 552/593] [Bugfix] Fix f-string for Python 3.9-3.11 (#16962)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/model_loader/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index ae5662a9b48a9..cb9100e355945 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -703,7 +703,7 @@ class ShardedStateLoader(BaseModelLoader):
 
             filepaths = []
             if is_s3(local_model_path):
-                file_pattern = f"*{self.pattern.format(rank=rank, part=" * ")}"
+                file_pattern = f"*{self.pattern.format(rank=rank, part=' * ')}"
                 filepaths = s3_glob(path=local_model_path,
                                     allow_pattern=[file_pattern])
             else:

From 3097ce3a329bfe19ce3add0dbf5aaa46d7b99e88 Mon Sep 17 00:00:00 2001
From: Michael Yao <haifeng.yao@daocloud.io>
Date: Tue, 22 Apr 2025 13:33:27 +0800
Subject: [PATCH 553/593] [Doc] Update ai_accelerator/hpu-gaudi.inc.md (#16956)

Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
---
 .../ai_accelerator/hpu-gaudi.inc.md           | 34 +++++++++----------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
index e3046f35ee15f..78938de317c48 100644
--- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
@@ -13,11 +13,11 @@ There are no pre-built wheels or images for this device, so you must build vLLM
 - Intel Gaudi accelerator
 - Intel Gaudi software version 1.18.0
 
-Please follow the instructions provided in the [Gaudi Installation
-Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html)
+Please follow the instructions provided in the
+[Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html)
 to set up the execution environment. To achieve the best performance,
-please follow the methods outlined in the [Optimizing Training Platform
-Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
+please follow the methods outlined in the
+[Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
 
 ## Configure a new environment
 
@@ -32,15 +32,13 @@ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloade
 pip list | grep neural # verify that neural_compressor is installed
 ```
 
-Refer to [Intel Gaudi Software Stack
-Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade)
+Refer to [Intel Gaudi Software Stack Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade)
 for more details.
 
 ### Run Docker Image
 
 It is highly recommended to use the latest Docker image from Intel Gaudi
-vault. Refer to the [Intel Gaudi
-documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers)
+vault. Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers)
 for more details.
 
 Use the following commands to run a Docker image:
@@ -278,8 +276,9 @@ Lower value corresponds to less usable graph memory reserved for prefill stage,
 :::
 
 User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented:
-\- `max_bs` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode
-\- `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt
+
+- `max_bs` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode
+- `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt
 
 When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy.
 
@@ -326,8 +325,7 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi
 - We recommend running inference on Gaudi 2 with `block_size` of 128
   for BF16 data type. Using default values (16, 32) might lead to
   sub-optimal performance due to Matrix Multiplication Engine
-  under-utilization (see [Gaudi
-  Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)).
+  under-utilization (see [Gaudi Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)).
 - For max throughput on Llama 7B, we recommend running with batch size
   of 128 or 256 and max context length of 2048 with HPU Graphs enabled.
   If you encounter out-of-memory issues, see troubleshooting section.
@@ -336,11 +334,11 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi
 
 **Diagnostic and profiling knobs:**
 
-- `VLLM_PROFILER_ENABLED`: if `true`, high level profiler will be enabled. Resulting JSON traces can be viewed in [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). Disabled by default.
-- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: if `true`, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside `PT_HPU_METRICS_GC_DETAILS=1`. Disabled by default.
-- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: if `true`, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default.
-- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: if `true`, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default.
-- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default.
+- `VLLM_PROFILER_ENABLED`: If `true`, enable the high level profiler. Resulting JSON traces can be viewed in [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). `false` by default.
+- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: If `true`, log graph compilations for each vLLM engine step when any occurs. Highly recommended to use with `PT_HPU_METRICS_GC_DETAILS=1`. `false` by default.
+- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: If `true`, always log graph compilations for each vLLM engine step even if none occurred. `false` by default.
+- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: If `true`, log CPU fallbacks for each vLLM engine step when any occurs. `false` by default.
+- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, always log CPU fallbacks for each vLLM engine step even if none occurred. `false` by default.
 
 **Performance tuning knobs:**
 
@@ -381,7 +379,7 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi
 
 Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:
 
-- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used, `1` is default
+- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used; if `1`, PyTorch Lazy backend for Gaudi will be used. `1` is default.
 - `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs
 
 ## Troubleshooting: tweaking HPU graphs

From a114bf20a3ef3c03ee6fc127cde0d1422de4be6c Mon Sep 17 00:00:00 2001
From: SnowCharm <snowcharmqq@gmail.com>
Date: Tue, 22 Apr 2025 14:01:54 +0800
Subject: [PATCH 554/593] [Perf] Optimize `_update_states` for GPU model runner
 (#16910)

Signed-off-by: snowcharm <snowcharmqq@gmail.com>
---
 vllm/v1/worker/gpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index d5efe2dda16d0..bdf0d0f72289e 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -454,7 +454,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
 
         # Add the new or resumed requests to the persistent batch.
         # The smaller empty indices are filled first.
-        removed_req_indices = sorted(removed_req_indices, reverse=True)
+        removed_req_indices.sort(reverse=True)
         for req_id in req_ids_to_add:
             req_state = self.requests[req_id]
             if removed_req_indices:

From acba33a0f1c113f400667b9d821a227e88b305db Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Tue, 22 Apr 2025 14:02:20 +0800
Subject: [PATCH 555/593] [Bugfix] Fix the issue where llm.generate cannot be
 called repeatedly after setting GuidedDecodingParams (#16767)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
---
 .../entrypoints/llm/test_struct_output_generate.py | 14 +++++++++++---
 vllm/sampling_params.py                            | 11 +++++++++++
 vllm/v1/engine/processor.py                        | 11 ++++++++++-
 3 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index b179dc3b4747c..c243d81e7f183 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -386,13 +386,21 @@ def test_structured_output_auto_mode(
         max_tokens=1000,
         guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
 
+    prompts = ("Give an example JSON object for a grade "
+               "that fits this schema: "
+               f"{unsupported_json_schema}")
     # This would fail with the default of "xgrammar", but in "auto"
     # we will handle fallback automatically.
-    outputs = llm.generate(prompts=("Give an example JSON object for a grade "
-                                    "that fits this schema: "
-                                    f"{unsupported_json_schema}"),
+    outputs = llm.generate(prompts=prompts,
                            sampling_params=sampling_params,
                            use_tqdm=True)
+    # Make sure `auto` backend handling doesn't mess up sampling_params
+    # and that we can reuse it without error.
+    outputs.extend(
+        llm.generate(prompts=prompts,
+                     sampling_params=sampling_params,
+                     use_tqdm=True))
+
     assert outputs is not None
     for output in outputs:
         assert output is not None
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 68ed996649471..707a757ca83a2 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -79,6 +79,17 @@ class GuidedDecodingParams:
             return []
         return self.backend.split(":")[1].split(",")
 
+    def add_option(self, opt_name: str) -> None:
+        """Adds an option to the backend options."""
+        if not self.backend:
+            self.backend = f":{opt_name}"
+        elif ":" not in self.backend:
+            self.backend += f":{opt_name}"
+        else:
+            options = set(self.backend_options())
+            options.add(opt_name)
+            self.backend = f"{self.backend_name}:{','.join(sorted(options))}"
+
     def no_fallback(self) -> bool:
         """Returns True if the "no-fallback" option is supplied for the guided
         decoding backend"""
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 61b7c45b62fa8..26c57b31aacd7 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -155,7 +155,14 @@ class Processor:
             raise ValueError(f"Only {supported_backends} structured output is "
                              "supported in V1.")
         if params.guided_decoding.backend:
-            if params.guided_decoding.backend != engine_level_backend:
+            # Request-level backend selection is not supported in V1.
+            # The values may differ if `params` is reused and was set
+            # to a specific backend based on `auto` behavior in a previous
+            # request. We remember that it was set as a result of `auto`
+            # using the `_auto` option set on the backend in the params.
+            if (params.guided_decoding.backend != engine_level_backend
+                    and not (engine_level_backend == "auto" and "_auto"
+                             in params.guided_decoding.backend_options())):
                 raise ValueError(
                     "Request-level structured output backend selection is no "
                     "longer supported. The request specified "
@@ -190,6 +197,8 @@ class Processor:
                 # The request includes some jsonschema feature(s) that
                 # are not supported in xgrammar. Fall back to guidance.
                 params.guided_decoding.backend = "guidance"
+            # Remember that this backend was set automatically
+            params.guided_decoding.add_option("_auto")
 
     def process_inputs(
         self,

From 2689d5c0279befb1ce39c174d8b854605ec50203 Mon Sep 17 00:00:00 2001
From: Flora Feng <4florafeng@gmail.com>
Date: Tue, 22 Apr 2025 00:48:15 -0700
Subject: [PATCH 556/593] [Model] Use autoweightloader for mamba (#16950)

Signed-off-by: sfeng33 <4florafeng@gmail.com>
---
 vllm/model_executor/models/mamba.py | 41 ++++++++++++++++-------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 7a525ad8e494f..ac95b65fd03f2 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -27,7 +27,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 from vllm.utils import LayerBlockType
 
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -154,6 +154,26 @@ class MambaModel(nn.Module):
 
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "A_log" in name:
+                name = name.replace("A_log", "A")
+            # Skip loading extra bias for GPTQ models.
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP,
                        SupportsV0Only):
@@ -257,20 +277,5 @@ class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP,
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "A_log" in name:
-                name = name.replace("A_log", "A")
-            # Skip loading extra bias for GPTQ models.
-            if name.endswith(".bias") and name not in params_dict:
-                continue
-            if is_pp_missing_parameter(name, self):
-                continue
-
-            param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)

From c4ab9f3e71f2b3309133886d1cc31e1dc8f614b4 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 22 Apr 2025 00:52:18 -0700
Subject: [PATCH 557/593] [V1] Remove pre-allocation for KV cache (#16941)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/v1/core/test_kv_cache_utils.py |  15 +--
 tests/v1/core/test_prefix_caching.py | 145 +++++++++------------------
 tests/v1/core/test_scheduler.py      |  12 +--
 vllm/v1/core/kv_cache_manager.py     |  23 +----
 vllm/v1/core/sched/scheduler.py      |   7 +-
 5 files changed, 61 insertions(+), 141 deletions(-)

diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index a4a571b180c6b..e2f8fd1999c45 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -496,8 +496,7 @@ def test_allocate_with_lookahead():
 
     # Test case 1: Requires additional lookahead tokens
     kv_cache_manager = KVCacheManager(kv_cache_config=config,
-                                      max_model_len=100,
-                                      num_preallocate_tokens=0)
+                                      max_model_len=100)
     blocks = kv_cache_manager.allocate_slots(
         request,
         num_tokens=3,
@@ -507,25 +506,19 @@ def test_allocate_with_lookahead():
 
     # Test case 2: With precomputed blocks
     kv_cache_manager = KVCacheManager(kv_cache_config=config,
-                                      max_model_len=100,
-                                      num_preallocate_tokens=4)
-    # num_preallocate_blocks = 4 // 4 - 2 // 4 = 1
+                                      max_model_len=100)
     # required_blocks = ceil((3 + 2) /4) = 2
-    # total_blocks = 1 + 2 = 3
     blocks = kv_cache_manager.allocate_slots(
         request,
         num_tokens=3,
         num_lookahead_tokens=2,
     )
-    assert len(blocks) == 3
+    assert len(blocks) == 2
 
     # Test case 3: With precomputed blocks
-    # num_preallocate_blocks = 4 // 4 - 4 // 4 = 0
     # required_blocks = ceil((3 + 4) / 4) = 2
-    # total_blocks = 0 + 2 = 2
     kv_cache_manager = KVCacheManager(kv_cache_config=config,
-                                      max_model_len=100,
-                                      num_preallocate_tokens=4)
+                                      max_model_len=100)
     blocks = kv_cache_manager.allocate_slots(
         request,
         num_tokens=3,
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 669c042369253..1b238d47c03a3 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -8,7 +8,7 @@ import torch
 
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
-from vllm.utils import cdiv, sha256
+from vllm.utils import sha256
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
 from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock,
@@ -61,7 +61,6 @@ def test_prefill(hash_algo):
         max_model_len=8192,
         enable_caching=True,
         caching_hash_algo=hash_algo,
-        num_preallocate_tokens=16,
     )
 
     # choose the hash function according to the parameter
@@ -80,7 +79,7 @@ def test_prefill(hash_algo):
     assert not computed_blocks
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55, computed_blocks)
-    assert [b.block_id for b in blocks] == [1, 2, 3, 4, 5]
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4]
 
     # Check full block metadata
     parent_block_hash = None
@@ -92,8 +91,8 @@ def test_prefill(hash_algo):
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
         parent_block_hash = block_hash.hash_value
 
-    # Check partial/preallocated block metadata
-    for block_id in (4, 5):
+    # Check partial block metadata
+    for block_id in (4, ):
         assert manager.block_pool.blocks[block_id].block_hash is None
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
 
@@ -107,12 +106,12 @@ def test_prefill(hash_algo):
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
-    assert [b.block_id for b in blocks] == [6, 7]
+    assert [b.block_id for b in blocks] == [5]
     for block in computed_blocks:
         assert block.ref_cnt == 2
 
-    # At this point, we should have 3 free blocks left.
-    assert manager.block_pool.free_block_queue.num_free_blocks == 3
+    # At this point, we should have 5 free blocks left.
+    assert manager.block_pool.free_block_queue.num_free_blocks == 5
 
     manager.free(req0)
     manager.free(req1)
@@ -120,14 +119,14 @@ def test_prefill(hash_algo):
     # All blocks should be available.
     assert manager.block_pool.free_block_queue.num_free_blocks == 10
     # The order should be
-    # [unallocated (8, 9, 10)]
-    # [unique_req0 (5, 4)]
-    # [unique_req1 (7, 6)]
+    # [unallocated (6, 7, 8, 9, 10)]
+    # [unique_req0 (4)]
+    # [unique_req1 (5)]
     # [common (3, 2, 1)]
     assert [
         b.block_id
         for b in manager.block_pool.free_block_queue.get_all_free_blocks()
-    ] == [8, 9, 10, 5, 4, 7, 6, 3, 2, 1]
+    ] == [6, 7, 8, 9, 10, 4, 5, 3, 2, 1]
 
     # Cache hit in the common prefix when the original block is already free.
     # Incomplete 1 block (6 tokens)
@@ -139,29 +138,29 @@ def test_prefill(hash_algo):
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks)
-    assert [b.block_id for b in blocks] == [8, 9]
+    assert [b.block_id for b in blocks] == [6]
 
-    # Although we only have 5 free blocks, we have 8 blocks in
+    # Although we only have 6 free blocks, we have 8 blocks in
     # the free block queue due to lazy removal.
-    assert manager.block_pool.free_block_queue.num_free_blocks == 5
+    assert manager.block_pool.free_block_queue.num_free_blocks == 6
     assert all([
         b.ref_cnt == 0
         for b in manager.block_pool.free_block_queue.get_all_free_blocks()
     ])
     assert len([
         b for b in manager.block_pool.free_block_queue.get_all_free_blocks()
-    ]) == 5
+    ]) == 6
 
     manager.free(req2)
 
     # Cache miss and eviction.
-    req3 = make_request("3", [99] * (16 * 9))
+    req3 = make_request("3", [99] * (16 * 10))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
     assert not computed_blocks
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req3, 16 * 9, computed_blocks)
+    blocks = manager.allocate_slots(req3, 16 * 10, computed_blocks)
     # This block ID order also checks the eviction order.
-    assert [b.block_id for b in blocks] == [10, 5, 4, 7, 6, 9, 8, 3, 2, 1]
+    assert [b.block_id for b in blocks] == [7, 8, 9, 10, 4, 5, 6, 3, 2, 1]
     assert manager.block_pool.free_block_queue.num_free_blocks == 0
     assert manager.block_pool.free_block_queue.free_list_head is None
     assert manager.block_pool.free_block_queue.free_list_tail is None
@@ -178,7 +177,6 @@ def test_prefill_plp():
         make_kv_cache_config(16, 11),
         max_model_len=8192,
         enable_caching=True,
-        num_preallocate_tokens=16,
     )
     # the default hash function is hash
     hash_fn = hash
@@ -197,7 +195,7 @@ def test_prefill_plp():
     assert not computed_blocks
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55, computed_blocks)
-    assert [b.block_id for b in blocks] == [1, 2, 3, 4, 5]
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4]
     req0_block_hashes = [b.block_hash for b in blocks]
 
     # Check full block metadata
@@ -210,8 +208,8 @@ def test_prefill_plp():
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
         parent_block_hash = block_hash.hash_value
 
-    # Check partial/preallocated block metadata
-    for block_id in (4, 5):
+    # Check partial block metadata
+    for block_id in (4, ):
         assert manager.block_pool.blocks[block_id].block_hash is None
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
 
@@ -226,12 +224,12 @@ def test_prefill_plp():
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
-    assert [b.block_id for b in blocks] == [6, 7]
+    assert [b.block_id for b in blocks] == [5]
     for block in computed_blocks:
         assert block.ref_cnt == 2
 
-    # At this point, we should have 3 free blocks left.
-    assert manager.block_pool.free_block_queue.num_free_blocks == 3
+    # At this point, we should have 5 free blocks left.
+    assert manager.block_pool.free_block_queue.num_free_blocks == 5
 
     manager.free(req0)
     manager.free(req1)
@@ -239,14 +237,14 @@ def test_prefill_plp():
     # All blocks should be available.
     assert manager.block_pool.free_block_queue.num_free_blocks == 10
     # The order should be
-    # [unallocated (8, 9, 10)]
-    # [unique_req0 (5, 4)]
-    # [unique_req1 (7, 6)]
+    # [unallocated (6, 7, 8, 9, 10)]
+    # [unique_req0 (4)]
+    # [unique_req1 (5)]
     # [common (3, 2, 1)]
     assert [
         b.block_id
         for b in manager.block_pool.free_block_queue.get_all_free_blocks()
-    ] == [8, 9, 10, 5, 4, 7, 6, 3, 2, 1]
+    ] == [6, 7, 8, 9, 10, 4, 5, 3, 2, 1]
 
     # Request #2 is a prompt-logprobs request:
     # NO cache hit in the common prefix; duplicates request #0 cached blocks
@@ -262,7 +260,7 @@ def test_prefill_plp():
     block_ids = [b.block_id for b in blocks]
     # Duplicate cached blocks have different ids but same hashes vs request #0
     assert [b.block_hash for b in blocks] == req0_block_hashes
-    assert block_ids != [1, 2, 3, 4, 5]
+    assert block_ids != [1, 2, 3, 4]
 
     # Request #2 block hashes are valid since request #0 hashes are.
     # Check block reference counts.
@@ -277,7 +275,6 @@ def test_decode():
         make_kv_cache_config(16, 11),
         max_model_len=8192,
         enable_caching=True,
-        num_preallocate_tokens=16,
     )
 
     # Complete 3 blocks (48 tokens)
@@ -291,7 +288,7 @@ def test_decode():
     assert not computed_blocks
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55, computed_blocks)
-    assert [b.block_id for b in blocks] == [1, 2, 3, 4, 5]
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4]
 
     # Append slots without allocating a new block.
     req0.num_computed_tokens = 55
@@ -299,28 +296,18 @@ def test_decode():
         req0.append_output_token_ids(8)
     new_blocks = manager.allocate_slots(req0, 4)
     assert new_blocks is not None and len(new_blocks) == 0
-    assert manager.req_to_blocks[req0.request_id][-2].block_hash is None
-
-    # Append slots without allocating a new block, but start using the
-    # preallocated block.
-    req0.num_computed_tokens = 59
-    # 6 tokens to fill the previous block, and 10 tokens to fill
-    # the preallocated block.
-    for _ in range(5 + 10):
-        req0.append_output_token_ids(7)
-    new_blocks = manager.allocate_slots(req0, 15)
-    assert new_blocks is not None and len(new_blocks) == 0
-    assert manager.req_to_blocks[req0.request_id][-2].block_hash is not None
+    assert manager.req_to_blocks[req0.request_id][-1].block_hash is None
 
     # Append slots with allocating a new block.
-    req0.num_computed_tokens = 74
-    # 6 tokens to fill the previous block, and 10 tokens to fill
+    req0.num_computed_tokens = 59
+    # 9 tokens to fill the previous block, and 10 tokens to fill
     # the preallocated block.
-    for _ in range(6 + 11):
-        req0.append_output_token_ids(12)
-    new_blocks = manager.allocate_slots(req0, 17)
-    # Plus one preallocated block.
-    assert new_blocks is not None and len(new_blocks) == 2
+    for _ in range(9 + 10):
+        req0.append_output_token_ids(7)
+    new_blocks = manager.allocate_slots(req0, 19)
+    assert new_blocks is not None and len(new_blocks) == 1
+    assert manager.req_to_blocks[req0.request_id][-2].block_hash is not None
+    assert manager.req_to_blocks[req0.request_id][-1].block_hash is None
 
 
 def test_evict():
@@ -328,7 +315,6 @@ def test_evict():
         make_kv_cache_config(16, 11),
         max_model_len=8192,
         enable_caching=True,
-        num_preallocate_tokens=16,
     )
 
     last_token_id = 5 * 16 + 7
@@ -337,7 +323,7 @@ def test_evict():
     assert not computed_blocks
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 5 * 16 + 7, computed_blocks)
-    assert len(blocks) == 7  # 5 full + 1 partial + 1 preallocated
+    assert len(blocks) == 6  # 5 full + 1 partial
 
     # 3 blocks.
     req1 = make_request("1", list(range(last_token_id,
@@ -349,7 +335,8 @@ def test_evict():
     assert len(blocks) == 3  # 3 full blocks
     last_token_id += 3 * 16
 
-    assert manager.block_pool.free_block_queue.num_free_blocks == 0
+    # 10 - (6 + 3) == 1
+    assert manager.block_pool.free_block_queue.num_free_blocks == 1
 
     manager.free(req0)
     manager.free(req1)
@@ -357,7 +344,7 @@ def test_evict():
     assert [
         b.block_id
         for b in manager.block_pool.free_block_queue.get_all_free_blocks()
-    ] == [7, 6, 5, 4, 3, 2, 1, 10, 9, 8]
+    ] == [10, 6, 5, 4, 3, 2, 1, 9, 8, 7]
 
     # Touch the first 2 blocks.
     req2 = make_request("2", list(range(2 * 16 + 3)))
@@ -365,8 +352,8 @@ def test_evict():
     assert [b.block_id for b in computed_blocks] == [1, 2]
     assert num_computed_tokens == 2 * 16
     blocks = manager.allocate_slots(req2, 3, computed_blocks)
-    assert [b.block_id for b in blocks] == [7, 6]
-    assert manager.block_pool.free_block_queue.num_free_blocks == 6
+    assert [b.block_id for b in blocks] == [10]
+    assert manager.block_pool.free_block_queue.num_free_blocks == 7
 
 
 def test_hash_block_correct_reuse():
@@ -379,7 +366,6 @@ def test_hash_block_correct_reuse():
         make_kv_cache_config(16, 2),
         max_model_len=8192,
         enable_caching=True,
-        num_preallocate_tokens=0,
     )
 
     # Allocate 1 block and cache it.
@@ -416,7 +402,6 @@ def test_computed_blocks_not_evicted():
         make_kv_cache_config(block_size, 3),
         max_model_len=8192,
         enable_caching=True,
-        num_preallocate_tokens=0,
     )
 
     # Allocate a block and cache it.
@@ -465,7 +450,6 @@ def test_basic_prefix_caching_disabled():
         make_kv_cache_config(block_size, 5),
         max_model_len=8192,
         enable_caching=False,
-        num_preallocate_tokens=0,
     )
 
     req1 = make_request("1", list(range(10)))  # 2 blocks and some more
@@ -496,40 +480,6 @@ def test_basic_prefix_caching_disabled():
     assert not blocks
 
 
-@pytest.mark.parametrize("num_preallocate_tokens", list(range(0, 8)))
-@pytest.mark.parametrize("block_size", [4])
-def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
-    """
-    This tests that the preallocated blocks are correctly added.
-    """
-    manager = KVCacheManager(
-        make_kv_cache_config(block_size, 11),
-        max_model_len=8192,
-        enable_caching=True,
-        num_preallocate_tokens=num_preallocate_tokens,
-    )
-    num_preallocated_blocks = cdiv(num_preallocate_tokens, block_size)
-
-    req = make_request("0", list(range(block_size * 30)))
-    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
-    assert not computed_blocks
-    assert num_computed_tokens == 0
-    # Just ask for 1 block.
-    blocks = manager.allocate_slots(req, block_size, computed_blocks)
-    req.num_computed_tokens = block_size
-    assert len(blocks) == 1 + num_preallocated_blocks
-
-    # Assume all computed, only when num_preallocate_tokens > 0, we need to
-    # consume the previously preallocated blocks.
-    if num_preallocated_blocks > 0:
-        manager.allocate_slots(req, block_size * (len(blocks) - 1))
-        req.num_computed_tokens = block_size * len(blocks)
-
-    # Append 1 block.
-    blocks = manager.allocate_slots(req, block_size)
-    assert len(blocks) == 1 + num_preallocated_blocks
-
-
 @pytest.mark.parametrize("hash_fn", [sha256, hash])
 def test_cache_blocks(hash_fn):
     """
@@ -588,7 +538,6 @@ def test_mm_prefix_caching():
         make_kv_cache_config(16, 11),
         max_model_len=8192,
         enable_caching=True,
-        num_preallocate_tokens=16,
     )
 
     # Common prompt tokens (T is text tokens and P is image placeholder tokens)
@@ -626,7 +575,7 @@ def test_mm_prefix_caching():
     assert block_hashes[2].extra_keys == ("bbb", )
 
     blocks = manager.allocate_slots(req0, 59, computed_blocks)
-    assert [b.block_id for b in blocks] == [1, 2, 3, 4, 5]
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4]
     req0.num_computed_tokens = 59
 
     # Append slots without allocating a new block.
@@ -667,7 +616,6 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
-        num_preallocate_tokens=0,
     )
     # Complete 3 blocks (48 tokens)
     # | Common-0 | Common-1 | Common-2 | ... |
@@ -721,7 +669,6 @@ def test_reset_prefix_cache():
         make_kv_cache_config(16, 11),
         max_model_len=8192,
         enable_caching=True,
-        num_preallocate_tokens=0,
     )
 
     full_block_token_ids = [i for i in range(3) for _ in range(16)]
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index f173344344f95..560a60a814461 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -804,20 +804,17 @@ def _assert_right_kv_cache_manager(
     """Check whether KVCacheManager is correct after allocate."""
 
     # Make sure the request stats are right.
-    EXPECTED_ACTUAL_BLOCKS = num_tokens // block_size
-    EXPECTED_TOTAL_BLOCKS = (EXPECTED_ACTUAL_BLOCKS +
-                             scheduler.kv_cache_manager.num_preallocate_blocks)
+    EXPECTED_TOTAL_BLOCKS = num_tokens // block_size
     for req_id in req_ids:
         blocks = scheduler.kv_cache_manager.req_to_blocks[req_id]
         hashes = scheduler.kv_cache_manager.req_to_block_hashes[req_id]
         assert (scheduler.kv_cache_manager.num_cached_block[req_id] ==
-                EXPECTED_ACTUAL_BLOCKS)
+                EXPECTED_TOTAL_BLOCKS)
         assert len(blocks) == EXPECTED_TOTAL_BLOCKS
-        assert len(hashes) == EXPECTED_ACTUAL_BLOCKS
+        assert len(hashes) == EXPECTED_TOTAL_BLOCKS
 
     # Make sure we actually touched all the blocks.
-    BLOCKS_PER_REQ = (num_tokens / block_size +
-                      scheduler.kv_cache_manager.num_preallocate_blocks)
+    BLOCKS_PER_REQ = num_tokens / block_size
     assert (scheduler.kv_cache_manager.block_pool.get_num_free_blocks() ==
             num_total_blocks - num_requests * BLOCKS_PER_REQ)
 
@@ -1052,7 +1049,6 @@ def test_kv_connector_handles_preemption():
         block_size=BLOCK_SIZE,
         num_blocks=NUM_BLOCKS,
     )
-    scheduler.kv_cache_manager.num_preallocate_blocks = 0
 
     NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
     scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index c3c83baf51293..354300d3c2fe1 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -25,7 +25,6 @@ class KVCacheManager:
         max_model_len: int,
         enable_caching: bool = True,
         caching_hash_algo: str = "builtin",
-        num_preallocate_tokens: int = 64,
         log_stats: bool = False,
     ) -> None:
         assert len(kv_cache_config.kv_cache_groups) == 1, (
@@ -42,22 +41,8 @@ class KVCacheManager:
         self.log_stats = log_stats
         # FIXME: make prefix cache stats conditional on log_stats
         self.prefix_cache_stats = PrefixCacheStats() if log_stats else None
-        # NOTE(woosuk): To avoid frequent block allocation, we preallocate some
-        # blocks for each request. For example, when a request reaches the end
-        # of its block table, we preallocate N blocks in advance. This way, we
-        # reduce the overhead of updating free_block_ids and ref_cnts for each
-        # request every step (at the cost of some memory waste).
-        # NOTE(woosuk): This is different from the "lookahead" slots since this
-        # does not guarantee that the request always has N empty blocks. After
-        # the request gets N empty blocks, it starts to use the blocks without
-        # further allocation. When it uses up all the N empty blocks, it gets
-        # N new empty blocks.
-        self.num_preallocate_tokens = num_preallocate_tokens
-        self.num_preallocate_blocks = cdiv(num_preallocate_tokens,
-                                           self.block_size)
 
         self.block_pool = BlockPool(self.num_gpu_blocks, enable_caching)
-
         self.specialized_manager = get_specialized_manager(
             kv_cache_spec=kv_cache_spec,
             block_pool=self.block_pool,
@@ -256,13 +241,9 @@ class KVCacheManager:
             # No new block is needed.
             new_blocks = []
         else:
-            # Get new blocks from the free block pool considering
-            # preallocated blocks.
-            num_preallocate_blocks = max(
-                0, self.num_preallocate_blocks -
-                num_lookahead_tokens // self.block_size)
+            # Get new blocks from the free block pool.
             num_new_blocks = min(
-                num_new_blocks + num_preallocate_blocks,
+                num_new_blocks,
                 self.block_pool.get_num_free_blocks(),
                 # Should not exceed the maximum number of blocks per request.
                 # This is especially because the block table has the shape
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 16efc42f212e0..5adcdde5bcd79 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -358,8 +358,11 @@ class Scheduler(SchedulerInterface):
                     new_encoder_budget = encoder_budget
 
                 new_blocks = self.kv_cache_manager.allocate_slots(
-                    request, num_new_tokens + num_external_tokens,
-                    computed_blocks)
+                    request,
+                    num_new_tokens + num_external_tokens,
+                    computed_blocks,
+                    num_lookahead_tokens=self.num_lookahead_tokens,
+                )
                 if new_blocks is None:
                     # The request cannot be scheduled.
                     break

From 8d32dc603d03fa2d9b2fe538fbc53ef049433762 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Tue, 22 Apr 2025 16:01:36 +0800
Subject: [PATCH 558/593] [Kernel] Support Microsoft Runtime Kernel Lib for our
 Low Precision Computation - BitBLAS (#6036)

Signed-off-by: xinyuxiao <xinyuxiao2024@gmail.com>
Co-authored-by: xinyuxiao <xinyuxiao2024@gmail.com>
---
 benchmarks/kernels/benchmark_bitblas.py       | 236 +++++++++
 docs/source/features/quantization/bitblas.md  |  40 ++
 docs/source/features/quantization/index.md    |   1 +
 .../quantization/supported_hardware.md        |  11 +
 tests/models/test_bitblas.py                  |  63 +++
 tests/models/test_gptq_bitblas.py             |  61 +++
 vllm/config.py                                |   3 +-
 vllm/model_executor/layers/linear.py          |  16 +
 .../layers/quantization/__init__.py           |   8 +-
 .../layers/quantization/bitblas.py            | 459 ++++++++++++++++++
 .../layers/quantization/gptq_bitblas.py       | 438 +++++++++++++++++
 .../kernels/mixed_precision/__init__.py       |   5 +-
 .../kernels/mixed_precision/bitblas.py        | 299 ++++++++++++
 .../quantization/utils/bitblas_utils.py       | 198 ++++++++
 vllm/model_executor/parameter.py              |  33 +-
 15 files changed, 1864 insertions(+), 7 deletions(-)
 create mode 100644 benchmarks/kernels/benchmark_bitblas.py
 create mode 100644 docs/source/features/quantization/bitblas.md
 create mode 100644 tests/models/test_bitblas.py
 create mode 100644 tests/models/test_gptq_bitblas.py
 create mode 100644 vllm/model_executor/layers/quantization/bitblas.py
 create mode 100644 vllm/model_executor/layers/quantization/gptq_bitblas.py
 create mode 100644 vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
 create mode 100644 vllm/model_executor/layers/quantization/utils/bitblas_utils.py

diff --git a/benchmarks/kernels/benchmark_bitblas.py b/benchmarks/kernels/benchmark_bitblas.py
new file mode 100644
index 0000000000000..b23b4f3ea685a
--- /dev/null
+++ b/benchmarks/kernels/benchmark_bitblas.py
@@ -0,0 +1,236 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
+    MINIMUM_BITBLAS_VERSION)
+
+try:
+    import bitblas
+    if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
+        raise ImportError("bitblas version is wrong. Please "
+                          f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
+except ImportError as e:
+    bitblas_import_exception = e
+    raise ValueError("Trying to use the bitblas backend, but could not import"
+                     f"with the following error: {bitblas_import_exception}. "
+                     "Please install bitblas through the following command: "
+                     f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`"
+                     ) from bitblas_import_exception
+
+from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
+
+from vllm.utils import FlexibleArgumentParser
+
+parser = FlexibleArgumentParser(
+    description="Benchmark BitBLAS int4 on a specific target.")
+
+# Add arguments to the parser
+parser.add_argument(
+    "--target",
+    type=str,
+    default=auto_detect_nvidia_target(),
+    help="Specify the target device for benchmarking.",
+)
+parser.add_argument("--group_size",
+                    type=int,
+                    default=None,
+                    help="Group size for grouped quantization.")
+parser.add_argument(
+    "--A_dtype",
+    type=str,
+    default="float16",
+    choices=["float16", "float32", "float64", "int32", "int8"],
+    help="Data type of activation A.",
+)
+parser.add_argument(
+    "--W_dtype",
+    type=str,
+    default="int4",
+    choices=[
+        "float16",
+        "float32",
+        "float64",
+        "int32",
+        "int8",
+        "int4",
+        "int2",
+        "int1",
+        "nf4",
+        "fp4_e2m1",
+    ],
+    help="Data type of weight W.",
+)
+parser.add_argument(
+    "--accum_dtype",
+    type=str,
+    default="float16",
+    choices=["float16", "int32"],
+    help="Data type for accumulation.",
+)
+parser.add_argument(
+    "--out_dtype",
+    type=str,
+    default="float16",
+    choices=["float16", "float32", "int32", "int8"],
+    help="Data type for output.",
+)
+parser.add_argument(
+    "--layout",
+    type=str,
+    default="nt",
+    choices=["nt", "nn"],
+    help="Matrix layout, 'nt' for non-transpose A and transpose W.",
+)
+parser.add_argument("--with_bias",
+                    action="store_true",
+                    help="Include bias in the benchmark.")
+parser.add_argument(
+    "--with_scaling",
+    action="store_true",
+    help="Include scaling factor in the quantization.",
+)
+parser.add_argument("--with_zeros",
+                    action="store_true",
+                    help="Include zeros in the quantization.")
+parser.add_argument(
+    "--zeros_mode",
+    type=str,
+    default=None,
+    choices=["original", "rescale", "quantized"],
+    help="Specify the mode for calculating zeros.",
+)
+
+# Parse the arguments
+args = parser.parse_args()
+
+# Assign arguments to variables
+target = args.target
+A_dtype = args.A_dtype
+W_dtype = args.W_dtype
+accum_dtype = args.accum_dtype
+out_dtype = args.out_dtype
+layout = args.layout
+with_bias = args.with_bias
+group_size = args.group_size
+with_scaling = args.with_scaling
+with_zeros = args.with_zeros
+zeros_mode = args.zeros_mode
+
+# Define a list of shared arguments that repeat in every config
+shared_args = [
+    A_dtype,
+    W_dtype,
+    out_dtype,
+    accum_dtype,
+    layout,
+    with_bias,
+    group_size,
+    with_scaling,
+    with_zeros,
+    zeros_mode,
+]
+
+# Define just the (M, K, N) shapes in a more compact list
+shapes = [
+    # square test
+    (1, 16384, 16384),
+    # BLOOM-176B
+    (1, 43008, 14336),
+    (1, 14336, 14336),
+    (1, 57344, 14336),
+    (1, 14336, 57344),
+    # OPT-65B
+    (1, 9216, 9216),
+    (1, 36864, 9216),
+    (1, 9216, 36864),
+    (1, 22016, 8192),
+    # LLAMA-70B/65B
+    (1, 8192, 22016),
+    (1, 8192, 8192),
+    (1, 28672, 8192),
+    (1, 8192, 28672),
+    # square test
+    (16384, 16384, 16384),
+    # BLOOM-176B
+    (8192, 43008, 14336),
+    (8192, 14336, 14336),
+    (8192, 57344, 14336),
+    (8192, 14336, 57344),
+    # OPT-65B
+    (8192, 9216, 9216),
+    (8192, 36864, 9216),
+    (8192, 9216, 36864),
+    (8192, 22016, 8192),
+    # LLAMA-70B/65B
+    (8192, 8192, 22016),
+    (8192, 8192, 8192),
+    (8192, 28672, 8192),
+    (8192, 8192, 28672),
+]
+
+# Build test shapes with all the shared arguments
+test_shapes = [(MatmulConfig, Matmul, (*shape, *shared_args))
+               for shape in shapes]
+
+benchmark_sets = []
+benchmark_sets.extend(test_shapes)
+
+benchmark_results = {}
+for config_class, operator, input_args in benchmark_sets:
+    config = config_class(*input_args)
+    matmul = operator(config, target=target, enable_tuning=True)
+    kernel_latency = matmul.profile_latency()
+
+    print("Time cost is: {:.3f} ms".format(kernel_latency))
+
+    profile_config = {
+        f"{operator.__name__}-{'-'.join([str(i) for i in input_args])}": {
+            "BitBLAS_top20_latency": kernel_latency,
+        }
+    }
+
+    benchmark_results.update(profile_config)
+
+# Define headers for the table
+headers = [
+    "PrimFunc",
+    "Input Arguments",
+    "BitBLAS Top20 Latency",
+]
+
+# Calculate column widths for pretty printing
+col_widths = [0, 0, 0]
+for config_key, values in benchmark_results.items():
+    args_split = config_key.split("-")
+    func_name = args_split[0]
+    input_args_str = "-".join(args_split[1:])
+    col_widths[0] = max(col_widths[0], len(func_name) + 2, len(headers[0]) + 2)
+    col_widths[1] = max(col_widths[1],
+                        len(input_args_str) + 2,
+                        len(headers[1]) + 2)
+    col_widths[2] = max(col_widths[2],
+                        len(f"{values['BitBLAS_top20_latency']:.3f} ms") + 2,
+                        len(headers[2]) + 2)
+    # break only if you want to measure widths from a single example;
+    # otherwise, let it loop over all items.
+
+# Print header
+for i, header in enumerate(headers):
+    headers[i] = header.ljust(col_widths[i])
+print("".join(headers))
+print("-" * sum(col_widths))
+
+# Print rows
+for config_key, values in benchmark_results.items():
+    args_split = config_key.split("-")
+    func_name = args_split[0]
+    input_args_str = "-".join(args_split[1:])
+    row = [
+        func_name,
+        input_args_str,
+        f"{values['BitBLAS_top20_latency']:.3f} ms",
+    ]
+    row_str = "".join(
+        [str(cell).ljust(col_widths[idx]) for idx, cell in enumerate(row)])
+    print(row_str)
diff --git a/docs/source/features/quantization/bitblas.md b/docs/source/features/quantization/bitblas.md
new file mode 100644
index 0000000000000..aff917f90ec2a
--- /dev/null
+++ b/docs/source/features/quantization/bitblas.md
@@ -0,0 +1,40 @@
+# BitBLAS
+
+vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more efficient and flexible model inference. Compared to other quantization frameworks, BitBLAS provides more precision combinations.
+
+Below are the steps to utilize BitBLAS with vLLM.
+
+```console
+pip install bitblas>=0.1.0
+```
+
+vLLM reads the model's config file and supports pre-quantized checkpoints.
+
+You can find pre-quantized models on:
+
+- [Hugging Face (BitBLAS)](https://huggingface.co/models?other=bitblas)
+- [Hugging Face (GPTQ)](https://huggingface.co/models?other=gptq)
+
+Usually, these repositories have a `quantize_config.json` file that includes a `quantization_config` section.
+
+## Read bitblas format checkpoint
+
+```python
+from vllm import LLM
+import torch
+
+# "hxbgsyxh/llama-13b-4bit-g-1-bitblas" is a pre-quantized checkpoint.
+model_id = "hxbgsyxh/llama-13b-4bit-g-1-bitblas"
+llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, quantization="bitblas")
+```
+
+## Read gptq format checkpoint
+
+```python
+from vllm import LLM
+import torch
+
+# "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
+model_id = "hxbgsyxh/llama-13b-4bit-g-1"
+llm = LLM(model=model_id, dtype=torch.float16, trust_remote_code=True, quantization="bitblas", max_model_len=1024)
+```
diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md
index 6f539f6e3f486..c7c8aeb662a56 100644
--- a/docs/source/features/quantization/index.md
+++ b/docs/source/features/quantization/index.md
@@ -11,6 +11,7 @@ Quantization trades off model precision for smaller memory footprint, allowing l
 supported_hardware
 auto_awq
 bnb
+bitblas
 gguf
 gptqmodel
 int4
diff --git a/docs/source/features/quantization/supported_hardware.md b/docs/source/features/quantization/supported_hardware.md
index 2cbe8779dd8a1..984e6626e2417 100644
--- a/docs/source/features/quantization/supported_hardware.md
+++ b/docs/source/features/quantization/supported_hardware.md
@@ -74,6 +74,17 @@ The table below shows the compatibility of various quantization implementations
   * ❌
   * ❌
   * ❌
+- * BitBLAS (GPTQ)
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ❌
+  * ❌
+  * ❌
+  * ❌
 - * AQLM
   * ✅︎
   * ✅︎
diff --git a/tests/models/test_bitblas.py b/tests/models/test_bitblas.py
new file mode 100644
index 0000000000000..ae4a52214ad0c
--- /dev/null
+++ b/tests/models/test_bitblas.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Compare the outputs of a GPTQ model to a bitblas model.
+
+Note: GPTQ and bitblas do not have bitwise correctness.
+As a result, in this test, we just confirm that the top selected tokens of the
+bitblas/GPTQ models are in the top 3 selections of each other.
+
+Note: bitblas internally uses locks to synchronize the threads. This can
+result in very slight nondeterminism for bitblas. As a result, we re-run the 
+test up to 3 times to see if we pass.
+
+Run `pytest tests/models/test_bitblas.py`.
+"""
+from dataclasses import dataclass
+
+import pytest
+
+from .utils import check_logprobs_close
+
+
+@dataclass
+class ModelPair:
+    model_bitblas: str
+    model_gptq: str
+
+
+model_pairs = [
+    ModelPair(model_bitblas="hxbgsyxh/opt-125m-4bit-128g-bitblas",
+              model_gptq="hxbgsyxh/opt-125m-4bit-128g"),
+]
+
+
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.skipif(True, reason="BitBLAS takes too much time for tuning.")
+@pytest.mark.parametrize("model_pair", model_pairs)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model_pair: ModelPair,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with vllm_runner(model_pair.model_bitblas,
+                     dtype=dtype,
+                     quantization="bitblas") as bitblas_model:
+        bitblas_outputs = bitblas_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    with vllm_runner(model_pair.model_gptq, dtype=dtype,
+                     quantization="gptq") as gptq_model:
+        gptq_outputs = gptq_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=gptq_outputs,
+        outputs_1_lst=bitblas_outputs,
+        name_0="gptq",
+        name_1="bitblas",
+    )
diff --git a/tests/models/test_gptq_bitblas.py b/tests/models/test_gptq_bitblas.py
new file mode 100644
index 0000000000000..d28442120ea69
--- /dev/null
+++ b/tests/models/test_gptq_bitblas.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Compare the outputs of a GPTQ model to a bitblas model.
+
+Note: GPTQ and bitblas do not have bitwise correctness.
+As a result, in this test, we just confirm that the top selected tokens of the
+bitblas/GPTQ models are in the top 3 selections of each other.
+
+Note: bitblas internally uses locks to synchronize the threads. This can
+result in very slight nondeterminism for bitblas. As a result, we re-run the 
+test up to 3 times to see if we pass.
+
+Run `pytest tests/models/test_bitblas.py`.
+"""
+from dataclasses import dataclass
+
+import pytest
+
+from .utils import check_logprobs_close
+
+
+@dataclass
+class ModelPair:
+    model_gptq: str
+
+
+model_pairs = [
+    ModelPair(model_gptq="hxbgsyxh/opt-125m-4bit-128g"),
+]
+
+
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.skipif(True, reason="BitBLAS takes too much time for tuning.")
+@pytest.mark.parametrize("model_pair", model_pairs)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model_pair: ModelPair,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with vllm_runner(model_pair.model_gptq,
+                     dtype=dtype,
+                     quantization="bitblas") as bitblas_model:
+        bitblas_outputs = bitblas_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    with vllm_runner(model_pair.model_gptq, dtype=dtype,
+                     quantization="gptq") as gptq_model:
+        gptq_outputs = gptq_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=gptq_outputs,
+        outputs_1_lst=bitblas_outputs,
+        name_0="gptq",
+        name_1="gptq_bitblas",
+    )
diff --git a/vllm/config.py b/vllm/config.py
index 20ca20ad2b6d5..f9e0ed9376048 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -750,7 +750,8 @@ class ModelConfig:
         optimized_quantization_methods = [
             "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
             "awq_marlin", "fbgemm_fp8", "compressed_tensors",
-            "compressed-tensors", "experts_int8", "quark", "nvfp4"
+            "compressed-tensors", "experts_int8", "quark", "nvfp4", "bitblas",
+            "gptq_bitblas"
         ]
         if self.quantization is not None:
             self.quantization = self.quantization.lower()
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index c5536438f5192..16500ab23e0f0 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -31,6 +31,8 @@ logger = init_logger(__name__)
 
 WEIGHT_LOADER_V2_SUPPORTED = [
     "CompressedTensorsLinearMethod",
+    "BitBLASLinearMethod",
+    "GPTQBitBLASLinearMethod",
     "AWQMarlinLinearMethod",
     "AWQLinearMethod",
     "GPTQMarlinLinearMethod",
@@ -50,6 +52,15 @@ WEIGHT_LOADER_V2_SUPPORTED = [
 ]
 
 
+def adjust_bitblas_shard(param, shard_size, shard_offset):
+    bitblas_tile_size = getattr(param, "bitblas_tile_size", None)
+    if bitblas_tile_size is not None:
+        return (shard_size // bitblas_tile_size,
+                shard_offset // bitblas_tile_size)
+
+    return shard_size, shard_offset
+
+
 def adjust_marlin_shard(param, shard_size, shard_offset):
     marlin_tile_size = getattr(param, "marlin_tile_size", None)
     if marlin_tile_size is None:
@@ -615,6 +626,9 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                     shard_size, shard_offset = adjust_marlin_shard(
                         param, shard_size, shard_offset)
 
+                shard_size, shard_offset = adjust_bitblas_shard(
+                    param, shard_size, shard_offset)
+
                 if use_bitsandbytes_4bit:
                     index = list(itertools.accumulate([0] + self.output_sizes))
                     orig_offsets = {
@@ -646,6 +660,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                 # Special case for Marlin.
                 shard_size, shard_offset = adjust_marlin_shard(
                     param, shard_size, shard_offset)
+            shard_size, shard_offset = adjust_bitblas_shard(
+                param, shard_size, shard_offset)
 
             use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit",
                                             False)
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index 89533955fd769..9e1bf05dab9e7 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -18,9 +18,11 @@ QUANTIZATION_METHODS: List[str] = [
     # The order of gptq methods is important for config.py iteration over
     # override_quantization_method(..)
     "marlin",
+    "bitblas",
     "gguf",
     "gptq_marlin_24",
     "gptq_marlin",
+    "gptq_bitblas",
     "awq_marlin",
     "gptq",
     "compressed-tensors",
@@ -85,6 +87,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     from .aqlm import AQLMConfig
     from .awq import AWQConfig
     from .awq_marlin import AWQMarlinConfig
+    from .bitblas import BitBLASConfig
     from .bitsandbytes import BitsAndBytesConfig
     from .compressed_tensors.compressed_tensors import (  # noqa: E501
         CompressedTensorsConfig)
@@ -94,6 +97,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     from .fp8 import Fp8Config
     from .gguf import GGUFConfig
     from .gptq import GPTQConfig
+    from .gptq_bitblas import GPTQBitBLASConfig
     from .gptq_marlin import GPTQMarlinConfig
     from .gptq_marlin_24 import GPTQMarlin24Config
     from .hqq_marlin import HQQMarlinConfig
@@ -119,9 +123,11 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
         # The order of gptq methods is important for config.py iteration over
         # override_quantization_method(..)
         "marlin": MarlinConfig,
+        "bitblas": BitBLASConfig,
         "gguf": GGUFConfig,
         "gptq_marlin_24": GPTQMarlin24Config,
         "gptq_marlin": GPTQMarlinConfig,
+        "gptq_bitblas": GPTQBitBLASConfig,
         "awq_marlin": AWQMarlinConfig,
         "gptq": GPTQConfig,
         "compressed-tensors": CompressedTensorsConfig,
@@ -146,4 +152,4 @@ __all__ = [
     "QuantizationConfig",
     "get_quantization_config",
     "QUANTIZATION_METHODS",
-]
+]
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/bitblas.py b/vllm/model_executor/layers/quantization/bitblas.py
new file mode 100644
index 0000000000000..3eaaa6c252ced
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/bitblas.py
@@ -0,0 +1,459 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any, Dict, List, Optional
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
+    BITBLAS_OPTIMIZE_FEATURES, BITBLAS_SUPPORTED_NUM_BITS,
+    BITBLAS_SUPPORTED_SYM, MINIMUM_BITBLAS_VERSION)
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           ChannelQuantScaleParameter,
+                                           GroupQuantScaleParameter,
+                                           PackedvLLMParameter)
+from vllm.model_executor.utils import set_weight_attrs
+
+logger = init_logger(__name__)
+
+
+class BitBLASConfig(QuantizationConfig):
+    """Config class for BitBLAS.
+
+    Reference: https://github.com/Microsoft/BitBLAS
+    """
+    TORCH_DTYPE = torch.float16
+    STORAGE_DTYPE = "int8"  # assume int8 storage
+    TORCH_STORAGE_DTYPE = getattr(torch, STORAGE_DTYPE)
+    # "original" or "rescale" or "quantized",
+    # gptq_with_bitblas prefer "quantized implementation"
+    ZEROS_MODE = "quantized"
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: Optional[int],
+        desc_act: Optional[bool],
+        is_sym: Optional[bool],
+        quant_method: Optional[str],
+        lm_head_quantized: bool,
+    ) -> None:
+        try:
+            import bitblas
+            if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
+                raise ImportError(
+                    "bitblas version is wrong. Please "
+                    f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
+        except ImportError as e:
+            bitblas_import_exception = e
+            raise ValueError(
+                "Trying to use the bitblas backend, but could not import"
+                f"with the following error: {bitblas_import_exception}. "
+                "Please install bitblas through the following command: "
+                f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`"
+            ) from bitblas_import_exception
+
+        if desc_act and group_size == -1:
+            # In this case, act_order == True is the same as act_order == False
+            # (since we have only one group per output channel)
+            desc_act = False
+
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.desc_act = desc_act
+        self.is_sym = is_sym
+        self.quant_method = quant_method
+        self.lm_head_quantized = lm_head_quantized
+
+        # Verify
+        if self.weight_bits not in BITBLAS_SUPPORTED_NUM_BITS:
+            raise ValueError(
+                f"BitBLAS does not support weight_bits = {self.weight_bits}. "
+                f"Only weight_bits = {BITBLAS_SUPPORTED_NUM_BITS} "
+                "are supported.")
+
+        if self.is_sym not in BITBLAS_SUPPORTED_SYM:
+            raise ValueError(
+                f"BitBLAS does not support is_sym = {self.is_sym}. "
+                f"Only sym = {BITBLAS_SUPPORTED_SYM} are supported.")
+
+        storage_dtype = self.STORAGE_DTYPE
+        storage_nbit = int("".join(c for c in storage_dtype if c.isdigit()))
+
+        self.storage_dtype = storage_dtype
+        self.storage_torch_dtype = self.TORCH_STORAGE_DTYPE
+        # 4 Bits packed into 32 bit datatype.
+        self.pack_factor = storage_nbit // weight_bits
+        self.nbits = weight_bits
+
+        # Zeros type for the quantized weights.
+        self.zeros_mode = self.ZEROS_MODE
+
+    def __repr__(self) -> str:
+        return (f"BitBLASConfig(weight_bits={self.weight_bits}, "
+                f"group_size={self.group_size}, "
+                f"desc_act={self.desc_act}, "
+                f"is_sym={self.is_sym}, "
+                f"quant_method={self.quant_method})")
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "bitblas"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+
+    @classmethod
+    # Need to figure it out
+    def get_min_capability(cls) -> int:
+        return 70
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+
+    @staticmethod
+    def get_from_keys(config: Dict[str, Any],
+                      keys: List[str],
+                      default: Any = None) -> Any:
+        """Get a value from the model's quantization config."""
+        for key in keys:
+            if key in config:
+                return config[key]
+        return default
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "BitBLASConfig":
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"], -1)
+        desc_act = cls.get_from_keys(config, ["desc_act"], False)
+        is_sym = cls.get_from_keys(config, ["sym"], False)
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                 default=False)
+        return cls(weight_bits, group_size, desc_act, is_sym, quant_method,
+                   lm_head_quantized)
+
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg,
+                                     user_quant) -> Optional[str]:
+        # compat: autogptq >=0.8.0 use checkpoint_format: str
+        # compat: autogptq <=0.7.1 is_bitblas_format: bool
+        is_bitblas_format = (hf_quant_cfg.get("checkpoint_format") == "bitblas"
+                             or hf_quant_cfg.get("is_bitblas_format", False))
+
+        is_valid_user_quant = (user_quant is None or user_quant == "gptq"
+                               or user_quant == "bitblas")
+
+        if is_bitblas_format and is_valid_user_quant:
+            msg = ("The model is serialized in {} format. Using {} kernel.".
+                   format(cls.get_name(), cls.get_name()))
+            logger.info(msg)
+            return cls.get_name()
+
+        return None
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["BitBLASLinearMethod"]:
+        if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
+                                             and self.lm_head_quantized):
+            return BitBLASLinearMethod(self)
+        return None
+
+
+class BitBLASLinearMethod(LinearMethodBase):
+    """Linear method for BitBLAS.
+
+    Args:
+        quant_config: The BitBLAS quantization config.
+    """
+    # USE BITBLAS_OPTIMIZE_FEATURES_CONTIGUOUS
+    # Instead of BITBLAS_OPTIMIZE_FEATURES
+    # If you want to high contiguous batching
+    # performance
+    OPT_FEATURES = BITBLAS_OPTIMIZE_FEATURES
+    ENABLE_TUNING = True
+    BITBLAS_DTYPES = {
+        torch.float32: "float32",
+        torch.float16: "float16",
+        torch.bfloat16: "bfloat16",
+        torch.half: "float16",
+        torch.int8: "int8",
+    }
+
+    def __init__(self, quant_config: BitBLASConfig):
+        self.quant_config = quant_config
+
+    def create_weights_gptq(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        """Creates quantized weights for use in linear operations.
+
+        The function initializes and returns a dictionary containing quantized 
+        weights, scales, and zeros
+        for performing quantized matrix multiplication operations.
+
+        Args:
+            input_size_per_partition: The size of the input partition.
+            output_size_per_partition: The size of the output partition.
+            input_size: The total size of the input (unused).
+            output_size: The total size of the output (unused).
+            params_dtype: 
+                The data type of the parameters (expected to be torch.float16).
+
+        Returns:
+            A dictionary containing the quantized weights ('qweight'), 
+            scales ('scales'), and zeros ('zeros').
+
+        Raises:
+            ValueError: If `params_dtype` is not `torch.float16` or if the 
+            input size per partition is not divisible by the group size in 
+            `quant_config`.
+        """
+        del input_size, output_size  # Unused arguments.
+        weight_loader = extra_weight_attrs["weight_loader"]
+
+        if params_dtype not in self.quant_config.get_supported_act_dtypes():
+            raise ValueError("Parameter data type must be torch.float16, "
+                             f"but got {params_dtype}")
+        group_size = self.quant_config.group_size
+        if group_size is None:
+            group_size = -1
+        # Validate output_size_per_partition
+        output_size_per_partition = sum(output_partition_sizes)
+        if (group_size != -1 and input_size_per_partition % group_size != 0):
+            raise ValueError(
+                f"Input size per partition ({input_size_per_partition}) must "
+                f"be divisible by group size ({group_size}).")
+
+        # Initialize or retrieve the BitBLAS matrix multiplication operator.
+        self._configure_bitblas_matmul(
+            input_size_per_partition,
+            output_size_per_partition,
+            params_dtype=params_dtype,
+            enable_tuning=self.ENABLE_TUNING,
+            bias=False,
+            layout="nt",
+            bits=self.quant_config.weight_bits,
+        )
+
+        # Initialize quantized weights with dimensions
+        # Quantized 4Bit weights packed.
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                self.bitblas_matmul.retrieve_weight_shape(),
+                device="cuda",
+                dtype=self.quant_config.storage_torch_dtype,
+                requires_grad=False,
+            ),
+            input_dim=1,
+            output_dim=0,
+            packed_dim=1,
+            packed_factor=self.quant_config.pack_factor,
+            bitblas_tile_size=(self.bitblas_matmul.retrieve_weight_shape()[-2]
+                               if self.bitblas_matmul.propagate_b else None),
+            weight_loader=weight_loader,
+        )
+
+        # Compute the number of input groups for channel-wise quantization.
+        input_groups = (1 if group_size == -1 else input_size_per_partition //
+                        group_size)
+
+        # Initialize scales and zeros for the quantized weights.
+        weight_scale_args = {
+            "data":
+            torch.empty(
+                output_size_per_partition,
+                input_groups,
+                device="cuda",
+                dtype=params_dtype,
+            ),
+            "weight_loader":
+            weight_loader
+        }
+        if input_groups == 1:
+            scales = ChannelQuantScaleParameter(output_dim=0,
+                                                **weight_scale_args)
+        else:
+            scales = GroupQuantScaleParameter(output_dim=0,
+                                              input_dim=1,
+                                              **weight_scale_args)
+
+        if self.quant_config.zeros_mode == "quantized":
+            zeros = PackedvLLMParameter(
+                data=torch.empty(
+                    input_groups,
+                    output_size_per_partition // self.quant_config.pack_factor,
+                    device="cuda",
+                    dtype=self.quant_config.storage_torch_dtype,
+                    requires_grad=False,
+                ),
+                input_dim=0,
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                weight_loader=weight_loader,
+            )
+
+        else:
+            zeros = BasevLLMParameter(
+                torch.empty(output_size_per_partition,
+                            input_groups,
+                            device="cuda",
+                            dtype=params_dtype),
+                weight_loader=weight_loader,
+            )
+            # Set attributes to indicate how scales and zeros are applied.
+            set_weight_attrs(zeros, {
+                "input_dim": None if input_groups == 1 else 1,
+                "output_dim": 0,
+            })
+
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("scales", scales)
+        layer.register_parameter("zeros", zeros)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        if self.quant_config.quant_method == "gptq":
+            return self.create_weights_gptq(layer, input_size_per_partition,
+                                            output_partition_sizes, input_size,
+                                            output_size, params_dtype,
+                                            **extra_weight_attrs)
+        else:
+            raise ValueError(
+                f"Unsupported quant_method {self.quant_config.quant_method}")
+
+    def _configure_bitblas_matmul(
+        self,
+        infeatures,
+        outfeatures,
+        params_dtype,
+        enable_tuning,
+        bias,
+        layout,
+        bits,
+        out_dtype="float16",
+    ):
+        from bitblas import MatmulConfig
+        bitblas_dtype = self.BITBLAS_DTYPES[params_dtype]
+
+        with_scaling = False
+        with_zeros = False
+        group_size = self.quant_config.group_size
+        zeros_mode = self.quant_config.zeros_mode
+        if self.quant_config.quant_method == "gptq":
+            with_scaling = True
+            with_zeros = True
+            W_dtype = f"uint{bits}"
+            if self.quant_config.is_sym:
+                with_zeros = False
+                W_dtype = f"int{bits}"
+        else:
+            raise ValueError(
+                f"Unsupported quant_method {self.quant_config.quant_method}")
+
+        matmul_config = MatmulConfig(
+            N=outfeatures,
+            K=infeatures,
+            A_dtype=bitblas_dtype,
+            W_dtype=W_dtype,
+            out_dtype=out_dtype,
+            accum_dtype="int32" if bitblas_dtype == "int8" else bitblas_dtype,
+            storage_dtype=self.quant_config.STORAGE_DTYPE,
+            with_scaling=with_scaling,
+            with_zeros=with_zeros,
+            group_size=group_size,
+            with_bias=bias,
+            layout=layout,
+            zeros_mode=zeros_mode,
+        )
+        self.bitblas_matmul = self._get_or_create_bitblas_operator(
+            matmul_config, enable_tuning)
+
+    def _get_or_create_bitblas_operator(self, config, enable_tuning):
+        from bitblas import Matmul, auto_detect_nvidia_target
+        from bitblas.cache import get_database_path, global_operator_cache
+        BITBLAS_DATABASE_PATH = get_database_path()
+        BITBLAS_TARGET = auto_detect_nvidia_target()
+        if global_operator_cache.size() == 0:
+            global_operator_cache.load_from_database(BITBLAS_DATABASE_PATH,
+                                                     BITBLAS_TARGET)
+
+        bitblas_matmul = global_operator_cache.get(config)
+        if bitblas_matmul is None:
+            bitblas_matmul = Matmul(config,
+                                    target=BITBLAS_TARGET,
+                                    enable_tuning=False)
+            if enable_tuning:
+                TUNING_MESSAGE = (f"BitBLAS Operator {config} is tuning ...")
+                logger.info(TUNING_MESSAGE)
+                bitblas_matmul.hardware_aware_finetune(topk=20)
+                global_operator_cache.add(config, bitblas_matmul)
+                global_operator_cache.save_into_database(
+                    BITBLAS_DATABASE_PATH, BITBLAS_TARGET)
+                TUNED_MESSAGE = (
+                    f"BitBLAS Operator {config} tuned and saved to database.")
+                logger.info(TUNED_MESSAGE)
+            else:
+                _message = f"BitBLAS Operator {config} created."
+                logger.info(_message)
+        else:
+            _message = (
+                f"BitBLAS Operator {config} found in global_operator_cache.")
+            logger.info(_message)
+        return bitblas_matmul
+
+    def apply_gptq(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        qweight = layer.qweight
+        scales = layer.scales
+        qzeros = layer.zeros
+
+        x_2d = x.view(-1, x.shape[-1])
+
+        if self.quant_config.is_sym:
+            output_2d = self.bitblas_matmul(x_2d, qweight, scales)
+        else:
+            output_2d = self.bitblas_matmul(x_2d, qweight, scales, qzeros)
+
+        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
+
+        if bias is not None:
+            output.add_(bias)  # In-place add
+
+        return output
+
+    def apply(
+        self,
+        *args: Any,
+        **kwargs: Any,
+    ) -> torch.Tensor:
+        if self.quant_config.quant_method == "gptq":
+            return self.apply_gptq(*args, **kwargs)
+        else:
+            raise ValueError(
+                f"Unsupported quant_method {self.quant_config.quant_method}")
diff --git a/vllm/model_executor/layers/quantization/gptq_bitblas.py b/vllm/model_executor/layers/quantization/gptq_bitblas.py
new file mode 100644
index 0000000000000..88cada4c61b83
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/gptq_bitblas.py
@@ -0,0 +1,438 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any, Dict, List, Optional, Set
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               set_weight_attrs)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
+    BitBLASLinearKernel, MPLinearLayerConfig)
+from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
+    BITBLAS_SUPPORTED_NUM_BITS as GPTQ_BITBLAS_SUPPORTED_NUM_BITS)
+from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
+    BITBLAS_SUPPORTED_SYM as GPTQ_BITBLAS_SUPPORTED_SYM)
+from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
+    MINIMUM_BITBLAS_VERSION, bitblas_repeat_scales_on_all_ranks,
+    check_bitblas_supported, verify_bitblas_supported)
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
+                                           GroupQuantScaleParameter,
+                                           PackedColumnParameter,
+                                           PackedvLLMParameter,
+                                           RowvLLMParameter)
+from vllm.scalar_type import scalar_types
+
+logger = init_logger(__name__)
+
+
+class GPTQBitBLASConfig(QuantizationConfig):
+    """Config class for GPTQ BitBLAS"""
+
+    # (num_bits, is_sym) -> quant_type
+    TYPE_MAP = {
+        (4, True): scalar_types.uint4b8,
+        (8, True): scalar_types.uint8b128,
+    }
+
+    TORCH_DTYPE = torch.float16
+    GPTQ_CKPT_STORAGE_DTYPE = (
+        "int32"  # GPTQ Default Checkpoints use int32 as storage dtype
+    )
+    GPTQ_BITBLAS_STORAGE_DTYPE = "int8"  # BitBLAS uses int8 as storage dtype
+    TORCH_BITBLAS_STORAGE_DTYPE = getattr(torch, GPTQ_BITBLAS_STORAGE_DTYPE)
+    # "original" or "rescale" or "quantized",
+    # the gptq_bitblas prefer "quantized"
+    ZEROS_MODE = "quantized"
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        desc_act: bool,
+        is_sym: bool,
+        quant_method: Optional[str],
+        lm_head_quantized: bool,
+    ) -> None:
+
+        try:
+            import bitblas
+            if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
+                raise ImportError(
+                    "bitblas version is wrong. Please "
+                    f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
+        except ImportError as e:
+            bitblas_import_exception = e
+            raise ValueError(
+                "Trying to use the bitblas backend, but could not import"
+                f"with the following error: {bitblas_import_exception}. "
+                "Please install bitblas through the following command: "
+                f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`"
+            ) from bitblas_import_exception
+
+        if desc_act and group_size == -1:
+            # In this case, act_order == True is the same as act_order == False
+            # (since we have only one group per output channel)
+            desc_act = False
+
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.desc_act = desc_act
+        self.is_sym = is_sym
+        self.quant_method = quant_method
+        self.lm_head_quantized = lm_head_quantized
+
+        # Verify
+        if self.weight_bits not in GPTQ_BITBLAS_SUPPORTED_NUM_BITS:
+            raise ValueError(
+                f"BitBLAS does not support weight_bits = {self.weight_bits}. "
+                f"Only weight_bits = {GPTQ_BITBLAS_SUPPORTED_NUM_BITS} "
+                "are supported.")
+
+        if self.is_sym not in GPTQ_BITBLAS_SUPPORTED_SYM:
+            raise ValueError(
+                f"BitBLAS does not support is_sym = {self.is_sym}. "
+                f"Only sym = {GPTQ_BITBLAS_SUPPORTED_SYM} are supported.")
+
+        self.storage_dtype = self.GPTQ_BITBLAS_STORAGE_DTYPE
+
+        storage_nbit = int("".join(c for c in self.GPTQ_CKPT_STORAGE_DTYPE
+                                   if c.isdigit()))
+
+        # 4 Bits packed into 32 bit datatype.
+        self.pack_factor = storage_nbit // weight_bits
+        self.nbits = weight_bits
+
+        # Zeros type for the quantized weights.
+        self.zeros_mode = self.ZEROS_MODE
+
+        if (weight_bits, is_sym) not in self.TYPE_MAP:
+            raise ValueError("Unsupported quantization config: "
+                             f"bits={weight_bits}, sym={is_sym}")
+
+        self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)]
+
+    def __repr__(self) -> str:
+        return (f"GPTQBitBLASConfig(weight_bits={self.weight_bits}, "
+                f"group_size={self.group_size}, "
+                f"desc_act={self.desc_act})"
+                f"is_sym={self.is_sym}, "
+                f"quant_method={self.quant_method})")
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "gptq_bitblas"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "GPTQBitBLASConfig":
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        desc_act = cls.get_from_keys(config, ["desc_act"])
+        is_sym = cls.get_from_keys(config, ["sym"])
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                 default=False)
+        return cls(weight_bits, group_size, desc_act, is_sym, quant_method,
+                   lm_head_quantized)
+
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg,
+                                     user_quant) -> Optional[str]:
+        can_convert = cls.is_gptq_bitblas_compatible(hf_quant_cfg)
+
+        is_valid_user_quant = (user_quant is None or user_quant == "bitblas"
+                               or user_quant == "gptq_bitblas")
+
+        if can_convert and is_valid_user_quant:
+            msg = ("The model is convertible to {} during runtime."
+                   " Using {} kernel.".format(cls.get_name(), cls.get_name()))
+            logger.info(msg)
+            return cls.get_name()
+
+        if can_convert and user_quant == "gptq":
+            logger.info("Detected that the model can run with gptq_bitblas"
+                        ", however you specified quantization=gptq explicitly,"
+                        " so forcing gptq. Use quantization=gptq_bitblas for"
+                        " faster inference")
+        return None
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["GPTQBitBLASLinearMethod"]:
+        if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
+                                             and self.lm_head_quantized):
+            return GPTQBitBLASLinearMethod(self)
+        return None
+
+    @property
+    def torch_storage_dtype(self) -> torch.dtype:
+        return self.TORCH_BITBLAS_STORAGE_DTYPE
+
+    @classmethod
+    def is_gptq_bitblas_compatible(cls, quant_config: Dict[str, Any]):
+        # Extract data from quant config.
+        num_bits = quant_config.get("bits")
+        group_size = quant_config.get("group_size")
+        sym = quant_config.get("sym")
+        desc_act = quant_config.get("desc_act")
+
+        # If we cannot find the info needed in the config, cannot convert.
+        if (num_bits is None or group_size is None or sym is None
+                or desc_act is None):
+            return False
+
+        if (num_bits, sym) not in cls.TYPE_MAP:
+            return False
+
+        # If the capability of the device is too low, cannot convert.
+        major, minor = torch.cuda.get_device_capability()
+        device_capability = major * 10 + minor
+        if device_capability < cls.get_min_capability():
+            return False
+
+        # Otherwise, can convert if model satisfies bitblas constraints.
+        return check_bitblas_supported(quant_type=cls.TYPE_MAP[(num_bits,
+                                                                sym)],
+                                       group_size=group_size)
+
+
+class GPTQBitBLASLinearMethod(LinearMethodBase):
+    """Linear method for GPTQ BitBLAS.
+
+    Args:
+        quant_config: The GPTQ BitBLAS quantization config.
+    """
+
+    kernel_type = BitBLASLinearKernel
+    _kernel_backends_being_used: Set[str] = set()
+
+    def __init__(self, quant_config: GPTQBitBLASConfig) -> None:
+        self.quant_config = quant_config
+        # Verify supported on platform.
+        verify_bitblas_supported(quant_type=self.quant_config.quant_type,
+                                 group_size=self.quant_config.group_size)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        """Creates quantized weights for use in linear operations.
+
+        The function initializes and returns a dictionary containing 
+        quantized weights, scales, and zeros
+        for performing quantized matrix multiplication operations.
+
+        Args:
+            input_size_per_partition: The size of the input partition.
+            output_partition_sizes: The size of the output partition.
+            input_size: The total size of the input (unused).
+            output_size: The total size of the output (unused).
+            params_dtype: 
+                The data type of the parameters (expected to be torch.float16).
+
+        Returns:
+            A dictionary containing the quantized weights ('qweight'), 
+            scales ('scales'), and zeros ('zeros').
+
+        Raises:
+            ValueError: If `params_dtype` is not `torch.float16` or 
+            if the input size per partition is not divisible by the 
+            group size in `quant_config`.
+        """
+        if params_dtype != torch.float16:
+            raise ValueError("Parameter data type must be torch.float16, "
+                             f"but got {params_dtype}")
+
+        # Normalize group_size
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+
+        if input_size_per_partition % group_size != 0:
+            raise ValueError(
+                f"Input size per partition ({input_size_per_partition}) must "
+                f"be divisible by group size ({self.quant_config.group_size})."
+            )
+
+        kernel_type = self.kernel_type
+        # Validate output_size_per_partition
+        output_size_per_partition = sum(output_partition_sizes)
+
+        is_row_parallel = input_size != input_size_per_partition
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        mp_linear_kernel_config = MPLinearLayerConfig(
+            full_weight_shape=(input_size, output_size),
+            partition_weight_shape=\
+                (input_size_per_partition, output_size_per_partition),
+            weight_type=self.quant_config.quant_type,
+            act_type=params_dtype,
+            group_size=self.quant_config.group_size,
+            zero_points=False,
+            has_g_idx=self.quant_config.desc_act
+        )
+
+        if kernel_type.__name__ not in self._kernel_backends_being_used:
+            logger.info("Using %s for GPTQBitBLASLinearMethod",
+                        kernel_type.__name__)
+            self._kernel_backends_being_used.add(kernel_type.__name__)
+
+        # Normalize group_size
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+
+        # Determine sharding
+        if bitblas_repeat_scales_on_all_ranks(self.quant_config.desc_act,
+                                              self.quant_config.group_size,
+                                              is_row_parallel):
+            # By setting scale_dim == None, weight_loader will
+            # repeat the scales on each GPU in TP>1 case.
+            scales_and_zp_input_dim = None
+            scales_and_zp_size = input_size // group_size
+        else:
+            # By setting scale_dim == 0, weight_loader will
+            # shard the scales in TP>1 case.
+            scales_and_zp_input_dim = 0
+            scales_and_zp_size = input_size_per_partition // group_size
+
+        # Init buffers
+        # Quantized weights
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition // self.quant_config.pack_factor,
+                output_size_per_partition,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=0,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader)
+
+        # Activation order
+        # Ignore warning from fused linear layers such as QKVParallelLinear.
+        g_idx = RowvLLMParameter(data=torch.empty(
+            input_size_per_partition,
+            dtype=torch.int32,
+        ),
+                                 input_dim=0,
+                                 weight_loader=weight_loader)
+
+        # Scales
+        scales = Parameter(
+            torch.empty(
+                scales_and_zp_size,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            scales,
+            {
+                **extra_weight_attrs,
+                "input_dim": scales_and_zp_input_dim,
+                "output_dim": 1,
+            },
+        )
+
+        # Quantized zero-points
+        qzeros_args = {
+            "data":
+            torch.empty(
+                scales_and_zp_size,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            "weight_loader":
+            weight_loader
+        }
+        weight_scale_args = {
+            "data":
+            torch.empty(
+                scales_and_zp_size,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            "weight_loader":
+            weight_loader
+        }
+
+        if scales_and_zp_input_dim is None:
+            scales = ChannelQuantScaleParameter(output_dim=1,
+                                                **weight_scale_args)
+            qzeros = PackedColumnParameter(
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args)
+
+        else:
+            scales = GroupQuantScaleParameter(output_dim=1,
+                                              input_dim=0,
+                                              **weight_scale_args)
+            qzeros = PackedvLLMParameter(
+                input_dim=0,
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args)
+
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("g_idx", g_idx)
+        layer.register_parameter("scales", scales)
+        layer.register_parameter("qzeros", qzeros)
+
+        self.kernel = kernel_type(
+            mp_linear_kernel_config,
+            w_q_param_name="qweight",
+            w_s_param_name="scales",
+            w_zp_param_name="qzeros",
+            w_gidx_param_name="g_idx",
+            bitblas_quant_config=self.quant_config,
+        )
+
+        # Initialize or retrieve the BitBLAS matrix multiplication operator.
+        self.kernel.configure_bitblas_matmul(
+            input_size_per_partition,
+            output_size_per_partition,
+            params_dtype=params_dtype,
+            bias=False,
+        )
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        self.kernel.process_weights_after_loading(layer)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        out = self.kernel.apply_gptq_bitblas_linear(layer, x)
+        if bias is not None:
+            out.add_(bias)
+        return out
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
index 520e1bc96721c..d144bb4361045 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
@@ -5,6 +5,8 @@ from typing import List, Optional, Type
 import vllm.envs as envs
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.allspark import (  # noqa: E501
     AllSparkLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.bitblas import (  # noqa: E501
+    BitBLASLinearKernel)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.exllama import (  # noqa: E501
     ExllamaLinearKernel)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.machete import (  # noqa: E501
@@ -20,6 +22,7 @@ _POSSIBLE_KERNELS: List[Type[MPLinearKernel]] = [
     MacheteLinearKernel,
     AllSparkLinearKernel,
     MarlinLinearKernel,
+    BitBLASLinearKernel,
     ExllamaLinearKernel,
 ]
 
@@ -76,4 +79,4 @@ def choose_mp_linear_kernel(
     raise ValueError(
         "Failed to find a kernel that can implement the "\
         "WNA16 linear layer. Reasons: \n"
-        + '\n'.join(failure_reasons))
+        + '\n'.join(failure_reasons))
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
new file mode 100644
index 0000000000000..21452d08b8a1c
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
@@ -0,0 +1,299 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Optional, Tuple
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
+    BITBLAS_OPTIMIZE_FEATURES, BITBLAS_SUPPORTED_GROUP_SIZES,
+    MINIMUM_BITBLAS_VERSION, bitblas_make_empty_g_idx, bitblas_sort_g_idx,
+    check_bitblas_supports_shape, query_bitblas_supported_quant_types,
+    unpack_gptq_qweight, unpack_gptq_qzeros)
+
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
+
+logger = init_logger(__name__)
+
+
+class BitBLASLinearKernel(MPLinearKernel):
+
+    OPT_FEATURES: List[int] = BITBLAS_OPTIMIZE_FEATURES
+    ENABLE_TUNING: bool = True
+    MATMUL_LAYOUT: str = "nt"
+    BITBLAS_DTYPES: Dict[torch.dtype, str] = {
+        torch.float32: "float32",
+        torch.float16: "float16",
+        torch.bfloat16: "bfloat16",
+        torch.half: "float16",
+        torch.int8: "int8",
+    }
+    bitblas_matmul: object = None
+
+    def __init__(
+        self,
+        c: MPLinearLayerConfig,
+        w_q_param_name: str,
+        w_s_param_name: str,
+        w_zp_param_name: Optional[str] = None,
+        w_gidx_param_name: Optional[str] = None,
+        bitblas_quant_config: Optional[QuantizationConfig] = None,
+    ):
+        self.quant_config = bitblas_quant_config
+        super().__init__(c, w_q_param_name, w_s_param_name, w_zp_param_name,
+                         w_gidx_param_name)
+
+    def repack_bitblas_from_gptq(
+        self,
+        b_q_weight: torch.Tensor,
+        scales: torch.Tensor,
+        qzeros: Optional[torch.Tensor] = None,
+    ):
+        from bitblas.quantization.utils import general_compress
+        assert self.bitblas_matmul is not None, "bitblas_matmul is None"
+
+        quant_config = self.quant_config
+        # qweight in gptq old quant linear stored with
+        # (outfeatures, infeatures), should be transposed.
+        qweight = b_q_weight.T.contiguous().view(
+            quant_config.torch_storage_dtype)  # type: ignore[union-attr]
+        intweight = unpack_gptq_qweight(
+            qweight,
+            quant_config.weight_bits).contiguous()  # type: ignore[union-attr]
+        if self.bitblas_matmul.weight_transform is not None:  # type: ignore[attr-defined]
+            qweight = self.bitblas_matmul.weight_transform(  # type: ignore[attr-defined]
+                intweight.cpu()).cuda()
+        # scales in gptq old quant linear stored with
+        # (infeatures // group_size, outfeatures), should be transposed.
+        scales = scales.T.contiguous()
+
+        if qzeros is None:
+            return qweight, scales, None
+
+        # qzeros should be de-quantized to int zeros.
+        weight_bits = quant_config.weight_bits  # type: ignore[union-attr]
+        intzeros = unpack_gptq_qzeros(qzeros, weight_bits).T.contiguous()
+        zeros: Optional[torch.Tensor] = None
+        zeros_mode = self.bitblas_matmul.config.zeros_mode  # type: ignore[attr-defined]
+        if zeros_mode == "original":
+            zeros = intzeros.to(torch.float16).contiguous()
+        elif zeros_mode == "rescale":
+            assert zeros is not None, "zeros should not be None"
+            zeros[:, :] = intzeros.to(torch.float16)[:, :] * scales[:, :]
+        elif zeros_mode == "quantized":
+            zeros = (
+                torch.Tensor(
+                    general_compress(
+                        intzeros.T.contiguous().cpu().numpy(),
+                        weight_bits,
+                    )).to(qweight.device).
+                to(quant_config.torch_storage_dtype  # type: ignore[union-attr]
+                   ).contiguous())
+        else:
+            raise ValueError("Unsupported zeros type: {}".format(zeros_mode))
+
+        return qweight, scales, zeros
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    @classmethod
+    def can_implement(cls,
+                      c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+
+        is_bitblas_installed = True
+
+        try:
+            import bitblas
+            if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
+                raise ImportError(
+                    "bitblas version is wrong. Please "
+                    f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
+        except ImportError:
+            is_bitblas_installed = False
+
+        if not is_bitblas_installed:
+            return False, "bitblas is not installed. Please install bitblas "\
+                          "by running `pip install bitblas>="\
+                           f"{MINIMUM_BITBLAS_VERSION}`"
+
+        quant_types = query_bitblas_supported_quant_types(c.zero_points)
+        if c.weight_type not in quant_types:
+            return False, (f"Quant type ({c.weight_type}) not supported by"
+                           f"  BitBLAS, supported types are: {quant_types}")
+
+        if c.group_size not in BITBLAS_SUPPORTED_GROUP_SIZES:
+            return False, (f"Group size ({c.group_size}) not supported by "
+                           "BitBLAS, supported group sizes are: "
+                           f"{BITBLAS_SUPPORTED_GROUP_SIZES}")
+
+        return check_bitblas_supports_shape(
+            c.partition_weight_shape[1],  # out_features
+            c.partition_weight_shape[0],  # in_features
+            c.full_weight_shape[0],  # in_features
+            c.group_size)
+
+    # note assumes that
+    #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
+    #  `weight_scale` is: {input_dim = 0, output_dim = 1}
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        device = getattr(layer, self.w_q_name).device
+        c = self.config
+        quant_config = self.quant_config
+
+        # Default names since bitblas requires empty parameters for these,
+        # TODO: remove this requirement from bitblas (allow optional tensors)
+        if self.w_gidx_name is None:
+            self.w_gidx_name = "g_idx"
+        if self.w_zp_name is None:
+            self.w_zp_name = "qzeros"
+
+        if c.has_g_idx:
+            g_idx, g_idx_sort_indices = bitblas_sort_g_idx(
+                getattr(layer, self.w_gidx_name))
+            self._transform_param(layer, self.w_gidx_name, lambda _: g_idx)
+            layer.g_idx_sort_indices = g_idx_sort_indices
+        else:
+            setattr(layer, self.w_gidx_name, bitblas_make_empty_g_idx(device))
+            layer.g_idx_sort_indices = bitblas_make_empty_g_idx(device)
+
+        if c.zero_points:
+            raise NotImplementedError("Zero points not supported by BitBLAS")
+        else:
+            setattr(layer, self.w_zp_name, bitblas_make_empty_g_idx(device))
+
+        # Repack weights
+        bitblas_qweight, bitblas_scales, bitblas_qzeros = (
+            self.repack_bitblas_from_gptq(
+                layer.qweight,
+                layer.scales,
+                None if quant_config.is_sym else  # type: ignore[union-attr]
+                layer.qzeros,  # type: ignore[union-attr]
+            ))
+        replace_parameter(layer, self.w_q_name, bitblas_qweight)
+        replace_parameter(layer, self.w_s_name, bitblas_scales)
+        if bitblas_qzeros is not None:
+            replace_parameter(layer, self.w_zp_name, bitblas_qzeros)
+
+    def configure_bitblas_matmul(
+        self,
+        infeatures: int,
+        outfeatures: int,
+        params_dtype: torch.dtype,
+        bias: bool,
+    ) -> None:
+        enable_tuning = self.ENABLE_TUNING
+        layout = self.MATMUL_LAYOUT
+        bits = self.quant_config.weight_bits  # type: ignore[union-attr]
+        self._configure_bitblas_matmul(
+            infeatures,
+            outfeatures,
+            params_dtype,
+            enable_tuning,
+            bias,
+            layout,
+            bits,
+        )
+
+    def _configure_bitblas_matmul(
+        self,
+        infeatures,
+        outfeatures,
+        params_dtype,
+        enable_tuning,
+        bias,
+        layout,
+        bits,
+    ):
+        from bitblas import MatmulConfig
+        bitblas_dtype = self.BITBLAS_DTYPES[params_dtype]
+        quant_config = self.quant_config
+        with_scaling = False
+        with_zeros = False
+        group_size = quant_config.group_size  # type: ignore[union-attr]
+        zeros_mode = quant_config.zeros_mode  # type: ignore[union-attr]
+        if quant_config.quant_method == "gptq":  # type: ignore[union-attr]
+            with_scaling = True
+            with_zeros = True
+            W_dtype = f"uint{bits}"
+            if quant_config.is_sym:  # type: ignore[union-attr]
+                with_zeros = False
+                W_dtype = f"int{bits}"
+        else:
+            raise ValueError(
+                f"Unsupported quant_method {quant_config.quant_method}"  # type: ignore[union-attr]
+            )  # type: ignore[union-attr]
+
+        matmul_config = MatmulConfig(
+            M=self.OPT_FEATURES,
+            N=outfeatures,
+            K=infeatures,
+            A_dtype=bitblas_dtype,
+            W_dtype=W_dtype,
+            out_dtype=bitblas_dtype,
+            accum_dtype="int32" if bitblas_dtype == "int8" else bitblas_dtype,
+            storage_dtype=quant_config.  # type: ignore[union-attr]
+            storage_dtype,  # type: ignore[union-attr]
+            with_scaling=with_scaling,
+            with_zeros=with_zeros,
+            group_size=group_size,
+            with_bias=bias,
+            layout=layout,
+            zeros_mode=zeros_mode,
+        )
+        self.bitblas_matmul = self._get_or_create_bitblas_operator(
+            matmul_config, enable_tuning)
+
+    def _get_or_create_bitblas_operator(self, config, enable_tuning):
+        from bitblas import Matmul, auto_detect_nvidia_target
+        from bitblas.cache import get_database_path, global_operator_cache
+        BITBLAS_DATABASE_PATH = get_database_path()
+        BITBLAS_TARGET = auto_detect_nvidia_target()
+
+        if global_operator_cache.size() == 0:
+            global_operator_cache.load_from_database(BITBLAS_DATABASE_PATH,
+                                                     BITBLAS_TARGET)
+
+        bitblas_matmul = global_operator_cache.get(config)
+        if bitblas_matmul is None:
+            bitblas_matmul = Matmul(config,
+                                    target=BITBLAS_TARGET,
+                                    enable_tuning=False)
+            if enable_tuning:
+                bitblas_matmul.hardware_aware_finetune(topk=20)
+                global_operator_cache.add(config, bitblas_matmul)
+                global_operator_cache.save_into_database(
+                    BITBLAS_DATABASE_PATH, BITBLAS_TARGET)
+                TUNING_MESSAGE = (
+                    f"BitBLAS Operator {config} tuned and saved to database.")
+                logger.info(TUNING_MESSAGE)
+            else:
+                _message = f"BitBLAS Operator {config} created without tuning. "
+                logger.info(_message)
+        else:
+            _message = f"BitBLAS Operator {config} retrieved from cache."
+            logger.info(_message)
+        return bitblas_matmul
+
+    def apply_gptq_bitblas_linear(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        output_size_per_partition = self.config.partition_weight_shape[1]
+        out_shape = x.shape[:-1] + (output_size_per_partition, )
+        args = [x, layer.qweight, layer.scales]
+        if self.bitblas_matmul.config.with_zeros:  # type: ignore[attr-defined]
+            args.append(layer.qzeros)
+        output = self.bitblas_matmul(*args)  # type: ignore[operator]
+        return output.view(out_shape)
+
+    def apply_weights(self, layer, x, bias=None):
+        NOT_IMPLEMENT_MESSAGE = (
+            f"{self.__class__.__name__}.apply_weights is not implemented. "
+            "Please use BitBLASLinearKernel.apply_gptq_bitblas_linear instead")
+        raise NotImplementedError(NOT_IMPLEMENT_MESSAGE)
diff --git a/vllm/model_executor/layers/quantization/utils/bitblas_utils.py b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py
new file mode 100644
index 0000000000000..5d28d327e8a2f
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py
@@ -0,0 +1,198 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import torch
+
+from vllm.platforms import current_platform
+from vllm.scalar_type import ScalarType, scalar_types
+
+MINIMUM_BITBLAS_VERSION = "0.1.0"
+
+BITBLAS_MIN_WEIGHT_SIZE_N = 16
+BITBLAS_MIN_WEIGHT_SIZE_K = 16
+GPTQ_BITBLAS_MAX_PARALLEL = 16
+
+BITBLAS_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
+
+# For dynamic shape code generation
+BITBLAS_OPTIMIZE_FEATURES = [1, 16, 32, 64, 128, 256, 512, 1024]
+# If want to enable high performance for contiguous batching
+# Please use the following values
+BITBLAS_OPTIMIZE_FEATURES_CONTIGUOUS = [16, 32, 64, 128, 256, 512, 1024]
+
+BITBLAS_SUPPORTED_NUM_BITS = [1, 2, 4, 8]
+BITBLAS_SUPPORTED_SYM = [False, True]
+
+
+# Determines the supported quantization types for BitBLAS based on the
+# device's capability and whether zero-point (zp) is used.
+def query_bitblas_supported_quant_types(has_zp: bool,
+                                        device_capability: Optional[int] = None
+                                        ):
+    if device_capability is None:
+        capability_tuple = current_platform.get_device_capability()
+        device_capability = (-1 if capability_tuple is None else
+                             capability_tuple.to_int())
+
+    if device_capability < 70:
+        return []
+
+    if has_zp:
+        # AWQ style, unsigned + runtime zero-point
+        return [scalar_types.uint4, scalar_types.uint8]
+    else:
+        # GPTQ style, unsigned + symmetric bias
+        # TODO: once fp8_bitblas is merged into "gptq_bitblas" we should be able
+        #  to add `scalar_types.float8_e4m3fn` here
+        return [scalar_types.uint4b8, scalar_types.uint8b128]
+
+
+def _check_bitblas_supported(
+        quant_type: ScalarType,
+        group_size: Optional[int],
+        has_zp: bool,
+        device_capability: Optional[int] = None) -> Tuple[bool, Optional[str]]:
+
+    if device_capability is None:
+        capability_tuple = current_platform.get_device_capability()
+        device_capability = (-1 if capability_tuple is None else
+                             capability_tuple.to_int())
+
+    supported_types = query_bitblas_supported_quant_types(
+        has_zp, device_capability)
+
+    if quant_type not in supported_types:
+        return (False, f"BitBLAS does not support weight_bits = {quant_type}. "
+                f"Only types = {supported_types} "
+                f"are supported (for group_size = {group_size}, "
+                f"device_capability = {device_capability}, zp = {has_zp}).")
+    if (group_size is None or group_size not in BITBLAS_SUPPORTED_GROUP_SIZES):
+        return (False, f"BitBLAS does not support group_size = {group_size}. "
+                f"Only group_sizes = {BITBLAS_SUPPORTED_GROUP_SIZES} "
+                "are supported.")
+
+    return True, None
+
+
+def check_bitblas_supported(quant_type: ScalarType,
+                            group_size: int,
+                            has_zp: bool = False,
+                            device_capability: Optional[int] = None) -> bool:
+    cond, _ = _check_bitblas_supported(quant_type, group_size, has_zp,
+                                       device_capability)
+    return cond
+
+
+def verify_bitblas_supported(quant_type: ScalarType,
+                             group_size: int,
+                             has_zp: bool = False) -> None:
+    cond, err_msg = _check_bitblas_supported(quant_type, group_size, has_zp)
+    if not cond:
+        assert err_msg is not None
+        raise ValueError(err_msg)
+
+
+def verify_bitblas_supports_shape(output_size_per_partition: int,
+                                  input_size_per_partition: int,
+                                  input_size: int, group_size: int) -> None:
+
+    # Validate output_size_per_partition
+    if output_size_per_partition % BITBLAS_MIN_WEIGHT_SIZE_N != 0:
+        raise ValueError(f"Weight output_size_per_partition = "
+                         f"{output_size_per_partition} is not divisible by "
+                         f" min_thread_n = {BITBLAS_MIN_WEIGHT_SIZE_N}. "
+                         "Consider reducing tensor_parallel_size or running "
+                         "with --quantization gptq.")
+
+    # Validate input_size_per_partition
+    if input_size_per_partition % BITBLAS_MIN_WEIGHT_SIZE_K != 0:
+        raise ValueError(f"Weight input_size_per_partition = "
+                         f"{input_size_per_partition} is not divisible "
+                         f"by min_thread_k = {BITBLAS_MIN_WEIGHT_SIZE_K}. "
+                         "Consider reducing tensor_parallel_size or running "
+                         "with --quantization gptq.")
+
+    if (group_size < input_size
+            and input_size_per_partition % group_size != 0):
+        raise ValueError(
+            f"Weight input_size_per_partition = {input_size_per_partition}"
+            f" is not divisible by group_size = {group_size}."
+            "Consider reducing tensor_parallel_size or running "
+            "with --quantization gptq.")
+
+
+def check_bitblas_supports_shape(output_size_per_partition: int,
+                                input_size_per_partition: int,
+                                input_size: int, group_size: int) \
+                                    -> Tuple[bool, Optional[str]]:
+    try:
+        verify_bitblas_supports_shape(output_size_per_partition,
+                                      input_size_per_partition, input_size,
+                                      group_size)
+    except ValueError as e:
+        return False, e.__str__()
+    return True, None
+
+
+def bitblas_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
+    return (not act_order) or (act_order and not is_row_parallel)
+
+
+def bitblas_repeat_scales_on_all_ranks(act_order: bool, group_size: int,
+                                       is_row_parallel: bool) -> bool:
+    # Need to repeat scales on every rank if act_ordering or
+    # channelwise and RowParallelLinear
+    is_channelwise = group_size == -1
+    return act_order or (is_channelwise and is_row_parallel)
+
+
+def bitblas_make_empty_g_idx(device: torch.device) -> torch.Tensor:
+    return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
+                              requires_grad=False)
+
+
+def bitblas_make_empty_zp(device: torch.device) -> torch.Tensor:
+    return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
+                              requires_grad=False)
+
+
+def bitblas_sort_g_idx(
+        g_idx: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
+    return g_idx[g_idx_sort_indices], g_idx_sort_indices
+
+
+def unpack_gptq_qzeros(qzeros, bits, is_gptq_v2=False) -> torch.Tensor:
+    qzeros = qzeros.view(torch.int32)
+    elems_per_int32 = 32 // bits
+    unpacked_zeros = torch.zeros(
+        (qzeros.shape[0], qzeros.shape[1] * elems_per_int32),
+        dtype=torch.int8,
+        device=qzeros.device,
+        requires_grad=False,
+    )
+
+    for col in range(unpacked_zeros.shape[1]):
+        i = col % elems_per_int32
+        unpacked_zeros[:, col] = (qzeros[:, col // elems_per_int32] >>
+                                  (bits * i)) & 0xF
+    if not is_gptq_v2:
+        return unpacked_zeros + 1
+    return unpacked_zeros
+
+
+def unpack_gptq_qweight(qweight, bits):
+    qweight = qweight.view(torch.int8)
+    elems_per_int8 = 8 // bits
+    unpacked_weight = torch.zeros(
+        (qweight.shape[0], qweight.shape[1] * elems_per_int8),
+        dtype=torch.int8,
+        device=qweight.device,
+        requires_grad=False,
+    )
+    for col in range(unpacked_weight.shape[1]):
+        i = col % elems_per_int8
+        unpacked_weight[:, col] = (qweight[:, col // elems_per_int8] >>
+                                   (bits * i))
+
+    return torch.bitwise_and(unpacked_weight, 2**bits - 1)
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
index 2b1294bf7baa3..34a0b527b585e 100644
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -282,10 +282,12 @@ class PackedColumnParameter(_ColumnvLLMParameter):
                  packed_factor: Union[int, Fraction],
                  packed_dim: int,
                  marlin_tile_size: Optional[int] = None,
+                 bitblas_tile_size: Optional[int] = None,
                  **kwargs):
         self._packed_factor = packed_factor
         self._packed_dim = packed_dim
         self._marlin_tile_size = marlin_tile_size
+        self._bitblas_tile_size = bitblas_tile_size
         super().__init__(**kwargs)
 
     @property
@@ -300,12 +302,17 @@ class PackedColumnParameter(_ColumnvLLMParameter):
     def marlin_tile_size(self):
         return self._marlin_tile_size
 
+    @property
+    def bitblas_tile_size(self):
+        return self._bitblas_tile_size
+
     def adjust_shard_indexes_for_packing(self, shard_size, shard_offset):
         return _adjust_shard_indexes_for_packing(
             shard_size=shard_size,
             shard_offset=shard_offset,
             packed_factor=self.packed_factor,
-            marlin_tile_size=self.marlin_tile_size)
+            marlin_tile_size=self.marlin_tile_size,
+            bitblas_tile_size=self.bitblas_tile_size)
 
 
 class PackedvLLMParameter(ModelWeightParameter):
@@ -323,10 +330,12 @@ class PackedvLLMParameter(ModelWeightParameter):
                  packed_factor: Union[int, Fraction],
                  packed_dim: int,
                  marlin_tile_size: Optional[int] = None,
+                 bitblas_tile_size: Optional[int] = None,
                  **kwargs):
         self._packed_factor = packed_factor
         self._packed_dim = packed_dim
         self._marlin_tile_size = marlin_tile_size
+        self._bitblas_tile_size = bitblas_tile_size
         super().__init__(**kwargs)
 
     @property
@@ -341,12 +350,17 @@ class PackedvLLMParameter(ModelWeightParameter):
     def marlin_tile_size(self):
         return self._marlin_tile_size
 
+    @property
+    def bitblas_tile_size(self):
+        return self._bitblas_tile_size
+
     def adjust_shard_indexes_for_packing(self, shard_size, shard_offset):
         return _adjust_shard_indexes_for_packing(
             shard_size=shard_size,
             shard_offset=shard_offset,
             packed_factor=self.packed_factor,
-            marlin_tile_size=self.marlin_tile_size)
+            marlin_tile_size=self.marlin_tile_size,
+            bitblas_tile_size=self.bitblas_tile_size)
 
 
 class BlockQuantScaleParameter(_ColumnvLLMParameter, RowvLLMParameter):
@@ -421,8 +435,13 @@ def _adjust_shard_indexes_for_marlin(shard_size, shard_offset,
     return shard_size * marlin_tile_size, shard_offset * marlin_tile_size
 
 
+def _adjust_shard_indexes_for_bitblas(shard_size, shard_offset,
+                                      bitblas_tile_size):
+    return shard_size // bitblas_tile_size, shard_offset // bitblas_tile_size
+
+
 def _adjust_shard_indexes_for_packing(shard_size, shard_offset, packed_factor,
-                                      marlin_tile_size):
+                                      marlin_tile_size, bitblas_tile_size):
     shard_size = shard_size // packed_factor
     shard_offset = shard_offset // packed_factor
     if marlin_tile_size is not None:
@@ -430,4 +449,10 @@ def _adjust_shard_indexes_for_packing(shard_size, shard_offset, packed_factor,
             shard_size=shard_size,
             shard_offset=shard_offset,
             marlin_tile_size=marlin_tile_size)
-    return shard_size, shard_offset
+    elif bitblas_tile_size is not None:
+        return _adjust_shard_indexes_for_bitblas(
+            shard_size=shard_size,
+            shard_offset=shard_offset,
+            bitblas_tile_size=bitblas_tile_size)
+
+    return shard_size, shard_offset
\ No newline at end of file

From e4d614423283d8beae4a92aeb537a49ad2662864 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Tue, 22 Apr 2025 01:16:19 -0700
Subject: [PATCH 559/593] [BugFix] Fix incremental detokenization perf issue
 (#16963)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/engine/detokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 006d53d8f1288..330a3f6dad90e 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -161,7 +161,7 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer):
         prompt_suffix = request.prompt_token_ids
         prompt_len = len(prompt_suffix)
         if prompt_len > 4:
-            for i in range(4, max(prompt_len + 1, 32)):
+            for i in range(4, min(prompt_len + 1, 24)):
                 suffix = request.prompt_token_ids[-i:]
                 if '�' not in self.tokenizer.decode(suffix):
                     prompt_suffix = suffix

From 8f7bace7c3cfb2836ea56722c66277270d7254c6 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 22 Apr 2025 16:35:35 +0800
Subject: [PATCH 560/593] [Doc] Improve documentation for multimodal CLI args
 (#16960)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config.py           | 13 +++++++++----
 vllm/engine/arg_utils.py | 14 ++++++++------
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index f9e0ed9376048..8b13a6e595ebf 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -54,13 +54,15 @@ if TYPE_CHECKING:
     from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
         BaseTokenizerGroup)
 
-    Config = TypeVar("Config", bound=DataclassInstance)
+    ConfigType = type[DataclassInstance]
 else:
     QuantizationConfig = None
-    Config = TypeVar("Config")
+    ConfigType = type
 
 logger = init_logger(__name__)
 
+ConfigT = TypeVar("ConfigT", bound=ConfigType)
+
 # This value is chosen to have a balance between ITL and TTFT. Note it is
 # not optimized for throughput.
 _DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048
@@ -162,7 +164,7 @@ def get_attr_docs(cls: type[Any]) -> dict[str, str]:
     return out
 
 
-def config(cls: type[Config]) -> type[Config]:
+def config(cls: ConfigT) -> ConfigT:
     """
     A decorator that ensures all fields in a dataclass have default values
     and that each field has a docstring.
@@ -181,7 +183,7 @@ def config(cls: type[Config]) -> type[Config]:
     return cls
 
 
-def get_field(cls: type[Config], name: str) -> Field:
+def get_field(cls: ConfigType, name: str) -> Field:
     """Get the default factory field of a dataclass by name. Used for getting
     default factory fields in `EngineArgs`."""
     if not is_dataclass(cls):
@@ -2749,6 +2751,9 @@ class MultiModalConfig:
     The maximum number of input items allowed per prompt for each modality.
     This should be a JSON string that will be parsed into a dictionary.
     Defaults to 1 (V0) or 999 (V1) for each modality.
+
+    For example, to allow up to 16 images and 2 videos per prompt:
+    ``{"images": 16, "videos": 2}``
     """
 
     def compute_hash(self) -> str:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 06529ae25a839..61d9eee3c2190 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -17,7 +17,7 @@ from typing_extensions import TypeIs
 import vllm.envs as envs
 from vllm import version
 from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
-                         Config, ConfigFormat, DecodingConfig, Device,
+                         ConfigFormat, ConfigType, DecodingConfig, Device,
                          DeviceConfig, DistributedExecutorBackend, HfOverrides,
                          KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
                          ModelConfig, ModelImpl, MultiModalConfig,
@@ -304,7 +304,7 @@ class EngineArgs:
             """Check if the class is a custom type."""
             return cls.__module__ != "builtins"
 
-        def get_kwargs(cls: type[Config]) -> dict[str, Any]:
+        def get_kwargs(cls: ConfigType) -> dict[str, Any]:
             cls_docs = get_attr_docs(cls)
             kwargs = {}
             for field in fields(cls):
@@ -678,13 +678,15 @@ class EngineArgs:
             '--mm-processor-kwargs',
             default=None,
             type=json.loads,
-            help=('Overrides for the multimodal input mapping/processing, '
-                  'e.g., image processor. For example: ``{"num_crops": 4}``.'))
+            help=('Overrides for the multi-modal processor obtained from '
+                  '``AutoProcessor.from_pretrained``. The available overrides '
+                  'depend on the model that is being run.'
+                  'For example, for Phi-3-Vision: ``{"num_crops": 4}``.'))
         parser.add_argument(
             '--disable-mm-preprocessor-cache',
             action='store_true',
-            help='If true, then disables caching of the multi-modal '
-            'preprocessor/mapper. (not recommended)')
+            help='If True, disable caching of the processed multi-modal '
+            'inputs.')
 
         # LoRA related configs
         parser.add_argument('--enable-lora',

From 0e237f00357c968a4f7ae25accd533e924baceff Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Tue, 22 Apr 2025 17:46:28 +0800
Subject: [PATCH 561/593] [FEAT][ROCm] Integrate Paged Attention Kernel from
 AITER (#15001)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 docker/Dockerfile.rocm_base                 |   2 +-
 vllm/attention/backends/rocm_flash_attn.py  | 109 +++++++++++++++-----
 vllm/attention/ops/rocm_aiter_paged_attn.py | 101 ++++++++++++++++++
 vllm/envs.py                                |   7 ++
 vllm/platforms/rocm.py                      |   3 +-
 5 files changed, 195 insertions(+), 27 deletions(-)
 create mode 100644 vllm/attention/ops/rocm_aiter_paged_attn.py

diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
index 05192eb69b54b..1776b26d445ce 100644
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -12,7 +12,7 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="1a7f4dfa"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="5a77249"
+ARG AITER_BRANCH="7e1ed08"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 
 FROM ${BASE_IMAGE} AS base
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 90a21906b6e63..37b6cadcb98ab 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -2,6 +2,7 @@
 """Attention layer ROCm GPUs."""
 import itertools
 from dataclasses import dataclass
+from functools import cache
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
 
 import torch
@@ -26,6 +27,32 @@ logger = init_logger(__name__)
 _PARTITION_SIZE_ROCM = 256
 
 
+@cache
+def is_rocm_aiter_paged_attn_enabled() -> bool:
+    return envs.VLLM_ROCM_USE_AITER_PAGED_ATTN \
+        and envs.VLLM_ROCM_USE_AITER \
+
+
+@cache
+def _get_paged_attn_module() -> PagedAttention:
+    """
+    Initializes the appropriate PagedAttention module from `attention/ops`, 
+    which is used as helper function
+    by `ROCmFlashAttentionImpl` and `ROCmFlashAttentionBackend`.
+
+    The choice of attention module depends on whether 
+    AITER paged attention is enabled:
+    - If enabled, `ROCmFlashAttentionImpl` uses `AITERPagedAttention`.
+    - Otherwise, it defaults to using the original `PagedAttention`.
+    """
+    if is_rocm_aiter_paged_attn_enabled():
+        # Import AITERPagedAttention only when the flag is enabled
+        from vllm.attention.ops.rocm_aiter_paged_attn import (
+            AITERPagedAttention)
+        return AITERPagedAttention()
+    return PagedAttention()
+
+
 class ROCmFlashAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
 
@@ -56,8 +83,9 @@ class ROCmFlashAttentionBackend(AttentionBackend):
         num_kv_heads: int,
         head_size: int,
     ) -> Tuple[int, ...]:
-        return PagedAttention.get_kv_cache_shape(num_blocks, block_size,
-                                                 num_kv_heads, head_size)
+        paged_attn = _get_paged_attn_module()
+        return paged_attn.get_kv_cache_shape(num_blocks, block_size,
+                                             num_kv_heads, head_size)
 
     @staticmethod
     def swap_blocks(
@@ -65,14 +93,16 @@ class ROCmFlashAttentionBackend(AttentionBackend):
         dst_kv_cache: torch.Tensor,
         src_to_dst: torch.Tensor,
     ) -> None:
-        PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
+        paged_attn = _get_paged_attn_module()
+        paged_attn.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
 
     @staticmethod
     def copy_blocks(
         kv_caches: List[torch.Tensor],
         src_to_dists: torch.Tensor,
     ) -> None:
-        PagedAttention.copy_blocks(kv_caches, src_to_dists)
+        paged_attn = _get_paged_attn_module()
+        paged_attn.copy_blocks(kv_caches, src_to_dists)
 
 
 @dataclass
@@ -496,7 +526,10 @@ class ROCmFlashAttentionImpl(AttentionImpl):
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        supported_head_sizes = PagedAttention.get_supported_head_sizes()
+        self.paged_attn_module = _get_paged_attn_module()
+        supported_head_sizes = self.paged_attn_module.get_supported_head_sizes(
+        )
+
         if head_size not in supported_head_sizes:
             raise ValueError(
                 f"Head size {head_size} is not supported by PagedAttention. "
@@ -546,6 +579,8 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                 self.sdpa_attn_func = _sdpa_attention
                 logger.debug("Using naive (SDPA) attention in ROCmBackend")
 
+        self.aiter_kv_scales_initialized = False
+
     def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor:
         """torch.repeat_interleave(x, dim=1, repeats=n_rep)"""
         tokens, n_kv_heads, head_dim = x.shape
@@ -624,12 +659,37 @@ class ROCmFlashAttentionImpl(AttentionImpl):
         else:
             assert value is None
 
+        paged_attn = self.paged_attn_module
+
+        # Reshaping kv tensors is required for AITER paged attention kernel
+        # because it works on a different tensor shape,
+        # when the size of one element is one byte (int8/fp8 dtypes).
+        # This reshaping is only required on the first forward call
+        # and the kv cache must not be empty.
+        if (is_rocm_aiter_paged_attn_enabled() and kv_cache.dtype.itemsize == 1
+                and not self.aiter_kv_scales_initialized
+                and kv_cache.shape != torch.Size([0])):
+            num_blocks = kv_cache.shape[1]
+            block_size = kv_cache.shape[2] // (self.num_kv_heads *
+                                               self.head_size)
+            k_scale = torch.empty((self.num_kv_heads, num_blocks * block_size),
+                                  dtype=torch.float32,
+                                  device=kv_cache.device)
+            v_scale = torch.empty((self.num_kv_heads, num_blocks * block_size),
+                                  dtype=torch.float32,
+                                  device=kv_cache.device)
+            self.aiter_kv_scales_initialized = True
+            k_scale.fill_(layer._k_scale.item())
+            v_scale.fill_(layer._v_scale.item())
+            layer._k_scale = k_scale
+            layer._v_scale = v_scale
+
         # Only update KV cache for decoder self-attention
         # and encoder-decoder cross-attention
         if self.attn_type not in [
                 AttentionType.ENCODER, AttentionType.ENCODER_ONLY
         ] and kv_cache.numel() > 0:
-            key_cache, value_cache = PagedAttention.split_kv_cache(
+            key_cache, value_cache = paged_attn.split_kv_cache(
                 kv_cache, self.num_kv_heads, self.head_size)
 
             if key is not None and value is not None:
@@ -637,7 +697,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                 # cache. If kv_cache is not provided, the new key and value
                 # tensors are not cached. This happens during the initial
                 # memory profiling run.
-                PagedAttention.write_to_paged_cache(
+                paged_attn.write_to_paged_cache(
                     key,
                     value,
                     key_cache,
@@ -768,23 +828,22 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                 # prefix-enabled attention -
                 # not applicable for encoder-only models
                 if self.attn_type != AttentionType.ENCODER_ONLY:
-                    output[:
-                           num_prefill_tokens] = PagedAttention.forward_prefix(
-                               query,
-                               key,
-                               value,
-                               self.kv_cache_dtype,
-                               key_cache,
-                               value_cache,
-                               prefill_meta.block_tables,
-                               prefill_meta.query_start_loc,
-                               prefill_meta.seq_lens_tensor,
-                               prefill_meta.max_query_len,
-                               self.alibi_slopes,
-                               self.sliding_window[0],
-                               layer._k_scale,
-                               layer._v_scale,
-                           )
+                    output[:num_prefill_tokens] = paged_attn.forward_prefix(
+                        query,
+                        key,
+                        value,
+                        self.kv_cache_dtype,
+                        key_cache,
+                        value_cache,
+                        prefill_meta.block_tables,
+                        prefill_meta.query_start_loc,
+                        prefill_meta.seq_lens_tensor,
+                        prefill_meta.max_query_len,
+                        self.alibi_slopes,
+                        self.sliding_window[0],
+                        layer._k_scale,
+                        layer._v_scale,
+                    )
         # Skip decode phase for encoder-only models
         if (decode_meta := attn_metadata.decode_metadata) and (
                 self.attn_type != AttentionType.ENCODER_ONLY):
@@ -843,7 +902,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                     layer._v_scale,
                 )
             else:
-                output[num_prefill_tokens:] = PagedAttention.forward_decode(
+                output[num_prefill_tokens:] = paged_attn.forward_decode(
                     decode_query,
                     key_cache,
                     value_cache,
diff --git a/vllm/attention/ops/rocm_aiter_paged_attn.py b/vllm/attention/ops/rocm_aiter_paged_attn.py
new file mode 100644
index 0000000000000..0f3cf1842c805
--- /dev/null
+++ b/vllm/attention/ops/rocm_aiter_paged_attn.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+
+import aiter as rocm_aiter
+import torch
+
+from vllm.attention.ops.paged_attn import PagedAttention
+from vllm.platforms import current_platform
+from vllm.utils import cdiv
+
+FP8_DTYPE = current_platform.fp8_dtype()
+
+
+class AITERPagedAttention(PagedAttention):
+
+    @staticmethod
+    def write_to_paged_cache(
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        slot_mapping: torch.Tensor,
+        kv_cache_dtype: str,
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
+    ) -> None:
+        if kv_cache_dtype not in ["int8", "fp8", "fp8_e4m3"]:
+            PagedAttention.write_to_paged_cache(key, value, key_cache,
+                                                value_cache, slot_mapping,
+                                                kv_cache_dtype, k_scale,
+                                                v_scale)
+        else:
+            kv_cache_torch_dtype = (FP8_DTYPE
+                                    if "fp8" in kv_cache_dtype else torch.int8)
+            key_cache = key_cache.view(kv_cache_torch_dtype)
+            value_cache = value_cache.view(kv_cache_torch_dtype)
+
+            rocm_aiter.reshape_and_cache_with_pertoken_quant(
+                key, value, key_cache, value_cache, k_scale, v_scale,
+                slot_mapping.flatten(), True)
+
+    @staticmethod
+    def forward_decode(
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_tables: torch.Tensor,
+        seq_lens: torch.Tensor,
+        max_seq_len: int,
+        kv_cache_dtype: str,
+        num_kv_heads: int,
+        scale: float,
+        alibi_slopes: Optional[torch.Tensor],
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
+        tp_rank: int = 0,
+        blocksparse_local_blocks: int = 0,
+        blocksparse_vert_stride: int = 0,
+        blocksparse_block_size: int = 64,
+        blocksparse_head_sliding_step: int = 0,
+    ) -> torch.Tensor:
+        if kv_cache_dtype not in ["int8", "fp8", "fp8_e4m3"]:
+            return PagedAttention.forward_decode(
+                query=query,
+                key_cache=key_cache,
+                value_cache=value_cache,
+                block_tables=block_tables,
+                seq_lens=seq_lens,
+                max_seq_len=max_seq_len,
+                kv_cache_dtype=kv_cache_dtype,
+                num_kv_heads=num_kv_heads,
+                scale=scale,
+                alibi_slopes=alibi_slopes,
+                k_scale=k_scale,
+                v_scale=v_scale,
+                tp_rank=tp_rank,
+                blocksparse_local_blocks=blocksparse_local_blocks,
+                blocksparse_vert_stride=blocksparse_vert_stride,
+                blocksparse_block_size=blocksparse_block_size,
+                blocksparse_head_sliding_step=blocksparse_head_sliding_step)
+
+        if "fp8" in kv_cache_dtype:
+            key_cache = key_cache.view(torch.float8_e4m3fnuz)
+            value_cache = value_cache.view(torch.float8_e4m3fnuz)
+
+        if blocksparse_vert_stride is not None and blocksparse_vert_stride > 1:
+            # use blocksparse paged attention
+            block_size = value_cache.size(-1)
+            assert (blocksparse_block_size > 0 and
+                    blocksparse_block_size % block_size == 0), \
+                (f"{blocksparse_block_size=} needs to be a multiple of"
+                 f"{block_size=} used in block_tables.")
+
+        output = torch.empty_like(query)
+        block_size = value_cache.shape[3]
+        max_num_blocks_per_seq = cdiv(max_seq_len, block_size)
+
+        rocm_aiter.pa_fwd_asm(query, key_cache, value_cache, block_tables,
+                              seq_lens, max_num_blocks_per_seq, k_scale,
+                              v_scale, output)
+        return output
diff --git a/vllm/envs.py b/vllm/envs.py
index d2e21a8dcfc4b..5922e90a91b1c 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -75,6 +75,7 @@ if TYPE_CHECKING:
     VLLM_DISABLED_KERNELS: list[str] = []
     VLLM_USE_V1: bool = True
     VLLM_ROCM_USE_AITER: bool = False
+    VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False
     VLLM_ROCM_USE_AITER_LINEAR: bool = True
     VLLM_ROCM_USE_AITER_MOE: bool = True
     VLLM_ROCM_USE_AITER_RMSNORM: bool = True
@@ -533,6 +534,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
              ("true", "1")),
 
+    # Whether to use aiter paged attention.
+    # By default is disabled.
+    "VLLM_ROCM_USE_AITER_PAGED_ATTN":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER_PAGED_ATTN", "False").lower() in
+             ("true", "1")),
+
     # use aiter linear op if aiter ops are enabled
     # The following list of related ops
     # - scaled_mm (per-tensor / rowwise)
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index f44507eb54ccb..091ec2a1f945a 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -118,7 +118,8 @@ def use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
             and (head_size == 64 or head_size == 128)
             and (block_size == 16 or block_size == 32)
             and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768
-            and envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
+            and not (envs.VLLM_ROCM_USE_AITER_PAGED_ATTN
+                     and envs.VLLM_ROCM_USE_AITER))
 
 
 class RocmPlatform(Platform):

From 4b91c927f69888a4c3fab0d23e4c7cce2868e49b Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Tue, 22 Apr 2025 19:44:21 +0800
Subject: [PATCH 562/593] [Misc] refactor example series (#16972)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 ...at_completion_tool_calls_with_reasoning.py | 142 ++++++++++--------
 1 file changed, 76 insertions(+), 66 deletions(-)

diff --git a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
index 9e7a69c6c87d6..8c6470aa3dd41 100644
--- a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
@@ -31,14 +31,6 @@ available_tools = {"get_current_weather": get_current_weather}
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-
-models = client.models.list()
-model = models.data[0].id
-
 tools = [{
     "type": "function",
     "function": {
@@ -109,69 +101,87 @@ def extract_reasoning_and_calls(chunks: list):
     return reasoning_content, arguments, function_names
 
 
-print("---------Full Generate With Automatic Function Calling-------------")
-tool_calls = client.chat.completions.create(messages=messages,
-                                            model=model,
-                                            tools=tools)
-print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
-print(f"function name: "
-      f"{tool_calls.choices[0].message.tool_calls[0].function.name}")
-print(f"function arguments: "
-      f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}")
+def main():
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
 
-print("----------Stream Generate With Automatic Function Calling-----------")
-tool_calls_stream = client.chat.completions.create(messages=messages,
-                                                   model=model,
-                                                   tools=tools,
-                                                   stream=True)
-chunks = []
-for chunk in tool_calls_stream:
-    chunks.append(chunk)
+    models = client.models.list()
+    model = models.data[0].id
 
-reasoning_content, arguments, function_names = extract_reasoning_and_calls(
-    chunks)
+    print(
+        "---------Full Generate With Automatic Function Calling-------------")
+    tool_calls = client.chat.completions.create(messages=messages,
+                                                model=model,
+                                                tools=tools)
+    print(
+        f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}"
+    )
+    print(f"function name: "
+          f"{tool_calls.choices[0].message.tool_calls[0].function.name}")
+    print(f"function arguments: "
+          f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}")
 
-print(f"reasoning_content: {reasoning_content}")
-print(f"function name: {function_names[0]}")
-print(f"function arguments: {arguments[0]}")
+    print(
+        "----------Stream Generate With Automatic Function Calling-----------")
+    tool_calls_stream = client.chat.completions.create(messages=messages,
+                                                       model=model,
+                                                       tools=tools,
+                                                       stream=True)
 
-print("----------Full Generate With Named Function Calling-----------------")
-tool_calls = client.chat.completions.create(messages=messages,
-                                            model=model,
-                                            tools=tools,
-                                            tool_choice={
-                                                "type": "function",
-                                                "function": {
-                                                    "name":
-                                                    "get_current_weather"
-                                                }
-                                            })
+    chunks = list(tool_calls_stream)
 
-tool_call = tool_calls.choices[0].message.tool_calls[0].function
-print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
-print(f"function name: {tool_call.name}")
-print(f"function arguments: {tool_call.arguments}")
-print("----------Stream Generate With Named Function Calling--------------")
+    reasoning_content, arguments, function_names = extract_reasoning_and_calls(
+        chunks)
 
-tool_calls_stream = client.chat.completions.create(
-    messages=messages,
-    model=model,
-    tools=tools,
-    tool_choice={
-        "type": "function",
-        "function": {
-            "name": "get_current_weather"
-        }
-    },
-    stream=True)
+    print(f"reasoning_content: {reasoning_content}")
+    print(f"function name: {function_names[0]}")
+    print(f"function arguments: {arguments[0]}")
 
-chunks = []
-for chunk in tool_calls_stream:
-    chunks.append(chunk)
+    print(
+        "----------Full Generate With Named Function Calling-----------------")
+    tool_calls = client.chat.completions.create(messages=messages,
+                                                model=model,
+                                                tools=tools,
+                                                tool_choice={
+                                                    "type": "function",
+                                                    "function": {
+                                                        "name":
+                                                        "get_current_weather"
+                                                    }
+                                                })
 
-reasoning_content, arguments, function_names = extract_reasoning_and_calls(
-    chunks)
-print(f"reasoning_content: {reasoning_content}")
-print(f"function name: {function_names[0]}")
-print(f"function arguments: {arguments[0]}")
-print("\n\n")
+    tool_call = tool_calls.choices[0].message.tool_calls[0].function
+    print(
+        f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}"
+    )
+    print(f"function name: {tool_call.name}")
+    print(f"function arguments: {tool_call.arguments}")
+    print(
+        "----------Stream Generate With Named Function Calling--------------")
+
+    tool_calls_stream = client.chat.completions.create(
+        messages=messages,
+        model=model,
+        tools=tools,
+        tool_choice={
+            "type": "function",
+            "function": {
+                "name": "get_current_weather"
+            }
+        },
+        stream=True)
+
+    chunks = list(tool_calls_stream)
+
+    reasoning_content, arguments, function_names = extract_reasoning_and_calls(
+        chunks)
+    print(f"reasoning_content: {reasoning_content}")
+    print(f"function name: {function_names[0]}")
+    print(f"function arguments: {arguments[0]}")
+    print("\n\n")
+
+
+if __name__ == "__main__":
+    main()

From 571e8dd65e2a286c621f7552d1d336ac3fe08b4a Mon Sep 17 00:00:00 2001
From: Yang Fan <suyang.fy@alibaba-inc.com>
Date: Tue, 22 Apr 2025 20:23:17 +0800
Subject: [PATCH 563/593] [Bugfix] Fix distributed bug again in Qwen2.5-VL &
 Qwen2.5-Omni (#16974)

Signed-off-by: fyabc <suyang.fy@alibaba-inc.com>
---
 vllm/model_executor/models/qwen2_5_vl.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 30980316ecfc7..0ab55411bad4f 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -198,8 +198,11 @@ class Qwen2_5_VisionMLP(nn.Module):
 
 def all_gather_interleave(local_tensor, hidden_size: int, tp_size: int):
     """All-gather the input tensor interleavely across model parallel group."""
+    import torch.distributed as dist
     gathered_tensors = [torch.zeros_like(local_tensor) for _ in range(tp_size)]
-    parallel_state.get_tp_group().all_gather(gathered_tensors, local_tensor)
+    dist.all_gather(gathered_tensors,
+                    local_tensor,
+                    group=parallel_state.get_tp_group().device_group)
 
     gathered_tensors_split = [
         torch.split(tensor, hidden_size // tp_size, -1)

From d059110498924d5053d44034ac9c1f3eaa68249c Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 22 Apr 2025 13:55:36 +0100
Subject: [PATCH 564/593] Improve configs - `SpeculativeConfig` (#16971)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config.py           | 184 +++++++++++++++++----------------------
 vllm/engine/arg_utils.py |  17 ++--
 2 files changed, 91 insertions(+), 110 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 8b13a6e595ebf..bf96bffc0fe23 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2128,139 +2128,113 @@ class DeviceConfig:
             self.device = torch.device(self.device_type)
 
 
+SpeculativeMethod = Literal["ngram", "eagle", "medusa", "mlp_speculator",
+                            "draft_model"]
+SpeculativeAcceptanceMethod = Literal["rejection_sampler",
+                                      "typical_acceptance_sampler"]
+
+
+@config
 @dataclass
 class SpeculativeConfig:
-    """
-    Configuration for speculative decoding.
-    Configurable parameters include:
-    - General Speculative Decoding Control:
-        - num_speculative_tokens (int): The number of speculative
-            tokens, if provided. It will default to the number in the draft
-            model config if present, otherwise, it is required.
-        - model (Optional[str]): The name of the draft model, eagle head,
-            or additional weights, if provided.
-        - method (Optional[str]): The name of the speculative method to use.
-            If users provide and set the `model` param, the speculative method
-            type will be detected automatically if possible, if `model` param
-            is not provided, the method name must be provided.
-            - Possible values:
-                - ngram
-                    Related additional configuration:
-                    - prompt_lookup_max (Optional[int]):
-                        Maximum size of ngram token window when using Ngram
-                        proposer, required when method is set to ngram.
-                    - prompt_lookup_min (Optional[int]):
-                        Minimum size of ngram token window when using Ngram
-                        proposer, if provided. Defaults to 1.
-                - eagle
-                - medusa
-                - mlp_speculator
-                - draft_model
-        - acceptance_method (str): The method to use for accepting draft
-            tokens. This can take two possible values: 'rejection_sampler' and
-            'typical_acceptance_sampler' for RejectionSampler and
-            TypicalAcceptanceSampler respectively. If not specified, it
-            defaults to 'rejection_sampler'.
-            - Possible values:
-                - rejection_sampler
-                - typical_acceptance_sampler
-                    Related additional configuration:
-                    - posterior_threshold (Optional[float]):
-                        A threshold value that sets a lower bound on the
-                        posterior probability of a token in the target model
-                        for it to be accepted. This threshold is used only
-                        when we use the TypicalAcceptanceSampler for token
-                        acceptance.
-                    - posterior_alpha (Optional[float]):
-                        Scaling factor for entropy-based threshold, applied
-                        when using TypicalAcceptanceSampler.
-        - draft_tensor_parallel_size (Optional[int]): The degree of the tensor
-            parallelism for the draft model. Can only be 1 or the same as the
-            target model's tensor parallel size.
-        - disable_logprobs (bool): If set to True, token log probabilities are
-            not returned during speculative decoding. If set to False, token
-            log probabilities are returned according to the log probability
-            settings in SamplingParams. If not specified, it defaults to True.
+    """Configuration for speculative decoding."""
 
-    - Draft Model Configuration:
-        - quantization (Optional[str]): Quantization method that was used to
-            quantize the draft model weights. If None, we assume the
-            model weights are not quantized. Note that it only takes effect
-            when using the draft model-based speculative method.
-        - max_model_len (Optional[int]): The maximum model length of the
-            draft model. Used when testing the ability to skip
-            speculation for some sequences.
-        - revision: The specific model version to use for the draft model. It
-            can be a branch name, a tag name, or a commit id. If unspecified,
-            will use the default version.
-        - code_revision: The specific revision to use for the draft model code
-            on Hugging Face Hub. It can be a branch name, a tag name, or a
-            commit id. If unspecified, will use the default version.
-
-    - Advanced Control:
-        - disable_mqa_scorer (bool): Disable the MQA scorer and fall back to
-            batch expansion for scoring proposals. If not specified, it
-            defaults to False.
-        - disable_by_batch_size (Optional[int]): Disable speculative decoding
-            for new incoming requests when the number of enqueued requests is
-            larger than this value, if provided.
-
-    Although the parameters above are structured hierarchically, there is no
-    need to nest them during configuration.
-
-    Non-configurable internal parameters include:
-    - Model Configuration:
-        - target_model_config (ModelConfig): The configuration of the target
-            model.
-        - draft_model_config (ModelConfig): The configuration of the draft
-            model initialized internal.
-    - Parallelism Configuration:
-        - target_parallel_config (ParallelConfig): The parallel configuration
-            for the target model.
-        - draft_parallel_config (ParallelConfig): The parallel configuration
-            for the draft model initialized internal.
-    - Execution Control:
-        - enable_chunked_prefill (bool): Whether vLLM is configured to use
-            chunked prefill or not. Used for raising an error since it's not
-            yet compatible with speculative decode.
-        - disable_log_stats (bool): Whether to disable the periodic printing of
-            stage times in speculative decoding.
-    """
-    # speculative configs from cli args
+    # General speculative decoding control
     num_speculative_tokens: int = field(default=None,
                                         init=True)  # type: ignore
-    method: Optional[str] = None
-    acceptance_method: str = "rejection_sampler"
-    draft_tensor_parallel_size: Optional[int] = None
-    disable_logprobs: bool = True
-
+    """The number of speculative tokens, if provided. It will default to the
+    number in the draft model config if present, otherwise, it is required."""
     model: Optional[str] = None
-    quantization: Optional[str] = None
-    max_model_len: Optional[int] = None
-    revision: Optional[str] = None
-    code_revision: Optional[str] = None
+    """The name of the draft model, eagle head, or additional weights, if
+    provided."""
+    method: Optional[SpeculativeMethod] = None
+    """The name of the speculative method to use. If users provide and set the
+    `model` param, the speculative method type will be detected automatically
+    if possible, if `model` param is not provided, the method name must be
+    provided.
 
+    If using `ngram` method, the related configuration `prompt_lookup_max` and
+    `prompt_lookup_min` should be considered."""
+    acceptance_method: SpeculativeAcceptanceMethod = "rejection_sampler"
+    """The method to use for accepting draft tokens:\n
+    - "rejection_sampler" maps to `RejectionSampler`.\n
+    - "typical_acceptance_sampler" maps to `TypicalAcceptanceSampler`.
+
+    If using `typical_acceptance_sampler`, the related configuration
+    `posterior_threshold` and `posterior_alpha` should be considered."""
+    draft_tensor_parallel_size: Optional[int] = None
+    """The degree of the tensor parallelism for the draft model. Can only be 1
+    or the same as the target model's tensor parallel size."""
+    disable_logprobs: bool = True
+    """If set to True, token log probabilities are not returned during
+    speculative decoding. If set to False, token log probabilities are returned
+    according to the log probability settings in SamplingParams."""
+
+    # Draft model configuration
+    quantization: Optional[str] = None
+    """Quantization method that was used to quantize the draft model weights.
+    If `None`, we assume the model weights are not quantized. Note that it only
+    takes effect when using the draft model-based speculative method."""
+    max_model_len: Optional[int] = None
+    """The maximum model length of the draft model. Used when testing the
+    ability to skip speculation for some sequences."""
+    revision: Optional[str] = None
+    """The specific model version to use for the draft model. It can be a
+    branch name, a tag name, or a commit id. If unspecified, will use the
+    default version."""
+    code_revision: Optional[str] = None
+    """The specific revision to use for the draft model code on Hugging Face
+    Hub. It can be a branch name, a tag name, or a commit id. If unspecified,
+    will use the default version."""
+
+    # Advanced control
     disable_mqa_scorer: bool = False
+    """Disable the MQA scorer and fall back to batch expansion for scoring
+    proposals."""
     disable_by_batch_size: Optional[int] = None
+    """Disable speculative decoding for new incoming requests when the number
+    of enqueued requests is larger than this value, if provided."""
+
+    # Ngram proposer configuration
     prompt_lookup_max: Optional[int] = None
+    """Maximum size of ngram token window when using Ngram proposer, required
+    when method is set to ngram."""
     prompt_lookup_min: Optional[int] = None
+    """Minimum size of ngram token window when using Ngram proposer, if
+    provided. Defaults to 1."""
+
+    # Typical acceptance sampler configuration
     posterior_threshold: Optional[float] = None
+    """A threshold value that sets a lower bound on the posterior probability
+    of a token in the target model for it to be accepted. This threshold is
+    used only when we use the `TypicalAcceptanceSampler` for token acceptance.
+    """
     posterior_alpha: Optional[float] = None
+    """Scaling factor for entropy-based threshold, applied when using
+    `TypicalAcceptanceSampler`."""
 
     # required configuration params passed from engine
     target_model_config: ModelConfig = field(default=None,
                                              init=True)  # type: ignore
+    """The configuration of the target model."""
     target_parallel_config: ParallelConfig = field(default=None,
                                                    init=True)  # type: ignore
+    """The parallel configuration for the target model."""
     enable_chunked_prefill: bool = field(default=None,
                                          init=True)  # type: ignore
+    """Whether vLLM is configured to use chunked prefill or not. Used for
+    raising an error since it's not yet compatible with speculative decode."""
     disable_log_stats: bool = field(default=None, init=True)  # type: ignore
+    """Whether to disable the periodic printing of stage times in speculative
+    decoding."""
 
     # params generated in the post-init stage
     draft_model_config: ModelConfig = field(default=None,
                                             init=True)  # type: ignore
+    """The configuration of the draft model initialized internal."""
     draft_parallel_config: ParallelConfig = field(default=None,
                                                   init=True)  # type: ignore
+    """The parallel configuration for the draft model initialized internal."""
 
     def compute_hash(self) -> str:
         """
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 61d9eee3c2190..b6d0bfeac4a44 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -768,11 +768,18 @@ class EngineArgs:
                             help=('Maximum number of forward steps per '
                                   'scheduler call.'))
 
-        parser.add_argument('--speculative-config',
-                            type=json.loads,
-                            default=None,
-                            help='The configurations for speculative decoding.'
-                            ' Should be a JSON string.')
+        # Speculative arguments
+        speculative_group = parser.add_argument_group(
+            title="SpeculativeConfig",
+            description=SpeculativeConfig.__doc__,
+        )
+        speculative_group.add_argument(
+            '--speculative-config',
+            type=json.loads,
+            default=None,
+            help='The configurations for speculative decoding.'
+            ' Should be a JSON string.')
+
         parser.add_argument(
             '--ignore-patterns',
             action="append",

From f961d7f6ef1441b9e3674cd9df5ddaee984ce3ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zhengyuan=20Su=20=28=E8=8B=8F=E6=94=BF=E6=B8=8A=29?=
 <su-zy21@mails.tsinghua.edu.cn>
Date: Tue, 22 Apr 2025 21:44:10 +0800
Subject: [PATCH 565/593] [BugFix] Pass in correct VLLM config in FlashInfer
 backend (#13207) (#16973)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 苏政渊 <suzhengyuan@moonshot.cn>
Co-authored-by: 苏政渊 <suzhengyuan@moonshot.cn>
---
 vllm/attention/backends/flashinfer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 09717a1121d05..718b15e58785c 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -37,7 +37,7 @@ from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
                                            is_block_tables_empty)
 from vllm.attention.layer import Attention
 from vllm.attention.ops.paged_attn import PagedAttention
-from vllm.config import VllmConfig, get_current_vllm_config
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
                         make_tensor_with_pad)
@@ -187,7 +187,7 @@ class FlashInferState(AttentionState):
         # Global hyperparameters shared by all attention layers
         self.global_hyperparameters: Optional[PerLayerParameters] = None
 
-        self.vllm_config = get_current_vllm_config()
+        self.vllm_config = self.runner.vllm_config
 
     def _get_workspace_buffer(self):
         if self._workspace_buffer is None:
@@ -613,7 +613,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         # Global hyperparameters shared by all attention layers
         self.global_hyperparameters: Optional[PerLayerParameters] = None
 
-        self.vllm_config = get_current_vllm_config()
+        self.vllm_config = self.runner.vllm_config
 
     def prepare(self):
         self.slot_mapping: List[int] = []

From 68d4c33202140f9ed89261a33a89f2e6b4abebf7 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Tue, 22 Apr 2025 22:27:36 +0800
Subject: [PATCH 566/593] [Misc] Add S3 environment variables for better
 support of MinIO. (#16977)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 vllm/transformers_utils/s3_utils.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py
index 1c3520bcfb278..f67b8fdd7c4ed 100644
--- a/vllm/transformers_utils/s3_utils.py
+++ b/vllm/transformers_utils/s3_utils.py
@@ -45,7 +45,12 @@ def glob(s3=None,
         list[str]: List of full S3 paths allowed by the pattern
     """
     if s3 is None:
-        s3 = boto3.client("s3")
+        s3 = boto3.client(
+            's3',
+            aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
+            aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
+            endpoint_url=os.getenv("AWS_ENDPOINT_URL"),
+            region_name=os.getenv("AWS_REGION_NAME"))
     if not path.endswith("/"):
         path = path + "/"
     bucket_name, _, paths = list_files(s3,
@@ -107,7 +112,12 @@ class S3Model:
     """
 
     def __init__(self) -> None:
-        self.s3 = boto3.client('s3')
+        self.s3 = boto3.client(
+            's3',
+            aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
+            aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
+            endpoint_url=os.getenv("AWS_ENDPOINT_URL"),
+            region_name=os.getenv("AWS_REGION_NAME"))
         for sig in (signal.SIGINT, signal.SIGTERM):
             existing_handler = signal.getsignal(sig)
             signal.signal(sig, self._close_by_signal(existing_handler))

From f34410715f9758a2ee2d45acd56179f9c1c06fc3 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Tue, 22 Apr 2025 23:40:24 +0800
Subject: [PATCH 567/593] [frontend] enhance tool_calls type check (#16882)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 vllm/entrypoints/chat_utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 0b662f1a7ec3c..2e19ebcdd61f3 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1095,7 +1095,11 @@ def _parse_chat_message_content(
         if role == 'assistant':
             parsed_msg = _AssistantParser(message)
 
-            if "tool_calls" in parsed_msg:
+            # The 'tool_calls' is not None check ensures compatibility.
+            # It's needed only if downstream code doesn't strictly
+            # follow the OpenAI spec.
+            if ("tool_calls" in parsed_msg
+                and parsed_msg["tool_calls"] is not None):
                 result_msg["tool_calls"] = list(parsed_msg["tool_calls"])
         elif role == "tool":
             parsed_msg = _ToolParser(message)

From 30bc3e0f665eabae365355641aa86a29ae445967 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Wed, 23 Apr 2025 00:31:13 +0800
Subject: [PATCH 568/593] [FEAT][ROCm]: Support AITER MLA (#15893)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: qli88 <qiang.li2@amd.com>
---
 tests/kernels/test_attention_selector.py      | 151 ++++++-
 tests/kernels/test_rocm_attention_selector.py |  29 +-
 vllm/attention/backends/mla/common.py         |  21 +-
 vllm/attention/backends/rocm_aiter_mla.py     | 412 ++++++++++++++++++
 vllm/attention/ops/rocm_aiter_mla.py          |  42 ++
 vllm/config.py                                |   2 +-
 vllm/envs.py                                  |   6 +
 vllm/platforms/interface.py                   |   1 +
 vllm/platforms/rocm.py                        |  34 +-
 9 files changed, 668 insertions(+), 30 deletions(-)
 create mode 100644 vllm/attention/backends/rocm_aiter_mla.py
 create mode 100644 vllm/attention/ops/rocm_aiter_mla.py

diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index a51e70d45ee0c..2b5e0a29ddc55 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -19,45 +19,152 @@ def clear_cache():
     _cached_get_attn_backend.cache_clear()
 
 
-@pytest.mark.parametrize(
-    "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"])
+# Define MLA and non-MLA backends separately
+DEVICE_MLA_BACKENDS = {
+    "cuda": ["TRITON_MLA", "FLASHMLA"],
+    "hip": ["TRITON_MLA", "ROCM_AITER_MLA"],
+    "cpu": [],
+}
+
+DEVICE_REGULAR_ATTN_BACKENDS = {
+    "cuda": ["XFORMERS", "FLASHINFER"],
+    "hip": ["ROCM_FLASH"],
+    "cpu": ["TORCH_SDPA"],
+}
+
+DEVICE_MLA_BLOCK_SIZES = {
+    "cuda": [16, 64],  # CUDA supports both standard and extended block sizes
+    "hip": [16, 1],  # HIP requires special handling for block_size=1
+    "cpu": [16]  # CPU uses fixed block size from test cases
+}
+
+
+def generate_params():
+    params = []
+    for use_mla in [True, False]:
+        for device in ["cuda", "hip", "cpu"]:
+            backends = DEVICE_MLA_BACKENDS[
+                device] if use_mla else DEVICE_REGULAR_ATTN_BACKENDS[device]
+            for name in backends:
+                block_sizes = DEVICE_MLA_BLOCK_SIZES[device] if use_mla else [
+                    16
+                ]
+                for block_size in block_sizes:
+                    params.append(
+                        pytest.param(
+                            device,
+                            name,
+                            use_mla,
+                            block_size,
+                            id=
+                            f"{device}_{name}_mla_{str(use_mla)[0]}_blks{block_size}"
+                        ))
+    return params
+
+
+@pytest.mark.parametrize("device, name, use_mla, block_size",
+                         generate_params())
 @pytest.mark.parametrize("use_v1", [True, False])
-@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
 def test_env(
-    name: str,
-    use_v1: bool,
     device: str,
+    name: str,
+    use_mla: bool,
+    block_size: int,
+    use_v1: bool,
     monkeypatch: pytest.MonkeyPatch,
 ):
-    """Test that the attention selector can be set via environment variable.
-    Note that we do not test FlashAttn because it is the default backend.
-    """
-
+    """Test attention backend selection with valid device-backend pairs."""
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
         m.setenv(STR_BACKEND_ENV_VAR, name)
+        m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
 
         if device == "cpu":
             with patch("vllm.attention.selector.current_platform",
                        CpuPlatform()):
                 backend = get_attn_backend(16, torch.float16, torch.float16,
-                                           16, False)
+                                           block_size, False)
             assert backend.get_name() == "TORCH_SDPA"
+
         elif device == "hip":
             with patch("vllm.attention.selector.current_platform",
                        RocmPlatform()):
-                backend = get_attn_backend(16, torch.float16, torch.float16,
-                                           16, False)
-            EXPECTED = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
-            assert backend.get_name() == EXPECTED
-        else:
-            if name in ["XFORMERS", "FLASHINFER"]:
-                with patch("vllm.attention.selector.current_platform",
-                           CudaPlatform()):
-                    backend = get_attn_backend(16, torch.float16,
-                                               torch.float16, 16, False)
-                EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name
-                assert backend.get_name() == EXPECTED
+                if use_mla:
+                    # Validate HIP MLA backend-block_size combinations
+                    valid_combination = (
+                        (name == "TRITON_MLA" and block_size != 1)
+                        or (name == "ROCM_AITER_MLA" and block_size == 1))
+
+                    if valid_combination:
+                        backend = get_attn_backend(16,
+                                                   torch.float16,
+                                                   torch.float16,
+                                                   block_size,
+                                                   False,
+                                                   use_mla=use_mla)
+                        assert backend.get_name() == name
+                    else:
+                        with pytest.raises(ValueError) as exc_info:
+                            get_attn_backend(16,
+                                             torch.float16,
+                                             torch.float16,
+                                             block_size,
+                                             False,
+                                             use_mla=use_mla)
+                        assert f"The selected backend, {name}" in str(
+                            exc_info.value)
+                else:
+                    backend = get_attn_backend(16,
+                                               torch.float16,
+                                               torch.float16,
+                                               block_size,
+                                               False,
+                                               use_mla=use_mla)
+                    expected = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
+                    assert backend.get_name() == expected
+
+        elif device == "cuda":
+            with patch("vllm.attention.selector.current_platform",
+                       CudaPlatform()):
+                if use_mla:
+                    if name == "FLASHMLA" and block_size == 64:
+                        from vllm.attention.backends.flashmla import (
+                            is_flashmla_supported)
+
+                        # only on cuda platforms with specific capability.
+                        is_supported, _ = is_flashmla_supported()
+
+                        if not is_supported:
+                            # if platform is not supported then skip this case.
+                            pytest.skip()
+                        else:
+                            backend = get_attn_backend(16,
+                                                       torch.float16,
+                                                       torch.float16,
+                                                       block_size,
+                                                       False,
+                                                       use_mla=use_mla)
+                            expected = f"{name}_VLLM_V1" if use_v1 else name
+                            assert backend.get_name() == expected
+                    else:
+                        backend = get_attn_backend(16,
+                                                   torch.float16,
+                                                   torch.float16,
+                                                   block_size,
+                                                   False,
+                                                   use_mla=use_mla)
+                        expected = ("TRITON_MLA_VLLM_V1"
+                                    if use_v1 else "TRITON_MLA")
+                        assert backend.get_name() == expected
+                else:
+                    backend = get_attn_backend(16,
+                                               torch.float16,
+                                               torch.float16,
+                                               block_size,
+                                               False,
+                                               use_mla=use_mla)
+                    expected = "FLASH_ATTN_VLLM_V1" if use_v1 else name
+                    assert backend.get_name() == expected
 
 
 def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
diff --git a/tests/kernels/test_rocm_attention_selector.py b/tests/kernels/test_rocm_attention_selector.py
index 90b483b4a41a0..4cf7bcb01d4d7 100644
--- a/tests/kernels/test_rocm_attention_selector.py
+++ b/tests/kernels/test_rocm_attention_selector.py
@@ -28,7 +28,34 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
         assert (backend.get_name() == "ROCM_FLASH"
                 or backend.get_name() == "TRITON_ATTN_VLLM_V1")
 
-        # mla test for deepseek related
+        # MLA test for deepseek related
+
+        # change the attention backend to triton MLA
+        m.setenv(STR_BACKEND_ENV_VAR, "TRITON_MLA")
         backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
                                    False, True)
         assert backend.get_name() == "TRITON_MLA"
+
+        # If attention backend is None
+        # If use_mla is true
+        # The selected backend is triton MLA
+        m.setenv(STR_BACKEND_ENV_VAR, None)
+        backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
+                                   False, True)
+        assert backend.get_name() == "TRITON_MLA"
+
+        # change the attention backend to AITER MLA
+        m.setenv(STR_BACKEND_ENV_VAR, "ROCM_AITER_MLA")
+        backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False,
+                                   False, True)
+        assert backend.get_name() == "ROCM_AITER_MLA"
+
+        # If attention backend is None
+        # If use_mla is true
+        # If VLLM_ROCM_USE_AITER is enabled
+        # The selected backend is ROCM_AITER_MLA
+        m.setenv(STR_BACKEND_ENV_VAR, None)
+        m.setenv("VLLM_ROCM_USE_AITER", "1")
+        backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False,
+                                   False, True)
+        assert backend.get_name() == "ROCM_AITER_MLA"
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index 2ec771a64557a..2517a59718382 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -711,12 +711,24 @@ class MLACommonMetadata(AttentionMetadata):
             self.seq_lens[i] += 1
         self.max_decode_seq_len = max(self.seq_lens)
 
+        self._ops_advance_step(num_seqs=num_seqs,
+                               num_queries=num_queries,
+                               block_size=block_size,
+                               input_tokens=model_input.input_tokens,
+                               sampled_token_ids=sampled_token_ids,
+                               input_positions=model_input.input_positions)
+
+    def _ops_advance_step(self, num_seqs: int, num_queries: int,
+                          block_size: int, input_tokens: torch.Tensor,
+                          sampled_token_ids: torch.Tensor,
+                          input_positions: torch.Tensor) -> None:
+        # here we use advance_step_flashinfo to update the paged_kv_* tensors
         ops.advance_step_flashattn(num_seqs=num_seqs,
                                    num_queries=num_queries,
                                    block_size=block_size,
-                                   input_tokens=model_input.input_tokens,
+                                   input_tokens=input_tokens,
                                    sampled_token_ids=sampled_token_ids,
-                                   input_positions=model_input.input_positions,
+                                   input_positions=input_positions,
                                    seq_lens=self.seq_lens_tensor,
                                    slot_mapping=self.slot_mapping,
                                    block_tables=self.block_tables)
@@ -727,6 +739,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]):
     NOTE: Please read the comment at the top of the file before trying to 
     understand this class
     """
+    BLOCK_TABLE_EXTENDER: list[list[int]] = []
 
     def __init__(self, input_builder: "ModelInputForGPUBuilder"):
         self.input_builder = input_builder
@@ -877,8 +890,10 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]):
         num_seqs = len(seq_lens)
         if use_captured_graph:
             self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
-            self.block_tables.extend([] * cuda_graph_pad_size)
+            self.block_tables.extend(self.__class__.BLOCK_TABLE_EXTENDER *
+                                     cuda_graph_pad_size)
             num_decode_tokens = batch_size - self.num_prefill_tokens
+
             block_tables = self._get_graph_runner_block_tables(
                 num_seqs, self.block_tables)
         else:
diff --git a/vllm/attention/backends/rocm_aiter_mla.py b/vllm/attention/backends/rocm_aiter_mla.py
new file mode 100644
index 0000000000000..6e695b78e0e15
--- /dev/null
+++ b/vllm/attention/backends/rocm_aiter_mla.py
@@ -0,0 +1,412 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Optional, Type, Union
+
+import torch
+
+import vllm._custom_ops as ops
+import vllm.envs as envs
+from vllm.attention.backends.mla.common import (MLACommonBackend,
+                                                MLACommonImpl,
+                                                MLACommonMetadata,
+                                                MLACommonMetadataBuilder,
+                                                MLACommonState)
+from vllm.attention.backends.utils import (compute_slot_mapping,
+                                           compute_slot_mapping_start_idx,
+                                           is_block_tables_empty)
+from vllm.attention.ops.rocm_aiter_mla import (aiter_mla_decode_fwd,
+                                               get_aiter_mla_metadata)
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUBuilder
+
+
+def is_aiter_mla_enabled() -> bool:
+    return envs.VLLM_ROCM_USE_AITER \
+        and envs.VLLM_ROCM_USE_AITER_MLA
+
+
+class AiterMLABackend(MLACommonBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "ROCM_AITER_MLA"
+
+    @staticmethod
+    def get_impl_cls() -> Type["AiterMLAImpl"]:
+        return AiterMLAImpl
+
+    @staticmethod
+    def get_metadata_cls() -> Type["AiterMLAMetadata"]:
+        return AiterMLAMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["AiterMLAMetadataBuilder"]:
+        return AiterMLAMetadataBuilder
+
+    @staticmethod
+    def get_state_cls() -> Type["AiterMLAState"]:
+        return AiterMLAState
+
+
+@dataclass
+class AiterMLAMetadata(MLACommonMetadata):
+    # The following 4 tensors are for current version of AITER MLA
+    block_table_bound: Optional[torch.Tensor] = None
+    # The indptr of the paged kv cache, shape: [batch_size + 1]
+    paged_kv_indptr: Optional[torch.Tensor] = None
+    # The page indices of the paged kv cache
+    paged_kv_indices: Optional[torch.Tensor] = None
+    # The number of entries in the last page of each request in
+    # the paged kv cache, shape: [batch_size]
+    paged_kv_last_page_lens: Optional[torch.Tensor] = None
+
+    @property
+    def prefill_metadata(self):
+        prefill_metadata = super().prefill_metadata
+        self._cached_prefill_metadata = prefill_metadata
+
+        if prefill_metadata is not None:
+            prefill_metadata.paged_kv_indptr = self.paged_kv_indptr
+            prefill_metadata.paged_kv_indices = self.paged_kv_indices
+            prefill_metadata\
+                .paged_kv_last_page_lens = self.paged_kv_last_page_lens
+            prefill_metadata.block_table_bound = self.block_table_bound
+
+            # update the cache
+            self._cached_prefill_metadata = self.__class__(
+                **prefill_metadata.__dict__)
+
+        return self._cached_prefill_metadata
+
+    @property
+    def decode_metadata(self):
+        decode_metadata = super().decode_metadata
+
+        self._cached_decode_metadata = decode_metadata
+
+        if decode_metadata is not None:
+            decode_metadata.paged_kv_indptr = self.paged_kv_indptr
+            decode_metadata.paged_kv_indices = self.paged_kv_indices
+            decode_metadata\
+                .paged_kv_last_page_lens = self.paged_kv_last_page_lens
+            decode_metadata.block_table_bound = self.block_table_bound
+
+            # update the cache
+            self._cached_decode_metadata = self.__class__(
+                **decode_metadata.__dict__)
+
+        return self._cached_decode_metadata
+
+    def _ops_advance_step(self, num_seqs: int, num_queries: int,
+                          block_size: int, input_tokens: torch.Tensor,
+                          sampled_token_ids: torch.Tensor,
+                          input_positions: torch.Tensor) -> None:
+
+        ops.advance_step_flashinfer(
+            num_seqs=num_seqs,
+            num_queries=num_queries,
+            block_size=block_size,
+            input_tokens=input_tokens,
+            sampled_token_ids=sampled_token_ids,
+            input_positions=input_positions,
+            seq_lens=self.seq_lens_tensor,
+            slot_mapping=self.slot_mapping,
+            block_tables=self.block_tables,
+            paged_kv_indices=self.paged_kv_indices,
+            paged_kv_indptr=self.paged_kv_indptr,
+            paged_kv_last_page_lens=self.paged_kv_last_page_lens,
+            block_table_bound=self.block_table_bound)
+
+
+class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
+    BLOCK_TABLE_EXTENDER: list[list[int]] = [[]]
+
+    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+        super().__init__(input_builder)
+        assert self.runner.model_config.max_model_len == 32768,\
+                "AITER MLA requires max model len to be set to 32768"
+        assert self.block_size == 1, "AITER MLA requires only block size 1."
+
+    def prepare(self):
+        super().prepare()
+        self.paged_kv_indices: list[int] = []
+        self.paged_kv_indptr: list[int] = [0]
+        self.paged_kv_last_page_lens: list[int] = []
+        self.total_blocks = 0
+
+    def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool,
+                       prefix_cache_hit: bool):
+        """Add a sequence group to the metadata. Specifically update/append
+        1. context length.
+        2. block table.
+        3. slot mapping.
+        """
+        is_prompt = inter_data.is_prompt
+        block_tables = inter_data.block_tables
+
+        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
+             curr_sliding_window_block, input_positions) in zip(
+                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
+                 inter_data.orig_seq_lens, inter_data.seq_lens,
+                 inter_data.query_lens, inter_data.context_lens,
+                 inter_data.curr_sliding_window_blocks,
+                 inter_data.input_positions):
+            self.input_positions.extend(input_positions)
+            self.context_lens.append(context_len)
+            if is_prompt:
+                self.num_prefills += 1
+                self.num_prefill_tokens += token_len
+                self.prefill_seq_lens.append(seq_len)
+            else:
+                self.num_decode_tokens += query_len
+                self.curr_seq_lens.append(curr_seq_len)
+
+            # Compute block table.
+            # TODO(sang): Combine chunked prefill and prefix caching by
+            # only allowing multiple of block_size chunk size.
+            # NOTE: This only works for oooooooxxx style attention.
+            block_table = []
+            if prefix_cache_hit:
+                # NOTE(woosuk): For flash-attn, the block table should
+                # include the entries for the incoming prefill tokens.
+                block_table = block_tables[seq_id]
+            elif ((chunked_prefill_enabled or not is_prompt)
+                  and block_tables is not None):
+                if curr_sliding_window_block == 0:
+                    block_table = block_tables[seq_id]
+                else:
+                    block_table = block_tables[seq_id][
+                        -curr_sliding_window_block:]
+            self.block_tables.append(block_table)
+
+            # Compute slot mapping.
+            is_profile_run = is_block_tables_empty(block_tables)
+            start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
+                                                       context_len,
+                                                       self.sliding_window)
+            compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
+                                 seq_len, context_len, start_idx,
+                                 self.block_size, inter_data.block_tables)
+            if is_profile_run:
+                return
+
+            # Update paged_kv_* tensors only for non-profile run
+            block_table = block_tables[seq_id]
+            self._update_paged_kv_tensors(block_table, seq_len)
+
+    def _update_paged_kv_tensors(self, block_table: list[int], seq_len: int):
+        # Get the number of valid blocks based on sequence length.
+        # If seq_len = 16, block_size = 16,
+        # block_table_bound is 1 with 1 valid block.
+        # If seq_len = 15, block_size = 16,
+        # block_table_bound is 0 + 1 with 1 valid block.
+        self.total_blocks += len(block_table)
+        block_table_bound = seq_len // self.block_size + 1 \
+            if seq_len % self.block_size != 0 \
+            else seq_len // self.block_size
+        self.paged_kv_indices.extend(block_table[:block_table_bound])
+        self.paged_kv_indptr.append(self.paged_kv_indptr[-1] +
+                                    block_table_bound)
+
+        last_page_len = seq_len % self.block_size
+        if last_page_len == 0:
+            last_page_len = self.block_size
+        self.paged_kv_last_page_lens.append(last_page_len)
+
+    def build(self, seq_lens: list[int], query_lens: list[int],
+              cuda_graph_pad_size: int, batch_size: int) -> AiterMLAMetadata:
+        metadata = super().build(seq_lens, query_lens, cuda_graph_pad_size,
+                                 batch_size)
+        device = self.runner.device
+        use_captured_graph = cuda_graph_pad_size != -1
+
+        if use_captured_graph:
+            last_paged_kv_indptr = self.paged_kv_indptr[-1]
+            self.paged_kv_indptr.extend([last_paged_kv_indptr] *
+                                        cuda_graph_pad_size)
+            self.paged_kv_last_page_lens.extend([0] * cuda_graph_pad_size)
+
+        # For current version of AITER MLA
+        if len(self.paged_kv_indptr) > 0:
+            # extend to the maximum number of blocks as returned by the
+            # scheduler
+            self.paged_kv_indices.extend(
+                [0] * (self.total_blocks - len(self.paged_kv_indices)))
+            paged_kv_indices_tensor = torch.tensor(self.paged_kv_indices,
+                                                   device=device,
+                                                   dtype=torch.int)
+            paged_kv_indptr_tensor = torch.tensor(self.paged_kv_indptr,
+                                                  device=device,
+                                                  dtype=torch.int)
+            paged_kv_last_page_lens_tensor = torch.tensor(
+                self.paged_kv_last_page_lens, device=device, dtype=torch.int)
+            block_table_bound_tensor = torch.zeros(len(self.paged_kv_indptr) -
+                                                   1,
+                                                   device=device,
+                                                   dtype=torch.int)
+        else:
+            paged_kv_indices_tensor = None
+            paged_kv_indptr_tensor = None
+            paged_kv_last_page_lens_tensor = None
+            block_table_bound_tensor = None
+
+        metadata.paged_kv_indptr = paged_kv_indptr_tensor
+        metadata.paged_kv_indices = paged_kv_indices_tensor
+        metadata.paged_kv_last_page_lens = paged_kv_last_page_lens_tensor
+        metadata.block_table_bound = block_table_bound_tensor
+
+        return metadata
+
+
+class AiterMLAState(MLACommonState[AiterMLAMetadata]):
+
+    @contextmanager
+    def graph_capture(self, max_batch_size: int):
+        kv_indices, kv_indptr, last_page_lens = get_aiter_mla_metadata(
+            max_batch_size=max_batch_size,
+            block_size=self.runner.block_size,
+            max_block_per_batch=self.runner.get_max_block_per_batch(),
+            device=self.runner.device)
+        self._paged_kv_indices_tensor = kv_indices
+        self._paged_kv_indptr_tensor = kv_indptr
+        self._paged_kv_last_page_lens_tensor = last_page_lens
+
+        with super().graph_capture(max_batch_size):
+            yield
+
+        del self._paged_kv_indices_tensor
+        del self._paged_kv_indptr_tensor
+        del self._paged_kv_last_page_lens_tensor
+
+    def graph_capture_get_metadata_for_batch(
+            self,
+            batch_size: int,
+            is_encoder_decoder_model: bool = False) -> AiterMLAMetadata:
+
+        metadata = super().graph_capture_get_metadata_for_batch(
+            batch_size, is_encoder_decoder_model)
+
+        paged_kv_indptr = self._paged_kv_indptr_tensor[:batch_size + 1]
+        paged_kv_indices = self._paged_kv_indices_tensor
+        paged_kv_last_page_lens = self._paged_kv_last_page_lens_tensor[:
+                                                                       batch_size]
+
+        metadata.paged_kv_indptr = paged_kv_indptr
+        metadata.paged_kv_indices = paged_kv_indices
+        metadata.paged_kv_last_page_lens = paged_kv_last_page_lens
+
+        return metadata
+
+    def get_graph_input_buffers(self,
+                                attn_metadata: AiterMLAMetadata,
+                                is_encoder_decoder_model: bool = False):
+        input_buffers = super().get_graph_input_buffers(
+            attn_metadata, is_encoder_decoder_model)
+        input_buffers[
+            'paged_kv_indptr'] = attn_metadata.decode_metadata.paged_kv_indptr
+        input_buffers[
+            "paged_kv_indices"] = attn_metadata.\
+            decode_metadata.paged_kv_indices
+        input_buffers[
+            "paged_kv_last_page_lens"] = attn_metadata.\
+            decode_metadata.paged_kv_last_page_lens
+
+        return input_buffers
+
+    def prepare_graph_input_buffers(self,
+                                    input_buffers,
+                                    attn_metadata: AiterMLAMetadata,
+                                    is_encoder_decoder_model: bool = False):
+        super().prepare_graph_input_buffers(input_buffers, attn_metadata,
+                                            is_encoder_decoder_model)
+
+        num_total_blocks = attn_metadata.decode_metadata.paged_kv_indices.shape[
+            0]
+        input_buffers["paged_kv_indptr"].copy_(
+            attn_metadata.decode_metadata.paged_kv_indptr, non_blocking=True)
+        input_buffers["paged_kv_indices"][:num_total_blocks].copy_(
+            attn_metadata.decode_metadata.paged_kv_indices, non_blocking=True)
+        input_buffers["paged_kv_last_page_lens"].copy_(
+            attn_metadata.decode_metadata.paged_kv_last_page_lens,
+            non_blocking=True)
+
+
+class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
+
+    def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float,
+            num_kv_heads: int,
+            alibi_slopes: Optional[list[float]],
+            sliding_window: Optional[int],
+            kv_cache_dtype: str,
+            blocksparse_params: Optional[dict[str, Any]],
+            logits_soft_cap: Optional[float],
+            attn_type: str,
+            # MLA Specific Arguments
+            **mla_args) -> None:
+        super().__init__(num_heads, head_size, scale, num_kv_heads,
+                         alibi_slopes, sliding_window, kv_cache_dtype,
+                         blocksparse_params, logits_soft_cap, attn_type,
+                         **mla_args)
+
+        unsupported_features = [
+            alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
+        ]
+        if any(unsupported_features):
+            raise NotImplementedError(
+                "Aiter MLA does not support one of the following: "
+                "alibi_slopes, sliding_window, blocksparse_params, "
+                "logits_soft_cap")
+
+        from aiter import flash_attn_varlen_func
+        self.flash_attn_varlen_func = flash_attn_varlen_func
+
+    def _flash_attn_varlen_diff_headdims(
+            self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+            softmax_scale: float, return_softmax_lse: bool,
+            **kwargs) -> Union[tuple[torch.Tensor, ...], torch.Tensor]:
+        output = self.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v,
+            softmax_scale=softmax_scale,
+            return_lse=return_softmax_lse,
+            **kwargs,
+        )
+
+        return output
+
+    def _forward_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: AiterMLAMetadata,
+    ) -> torch.Tensor:
+        assert kv_c_and_k_pe_cache.numel() > 0
+
+        decode_meta = attn_metadata.decode_metadata
+        assert decode_meta is not None
+        B = q_nope.shape[0]
+
+        q = torch.cat([q_nope, q_pe], dim=-1)
+        o = torch.zeros(B,
+                        self.num_heads,
+                        self.kv_lora_rank,
+                        dtype=q.dtype,
+                        device=q.device)
+
+        kv_buffer = kv_c_and_k_pe_cache.unsqueeze(2)
+
+        aiter_mla_decode_fwd(q, kv_buffer, o, self.scale,
+                             attn_metadata.paged_kv_indptr,
+                             attn_metadata.paged_kv_indices,
+                             attn_metadata.paged_kv_last_page_lens)
+
+        return self._v_up_proj_and_o_proj(o)
diff --git a/vllm/attention/ops/rocm_aiter_mla.py b/vllm/attention/ops/rocm_aiter_mla.py
new file mode 100644
index 0000000000000..1c90f8c19b09c
--- /dev/null
+++ b/vllm/attention/ops/rocm_aiter_mla.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+
+
+def get_aiter_mla_metadata(max_batch_size: int, block_size: int,
+                           max_block_per_batch: int,
+                           device: torch.device) -> tuple[torch.Tensor, ...]:
+    paged_kv_indices = torch.zeros(max_batch_size * max_block_per_batch,
+                                   dtype=torch.int32,
+                                   device=device)
+    paged_kv_indptr = torch.zeros(max_batch_size + 1,
+                                  dtype=torch.int32,
+                                  device=device)
+    paged_kv_last_page_lens = torch.full((max_batch_size, ),
+                                         block_size,
+                                         dtype=torch.int32)
+    return paged_kv_indices, paged_kv_indptr, paged_kv_last_page_lens
+
+
+def aiter_mla_decode_fwd(
+    q: torch.Tensor,
+    kv_buffer: torch.Tensor,
+    o: torch.Tensor,
+    sm_scale: float,
+    kv_indptr: Optional[torch.Tensor] = None,
+    kv_indices: Optional[torch.Tensor] = None,
+    kv_last_page_lens: Optional[torch.Tensor] = None,
+    logit_cap: float = 0.0,
+):
+    from aiter.mla import mla_decode_fwd
+
+    mla_decode_fwd(q,
+                   kv_buffer.view(-1, 1, 1, q.shape[-1]),
+                   o,
+                   kv_indptr,
+                   kv_indices,
+                   kv_last_page_lens,
+                   sm_scale=sm_scale,
+                   logit_cap=logit_cap)
diff --git a/vllm/config.py b/vllm/config.py
index bf96bffc0fe23..a1c495931b2ec 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1248,7 +1248,7 @@ class ModelConfig:
                 or getattr(self.hf_config, "is_matryoshka", False))
 
 
-BlockSize = Literal[8, 16, 32, 64, 128]
+BlockSize = Literal[1, 8, 16, 32, 64, 128]
 CacheDType = Literal["auto", "fp8", "fp8_e4m3", "fp8_e5m2"]
 PrefixCachingHashAlgo = Literal["builtin", "sha256"]
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 5922e90a91b1c..92dcf1555f223 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -79,6 +79,7 @@ if TYPE_CHECKING:
     VLLM_ROCM_USE_AITER_LINEAR: bool = True
     VLLM_ROCM_USE_AITER_MOE: bool = True
     VLLM_ROCM_USE_AITER_RMSNORM: bool = True
+    VLLM_ROCM_USE_AITER_MLA: bool = True
     VLLM_ROCM_USE_SKINNY_GEMM: bool = True
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ROCM_MOE_PADDING: bool = True
@@ -558,6 +559,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: (os.getenv("VLLM_ROCM_USE_AITER_RMSNORM", "True").lower() in
              ("true", "1")),
 
+    # Whether to use aiter mla ops.
+    # By default is enabled.
+    "VLLM_ROCM_USE_AITER_MLA":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER_MLA", "True").lower() in
+             ("true", "1")),
     # use rocm skinny gemms
     "VLLM_ROCM_USE_SKINNY_GEMM":
     lambda: (os.getenv("VLLM_ROCM_USE_SKINNY_GEMM", "True").lower() in
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index d8bde5c5cb321..a60e128b550ff 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -39,6 +39,7 @@ class _Backend(enum.Enum):
     TRITON_ATTN_VLLM_V1 = enum.auto()
     XFORMERS = enum.auto()
     ROCM_FLASH = enum.auto()
+    ROCM_AITER_MLA = enum.auto()
     TORCH_SDPA = enum.auto()
     FLASHINFER = enum.auto()
     TRITON_MLA = enum.auto()  # Supported by V1
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 091ec2a1f945a..24d8657af17d7 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -141,8 +141,36 @@ class RocmPlatform(Platform):
                              kv_cache_dtype, block_size, use_v1,
                              use_mla) -> str:
         if use_mla:
-            logger.info("Using Triton MLA backend.")
-            return "vllm.attention.backends.triton_mla.TritonMLABackend"
+            from vllm.attention.backends.rocm_aiter_mla import (
+                is_aiter_mla_enabled)
+
+            if selected_backend is None:
+                selected_backend = (_Backend.ROCM_AITER_MLA if
+                                    is_aiter_mla_enabled() or block_size == 1
+                                    else _Backend.TRITON_MLA)
+
+            if selected_backend == _Backend.TRITON_MLA:
+                if block_size != 1:
+                    logger.info("Using Triton MLA backend.")
+                    return "vllm.attention.backends.triton_mla.TritonMLABackend"  # noqa: E501
+                else:
+                    raise ValueError(
+                        f" The selected backend, {selected_backend.name},"
+                        f"does not support block size {block_size}.")
+            elif selected_backend == _Backend.ROCM_AITER_MLA:
+                if block_size == 1:
+                    logger.info("Using AITER MLA backend.")
+                    return "vllm.attention.backends.rocm_aiter_mla.AiterMLABackend"  # noqa: E501
+                else:
+                    raise ValueError(
+                        f" The selected backend, {selected_backend.name},"
+                        f"does not support block size {block_size}."
+                        "(currently only supports block size 1)")
+            else:
+                raise ValueError(
+                    f" The selected backend, {selected_backend.name},"
+                    f"is not MLA type while requested for MLA backend.")
+
         selected_backend = (_Backend.ROCM_FLASH if selected_backend
                             == _Backend.FLASH_ATTN else selected_backend)
         if envs.VLLM_USE_V1:
@@ -317,4 +345,4 @@ class RocmPlatform(Platform):
     @classmethod
     def get_cu_count(cls, device_id: int = 0) -> int:
         return torch.cuda.get_device_properties(
-            device_id).multi_processor_count
\ No newline at end of file
+            device_id).multi_processor_count

From 7f58fb9718f4d6ed370b0f595d297c8c90fc11fa Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@users.noreply.github.com>
Date: Tue, 22 Apr 2025 12:32:22 -0400
Subject: [PATCH 569/593] Add assertion for no objects while hashing hf_config
 (#16930)

Signed-off-by: rzou <zou3519@gmail.com>
---
 vllm/config.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/vllm/config.py b/vllm/config.py
index a1c495931b2ec..641b221f5d347 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -6,6 +6,7 @@ import enum
 import hashlib
 import inspect
 import json
+import re
 import sys
 import textwrap
 import warnings
@@ -328,6 +329,8 @@ class ModelConfig:
         factors.append(self.rope_theta)
         # hf_config can control how the model looks!
         factors.append(self.hf_config.to_json_string())
+        str_factors = str(factors)
+        assert_hashable(str_factors)
         return hashlib.sha256(str(factors).encode()).hexdigest()
 
     def __init__(
@@ -4031,3 +4034,30 @@ def get_current_vllm_config() -> VllmConfig:
         from vllm.config import VllmConfig
         return VllmConfig()
     return _current_vllm_config
+
+
+def contains_object_print(text):
+    """
+    Check if the text looks like a printed Python object, e.g.
+    contains any substring matching the pattern: "at 0xFFFFFFF>"
+    We match against 0x followed by 2-16 hex chars (there's
+    a max of 16 on a 64 bit system).
+
+    Args:
+        text (str): The text to check
+
+    Returns:
+        bool: True if a match is found, False otherwise
+    """
+    pattern = r'at 0x[a-fA-F0-9]{2,16}>'
+    match = re.search(pattern, text)
+    return match is not None
+
+
+def assert_hashable(text):
+    if not contains_object_print(text):
+        return True
+    raise AssertionError(
+        f"vLLM tried to hash some configs that may have Python objects ids "
+        f"in them. This is a bug, please file an issue. "
+        f"Text being hashed: {text}")

From 5536b30a4c7877d75758d21bdaf39b3a59aa2dc2 Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Tue, 22 Apr 2025 11:32:40 -0500
Subject: [PATCH 570/593] Fencing Kernels Tests for enabling on AMD (#16929)

Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
---
 .buildkite/scripts/hardware_ci/run-amd-test.sh | 7 +++++++
 .buildkite/test-pipeline.yaml                  | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 469422ddec20a..368f30434aa1d 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -98,6 +98,13 @@ if [[ $commands == *" kernels "* ]]; then
   --ignore=kernels/test_machete_mm.py \
   --ignore=kernels/test_mha_attn.py \
   --ignore=kernels/test_block_fp8.py \
+  --ignore=kernels/test_cutlass_moe.py \
+  --ignore=kernels/test_mamba_ssm_ssd.py \
+  --ignore=kernels/test_attention.py \
+  --ignore=kernels/test_block_int8.py \
+  --ignore=kernels/test_fused_quant_layernorm.py \
+  --ignore=kernels/test_int8_kernel.py \
+  --ignore=kernels/test_triton_moe_ptpc_fp8.py \
   --ignore=kernels/test_permute_cols.py"
 fi
 
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f41d15c2324e6..898b8e5da2f48 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -314,7 +314,7 @@ steps:
   - pytest -v -s compile/test_full_graph.py
 
 - label: Kernels Test %N # 1h each
-  # mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
   source_file_dependencies:
   - csrc/
   - vllm/attention

From 5175b884f70d82d57ad7ce5229f579e45d0c502a Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Tue, 22 Apr 2025 16:27:14 -0700
Subject: [PATCH 571/593] [BugFix] Remove default multiproc executor
 `collective_rpc` timeout (#17000)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/executor/multiproc_executor.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index cff6181fa3adf..cb125bf4bf173 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -38,7 +38,7 @@ logger = init_logger(__name__)
 POLLING_TIMEOUT_MS = 5000
 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
 
-EXECUTE_MODEL_TIMEOUT_S = 30
+EXECUTE_MODEL_TIMEOUT_S = 40
 
 
 class MultiprocExecutor(Executor):
@@ -151,16 +151,16 @@ class MultiprocExecutor(Executor):
 
     def collective_rpc(self,
                        method: Union[str, Callable],
-                       timeout: Optional[float] = 180.0,
+                       timeout: Optional[float] = None,
                        args: tuple = (),
                        kwargs: Optional[dict] = None,
                        rank0_reply_only: bool = False) -> list[Any]:
-        start_time = time.monotonic()
-        kwargs = kwargs or {}
-
         if self.is_failed:
             raise RuntimeError("Executor failed.")
 
+        deadline = None if timeout is None else time.monotonic() + timeout
+        kwargs = kwargs or {}
+
         # NOTE: If the args are heterogeneous, then we pack them into a list,
         # and unpack them in the method of every worker, because every worker
         # knows their own rank.
@@ -176,8 +176,8 @@ class MultiprocExecutor(Executor):
             workers = (self.workers[0], ) if rank0_reply_only else self.workers
             responses = [None] * len(workers)
             for w in workers:
-                dequeue_timeout = timeout - (time.monotonic() - start_time
-                                             ) if timeout is not None else None
+                dequeue_timeout = None if deadline is None else (
+                    deadline - time.monotonic())
                 status, result = w.worker_response_mq.dequeue(
                     timeout=dequeue_timeout, cancel=self.shutdown_event)
 

From 83d933718c82f71e4971b6febe781743a2a52919 Mon Sep 17 00:00:00 2001
From: Chenyaaang <42742451+Chenyaaang@users.noreply.github.com>
Date: Tue, 22 Apr 2025 17:05:23 -0700
Subject: [PATCH 572/593] [Core][V1][TPU] Enable structured decoding on TPU V1
 (#16499)

Signed-off-by: Chenyaaang <chenyangli@google.com>
---
 .../scripts/hardware_ci/run-tpu-v1-test.sh    |   4 +-
 .../benchmark_serving_structured_output.py    |   2 +-
 tests/v1/tpu/test_sampler.py                  |   7 +-
 vllm/platforms/tpu.py                         |   4 +-
 vllm/v1/worker/tpu_model_runner.py            | 172 +++++++++++++++---
 5 files changed, 158 insertions(+), 31 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index 6b5e86a0ebd64..704bc6b7324da 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -44,7 +44,9 @@ docker run --privileged --net host --shm-size=16G -it \
     && echo TEST_9 \
     && pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py \
     && echo TEST_10 \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py \
+    && echo TEST_11 \
+    && pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py" \
 
 
 # TODO: This test fails because it uses RANDOM_SEED sampling
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index e52f16a8b1299..5dd9b1dbd4611 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -51,7 +51,7 @@ try:
 except ImportError:
     from argparse import ArgumentParser as FlexibleArgumentParser
 
-from vllm.v1.structured_output.utils import (
+from vllm.v1.structured_output.backend_xgrammar import (
     has_xgrammar_unsupported_json_features)
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
diff --git a/tests/v1/tpu/test_sampler.py b/tests/v1/tpu/test_sampler.py
index 50d40aa9dec2c..046d3e404e4f4 100644
--- a/tests/v1/tpu/test_sampler.py
+++ b/tests/v1/tpu/test_sampler.py
@@ -23,7 +23,7 @@ def test_sampler_different(model_name: str):
     different results.
     """
     llm = LLM(model_name,
-              enforce_eager=False,
+              enforce_eager=True,
               max_num_seqs=1,
               max_model_len=512,
               max_num_batched_tokens=512)
@@ -57,4 +57,7 @@ def test_sampler_different(model_name: str):
         # Make sure first two reqs have the same K/P
         sampling_params[0] = sampling_params[1]
         output = llm.generate(p, sampling_params)
-        assert output[0].outputs[0].text == output[1].outputs[0].text
+        # There are natural numerical instabilities that make it difficult
+        # to have deterministic results over many tokens, tests the first ~20
+        # tokens match.
+        assert output[0].outputs[0].text[:20] == output[1].outputs[0].text[:20]
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index b1e221e28b434..fcac5155637fe 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -168,9 +168,9 @@ class TpuPlatform(Platform):
     ) -> None:
         """Raises if this request is unsupported on this platform"""
         if isinstance(params, SamplingParams):
-            if params.guided_decoding is not None:
+            if params.guided_decoding is not None and not envs.VLLM_USE_V1:
                 raise ValueError("Structured output is not supported on "
-                                 f"{cls.device_name}.")
+                                 f"{cls.device_name} V0.")
             if params.sampling_type == SamplingType.RANDOM_SEED:
                 raise ValueError(
                     "Torch XLA does not support per-request seed.")
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 7eb464660e959..5d94f675f92e8 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -30,8 +30,9 @@ from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available
 from vllm.v1.attention.backends.pallas import (PallasAttentionBackend,
                                                PallasMetadata)
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
-from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
-                                        KVCacheSpec, SlidingWindowSpec)
+from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec,
+                                        KVCacheConfig, KVCacheSpec,
+                                        SlidingWindowSpec)
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
                              ModelRunnerOutput)
 from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
@@ -148,6 +149,7 @@ class TPUModelRunner:
         self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
         self.head_size = model_config.get_head_size()
         self.hidden_size = model_config.get_hidden_size()
+        self.vocab_size = model_config.get_vocab_size()
 
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
@@ -178,7 +180,7 @@ class TPUModelRunner:
             max_num_blocks_per_req=self.max_num_blocks_per_req,
             device=self.device,
             pin_memory=self.pin_memory,
-            vocab_size=model_config.get_vocab_size(),
+            vocab_size=self.vocab_size,
         )
 
         # Cached torch/numpy tensor
@@ -221,6 +223,20 @@ class TPUModelRunner:
         self.num_reqs_paddings = _get_req_paddings(
             min_req_size=MIN_NUM_SEQS, max_req_size=self.max_num_reqs)
 
+        # tensors for structured decoding
+        self.grammar_bitmask_cpu = torch.zeros(
+            (self.max_num_reqs, cdiv(self.vocab_size, 32)),
+            dtype=torch.int32,
+            device="cpu",
+            pin_memory=self.pin_memory)
+        self.require_structured_out_cpu = torch.zeros(
+            (self.max_num_reqs, 1),
+            dtype=torch.bool,
+            device="cpu",
+            pin_memory=self.pin_memory)
+        self.structured_decode_arange = torch.arange(
+            0, 32, device="cpu", pin_memory=self.pin_memory)
+
         # Get maximum number of mm items per modality (batch size).
         self.max_num_mm_items_by_modality = dict()
         if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0
@@ -762,9 +778,16 @@ class TPUModelRunner:
             )
         hidden_states = self.select_hidden_states(hidden_states,
                                                   logits_indices)
+        logits = self.compute_logits(hidden_states)
         tpu_sampling_metadata = TPUSupportedSamplingMetadata.\
             from_input_batch(self.input_batch, padded_num_reqs, self.device)
-        selected_token_ids = self.sample_from_hidden(hidden_states,
+        if scheduler_output.grammar_bitmask is not None:
+            require_struct_decoding, grammar_bitmask_padded, arange = \
+                self.prepare_structured_decoding_input(logits, scheduler_output)
+            logits = self.structured_decode(require_struct_decoding,
+                                            grammar_bitmask_padded, logits,
+                                            arange)
+        selected_token_ids = self.sample_from_logits(logits,
                                                      tpu_sampling_metadata)
         # Remove padding on cpu and keep dynamic op outside of xla graph.
         selected_token_ids = selected_token_ids.cpu()[:num_reqs]
@@ -997,7 +1020,7 @@ class TPUModelRunner:
             self._dummy_run(num_tokens)
         xm.wait_device_ops()
         end = time.perf_counter()
-        logger.info("Compilation finished in in %.2f [secs].", end - start)
+        logger.info("Compilation finished in %.2f [secs].", end - start)
         self._update_num_xla_graphs("model backbone")
 
     def _precompile_select_hidden_states(self) -> None:
@@ -1026,19 +1049,59 @@ class TPUModelRunner:
                     break
         xm.wait_device_ops()
         end = time.perf_counter()
-        logger.info("Compilation finished in in %.2f [secs].", end - start)
+        logger.info("Compilation finished in %.2f [secs].", end - start)
         self._update_num_xla_graphs("select_hidden_states")
 
-    def _precompile_sample_from_hidden(self) -> None:
-        logger.info("Compiling sampling with different num_reqs.")
+    def _precompile_compute_logits(self) -> None:
+        logger.info("Compiling compute_logits with different input shapes.")
         start = time.perf_counter()
         hsize = self.model_config.get_hidden_size()
         for num_reqs in self.num_reqs_paddings:
             dummy_hidden = torch.zeros((num_reqs, hsize),
                                        device=self.device,
                                        dtype=self._hidden_states_dtype)
-            # The first dimension of dummy_hidden cannot be mark_dynamic because
-            # some operations in the sampler require it to be static.
+            torch._dynamo.mark_dynamic(dummy_hidden, 0)
+            self.compute_logits(dummy_hidden)
+            logger.info("  -- num_seqs: %d", num_reqs)
+        xm.wait_device_ops()
+        end = time.perf_counter()
+        logger.info("Compilation finished in %.2f [secs].", end - start)
+        self._update_num_xla_graphs("compute_logits")
+
+    def _precompile_structured_decoding(self) -> None:
+        logger.info(
+            "Compiling structured_decoding with different input shapes.")
+        start = time.perf_counter()
+        for num_reqs in self.num_reqs_paddings:
+            dummy_logits = torch.zeros((num_reqs, self.vocab_size),
+                                       device=self.device,
+                                       dtype=self._hidden_states_dtype)
+            dummy_require_struct_decoding = \
+                self.require_structured_out_cpu[:num_reqs].to(self.device)
+            dummy_grammar_bitmask = \
+                self.grammar_bitmask_cpu[:num_reqs].to(self.device)
+            # The first dimension of the above 3 dummy tensors cannot be
+            # mark_dynamic because some operations in structured_decode require
+            # them to be static.
+            arange = self.structured_decode_arange.to(self.device)
+            self.structured_decode(dummy_require_struct_decoding,
+                                   dummy_grammar_bitmask, dummy_logits, arange)
+            logger.info("  -- num_seqs: %d", num_reqs)
+        xm.wait_device_ops()
+        end = time.perf_counter()
+        logger.info("Compilation finished in %.2f [secs].", end - start)
+        self._update_num_xla_graphs("structured_decoding")
+
+    def _precompile_sample_from_logits(self) -> None:
+        logger.info(
+            "Compiling sample_from_logits with different input shapes.")
+        start = time.perf_counter()
+        for num_reqs in self.num_reqs_paddings:
+            dummy_logits = torch.zeros((num_reqs, self.vocab_size),
+                                       device=self.device,
+                                       dtype=self._hidden_states_dtype)
+            # The first dimension of dummy_logits cannot be mark_dynamic
+            # because some operations in the sampler require it to be static.
             for all_greedy in [False, True]:
                 generate_params_if_all_greedy = not all_greedy
                 sampling_metadata = (
@@ -1049,12 +1112,12 @@ class TPUModelRunner:
                         generate_params_if_all_greedy,
                     ))
                 sampling_metadata.all_greedy = all_greedy
-                self.sample_from_hidden(dummy_hidden, sampling_metadata)
+                self.sample_from_logits(dummy_logits, sampling_metadata)
             logger.info("  -- num_seqs: %d", num_reqs)
         xm.wait_device_ops()
         end = time.perf_counter()
-        logger.info("Compilation finished in in %.2f [secs].", end - start)
-        self._update_num_xla_graphs("sampling")
+        logger.info("Compilation finished in %.2f [secs].", end - start)
+        self._update_num_xla_graphs("sample_from_logits")
 
     def capture_model(self) -> None:
         """
@@ -1063,7 +1126,9 @@ class TPUModelRunner:
         self._precompile_mm_encoder()
         self._precompile_backbone()
         self._precompile_select_hidden_states()
-        self._precompile_sample_from_hidden()
+        self._precompile_compute_logits()
+        self._precompile_structured_decoding()
+        self._precompile_sample_from_logits()
 
     def profile_run(
         self,
@@ -1144,7 +1209,7 @@ class TPUModelRunner:
                 tensor_config = kv_cache_config.tensors[layer_name]
                 assert tensor_config.size % kv_cache_spec.page_size_bytes == 0
                 num_blocks = tensor_config.size // kv_cache_spec.page_size_bytes
-                if isinstance(kv_cache_spec, FullAttentionSpec):
+                if isinstance(kv_cache_spec, AttentionSpec):
                     kv_cache_shape = PallasAttentionBackend.get_kv_cache_shape(
                         num_blocks, kv_cache_spec.block_size,
                         kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
@@ -1179,16 +1244,14 @@ class TPUModelRunner:
         return hidden_states[indices_do_sample]
 
     @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
-    def sample_from_hidden(
-        self,
-        sample_hidden_states: torch.Tensor,
-        sampling_metadata: TPUSupportedSamplingMetadata,
-    ) -> torch.Tensor:
-        """
-        Sample with xla-friendly function. This function is to be traced 
-        separately from `forward` for lighter compilation overhead.
-        """
-        logits = self.model.compute_logits(sample_hidden_states, None)
+    def compute_logits(self,
+                       sample_hidden_states: torch.Tensor) -> torch.Tensor:
+        return self.model.compute_logits(sample_hidden_states, None)
+
+    @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
+    def sample_from_logits(
+            self, logits: torch.Tensor,
+            sampling_metadata: TPUSupportedSamplingMetadata) -> torch.Tensor:
         if sampling_metadata.all_greedy:
             out_tokens = torch.argmax(logits, dim=-1, keepdim=True)
         else:
@@ -1196,12 +1259,71 @@ class TPUModelRunner:
                                       sampling_metadata).sampled_token_ids
         return out_tokens
 
+    @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
+    def structured_decode(self, require_struct_decoding: torch.Tensor,
+                          grammar_bitmask: torch.Tensor, logits: torch.Tensor,
+                          arange: torch.Tensor) -> torch.Tensor:
+        return torch.where(
+            require_struct_decoding,
+            self.apply_grammar_bitmask(logits, grammar_bitmask, arange),
+            logits)
+
+    def apply_grammar_bitmask(self, logits: torch.Tensor,
+                              grammar_bitmask: torch.Tensor,
+                              arange: torch.Tensor):
+        assert (logits.shape[0] == grammar_bitmask.shape[0])
+        logits_cloned = logits.clone()
+        for i in range(logits.shape[0]):
+            unpacked_bitmask = (torch.bitwise_right_shift(
+                grammar_bitmask[i][:, None], arange[None, :]) & 1) == 0
+            unpacked_bitmask = unpacked_bitmask.reshape(-1)[:self.vocab_size]
+            logits_cloned[i] = logits_cloned[i].masked_fill(
+                unpacked_bitmask, -float("inf"))
+        return logits_cloned
+
     def get_multimodal_embeddings(self, *args, **kwargs):
         return self.model.get_multimodal_embeddings(*args, **kwargs)
 
     def get_input_embeddings(self, *args, **kwargs):
         return self.model.get_input_embeddings(*args, **kwargs)
 
+    def prepare_structured_decoding_input(
+        self, logits: torch.Tensor, scheduler_output: "SchedulerOutput"
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        grammar_bitmask = scheduler_output.grammar_bitmask
+        assert grammar_bitmask is not None
+        num_reqs, _ = logits.shape
+
+        # Reset pre-allocated tensors
+        self.grammar_bitmask_cpu.zero_()
+        self.require_structured_out_cpu.zero_()
+
+        # We receive the structured output bitmask from the scheduler, but the
+        # indices of the requests in the batch may not match the indices of
+        # the bitmask since the scheduler doesn't know how the tpu runner is
+        # ordering the requests in the batch. We need to match the order of
+        # bitmask with the order of requests
+        struct_out_indices: list[int] = []
+        mask_indices: list[int] = []
+        for req_id in self.input_batch.req_ids:
+            mask_index = scheduler_output.structured_output_request_ids.get(
+                req_id)
+            if mask_index is None:
+                continue
+            batch_index = self.input_batch.req_id_to_index[req_id]
+            struct_out_indices.append(batch_index)
+            mask_indices.append(mask_index)
+        self.grammar_bitmask_cpu[struct_out_indices] = torch.from_numpy(
+            grammar_bitmask[mask_indices])
+        # It's not guaranteed that all requests in this batch require
+        # structured output, so create a bool tensor to represent
+        # the requests that need structured output.
+        struct_out_indices = torch.tensor(struct_out_indices, dtype=torch.long)
+        self.require_structured_out_cpu[struct_out_indices] = True
+        return self.require_structured_out_cpu[:num_reqs].to(logits.device), \
+            self.grammar_bitmask_cpu[:num_reqs].to(logits.device), \
+            self.structured_decode_arange.to(logits.device)
+
     def _get_mm_dummy_batch(self, modality: str,
                             batch_size: int) -> BatchedTensorInputs:
         # Dummy data for pre-compiling multimodal models.

From 36fe78769ff4134c59bb675f2805364938092079 Mon Sep 17 00:00:00 2001
From: Guillaume Calmettes <gcalmettes@scaleway.com>
Date: Wed, 23 Apr 2025 03:43:06 +0200
Subject: [PATCH 573/593] [Bugfix] validate urls object for multimodal content
 parts (#16990)

Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
---
 tests/entrypoints/openai/test_audio.py  | 29 ++++++++++++++++++++++++
 tests/entrypoints/openai/test_video.py  | 29 ++++++++++++++++++++++++
 tests/entrypoints/openai/test_vision.py | 30 +++++++++++++++++++++++++
 vllm/entrypoints/chat_utils.py          | 10 +++++----
 4 files changed, 94 insertions(+), 4 deletions(-)

diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index 29d5a85af6132..72e616656775e 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -104,6 +104,35 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
     assert message.content is not None and len(message.content) >= 0
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
+async def test_error_on_invalid_audio_url_type(client: openai.AsyncOpenAI,
+                                               model_name: str,
+                                               audio_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "audio_url",
+                "audio_url": audio_url
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+
+    # audio_url should be a dict {"url": "some url"}, not directly a string
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(model=model_name,
+                                                 messages=messages,
+                                                 max_completion_tokens=10,
+                                                 temperature=0.0)
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py
index 8679c2f25db40..53f057a294c0a 100644
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -108,6 +108,35 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI,
     assert message.content is not None and len(message.content) >= 0
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+async def test_error_on_invalid_video_url_type(client: openai.AsyncOpenAI,
+                                               model_name: str,
+                                               video_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "video_url",
+                "video_url": video_url
+            },
+            {
+                "type": "text",
+                "text": "What's in this video?"
+            },
+        ],
+    }]
+
+    # video_url should be a dict {"url": "some url"}, not directly a string
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(model=model_name,
+                                                 messages=messages,
+                                                 max_completion_tokens=10,
+                                                 temperature=0.0)
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 87b5cee73ecb0..1ab50b41c7ecb 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -137,6 +137,36 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
     assert message.content is not None and len(message.content) >= 0
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI,
+                                               model_name: str,
+                                               image_url: str):
+    content_text = "What's in this image?"
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": image_url
+            },
+            {
+                "type": "text",
+                "text": content_text
+            },
+        ],
+    }]
+
+    # image_url should be a dict {"url": "some url"}, not directly a string
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(model=model_name,
+                                                 messages=messages,
+                                                 max_completion_tokens=10,
+                                                 temperature=0.0)
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 2e19ebcdd61f3..bd2c3357cdc00 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -27,10 +27,11 @@ from openai.types.chat import (ChatCompletionMessageToolCallParam,
                                ChatCompletionToolMessageParam)
 from openai.types.chat.chat_completion_content_part_input_audio_param import (
     InputAudio)
+from pydantic import TypeAdapter
 # yapf: enable
-# pydantic needs the TypedDict from typing_extensions
 from transformers import (PreTrainedTokenizer, PreTrainedTokenizerFast,
                           ProcessorMixin)
+# pydantic needs the TypedDict from typing_extensions
 from typing_extensions import Required, TypeAlias, TypedDict
 
 from vllm.config import ModelConfig
@@ -879,12 +880,13 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int],
 
 # No need to validate using Pydantic again
 _TextParser = partial(cast, ChatCompletionContentPartTextParam)
-_ImageParser = partial(cast, ChatCompletionContentPartImageParam)
 _ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam)
-_AudioParser = partial(cast, ChatCompletionContentPartAudioParam)
 _InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
-_VideoParser = partial(cast, ChatCompletionContentPartVideoParam)
+# Need to validate url objects
+_ImageParser = TypeAdapter(ChatCompletionContentPartImageParam).validate_python
+_AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python
+_VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python
 
 _ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio]
 

From f67e9e9f221e9791733b827585d6eb6dbc23133c Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Tue, 22 Apr 2025 19:08:27 -0700
Subject: [PATCH 574/593] add Dockerfile build vllm against torch nightly
 (#16936)

Signed-off-by: Yang Wang <elainewy@meta.com>
---
 .buildkite/test-pipeline.yaml       |   3 +
 docker/Dockerfile.nightly_torch     | 307 ++++++++++++++++++++++++++++
 requirements/nightly_torch_test.txt |  28 +++
 3 files changed, 338 insertions(+)
 create mode 100644 docker/Dockerfile.nightly_torch
 create mode 100644 requirements/nightly_torch_test.txt

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 898b8e5da2f48..95e38305e1e1b 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -8,6 +8,7 @@
 # Documentation
 # label(str): the name of the test. emoji allowed.
 # fast_check(bool): whether to run this on each commit on fastcheck pipeline.
+# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
 # fast_check_only(bool): run this test on fastcheck pipeline only
 # optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
 # command(str): the single command to run for tests. incompatible with commands.
@@ -70,6 +71,7 @@ steps:
 - label: Basic Correctness Test # 30min
   #mirror_hardwares: [amd]
   fast_check: true
+  torch_nightly: true
   source_file_dependencies:
   - vllm/
   - tests/basic_correctness/test_basic_correctness
@@ -104,6 +106,7 @@ steps:
 - label: Entrypoints Test # 40min
   working_dir: "/vllm-workspace/tests"
   fast_check: true
+  torch_nightly: true
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch
new file mode 100644
index 0000000000000..0063712e47818
--- /dev/null
+++ b/docker/Dockerfile.nightly_torch
@@ -0,0 +1,307 @@
+# The vLLM Dockerfile is used to construct vLLM image against torch nightly that can be directly used for testing
+
+# for torch nightly, cuda >=12.6 is required,
+# use 12.8 due to FlashAttention issue with cuda 12.6 (https://github.com/vllm-project/vllm/issues/15435#issuecomment-2775924628)
+ARG CUDA_VERSION=12.8.0
+#
+#################### BASE BUILD IMAGE ####################
+# prepare basic build environment
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
+ARG CUDA_VERSION=12.8.0
+ARG PYTHON_VERSION=3.12
+ARG TARGETPLATFORM
+ENV DEBIAN_FRONTEND=noninteractive
+# Install Python and other dependencies
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y ccache software-properties-common git curl sudo \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && python3 --version \
+    && python3 -m pip --version
+# Install uv for faster pip installs
+RUN --mount=type=cache,target=/root/.cache/uv \
+    python3 -m pip install uv
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
+# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
+# as it was causing spam when compiling the CUTLASS kernels
+RUN apt-get install -y gcc-10 g++-10
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
+RUN <<EOF
+gcc --version
+EOF
+
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+
+WORKDIR /workspace
+
+# install build and runtime dependencies
+COPY requirements/common.txt requirements/common.txt
+COPY use_existing_torch.py use_existing_torch.py
+COPY pyproject.toml pyproject.toml
+
+# install build and runtime dependencies without stable torch version
+RUN python3 use_existing_torch.py
+
+# install torch nightly
+ARG PINNED_TORCH_VERSION
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -n "$PINNED_TORCH_VERSION" ]; then \
+      pkgs="$PINNED_TORCH_VERSION"; \
+    else \
+      pkgs="torch torchaudio torchvision"; \
+    fi && \
+    uv pip install --system $pkgs --index-url https://download.pytorch.org/whl/nightly/cu128
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system numba==0.61.2
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+uv pip install --system -r requirements/common.txt
+
+# must put before installing xformers, so it can install the correct version of xfomrers.
+ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
+ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
+
+# Build xformers with cuda and torch nightly
+# following official xformers guidance: https://github.com/facebookresearch/xformers#build
+# todo(elainewy): cache xformers build result for faster build
+ARG max_jobs=16
+ENV MAX_JOBS=${max_jobs}
+ARG XFORMERS_COMMIT=f2de641ef670510cadab099ce6954031f52f191c
+
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+     --mount=type=cache,target=/root/.cache/uv \
+    echo 'git clone xformers...' \
+    && git clone https://github.com/facebookresearch/xformers.git --recursive \
+    && cd xformers \
+    && git checkout ${XFORMERS_COMMIT} \
+    && git submodule update --init --recursive \
+    && echo 'finish git clone xformers...' \
+    && rm -rf build \
+    && python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \
+    && cd .. \
+    && rm -rf xformers
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system xformers-dist/*.whl --verbose
+
+# build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
+# track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
+RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
+RUN cat torch_build_versions.txt
+
+# cuda arch list used by torch
+# can be useful for `test`
+# explicitly set the list to avoid issues with torch 2.2
+# see https://github.com/pytorch/pytorch/pull/123243
+
+# Override the arch list for flash-attn to reduce the binary size
+ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
+ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
+#################### BASE BUILD IMAGE ####################
+
+#################### WHEEL BUILD IMAGE ####################
+FROM base AS build
+ARG TARGETPLATFORM
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
+COPY . .
+
+RUN python3 use_existing_torch.py
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/build.txt
+
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
+
+# Max jobs used by Ninja to build extensions
+ARG max_jobs=16
+ENV MAX_JOBS=${max_jobs}
+ARG nvcc_threads=2
+ENV NVCC_THREADS=$nvcc_threads
+
+ARG USE_SCCACHE
+ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
+ARG SCCACHE_REGION_NAME=us-west-2
+ARG SCCACHE_S3_NO_CREDENTIALS=0
+
+# if USE_SCCACHE is set, use sccache to speed up compilation
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=.git,target=.git \
+    if [ "$USE_SCCACHE" = "1" ]; then \
+        echo "Installing sccache..." \
+        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
+        && tar -xzf sccache.tar.gz \
+        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
+        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
+        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
+        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
+        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
+        && export SCCACHE_IDLE_TIMEOUT=0 \
+        && export CMAKE_BUILD_TYPE=Release \
+        && sccache --show-stats \
+        && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
+        && sccache --show-stats; \
+    fi
+
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=.git,target=.git  \
+    if [ "$USE_SCCACHE" != "1" ]; then \
+        # Clean any existing CMake artifacts
+        rm -rf .deps && \
+        mkdir -p .deps && \
+        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
+    fi
+
+#################### WHEEL BUILD IMAGE ####################
+
+################### VLLM INSTALLED IMAGE ####################
+# Setup clean environment for vLLM and its dependencies for test and api server using ubuntu22.04 with AOT flashinfer
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
+# prepare for environment starts
+ARG CUDA_VERSION=12.8.0
+ARG PYTHON_VERSION=3.12
+WORKDIR /vllm-workspace
+ENV DEBIAN_FRONTEND=noninteractive
+ARG TARGETPLATFORM
+
+RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
+    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
+
+# Install Python and other dependencies
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && python3 --version && python3 -m pip --version
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    python3 -m pip install uv
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+
+# get the nightly torch version used in the build to make sure the version is the same
+COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu128
+
+# install the vllm wheel
+RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/vllm-dist \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system vllm-dist/*.whl --verbose
+
+# install xformers again for the new environment
+RUN --mount=type=bind,from=base,src=/workspace/xformers-dist,target=/vllm-workspace/xformers-dist \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system /vllm-workspace/xformers-dist/*.whl --verbose
+
+ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
+
+# install package for build flashinfer
+# see issue: https://github.com/flashinfer-ai/flashinfer/issues/738
+RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.post1
+
+
+# build flashinfer for torch nightly from source around 10 mins
+# release version: v0.2.2.post1
+# todo(elainewy): cache flashinfer build result for faster build
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/uv \
+    echo "git clone flashinfer..." \
+    && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
+    && cd flashinfer \
+    && git checkout v0.2.2.post1 \
+    && git submodule update --init --recursive \
+    && echo "finish git clone flashinfer..." \
+    && rm -rf build \
+    && export TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} \
+    && FLASHINFER_ENABLE_AOT=1 python3 setup.py bdist_wheel --dist-dir=../flashinfer-dist --verbose \
+    && cd .. \
+    && rm -rf flashinfer
+
+# install flashinfer
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system flashinfer-dist/*.whl --verbose
+
+# install common packages
+COPY requirements/common.txt requirements/common.txt
+COPY use_existing_torch.py use_existing_torch.py
+COPY pyproject.toml pyproject.toml
+
+COPY examples examples
+COPY benchmarks benchmarks
+COPY ./vllm/collect_env.py .
+
+RUN python3 use_existing_torch.py
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/common.txt
+
+################### VLLM INSTALLED IMAGE ####################
+
+
+#################### UNITTEST IMAGE #############################
+FROM vllm-base as test
+COPY tests/ tests/
+
+# install build and runtime dependencies without stable torch version
+COPY requirements/nightly_torch_test.txt requirements/nightly_torch_test.txt
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -e tests/vllm_test_utils
+
+# enable fast downloads from hf (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system hf_transfer
+ENV HF_HUB_ENABLE_HF_TRANSFER 1
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/nightly_torch_test.txt
+
+#################### UNITTEST IMAGE #############################
+
diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt
new file mode 100644
index 0000000000000..20372a9b2ef16
--- /dev/null
+++ b/requirements/nightly_torch_test.txt
@@ -0,0 +1,28 @@
+# Dependency that able to run entrypoints test
+# pytest and its extensions
+pytest
+pytest-asyncio
+pytest-forked
+pytest-mock
+pytest-rerunfailures
+pytest-shard
+pytest-timeout
+
+
+librosa # required by audio tests in entrypoints/openai
+sentence-transformers
+numba == 0.61.2; python_version > '3.9'
+# testing utils
+awscli
+boto3
+botocore
+datasets
+ray >= 2.10.0
+peft
+runai-model-streamer==0.11.0
+runai-model-streamer-s3==0.11.0
+tensorizer>=2.9.0
+lm-eval==0.4.8
+buildkite-test-collector==0.1.9
+
+lm-eval[api]==0.4.8 # required for model evaluation test

From bc7c4d206bbfb56b06d218b6c2971e8ca191db36 Mon Sep 17 00:00:00 2001
From: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com>
Date: Tue, 22 Apr 2025 19:11:56 -0700
Subject: [PATCH 575/593] [Kernel][ROCM] Upstream prefix prefill speed up for
 vLLM V1 (#13305)

Signed-off-by: Sage Moore <sage@neuralmagic.com>
Signed-off-by: root <root@banff-cyxtera-s73-5.ctr.dcgpu>
Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Signed-off-by: root <root@banff-cyxtera-s65-4.amd.com>
Signed-off-by: maleksan85 <maleksan@amd.com>
Signed-off-by: <>
Co-authored-by: Sage Moore <sage@neuralmagic.com>
Co-authored-by: root <root@banff-cyxtera-s73-5.ctr.dcgpu>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: qli88 <qiang.li2@amd.com>
Co-authored-by: root <root@banff-cyxtera-s65-4.amd.com>
---
 tests/core/block/e2e/test_correctness.py |    6 +-
 vllm/attention/ops/prefix_prefill.py     | 1596 +++++++++++-----------
 2 files changed, 805 insertions(+), 797 deletions(-)

diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
index e9b537ed5150e..9e8e315d87b18 100644
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -195,15 +195,15 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
     ])
 @pytest.mark.parametrize("per_test_common_llm_kwargs",
                          [{
-                             "block_size": 8,
+                             "block_size": 16,
                              "max_num_batched_tokens": 2,
                              "max_num_seqs": 2,
                          }, {
-                             "block_size": 8,
+                             "block_size": 16,
                              "max_num_batched_tokens": 3,
                              "max_num_seqs": 2,
                          }, {
-                             "block_size": 8,
+                             "block_size": 16,
                              "max_num_batched_tokens": 256,
                              "max_num_seqs": 10,
                          }])
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index e0478c2aebdaa..a8c8d8409620c 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -16,831 +16,778 @@ NUM_WARPS = 4 if current_platform.is_rocm() else 8
 # To check compatibility
 IS_TURING = current_platform.get_device_capability() == (7, 5)
 
-if triton.__version__ >= "2.1.0":
 
-    @triton.jit
-    def _fwd_kernel(
-        Q,
-        K,
-        V,
-        K_cache,
-        V_cache,
-        B_Loc,
-        sm_scale,
-        k_scale,
-        v_scale,
-        B_Start_Loc,
-        B_Seqlen,
-        block_size,
-        x,
-        Out,
-        stride_b_loc_b,
-        stride_b_loc_s,
-        stride_qbs,
-        stride_qh,
-        stride_qd,
-        stride_kbs,
-        stride_kh,
-        stride_kd,
-        stride_vbs,
-        stride_vh,
-        stride_vd,
-        stride_obs,
-        stride_oh,
-        stride_od,
-        stride_k_cache_bs,
-        stride_k_cache_h,
-        stride_k_cache_d,
-        stride_k_cache_bl,
-        stride_k_cache_x,
-        stride_v_cache_bs,
-        stride_v_cache_h,
-        stride_v_cache_d,
-        stride_v_cache_bl,
-        num_queries_per_kv: int,
-        IN_PRECISION: tl.constexpr,
-        BLOCK_M: tl.constexpr,
-        BLOCK_DMODEL: tl.constexpr,  # head size
-        BLOCK_DMODEL_PADDED: tl.constexpr,  # head size padded to a power of 2
-        BLOCK_N: tl.constexpr,
-        SLIDING_WINDOW: tl.constexpr,
-        SKIP_DECODE: tl.constexpr,
-    ):
+# Here's an example autotuner config for this kernel. This config does provide
+# a performance improvement, but dramatically increases first call latency in
+# triton 3.2. Because of this tradeoff, it's currently commented out.
+# @triton.autotune(
+#     configs=[
+#         triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, \
+#                         "num_unroll_cache": 4, \
+#                         "num_unroll_request": 1 } | \
+#                         ({"kpack": 2, "waves_per_eu": 2} \
+#                             if current_platform.is_rocm() else {}), \
+#                         num_warps=4, \
+#                         num_stages=1)
+#     ],
+#     key=["BLOCK_SIZE", "MAX_Q_LEN", "MAX_CTX_LEN"]
+# )
+@triton.jit
+def _fwd_kernel(Q,
+                K,
+                V,
+                K_cache,
+                V_cache,
+                B_Loc,
+                sm_scale,
+                k_scale,
+                v_scale,
+                B_Start_Loc,
+                B_Seqlen,
+                x: tl.constexpr,
+                Out,
+                stride_b_loc_b,
+                stride_b_loc_s,
+                stride_qbs,
+                stride_qh,
+                stride_qd,
+                stride_kbs,
+                stride_kh,
+                stride_kd,
+                stride_vbs,
+                stride_vh,
+                stride_vd,
+                stride_obs,
+                stride_oh,
+                stride_od,
+                stride_k_cache_bs,
+                stride_k_cache_h,
+                stride_k_cache_d,
+                stride_k_cache_bl: tl.constexpr,
+                stride_k_cache_x,
+                stride_v_cache_bs,
+                stride_v_cache_h,
+                stride_v_cache_d,
+                stride_v_cache_bl,
+                num_queries_per_kv: tl.constexpr,
+                IN_PRECISION: tl.constexpr,
+                BLOCK_M: tl.constexpr,
+                BLOCK_DMODEL: tl.constexpr,
+                BLOCK_DMODEL_PADDED: tl.constexpr,
+                BLOCK_SIZE: tl.constexpr,
+                BLOCK_N: tl.constexpr,
+                SLIDING_WINDOW: tl.constexpr,
+                num_unroll_cache: tl.constexpr,
+                num_unroll_request: tl.constexpr,
+                SKIP_DECODE: tl.constexpr,
+                MAX_Q_LEN: tl.constexpr = 0,
+                MAX_CTX_LEN: tl.constexpr = 0):
 
-        cur_batch = tl.program_id(0)
-        cur_head = tl.program_id(1)
-        start_m = tl.program_id(2)
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    start_m = tl.program_id(2)
 
-        cur_kv_head = cur_head // num_queries_per_kv
+    cur_kv_head = cur_head // num_queries_per_kv
 
-        cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
-        cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
-        cur_batch_in_all_stop_index = tl.load(B_Start_Loc + cur_batch + 1)
-        cur_batch_query_len = (cur_batch_in_all_stop_index -
-                               cur_batch_in_all_start_index)
-        cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+    cur_batch_in_all_stop_index = tl.load(B_Start_Loc + cur_batch + 1)
+    cur_batch_query_len = (cur_batch_in_all_stop_index -
+                           cur_batch_in_all_start_index)
+    cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len
 
-        if SKIP_DECODE and cur_batch_query_len == 1:
-            return
-
-        # start position inside of the query
-        # generally, N goes over kv, while M goes over query_len
-        block_start_loc = BLOCK_M * start_m
-
-        # initialize offsets
-        # [N]; starts at 0
-        offs_n = tl.arange(0, BLOCK_N)
-        # [D]; starts at 0
-        offs_d = tl.arange(0, BLOCK_DMODEL_PADDED)
-        # [M]; starts at current position in query
-        offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-        # [M,D]
-        off_q = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
-            cur_head * stride_qh + offs_d[None, :] * stride_qd)
-
-        dim_mask = tl.where(
-            tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1,
-            0).to(tl.int1)  # [D]
-
-        q = tl.load(Q + off_q,
-                    mask=dim_mask[None, :] &
-                    (offs_m[:, None] < cur_batch_query_len),
-                    other=0.0)  # [M,D]
-
-        # initialize pointer to m and l
-        m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")  # [M]
-        l_i = tl.zeros([BLOCK_M], dtype=tl.float32)  # [M]
-        acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED],
-                       dtype=tl.float32)  # [M,D]
-
-        # compute query against context (no causal mask here)
-        for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
-                         ((start_n + offs_n) // block_size) * stride_b_loc_s,
-                         mask=(start_n + offs_n) < cur_batch_ctx_len,
-                         other=0)  # [N]
-            # [D,N]
-            off_k = (bn[None, :] * stride_k_cache_bs +
-                     cur_kv_head * stride_k_cache_h +
-                     (offs_d[:, None] // x) * stride_k_cache_d +
-                     ((start_n + offs_n[None, :]) % block_size) *
-                     stride_k_cache_bl +
-                     (offs_d[:, None] % x) * stride_k_cache_x)
-            # [N,D]
-            off_v = (
-                bn[:, None] * stride_v_cache_bs +
-                cur_kv_head * stride_v_cache_h +
-                offs_d[None, :] * stride_v_cache_d +
-                (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
-            k_load = tl.load(K_cache + off_k,
-                             mask=dim_mask[:, None] &
-                             ((start_n + offs_n[None, :]) < cur_batch_ctx_len),
-                             other=0.0)  # [D,N]
-
-            if k_load.dtype.is_fp8():
-                k = (k_load.to(tl.float32) * tl.load(k_scale)).to(q.dtype)
-            else:
-                k = k_load
-
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)  # [M,N]
-            qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
-            qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
-                          float("-inf"))
-            qk *= sm_scale
-            if SLIDING_WINDOW > 0:
-                # (cur_batch_ctx_len + offs_m[:, None]) are the positions of
-                # Q entries in sequence
-                # (start_n + offs_n[None, :]) are the positions of
-                # KV entries in sequence
-                # So the condition makes sure each entry in Q only attends
-                # to KV entries not more than SLIDING_WINDOW away.
-                #
-                # We can't use -inf here, because the
-                # sliding window may lead to the entire row being masked.
-                # This then makes m_ij contain -inf, which causes NaNs in
-                # exp().
-                qk = tl.where((cur_batch_ctx_len + offs_m[:, None]) -
-                              (start_n + offs_n[None, :]) < SLIDING_WINDOW, qk,
-                              -10000)
-
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)  # [M]
-            p = tl.exp(qk - m_ij[:, None])  # [M,N]
-            l_ij = tl.sum(p, 1)  # [M]
-            # -- update m_i and l_i
-            m_i_new = tl.maximum(m_i, m_ij)  # [M]
-            alpha = tl.exp(m_i - m_i_new)  # [M]
-            beta = tl.exp(m_ij - m_i_new)  # [M]
-            l_i_new = alpha * l_i + beta * l_ij  # [M]
-
-            # -- update output accumulator --
-            # scale p
-            p_scale = beta / l_i_new
-            p = p * p_scale[:, None]
-            # scale acc
-            acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v_load = tl.load(V_cache + off_v,
-                             mask=dim_mask[None, :] &
-                             ((start_n + offs_n[:, None]) < cur_batch_ctx_len),
-                             other=0.0)  # [N,D]
-            if v_load.dtype.is_fp8():
-                v = (v_load.to(tl.float32) * tl.load(v_scale)).to(q.dtype)
-            else:
-                v = v_load
-            p = p.to(v.dtype)
-
-            acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION)
-            # # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-
-        off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
-                 offs_d[:, None] * stride_kd)
-        off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
-                 offs_d[None, :] * stride_vd)
-        k_ptrs = K + off_k
-        v_ptrs = V + off_v
-
-        # block_mask is 0 when we're already past the current query length
-        block_mask = tl.where(block_start_loc < cur_batch_query_len, 1, 0)
-
-        # compute query against itself (with causal mask)
-        for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            k = tl.load(k_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_kbs,
-                        mask=dim_mask[:, None] &
-                        ((start_n + offs_n[None, :]) < cur_batch_query_len),
-                        other=0.0)
-
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
-            qk *= sm_scale
-            # apply causal mask
-            qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
-                          float("-inf"))
-            if SLIDING_WINDOW > 0:
-                qk = tl.where(
-                    offs_m[:, None] - (start_n + offs_n[None, :])
-                    < SLIDING_WINDOW, qk, -10000)
-
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)
-            p = tl.exp(qk - m_ij[:, None])
-            l_ij = tl.sum(p, 1)
-            # -- update m_i and l_i
-            m_i_new = tl.maximum(m_i, m_ij)
-            alpha = tl.exp(m_i - m_i_new)
-            beta = tl.exp(m_ij - m_i_new)
-            l_i_new = alpha * l_i + beta * l_ij
-            # -- update output accumulator --
-            # scale p
-            p_scale = beta / l_i_new
-            p = p * p_scale[:, None]
-            # scale acc
-            acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v = tl.load(v_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_vbs,
-                        mask=dim_mask[None, :] &
-                        ((start_n + offs_n[:, None]) < cur_batch_query_len),
-                        other=0.0)
-            p = p.to(v.dtype)
-
-            acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION)
-            # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-        # initialize pointers to output
-        off_o = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
-            cur_head * stride_oh + offs_d[None, :] * stride_od)
-        out_ptrs = Out + off_o
-        tl.store(out_ptrs,
-                 acc,
-                 mask=dim_mask[None, :] &
-                 (offs_m[:, None] < cur_batch_query_len))
+    if SKIP_DECODE and cur_batch_query_len == 1:
         return
 
-    @triton.jit
-    def _fwd_kernel_flash_attn_v2(
-        Q,
-        K,
-        V,
-        K_cache,
-        V_cache,
-        B_Loc,
-        sm_scale,
-        B_Start_Loc,
-        B_Seqlen,
-        B_Ctxlen,
-        block_size,
-        x,
-        Out,
-        stride_b_loc_b,
-        stride_b_loc_s,
-        stride_qbs,
-        stride_qh,
-        stride_qd,
-        stride_kbs,
-        stride_kh,
-        stride_kd,
-        stride_vbs,
-        stride_vh,
-        stride_vd,
-        stride_obs,
-        stride_oh,
-        stride_od,
-        stride_k_cache_bs,
-        stride_k_cache_h,
-        stride_k_cache_d,
-        stride_k_cache_bl,
-        stride_k_cache_x,
-        stride_v_cache_bs,
-        stride_v_cache_h,
-        stride_v_cache_d,
-        stride_v_cache_bl,
-        num_queries_per_kv: int,
-        BLOCK_M: tl.constexpr,
-        BLOCK_DMODEL: tl.constexpr,
-        BLOCK_N: tl.constexpr,
-    ):
-        cur_batch = tl.program_id(0)
-        cur_head = tl.program_id(1)
-        start_m = tl.program_id(2)
+    # start position inside of the query
+    # generally, N goes over kv, while M goes over query_len
+    block_start_loc = BLOCK_M * start_m
 
-        cur_kv_head = cur_head // num_queries_per_kv
+    # initialize offsets
+    # [BLOCK_SIZE]; starts at 0
+    offs_bs_n = tl.arange(0, BLOCK_SIZE)
+    # [N]; starts at 0
+    offs_n = tl.arange(0, BLOCK_N)
+    # [D]; starts at 0
+    offs_d = tl.arange(0, BLOCK_DMODEL_PADDED)
+    # [M]; starts at current position in query
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    # [M,D]
+    off_q = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
+             cur_head * stride_qh + offs_d[None, :] * stride_qd)
 
-        cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
-        cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
-        cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+    dim_mask = tl.where(
+        tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1,
+        0).to(tl.int1)  # [D]
 
-        block_start_loc = BLOCK_M * start_m
+    q = tl.load(Q + off_q,
+                mask=dim_mask[None, :] &
+                (offs_m[:, None] < cur_batch_query_len),
+                other=0.0)  # [M,D]
 
-        # initialize offsets
-        offs_n = tl.arange(0, BLOCK_N)
-        offs_d = tl.arange(0, BLOCK_DMODEL)
-        offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-        off_q = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
-            cur_head * stride_qh + offs_d[None, :] * stride_qd)
+    # initialize pointer to m and l
+    m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED], dtype=tl.float32)  # [M,D]
 
-        q = tl.load(Q + off_q,
-                    mask=offs_m[:, None]
+    # compute query against context (no causal mask here)
+    for start_n in tl.range(0, cur_batch_ctx_len, BLOCK_SIZE, \
+                            loop_unroll_factor=num_unroll_cache):
+        start_n = tl.multiple_of(start_n, BLOCK_SIZE)
+        # -- compute qk ----
+        bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
+                     (start_n // BLOCK_SIZE) * stride_b_loc_s)
+        # [D,BLOCK_SIZE]
+        off_k = (
+            bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
+            (offs_d[:, None] // x) * stride_k_cache_d +
+            ((start_n + offs_bs_n[None, :]) % BLOCK_SIZE) * stride_k_cache_bl +
+            (offs_d[:, None] % x) * stride_k_cache_x)
+
+        # [BLOCK_SIZE,D]
+        off_v = (bn[:, None] * stride_v_cache_bs +
+                 cur_kv_head * stride_v_cache_h +
+                 offs_d[None, :] * stride_v_cache_d +
+                 offs_bs_n[:, None] * stride_v_cache_bl)
+
+        if start_n + BLOCK_SIZE > cur_batch_ctx_len or \
+            BLOCK_DMODEL != BLOCK_DMODEL_PADDED:
+            k_load = tl.load(
+                K_cache + off_k,
+                mask=dim_mask[:, None] &
+                ((start_n + offs_bs_n[None, :]) < cur_batch_ctx_len),
+                other=0.0)  # [D,N]
+        else:
+            k_load = tl.load(K_cache + off_k)
+
+        if k_load.dtype.is_fp8():
+            k = (k_load.to(tl.float32) * tl.load(k_scale)).to(q.dtype)
+        else:
+            k = k_load
+
+        qk = tl.zeros([BLOCK_M, BLOCK_SIZE], dtype=tl.float32)  # [M,N]
+        qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
+        qk = tl.where((start_n + offs_bs_n[None, :]) < cur_batch_ctx_len, qk,
+                      float("-inf"))
+        qk *= sm_scale
+        if SLIDING_WINDOW > 0:
+            # (cur_batch_ctx_len + offs_m[:, None]) are the positions of
+            # Q entries in sequence
+            # (start_n + offs_bs_n[None, :]) are the positions of
+            # KV entries in sequence
+            # So the condition makes sure each entry in Q only attends
+            # to KV entries not more than SLIDING_WINDOW away.
+            #
+            # We can't use -inf here, because the
+            # sliding window may lead to the entire row being masked.
+            # This then makes m_ij contain -inf, which causes NaNs in
+            # exp().
+            qk = tl.where((cur_batch_ctx_len + offs_m[:, None]) -
+                          (start_n + offs_bs_n[None, :]) < SLIDING_WINDOW, qk,
+                          -10000)
+
+        # compute running maximum
+        m_ij = tl.maximum(m_i, tl.max(qk, axis=1))
+        p = tl.exp(qk - m_ij[:, None])
+        l_ij = tl.sum(p, axis=1)
+        alpha = tl.exp(m_i - m_ij)
+        acc = acc * alpha[:, None]
+
+        # update acc
+        if start_n + BLOCK_SIZE > cur_batch_ctx_len or \
+            BLOCK_DMODEL != BLOCK_DMODEL_PADDED:
+            v_load = tl.load(
+                V_cache + off_v,
+                mask=dim_mask[None, :] &
+                ((start_n + offs_bs_n[:, None]) < cur_batch_ctx_len),
+                other=0.0)  # [N,D]
+        else:
+            v_load = tl.load(V_cache + off_v)
+
+        if v_load.dtype.is_fp8():
+            v = (v_load.to(tl.float32) * tl.load(v_scale)).to(q.dtype)
+        else:
+            v = v_load
+        p = p.to(v.dtype)
+
+        acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION)
+        # # update m_i and l_i
+        l_i = l_i * alpha + l_ij
+        m_i = m_ij
+
+    off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
+             offs_d[:, None] * stride_kd)
+    off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
+             offs_d[None, :] * stride_vd)
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+
+    # block_mask is 0 when we're already past the current query length
+    block_mask = tl.where(block_start_loc < cur_batch_query_len, 1, 0)
+
+    # compute query against itself (with causal mask)
+    for start_n in tl.range(0, \
+                        block_mask * (start_m + 1) * BLOCK_M, BLOCK_N, \
+                        loop_unroll_factor=num_unroll_request):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        k = tl.load(k_ptrs +
+                    (cur_batch_in_all_start_index + start_n) * stride_kbs,
+                    mask=dim_mask[:, None] &
+                    ((start_n + offs_n[None, :]) < cur_batch_query_len),
+                    other=0.0)
+
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
+        qk *= sm_scale
+        # apply causal mask
+        qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
+                      float("-inf"))
+        if SLIDING_WINDOW > 0:
+            qk = tl.where(
+                offs_m[:, None] - (start_n + offs_n[None, :]) < SLIDING_WINDOW,
+                qk, -10000)
+
+        # compute running maximum
+        m_ij = tl.maximum(m_i, tl.max(qk, axis=1))
+        p = tl.exp(qk - m_ij[:, None])
+        l_ij = tl.sum(p, axis=1)
+        alpha = tl.exp(m_i - m_ij)
+        acc = acc * alpha[:, None]
+
+        # update acc
+        v = tl.load(v_ptrs +
+                    (cur_batch_in_all_start_index + start_n) * stride_vbs,
+                    mask=dim_mask[None, :] &
+                    ((start_n + offs_n[:, None]) < cur_batch_query_len),
+                    other=0.0)
+        p = p.to(v.dtype)
+
+        acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION)
+        # update m_i and l_i
+        l_i = l_i * alpha + l_ij
+        m_i = m_ij
+
+    acc = acc / l_i[:, None]
+
+    # initialize pointers to output
+    off_o = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
+             cur_head * stride_oh + offs_d[None, :] * stride_od)
+    out_ptrs = Out + off_o
+    tl.store(out_ptrs,
+             acc,
+             mask=dim_mask[None, :] & (offs_m[:, None] < cur_batch_query_len))
+    return
+
+
+@triton.jit
+def _fwd_kernel_flash_attn_v2(
+    Q,
+    K,
+    V,
+    K_cache,
+    V_cache,
+    B_Loc,
+    sm_scale,
+    B_Start_Loc,
+    B_Seqlen,
+    B_Ctxlen,
+    block_size,
+    x,
+    Out,
+    stride_b_loc_b,
+    stride_b_loc_s,
+    stride_qbs,
+    stride_qh,
+    stride_qd,
+    stride_kbs,
+    stride_kh,
+    stride_kd,
+    stride_vbs,
+    stride_vh,
+    stride_vd,
+    stride_obs,
+    stride_oh,
+    stride_od,
+    stride_k_cache_bs,
+    stride_k_cache_h,
+    stride_k_cache_d,
+    stride_k_cache_bl,
+    stride_k_cache_x,
+    stride_v_cache_bs,
+    stride_v_cache_h,
+    stride_v_cache_d,
+    stride_v_cache_bl,
+    num_queries_per_kv: int,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    start_m = tl.program_id(2)
+
+    cur_kv_head = cur_head // num_queries_per_kv
+
+    cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+
+    block_start_loc = BLOCK_M * start_m
+
+    # initialize offsets
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    off_q = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
+             cur_head * stride_qh + offs_d[None, :] * stride_qd)
+
+    q = tl.load(Q + off_q,
+                mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
+                other=0.0)
+
+    # # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+
+    for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
+                     ((start_n + offs_n) // block_size) * stride_b_loc_s,
+                     mask=(start_n + offs_n) < cur_batch_ctx_len,
+                     other=0)
+        off_k = (
+            bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
+            (offs_d[:, None] // x) * stride_k_cache_d +
+            ((start_n + offs_n[None, :]) % block_size) * stride_k_cache_bl +
+            (offs_d[:, None] % x) * stride_k_cache_x)
+        off_v = (bn[:, None] * stride_v_cache_bs +
+                 cur_kv_head * stride_v_cache_h +
+                 offs_d[None, :] * stride_v_cache_d +
+                 (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
+        k = tl.load(K_cache + off_k,
+                    mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
+                    other=0.0)
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k)
+        qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
+                      float("-inf"))
+        qk *= sm_scale
+
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        m_i_new = tl.maximum(m_i, m_ij)
+        p = tl.math.exp(qk - m_i_new[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+
+        alpha = tl.math.exp(m_i - m_i_new)
+        l_i_new = alpha * l_i + l_ij
+        # -- update output accumulator --
+        # scale p
+        # scale acc
+        acc_scale = alpha
+        # acc_scale = l_i / l_i_new * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v = tl.load(V_cache + off_v,
+                    mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len,
+                    other=0.0)
+
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v)
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+
+    off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
+             offs_d[:, None] * stride_kd)
+    off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
+             offs_d[None, :] * stride_vd)
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+
+    block_mask = tl.where(
+        block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
+
+    for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        k = tl.load(k_ptrs +
+                    (cur_batch_in_all_start_index + start_n) * stride_kbs,
+                    mask=(start_n + offs_n[None, :])
                     < cur_batch_seq_len - cur_batch_ctx_len,
                     other=0.0)
 
-        # # initialize pointer to m and l
-        m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-        l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
-        acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k)
+        qk *= sm_scale
+        qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
+                      float("-inf"))
 
-        for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
-                         ((start_n + offs_n) // block_size) * stride_b_loc_s,
-                         mask=(start_n + offs_n) < cur_batch_ctx_len,
-                         other=0)
-            off_k = (bn[None, :] * stride_k_cache_bs +
-                     cur_kv_head * stride_k_cache_h +
-                     (offs_d[:, None] // x) * stride_k_cache_d +
-                     ((start_n + offs_n[None, :]) % block_size) *
-                     stride_k_cache_bl +
-                     (offs_d[:, None] % x) * stride_k_cache_x)
-            off_v = (
-                bn[:, None] * stride_v_cache_bs +
-                cur_kv_head * stride_v_cache_h +
-                offs_d[None, :] * stride_v_cache_d +
-                (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
-            k = tl.load(K_cache + off_k,
-                        mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
-                        other=0.0)
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk += tl.dot(q, k)
-            qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
-                          float("-inf"))
-            qk *= sm_scale
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        m_i_new = tl.maximum(m_i, m_ij)
+        p = tl.math.exp(qk - m_i_new[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
 
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)
-            m_i_new = tl.maximum(m_i, m_ij)
-            p = tl.math.exp(qk - m_i_new[:, None])
-            l_ij = tl.sum(p, 1)
-            # -- update m_i and l_i
-
-            alpha = tl.math.exp(m_i - m_i_new)
-            l_i_new = alpha * l_i + l_ij
-            # -- update output accumulator --
-            # scale p
-            # scale acc
-            acc_scale = alpha
-            # acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v = tl.load(V_cache + off_v,
-                        mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len,
-                        other=0.0)
-
-            p = p.to(v.dtype)
-            acc += tl.dot(p, v)
-            # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-
-        off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
-                 offs_d[:, None] * stride_kd)
-        off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
-                 offs_d[None, :] * stride_vd)
-        k_ptrs = K + off_k
-        v_ptrs = V + off_v
-
-        block_mask = tl.where(
-            block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
-
-        for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            k = tl.load(k_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_kbs,
-                        mask=(start_n + offs_n[None, :])
-                        < cur_batch_seq_len - cur_batch_ctx_len,
-                        other=0.0)
-
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk += tl.dot(q, k)
-            qk *= sm_scale
-            qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
-                          float("-inf"))
-
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)
-            m_i_new = tl.maximum(m_i, m_ij)
-            p = tl.math.exp(qk - m_i_new[:, None])
-            l_ij = tl.sum(p, 1)
-            # -- update m_i and l_i
-
-            alpha = tl.math.exp(m_i - m_i_new)
-            l_i_new = alpha * l_i + l_ij
-            # -- update output accumulator --
-            # scale p
-            # scale acc
-            acc_scale = alpha
-            # acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v = tl.load(v_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_vbs,
-                        mask=(start_n + offs_n[:, None])
-                        < cur_batch_seq_len - cur_batch_ctx_len,
-                        other=0.0)
-
-            p = p.to(v.dtype)
-            acc += tl.dot(p, v)
-            # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-
-        # acc /= l_i[:, None]
-        # initialize pointers to output
-        off_o = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
-            cur_head * stride_oh + offs_d[None, :] * stride_od)
-        out_ptrs = Out + off_o
-        tl.store(out_ptrs,
-                 acc,
-                 mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len)
-        return
-
-    @triton.jit
-    def _fwd_kernel_alibi(
-        Q,
-        K,
-        V,
-        K_cache,
-        V_cache,
-        B_Loc,
-        sm_scale,
-        k_scale,
-        v_scale,
-        B_Start_Loc,
-        B_Seqlen,
-        Alibi_slopes,
-        block_size,
-        x,
-        Out,
-        stride_b_loc_b,
-        stride_b_loc_s,
-        stride_qbs,
-        stride_qh,
-        stride_qd,
-        stride_kbs,
-        stride_kh,
-        stride_kd,
-        stride_vbs,
-        stride_vh,
-        stride_vd,
-        stride_obs,
-        stride_oh,
-        stride_od,
-        stride_k_cache_bs,
-        stride_k_cache_h,
-        stride_k_cache_d,
-        stride_k_cache_bl,
-        stride_k_cache_x,
-        stride_v_cache_bs,
-        stride_v_cache_h,
-        stride_v_cache_d,
-        stride_v_cache_bl,
-        num_queries_per_kv: int,
-        IN_PRECISION: tl.constexpr,
-        BLOCK_M: tl.constexpr,
-        BLOCK_DMODEL: tl.constexpr,  # head size
-        BLOCK_DMODEL_PADDED: tl.constexpr,  # head size padded to a power of 2
-        BLOCK_N: tl.constexpr,
-        SKIP_DECODE: tl.constexpr,
-    ):
-        # attn_bias[]
-        cur_batch = tl.program_id(0)
-        cur_head = tl.program_id(1)
-        start_m = tl.program_id(2)
-
-        cur_kv_head = cur_head // num_queries_per_kv
-
-        # cur_batch_seq_len: the length of prompts
-        # cur_batch_ctx_len: the length of prefix
-        # cur_batch_in_all_start_index: the start id of the dim=0
-        cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
-        cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
-        cur_batch_in_all_stop_index = tl.load(B_Start_Loc + cur_batch + 1)
-        cur_batch_query_len = (cur_batch_in_all_stop_index -
-                               cur_batch_in_all_start_index)
-        cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len
-
-        if SKIP_DECODE and cur_batch_query_len == 1:
-            return
-
-        block_start_loc = BLOCK_M * start_m
-
-        # initialize offsets
-        offs_n = tl.arange(0, BLOCK_N)
-        offs_d = tl.arange(0, BLOCK_DMODEL_PADDED)
-        offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-        off_q = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
-            cur_head * stride_qh + offs_d[None, :] * stride_qd)
-
-        dim_mask = tl.where(
-            tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1, 0).to(tl.int1)
-
-        q = tl.load(Q + off_q,
-                    mask=dim_mask[None, :] &
-                    (offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len),
+        alpha = tl.math.exp(m_i - m_i_new)
+        l_i_new = alpha * l_i + l_ij
+        # -- update output accumulator --
+        # scale p
+        # scale acc
+        acc_scale = alpha
+        # acc_scale = l_i / l_i_new * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v = tl.load(v_ptrs +
+                    (cur_batch_in_all_start_index + start_n) * stride_vbs,
+                    mask=(start_n + offs_n[:, None])
+                    < cur_batch_seq_len - cur_batch_ctx_len,
                     other=0.0)
 
-        # # initialize pointer to m and l
-        m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-        l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
-        acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED], dtype=tl.float32)
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v)
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
 
-        alibi_slope = tl.load(Alibi_slopes + cur_head)
-        alibi_start_q = tl.arange(
-            0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
-        alibi_start_k = 0
-        for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
-                         ((start_n + offs_n) // block_size) * stride_b_loc_s,
-                         mask=(start_n + offs_n) < cur_batch_ctx_len,
-                         other=0)
-            off_k = (bn[None, :] * stride_k_cache_bs +
-                     cur_kv_head * stride_k_cache_h +
-                     (offs_d[:, None] // x) * stride_k_cache_d +
-                     ((start_n + offs_n[None, :]) % block_size) *
-                     stride_k_cache_bl +
-                     (offs_d[:, None] % x) * stride_k_cache_x)
-            off_v = (
-                bn[:, None] * stride_v_cache_bs +
-                cur_kv_head * stride_v_cache_h +
-                offs_d[None, :] * stride_v_cache_d +
-                (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
-            k_load = tl.load(K_cache + off_k,
-                             mask=dim_mask[:, None] &
-                             ((start_n + offs_n[None, :]) < cur_batch_ctx_len),
-                             other=0.0)  # [D,N]
+    # acc /= l_i[:, None]
+    # initialize pointers to output
+    off_o = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
+             cur_head * stride_oh + offs_d[None, :] * stride_od)
+    out_ptrs = Out + off_o
+    tl.store(out_ptrs,
+             acc,
+             mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len)
+    return
 
-            if k_load.dtype.is_fp8():
-                k = (k_load.to(tl.float32) * tl.load(k_scale)).to(q.dtype)
-            else:
-                k = k_load
 
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
-            qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
-                          float("-inf"))
-            qk *= sm_scale
+@triton.jit
+def _fwd_kernel_alibi(
+    Q,
+    K,
+    V,
+    K_cache,
+    V_cache,
+    B_Loc,
+    sm_scale,
+    k_scale,
+    v_scale,
+    B_Start_Loc,
+    B_Seqlen,
+    Alibi_slopes,
+    block_size,
+    x,
+    Out,
+    stride_b_loc_b,
+    stride_b_loc_s,
+    stride_qbs,
+    stride_qh,
+    stride_qd,
+    stride_kbs,
+    stride_kh,
+    stride_kd,
+    stride_vbs,
+    stride_vh,
+    stride_vd,
+    stride_obs,
+    stride_oh,
+    stride_od,
+    stride_k_cache_bs,
+    stride_k_cache_h,
+    stride_k_cache_d,
+    stride_k_cache_bl,
+    stride_k_cache_x,
+    stride_v_cache_bs,
+    stride_v_cache_h,
+    stride_v_cache_d,
+    stride_v_cache_bl,
+    num_queries_per_kv: int,
+    IN_PRECISION: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,  # head size
+    BLOCK_DMODEL_PADDED: tl.constexpr,  # head size padded to a power of 2
+    BLOCK_N: tl.constexpr,
+    SKIP_DECODE: tl.constexpr,
+):
+    # attn_bias[]
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    start_m = tl.program_id(2)
 
-            # load alibi
-            alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k -
-                     alibi_start_q[:, None]) * alibi_slope
-            alibi = tl.where(
-                (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len),
-                alibi, float("-inf"))
-            qk += alibi
-            alibi_start_k += BLOCK_N
+    cur_kv_head = cur_head // num_queries_per_kv
 
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)
-            m_i_new = tl.maximum(m_i, m_ij)
-            p = tl.math.exp(qk - m_i_new[:, None])
-            l_ij = tl.sum(p, 1)
-            # -- update m_i and l_i
+    # cur_batch_seq_len: the length of prompts
+    # cur_batch_ctx_len: the length of prefix
+    # cur_batch_in_all_start_index: the start id of the dim=0
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+    cur_batch_in_all_stop_index = tl.load(B_Start_Loc + cur_batch + 1)
+    cur_batch_query_len = (cur_batch_in_all_stop_index -
+                           cur_batch_in_all_start_index)
+    cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len
 
-            alpha = tl.math.exp(m_i - m_i_new)
-            l_i_new = alpha * l_i + l_ij
-            # -- update output accumulator --
-            # scale p
-            # scale acc
-            acc_scale = alpha
-            # acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v_load = tl.load(V_cache + off_v,
-                             mask=dim_mask[None, :] &
-                             ((start_n + offs_n[:, None]) < cur_batch_ctx_len),
-                             other=0.0)
-            if v_load.dtype.is_fp8():
-                v = (v_load.to(tl.float32) * tl.load(v_scale)).to(q.dtype)
-            else:
-                v = v_load
-            p = p.to(v.dtype)
-
-            acc = tl.dot(p, v, acc=acc, input_precision='ieee')
-            # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-
-        off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
-                 offs_d[:, None] * stride_kd)
-        off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
-                 offs_d[None, :] * stride_vd)
-        k_ptrs = K + off_k
-        v_ptrs = V + off_v
-
-        block_mask = tl.where(
-            block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
-
-        # init alibi
-        alibi_slope = tl.load(Alibi_slopes + cur_head)
-        alibi_start_q = tl.arange(
-            0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
-        alibi_start_k = cur_batch_ctx_len
-        # # init debugger
-        # offset_db_q = tl.arange(0, BLOCK_M) + block_start_loc
-        # offset_db_k = tl.arange(0, BLOCK_N)
-        # calc q[BLOCK_M, BLOCK_MODEL] mul k[prefix_len: , BLOCK_DMODEL]
-        for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            k = tl.load(k_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_kbs,
-                        mask=dim_mask[:, None] &
-                        ((start_n + offs_n[None, :])
-                         < cur_batch_seq_len - cur_batch_ctx_len),
-                        other=0.0)
-
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk = tl.dot(q, k, acc=qk, input_precision='ieee')
-            qk *= sm_scale
-            qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
-                          float("-inf"))
-
-            # load alibi
-            alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k -
-                     alibi_start_q[:, None]) * alibi_slope
-            alibi = tl.where(
-                (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len),
-                alibi, float("-inf"))
-            qk += alibi
-            alibi_start_k += BLOCK_N
-
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)
-            m_i_new = tl.maximum(m_i, m_ij)
-            p = tl.math.exp(qk - m_i_new[:, None])
-            l_ij = tl.sum(p, 1)
-            # -- update m_i and l_i
-
-            alpha = tl.math.exp(m_i - m_i_new)
-            l_i_new = alpha * l_i + l_ij
-            # -- update output accumulator --
-            # scale p
-            # scale acc
-            acc_scale = alpha
-            # acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v = tl.load(v_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_vbs,
-                        mask=dim_mask[None, :] &
-                        ((start_n + offs_n[:, None])
-                         < cur_batch_seq_len - cur_batch_ctx_len),
-                        other=0.0)
-            p = p.to(v.dtype)
-
-            acc = tl.dot(p, v, acc=acc, input_precision='ieee')
-            # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-
-        acc = acc / l_i[:, None]
-
-        # initialize pointers to output
-        off_o = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
-            cur_head * stride_oh + offs_d[None, :] * stride_od)
-        out_ptrs = Out + off_o
-        tl.store(out_ptrs,
-                 acc,
-                 mask=dim_mask[None, :] &
-                 (offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len))
+    if SKIP_DECODE and cur_batch_query_len == 1:
         return
 
-    @torch.inference_mode()
-    def context_attention_fwd(q,
-                              k,
-                              v,
-                              o,
-                              kv_cache_dtype: str,
-                              k_cache,
-                              v_cache,
-                              b_loc,
-                              b_start_loc,
-                              b_seq_len,
-                              max_seq_len,
-                              max_input_len,
-                              k_scale: torch.Tensor,
-                              v_scale: torch.Tensor,
-                              alibi_slopes=None,
-                              sliding_window=None,
-                              sm_scale=None,
-                              skip_decode=False):
+    block_start_loc = BLOCK_M * start_m
 
-        q_dtype_is_f32 = q.dtype is torch.float32
+    # initialize offsets
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL_PADDED)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    off_q = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
+             cur_head * stride_qh + offs_d[None, :] * stride_qd)
+
+    dim_mask = tl.where(
+        tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1, 0).to(tl.int1)
+
+    q = tl.load(Q + off_q,
+                mask=dim_mask[None, :] &
+                (offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len),
+                other=0.0)
+
+    # # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED], dtype=tl.float32)
+
+    alibi_slope = tl.load(Alibi_slopes + cur_head)
+    alibi_start_q = tl.arange(0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
+    alibi_start_k = 0
+    for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
+                     ((start_n + offs_n) // block_size) * stride_b_loc_s,
+                     mask=(start_n + offs_n) < cur_batch_ctx_len,
+                     other=0)
+        off_k = (
+            bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
+            (offs_d[:, None] // x) * stride_k_cache_d +
+            ((start_n + offs_n[None, :]) % block_size) * stride_k_cache_bl +
+            (offs_d[:, None] % x) * stride_k_cache_x)
+        off_v = (bn[:, None] * stride_v_cache_bs +
+                 cur_kv_head * stride_v_cache_h +
+                 offs_d[None, :] * stride_v_cache_d +
+                 (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
+        k_load = tl.load(K_cache + off_k,
+                         mask=dim_mask[:, None] &
+                         ((start_n + offs_n[None, :]) < cur_batch_ctx_len),
+                         other=0.0)  # [D,N]
+
+        if k_load.dtype.is_fp8():
+            k = (k_load.to(tl.float32) * tl.load(k_scale)).to(q.dtype)
+        else:
+            k = k_load
+
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
+        qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
+                      float("-inf"))
+        qk *= sm_scale
+
+        # load alibi
+        alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k -
+                 alibi_start_q[:, None]) * alibi_slope
+        alibi = tl.where(
+            (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len), alibi,
+            float("-inf"))
+        qk += alibi
+        alibi_start_k += BLOCK_N
+
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        m_i_new = tl.maximum(m_i, m_ij)
+        p = tl.math.exp(qk - m_i_new[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+
+        alpha = tl.math.exp(m_i - m_i_new)
+        l_i_new = alpha * l_i + l_ij
+        # -- update output accumulator --
+        # scale p
+        # scale acc
+        acc_scale = alpha
+        # acc_scale = l_i / l_i_new * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v_load = tl.load(V_cache + off_v,
+                         mask=dim_mask[None, :] &
+                         ((start_n + offs_n[:, None]) < cur_batch_ctx_len),
+                         other=0.0)
+        if v_load.dtype.is_fp8():
+            v = (v_load.to(tl.float32) * tl.load(v_scale)).to(q.dtype)
+        else:
+            v = v_load
+        p = p.to(v.dtype)
+
+        acc = tl.dot(p, v, acc=acc, input_precision='ieee')
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+
+    off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
+             offs_d[:, None] * stride_kd)
+    off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
+             offs_d[None, :] * stride_vd)
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+
+    block_mask = tl.where(
+        block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
+
+    # init alibi
+    alibi_slope = tl.load(Alibi_slopes + cur_head)
+    alibi_start_q = tl.arange(0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
+    alibi_start_k = cur_batch_ctx_len
+    # # init debugger
+    # offset_db_q = tl.arange(0, BLOCK_M) + block_start_loc
+    # offset_db_k = tl.arange(0, BLOCK_N)
+    # calc q[BLOCK_M, BLOCK_MODEL] mul k[prefix_len: , BLOCK_DMODEL]
+    for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        k = tl.load(
+            k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs,
+            mask=dim_mask[:, None] & ((start_n + offs_n[None, :])
+                                      < cur_batch_seq_len - cur_batch_ctx_len),
+            other=0.0)
+
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk = tl.dot(q, k, acc=qk, input_precision='ieee')
+        qk *= sm_scale
+        qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
+                      float("-inf"))
+
+        # load alibi
+        alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k -
+                 alibi_start_q[:, None]) * alibi_slope
+        alibi = tl.where(
+            (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len), alibi,
+            float("-inf"))
+        qk += alibi
+        alibi_start_k += BLOCK_N
+
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        m_i_new = tl.maximum(m_i, m_ij)
+        p = tl.math.exp(qk - m_i_new[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+
+        alpha = tl.math.exp(m_i - m_i_new)
+        l_i_new = alpha * l_i + l_ij
+        # -- update output accumulator --
+        # scale p
+        # scale acc
+        acc_scale = alpha
+        # acc_scale = l_i / l_i_new * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v = tl.load(
+            v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs,
+            mask=dim_mask[None, :] & ((start_n + offs_n[:, None])
+                                      < cur_batch_seq_len - cur_batch_ctx_len),
+            other=0.0)
+        p = p.to(v.dtype)
+
+        acc = tl.dot(p, v, acc=acc, input_precision='ieee')
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+
+    acc = acc / l_i[:, None]
+
+    # initialize pointers to output
+    off_o = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
+             cur_head * stride_oh + offs_d[None, :] * stride_od)
+    out_ptrs = Out + off_o
+    tl.store(out_ptrs,
+             acc,
+             mask=dim_mask[None, :] &
+             (offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len))
+    return
+
+
+@torch.inference_mode()
+def context_attention_fwd(q,
+                          k,
+                          v,
+                          o,
+                          kv_cache_dtype: str,
+                          k_cache,
+                          v_cache,
+                          b_loc,
+                          b_start_loc,
+                          b_seq_len,
+                          max_seq_len,
+                          max_input_len,
+                          k_scale: torch.Tensor,
+                          v_scale: torch.Tensor,
+                          alibi_slopes=None,
+                          sliding_window=None,
+                          sm_scale=None,
+                          skip_decode=False):
+
+    q_dtype_is_f32 = q.dtype is torch.float32
+
+    # Turing does have tensor core for float32 multiplication
+    # use ieee as fallback for triton kernels work. There is also
+    # warning on vllm/config.py to inform users this fallback
+    # implementation
+    IN_PRECISION = 'ieee' if IS_TURING and q_dtype_is_f32 else None
+
+    # Conversion of FP8 Tensor from uint8 storage to
+    # appropriate torch.dtype for interpretation by Triton
+    if "fp8" in kv_cache_dtype:
+        assert (k_cache.dtype == torch.uint8)
+        assert (v_cache.dtype == torch.uint8)
+
+        if kv_cache_dtype in ("fp8", "fp8_e4m3"):
+            target_dtype = current_platform.fp8_dtype()
+        elif kv_cache_dtype == "fp8_e5m2":
+            target_dtype = torch.float8_e5m2
+        else:
+            raise ValueError("Unsupported FP8 dtype:", kv_cache_dtype)
+
+        k_cache = k_cache.view(target_dtype)
+        v_cache = v_cache.view(target_dtype)
+
+    if (k_cache.dtype == torch.uint8
+            or v_cache.dtype == torch.uint8 and kv_cache_dtype == "auto"):
+        raise ValueError("kv_cache_dtype='auto' unsupported for\
+            FP8 KV Cache prefill kernel")
+
+    # shape constraints
+    Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
+    assert Lq == Lk and Lk == Lv
+    # round up Lk to a power of 2 - this is required for Triton block size
+    Lk_padded = triton.next_power_of_2(Lk)
+
+    if sm_scale is None:
+        sm_scale = 1.0 / (Lq**0.5)
+    batch, head = b_seq_len.shape[0], q.shape[1]
+    num_queries_per_kv = q.shape[1] // k.shape[1]
+
+    assert batch + 1 == len(b_start_loc)
+
+    # 0 means "disable"
+    if sliding_window is None or sliding_window <= 0:
+        sliding_window = 0
+
+    if alibi_slopes is not None:
         # need to reduce num. blocks when using fp32
         # due to increased use of GPU shared memory
         # if q.dtype is torch.float32:
         BLOCK = BASE_BLOCK // 2 if q_dtype_is_f32 else BASE_BLOCK
-
-        # Turing does have tensor core for float32 multiplication
-        # use ieee as fallback for triton kernels work. There is also
-        # warning on vllm/config.py to inform users this fallback
-        # implementation
-        IN_PRECISION = 'ieee' if IS_TURING and q_dtype_is_f32 else None
-
-        # Conversion of FP8 Tensor from uint8 storage to
-        # appropriate torch.dtype for interpretation by Triton
-        if "fp8" in kv_cache_dtype:
-            assert (k_cache.dtype == torch.uint8)
-            assert (v_cache.dtype == torch.uint8)
-
-            if kv_cache_dtype in ("fp8", "fp8_e4m3"):
-                target_dtype = current_platform.fp8_dtype()
-            elif kv_cache_dtype == "fp8_e5m2":
-                target_dtype = torch.float8_e5m2
-            else:
-                raise ValueError("Unsupported FP8 dtype:", kv_cache_dtype)
-
-            k_cache = k_cache.view(target_dtype)
-            v_cache = v_cache.view(target_dtype)
-
-        if (k_cache.dtype == torch.uint8
-                or v_cache.dtype == torch.uint8 and kv_cache_dtype == "auto"):
-            raise ValueError("kv_cache_dtype='auto' unsupported for\
-                FP8 KV Cache prefill kernel")
-
-        # shape constraints
-        Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
-        assert Lq == Lk and Lk == Lv
-        # round up Lk to a power of 2 - this is required for Triton block size
-        Lk_padded = triton.next_power_of_2(Lk)
-
-        if sm_scale is None:
-            sm_scale = 1.0 / (Lq**0.5)
-        batch, head = b_seq_len.shape[0], q.shape[1]
-        num_queries_per_kv = q.shape[1] // k.shape[1]
-
-        assert batch + 1 == len(b_start_loc)
-        grid = (batch, head, triton.cdiv(max_input_len, BLOCK))  # batch, head,
-
-        # 0 means "disable"
-        if sliding_window is None or sliding_window <= 0:
-            sliding_window = 0
-
-        if alibi_slopes is not None:
-            _fwd_kernel_alibi[grid](
-                q,
-                k,
-                v,
-                k_cache,
-                v_cache,
-                b_loc,
-                sm_scale,
-                k_scale,
-                v_scale,
-                b_start_loc,
-                b_seq_len,
-                alibi_slopes,
-                v_cache.shape[3],
-                k_cache.shape[4],
-                o,
-                b_loc.stride(0),
-                b_loc.stride(1),
-                q.stride(0),
-                q.stride(1),
-                q.stride(2),
-                k.stride(0),
-                k.stride(1),
-                k.stride(2),
-                v.stride(0),
-                v.stride(1),
-                v.stride(2),
-                o.stride(0),
-                o.stride(1),
-                o.stride(2),
-                k_cache.stride(0),
-                k_cache.stride(1),
-                k_cache.stride(2),
-                k_cache.stride(3),
-                k_cache.stride(
-                    4
-                ),  #[num_blocks, num_kv_heads, head_size/x, block_size, x]
-                v_cache.stride(0),
-                v_cache.stride(1),
-                v_cache.stride(2),
-                v_cache.stride(
-                    3),  #[num_blocks, num_kv_heads, head_size, block_size]
-                num_queries_per_kv=num_queries_per_kv,
-                IN_PRECISION=IN_PRECISION,
-                BLOCK_M=BLOCK,
-                BLOCK_DMODEL=Lk,
-                BLOCK_DMODEL_PADDED=Lk_padded,
-                BLOCK_N=BLOCK,
-                SKIP_DECODE=skip_decode,
-                num_warps=NUM_WARPS,
-                num_stages=1,
-            )
-            return
-
-        _fwd_kernel[grid](
+        # batch, head,
+        grid = (batch, head, triton.cdiv(max_input_len, BLOCK))
+        _fwd_kernel_alibi[grid](
             q,
             k,
             v,
@@ -852,6 +799,7 @@ if triton.__version__ >= "2.1.0":
             v_scale,
             b_start_loc,
             b_seq_len,
+            alibi_slopes,
             v_cache.shape[3],
             k_cache.shape[4],
             o,
@@ -886,9 +834,69 @@ if triton.__version__ >= "2.1.0":
             BLOCK_DMODEL=Lk,
             BLOCK_DMODEL_PADDED=Lk_padded,
             BLOCK_N=BLOCK,
-            SLIDING_WINDOW=sliding_window,
             SKIP_DECODE=skip_decode,
             num_warps=NUM_WARPS,
             num_stages=1,
         )
         return
+
+    max_seq_len = 0 if max_seq_len is None else max_seq_len
+    extra_kargs = {}
+    if current_platform.is_rocm():
+        extra_kargs = {"kpack": 2, "waves_per_eu": 2}
+
+    grid = lambda META: (batch, head,
+                         triton.cdiv(max_input_len, META["BLOCK_M"]))
+    _fwd_kernel[grid](
+        q,
+        k,
+        v,
+        k_cache,
+        v_cache,
+        b_loc,
+        sm_scale,
+        k_scale,
+        v_scale,
+        b_start_loc,
+        b_seq_len,
+        k_cache.shape[4],
+        o,
+        b_loc.stride(0),
+        b_loc.stride(1),
+        q.stride(0),
+        q.stride(1),
+        q.stride(2),
+        k.stride(0),
+        k.stride(1),
+        k.stride(2),
+        v.stride(0),
+        v.stride(1),
+        v.stride(2),
+        o.stride(0),
+        o.stride(1),
+        o.stride(2),
+        k_cache.stride(0),
+        k_cache.stride(1),
+        k_cache.stride(2),
+        k_cache.stride(3),
+        k_cache.stride(
+            4),  #[num_blocks, num_kv_heads, head_size/x, block_size, x]
+        v_cache.stride(0),
+        v_cache.stride(1),
+        v_cache.stride(2),
+        v_cache.stride(3),  #[num_blocks, num_kv_heads, head_size, block_size]
+        BLOCK_SIZE=v_cache.shape[3],
+        num_queries_per_kv=num_queries_per_kv,
+        IN_PRECISION=IN_PRECISION,
+        BLOCK_DMODEL=Lk,
+        BLOCK_DMODEL_PADDED=Lk_padded,
+        SLIDING_WINDOW=sliding_window,
+        SKIP_DECODE=skip_decode,
+        BLOCK_M=128,
+        BLOCK_N=64,
+        num_unroll_cache=4,
+        num_unroll_request=1,
+        num_warps=4,
+        num_stages=1,
+        **extra_kargs)
+    return

From 1e013fa3887ccabcffea8cbe3714a8805aafaae4 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Tue, 22 Apr 2025 19:12:15 -0700
Subject: [PATCH 576/593] [V1][DP] More robust DP/EP dummy request coordination
 (#16277)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/test_async_llm_dp.py |  4 +-
 vllm/v1/engine/__init__.py    | 15 ++++++--
 vllm/v1/engine/core.py        | 63 +++++++++++++++++++++-----------
 vllm/v1/engine/core_client.py | 69 +++++++++++++++++++----------------
 4 files changed, 94 insertions(+), 57 deletions(-)

diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py
index f0e031969e733..ce4c4d198db58 100644
--- a/tests/v1/test_async_llm_dp.py
+++ b/tests/v1/test_async_llm_dp.py
@@ -101,9 +101,9 @@ async def test_load(output_kind: RequestOutputKind):
         # the engines only synchronize stopping every N steps so
         # allow a small amount of time here.
         for _ in range(10):
-            if core_client.num_engines_running == 0:
+            if not core_client.engines_running:
                 break
             await asyncio.sleep(0.5)
 
-        assert core_client.num_engines_running == 0
+        assert not core_client.engines_running
         assert not core_client.reqs_in_flight
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index af4122a510779..5f5675b955fad 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -61,6 +61,11 @@ class EngineCoreRequest(
     arrival_time: float
     lora_request: Optional[LoRARequest]
 
+    # Used in DP case to indicate which wave of requests this is expected to
+    # belong to, to cover a race condition where the request is sent before
+    # a wave finished notification is received.
+    current_wave: int = 0
+
 
 class EngineCoreEventType(enum.IntEnum):
     """The type of engine core request event."""
@@ -139,8 +144,12 @@ class EngineCoreOutputs(
     utility_output: Optional[UtilityOutput] = None
     finished_requests: Optional[set[str]] = None
 
-    # In DP case, used to signal that the engine is paused.
-    engine_paused: bool = False
+    # In DP case, used to signal that the current wave of requests
+    # has finished and the engines are paused.
+    wave_complete: Optional[int] = None
+    # In DP case, used to signal that a request was received for an
+    # "old" wave, so the next wave needs to be started in other engines.
+    start_wave: Optional[int] = None
 
     def __post_init__(self):
         if self.timestamp == 0.0:
@@ -154,7 +163,7 @@ class EngineCoreRequestType(enum.Enum):
     """
     ADD = b'\x00'
     ABORT = b'\x01'
-    START_DP = b'\x02'
+    START_DP_WAVE = b'\x02'
     UTILITY = b'\x03'
     # Sentinel used within EngineCoreProc.
     EXECUTOR_FAILED = b'\x04'
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 9c4036efd050b..2211431fbceb1 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -325,7 +325,7 @@ class EngineCoreProc(EngineCore):
 
         self.step_fn = (self.step if self.batch_queue is None else
                         self.step_with_batch_queue)
-        self.global_unfinished_reqs = False
+        self.engines_running = False
 
         # Background Threads and Queues for IO. These enable us to
         # overlap ZMQ socket IO with GPU since they release the GIL,
@@ -410,8 +410,7 @@ class EngineCoreProc(EngineCore):
         """Exits when an engine step needs to be performed."""
 
         waited = False
-        while not self.global_unfinished_reqs and not (
-                self.scheduler.has_requests()):
+        while not self.engines_running and not (self.scheduler.has_requests()):
             if logger.isEnabledFor(DEBUG) and self.input_queue.empty():
                 logger.debug("EngineCore waiting for work.")
                 waited = True
@@ -419,10 +418,7 @@ class EngineCoreProc(EngineCore):
             self._handle_client_request(*req)
 
         if waited:
-            logger.debug(
-                "EngineCore loop active - local unfinished: %s, finished: %s.",
-                self.scheduler.has_unfinished_requests(),
-                self.scheduler.has_finished_requests())
+            logger.debug("EngineCore loop active.")
 
         # Handle any more client requests.
         while not self.input_queue.empty():
@@ -446,10 +442,6 @@ class EngineCoreProc(EngineCore):
             self.add_request(request)
         elif request_type == EngineCoreRequestType.ABORT:
             self.abort_requests(request)
-        elif request_type == EngineCoreRequestType.START_DP:
-            if not self.global_unfinished_reqs:
-                logger.debug("EngineCore starting idle loop.")
-                self.global_unfinished_reqs = True
         elif request_type == EngineCoreRequestType.UTILITY:
             call_id, method_name, args = request
             output = UtilityOutput(call_id)
@@ -548,9 +540,6 @@ class EngineCoreProc(EngineCore):
                 socket.send_multipart(buffers, copy=False)
 
 
-ENGINE_PAUSED_OUTPUTS = EngineCoreOutputs(engine_paused=True)
-
-
 class DPEngineCoreProc(EngineCoreProc):
     """ZMQ-wrapper for running EngineCore in background process
     in a data parallel context."""
@@ -587,7 +576,9 @@ class DPEngineCoreProc(EngineCoreProc):
                 for i in range(local_dp_rank * tp_size, (local_dp_rank + 1) *
                                tp_size))
 
+        self.local_dp_rank = local_dp_rank
         self.dp_group = vllm_config.parallel_config.stateless_init_dp_group()
+        self.current_wave = 0
 
         # Initialize the engine after setting up environment.
         super().__init__(input_path, output_path, vllm_config, executor_class,
@@ -602,6 +593,31 @@ class DPEngineCoreProc(EngineCoreProc):
         if dp_group := getattr(self, "dp_group", None):
             stateless_destroy_torch_distributed_process_group(dp_group)
 
+    def add_request(self, request: EngineCoreRequest):
+        if request.current_wave != self.current_wave:
+            if request.current_wave > self.current_wave:
+                self.current_wave = request.current_wave
+            elif not self.engines_running:
+                # Request received for an already-completed wave, notify
+                # front-end that we need to start the next one.
+                self.output_queue.put_nowait(
+                    EngineCoreOutputs(start_wave=self.current_wave))
+
+        super().add_request(request)
+
+    def _handle_client_request(self, request_type: EngineCoreRequestType,
+                               request: Any) -> None:
+        if request_type == EngineCoreRequestType.START_DP_WAVE:
+            new_wave: int = request
+            if new_wave >= self.current_wave:
+                self.current_wave = new_wave
+                if not self.engines_running:
+                    logger.debug("EngineCore starting idle loop for wave %d.",
+                                 new_wave)
+                    self.engines_running = True
+        else:
+            super()._handle_client_request(request_type, request)
+
     def run_busy_loop(self):
         """Core busy loop of the EngineCore for data parallel case."""
 
@@ -628,7 +644,7 @@ class DPEngineCoreProc(EngineCoreProc):
                     # up-to-date state is returned in the engine outputs.
                     self._process_engine_step()
 
-                if not self.global_unfinished_reqs:
+                if not self.engines_running:
                     # All engines are idle.
                     continue
 
@@ -637,18 +653,23 @@ class DPEngineCoreProc(EngineCoreProc):
                 self.execute_dummy_batch()
 
             # 3) All-reduce operation to determine global unfinished reqs.
-            self.global_unfinished_reqs = self._has_global_unfinished_reqs(
+            self.engines_running = self._has_global_unfinished_reqs(
                 local_unfinished_reqs)
 
-            if not self.global_unfinished_reqs:
-                # Notify client that we are pausing the loop.
-                self.output_queue.put_nowait(ENGINE_PAUSED_OUTPUTS)
+            if not self.engines_running:
+                if self.local_dp_rank == 0:
+                    # Notify client that we are pausing the loop.
+                    logger.debug("Wave %d finished, pausing engine loop.",
+                                 self.current_wave)
+                    self.output_queue.put_nowait(
+                        EngineCoreOutputs(wave_complete=self.current_wave))
+                self.current_wave += 1
 
     def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool:
 
-        # Optimization - only perform finish-sync all-reduce every 16 steps.
+        # Optimization - only perform finish-sync all-reduce every 24 steps.
         self.counter += 1
-        if self.counter != 16:
+        if self.counter != 24:
             return True
         self.counter = 0
 
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index f54b3546f06dd..0efb5dfb39b70 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -792,15 +792,12 @@ class DPAsyncMPClient(AsyncMPClient):
     def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
                  log_stats: bool):
 
-        self.num_engines_running = 0
+        self.current_wave = 0
+        self.engines_running = False
         self.reqs_in_flight: dict[str, CoreEngine] = {}
 
         super().__init__(vllm_config, executor_class, log_stats)
 
-        # Control message used for triggering dp idle mode loop.
-        self.start_dp_msg = (EngineCoreRequestType.START_DP.value,
-                             *self.encoder.encode(None))
-
         assert len(self.core_engines) > 1
 
     def _init_core_engines(
@@ -829,23 +826,23 @@ class DPAsyncMPClient(AsyncMPClient):
         # NOTE: text prompt is not needed in the core engine as it has been
         # tokenized.
         request.prompt = None
-
-        msg = (EngineCoreRequestType.ADD.value, *self.encoder.encode(request))
+        request.current_wave = self.current_wave
 
         chosen_engine = self.get_core_engine_for_request()
         self.reqs_in_flight[request.request_id] = chosen_engine
         chosen_engine.num_reqs_in_flight += 1
-        if self.num_engines_running >= len(self.core_engines):
-            await self._send_input_message(msg, chosen_engine)
-        else:
+
+        to_await = self._send_input(EngineCoreRequestType.ADD, request,
+                                    chosen_engine)
+        if not self.engines_running:
             # Send request to chosen engine and dp start loop
             # control message to all other engines.
-            self.num_engines_running += len(self.core_engines)
-            await asyncio.gather(*[
-                self._send_input_message(
-                    msg if engine is chosen_engine else self.start_dp_msg,
-                    engine) for engine in self.core_engines
-            ])
+            self.engines_running = True
+            to_await = asyncio.gather(
+                to_await,  # type: ignore[assignment]
+                *self._start_wave_coros(exclude_index=chosen_engine.index))
+
+        await to_await
 
         self._ensure_output_queue_task()
 
@@ -860,21 +857,31 @@ class DPAsyncMPClient(AsyncMPClient):
                 if engine := self.reqs_in_flight.pop(req_id, None):
                     engine.num_reqs_in_flight -= 1
 
-        if outputs.engine_paused:
-            assert self.num_engines_running >= 1
-            self.num_engines_running -= 1
-            if not self.num_engines_running and self.reqs_in_flight:
-                # If there are requests in flight here, they must have
-                # been sent after the engines paused. We must make
-                # sure to start the other engines:
-                self.num_engines_running = len(self.core_engines)
-                coros = [
-                    self._send_input_message(self.start_dp_msg, engine)
-                    for engine in self.core_engines
-                    if not engine.num_reqs_in_flight
-                ]
-                if coros:
-                    await asyncio.gather(*coros)
+        if outputs.wave_complete is not None:
+            # Current wave is complete, move to next wave number
+            # and mark engines as paused.
+            if self.current_wave <= outputs.wave_complete:
+                self.current_wave = outputs.wave_complete + 1
+                self.engines_running = False
+
+        elif outputs.start_wave is not None and (
+                outputs.start_wave > self.current_wave or
+            (outputs.start_wave == self.current_wave
+             and not self.engines_running)):
+            # Engine received request for a non-current wave so we must ensure
+            # that other engines progress to the next wave.
+            self.current_wave = outputs.start_wave
+            self.engines_running = True
+            await asyncio.gather(*self._start_wave_coros(
+                exclude_index=outputs.engine_index))
+
+    def _start_wave_coros(self, exclude_index: int) -> list[Awaitable[None]]:
+        logger.debug("Sending start DP wave %d.", self.current_wave)
+        return [
+            self._send_input(EngineCoreRequestType.START_DP_WAVE,
+                             self.current_wave, engine)
+            for engine in self.core_engines if engine.index != exclude_index
+        ]
 
     async def abort_requests_async(self, request_ids: list[str]) -> None:
         if not request_ids:

From 7e081ba7cad2cf5c98376135de781fc76bfc103c Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Wed, 23 Apr 2025 10:17:48 +0800
Subject: [PATCH 577/593] [BugFix] Revert ROCm Custom Paged Attention Env Flag
 Check (#17022)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/platforms/rocm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 24d8657af17d7..944879b94ecd5 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -118,6 +118,7 @@ def use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
             and (head_size == 64 or head_size == 128)
             and (block_size == 16 or block_size == 32)
             and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768
+            and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
             and not (envs.VLLM_ROCM_USE_AITER_PAGED_ATTN
                      and envs.VLLM_ROCM_USE_AITER))
 

From 6bc1e30ef9ebaca4fe87c4406e60298b29d73ef9 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Wed, 23 Apr 2025 10:22:29 +0800
Subject: [PATCH 578/593] Revert "[Misc] Add S3 environment variables for
 better support of MinIO." (#17021)

---
 vllm/transformers_utils/s3_utils.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py
index f67b8fdd7c4ed..1c3520bcfb278 100644
--- a/vllm/transformers_utils/s3_utils.py
+++ b/vllm/transformers_utils/s3_utils.py
@@ -45,12 +45,7 @@ def glob(s3=None,
         list[str]: List of full S3 paths allowed by the pattern
     """
     if s3 is None:
-        s3 = boto3.client(
-            's3',
-            aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
-            aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
-            endpoint_url=os.getenv("AWS_ENDPOINT_URL"),
-            region_name=os.getenv("AWS_REGION_NAME"))
+        s3 = boto3.client("s3")
     if not path.endswith("/"):
         path = path + "/"
     bucket_name, _, paths = list_files(s3,
@@ -112,12 +107,7 @@ class S3Model:
     """
 
     def __init__(self) -> None:
-        self.s3 = boto3.client(
-            's3',
-            aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
-            aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
-            endpoint_url=os.getenv("AWS_ENDPOINT_URL"),
-            region_name=os.getenv("AWS_REGION_NAME"))
+        self.s3 = boto3.client('s3')
         for sig in (signal.SIGINT, signal.SIGTERM):
             existing_handler = signal.getsignal(sig)
             signal.signal(sig, self._close_by_signal(existing_handler))

From e1cf90e09971ad1fdb340c49e5b008b7852bc07d Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 23 Apr 2025 10:59:48 +0800
Subject: [PATCH 579/593] [misc] tune some env vars for GB200 (#16992)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/env_override.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/vllm/env_override.py b/vllm/env_override.py
index 0fa5b70c2ef91..71f031d1e2313 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -8,8 +8,21 @@ import torch
 # that interact with vllm workers.
 # they are executed whenever `import vllm` is called.
 
-# see https://github.com/NVIDIA/nccl/issues/1234
-os.environ['NCCL_CUMEM_ENABLE'] = '0'
+if not os.path.exists('/dev/nvidia-caps-imex-channels'):
+    # normally, we disable NCCL_CUMEM_ENABLE because it
+    # will cost 1~2 GiB GPU memory with cudagraph+allreduce,
+    # see https://github.com/NVIDIA/nccl/issues/1234
+    # for more details.
+    # However, NCCL requires NCCL_CUMEM_ENABLE to work with
+    # multi-node NVLink, typically on GB200-NVL72 systems.
+    # The ultimate way to detect multi-node NVLink is to use
+    # NVML APIs, which are too expensive to call here.
+    # As an approximation, we check the existence of
+    # /dev/nvidia-caps-imex-channels, used by
+    # multi-node NVLink to communicate across nodes.
+    # This will still cost some GPU memory, but it is worthwhile
+    # because we can get very fast cross-node bandwidth with NVLink.
+    os.environ['NCCL_CUMEM_ENABLE'] = '0'
 
 # see https://github.com/vllm-project/vllm/pull/15951
 # it avoids unintentional cuda initialization from torch.cuda.is_available()

From 56a735261c17d04529bca08d8245b1d383bae5f0 Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Tue, 22 Apr 2025 22:14:11 -0500
Subject: [PATCH 580/593] [INTEL-HPU][v0] Port delayed sampling to upstream
 (#16949)

Signed-off-by: Michal Adamczyk <michal.adamczyk@intel.com>
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
Co-authored-by: Michal Adamczyk <madamczyk@habana.ai>
---
 vllm/envs.py                    |   7 ++
 vllm/worker/hpu_model_runner.py | 140 ++++++++++++++++++++++++++++++--
 2 files changed, 140 insertions(+), 7 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 92dcf1555f223..03a8a2b20f02e 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -98,6 +98,7 @@ if TYPE_CHECKING:
     VLLM_RAY_BUNDLE_INDICES: str = ""
     VLLM_CUDART_SO_PATH: Optional[str] = None
     VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH: bool = True
+    VLLM_HPU_USE_DELAYED_SAMPLING: bool = False
     VLLM_DP_RANK: int = 0
     VLLM_DP_RANK_LOCAL: int = -1
     VLLM_DP_SIZE: int = 1
@@ -650,6 +651,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: os.environ.get("VLLM_CONTIGUOUS_PA", "true").lower() in
     ("1", "true"),
 
+    # Use delayed sampling for HPU to reduce host cpu overhead
+    # between each step.
+    "VLLM_HPU_USE_DELAYED_SAMPLING":
+    lambda: os.environ.get("VLLM_DELAYED_SAMPLING", "false").lower() in
+    ("1", "true"),
+
     # Rank of the process in the data parallel setting
     "VLLM_DP_RANK":
     lambda: int(os.getenv("VLLM_DP_RANK", "0")),
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 7a346b34cef59..2d31024b47d0a 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -74,6 +74,8 @@ _PAD_BLOCK_ID = 0
 
 LORA_WARMUP_RANK = 8
 
+DUMMY_TOKEN_ID = -1
+
 
 class Singleton(type):
     _instances: Dict[type, object] = {}
@@ -668,6 +670,9 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
 
         # For multi-step scheduling
         self.cached_step_outputs: List[torch.Tensor] = []
+        # For delayed sampling
+        self.cached_step_inputs: List[
+            ModelInputForHPUWithSamplingMetadata] = []
 
     def _set_gc_threshold(self) -> None:
         # Read https://docs.python.org/3/library/gc.html#gc.set_threshold
@@ -771,6 +776,12 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
         msg = f"Loading model weights took in total {m.get_summary_string()}"
         logger.info(msg)
 
+    def _maybe_wrap_in_hpu_graph(self, *args, **kwargs):
+        return htorch.hpu.wrap_in_hpu_graph(
+            HpuModelAdapter(*args, **kwargs), disable_tensor_cache=True
+        ) if htorch.utils.internal.is_lazy() else HpuModelAdapter(
+            *args, **kwargs)
+
     def get_model(self) -> nn.Module:
         return self.model
 
@@ -2020,6 +2031,21 @@ class HPUModelRunner(HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]):
 
         return lora_mask, lora_logits_mask
 
+    def _get_seq_ids(self, model_input):
+        return ([
+            sg.seq_ids[0] for sg in model_input.sampling_metadata.seq_groups
+        ])
+
+    def _pad_to_max_num_seqs(self, tensor, value):
+        padding_needed = self.max_num_seqs - tensor.size(0)
+        if padding_needed:
+            padding = torch.full((padding_needed, *tensor.shape[1:]),
+                                 value,
+                                 device=tensor.device,
+                                 dtype=tensor.dtype)
+            tensor = torch.cat([tensor, padding])
+        return tensor
+
     @torch.inference_mode()
     def execute_model(
         self,
@@ -2030,6 +2056,37 @@ class HPUModelRunner(HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]):
         warmup_mode=False,
         seqs=None,
     ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
+        VLLM_DELAYED_SAMPLING = envs.VLLM_HPU_USE_DELAYED_SAMPLING
+        use_delayed_sampling = VLLM_DELAYED_SAMPLING and not warmup_mode
+        assert not (use_delayed_sampling and num_steps != 1), \
+            'Delayed sampling is not compatible with MSS!'
+        assert model_input.input_tokens is not None
+        if use_delayed_sampling and not model_input.is_prompt and \
+                self.is_driver_worker:
+            num_cached = len(self.cached_step_outputs)
+            assert num_cached > 0
+            cur_seq_ids = self._get_seq_ids(model_input)
+            cur_seq_id_pos = {
+                sid: idx
+                for idx, sid in enumerate(cur_seq_ids) if sid >= 0
+            }
+            htorch.core.mark_step()
+            for i in range(num_cached):
+                prev_seq_ids = self._get_seq_ids(self.cached_step_inputs[i])
+                target_indices = [
+                    cur_seq_id_pos.get(psi, -1) for psi in prev_seq_ids
+                ]
+                padding = self.cached_step_outputs[i].size(0) - len(
+                    target_indices)
+                target_indices.extend([-1] * padding)
+                target_indices = torch.tensor(
+                    target_indices,
+                    device=model_input.input_tokens.device,
+                    dtype=model_input.input_tokens.dtype)
+                model_input.input_tokens.index_copy_(
+                    0, target_indices, self.cached_step_outputs[i])
+                htorch.core.mark_step()
+
         if not model_input.is_first_multi_step:
             if not model_input.is_last_step:
                 # not first or last multi-step
@@ -2045,7 +2102,21 @@ class HPUModelRunner(HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]):
                 assert model_input.lora_mapping is not None
                 self.set_active_loras(model_input.lora_requests,
                                       model_input.lora_mapping)
-            input_tokens = model_input.input_tokens
+            # Rank!=0 workers has is_prompt==None
+            if use_delayed_sampling and not model_input.is_prompt and \
+                    model_input.input_tokens.size(1) == 1:
+                if self.is_driver_worker:
+                    model_kwargs_broadcast_data = {
+                        "input_tokens": model_input.input_tokens
+                    }
+                    broadcast_tensor_dict(model_kwargs_broadcast_data, src=0)
+                    input_tokens = model_input.input_tokens
+
+                else:
+                    model_kwargs_broadcast_data = broadcast_tensor_dict(src=0)
+                    input_tokens = model_kwargs_broadcast_data["input_tokens"]
+            else:
+                input_tokens = model_input.input_tokens
             input_positions = model_input.input_positions
             attn_metadata = model_input.attn_metadata
             sampling_metadata = model_input.sampling_metadata
@@ -2092,7 +2163,7 @@ class HPUModelRunner(HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]):
                                     f"graphs{'T' if use_graphs else 'F'}")
             else:
                 model_event_name = 'model_executable'
-            if num_steps > 1:
+            if num_steps > 1 or use_delayed_sampling:
                 # in case of multi-step scheduling
                 # we only want to pythonize in the last step
                 sampling_metadata.skip_sampler_cpu_output = True
@@ -2152,9 +2223,9 @@ class HPUModelRunner(HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]):
                 if not self.is_driver_worker:
                     continue
 
-                if model_input.async_callback is not None:
-                    model_input.async_callback()
-                # Sample the next token.
+                if use_delayed_sampling:
+                    fake_output = self._delayed_sampler_outputs(model_input)
+
                 with self.profiler.record_event(
                         'internal', ('sample_'
                                      f'{"prompt" if is_prompt else "decode"}_'
@@ -2166,9 +2237,16 @@ class HPUModelRunner(HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]):
                     )
                     if num_steps > 1:
                         output = output.sampled_token_ids
-                        self.cached_step_outputs.append(
-                            output.detach().clone())
+                        self.cached_step_outputs.append(output)
+                    if use_delayed_sampling and self.is_driver_worker:
+                        self._patch_prev_output()
+                        output = self._pad_to_max_num_seqs(
+                            output.sampled_token_ids, DUMMY_TOKEN_ID)
+                        self.cached_step_outputs.append(output)
+                        self.cached_step_inputs.append(model_input)
                 htorch.core.mark_step()
+                if model_input.async_callback is not None:
+                    model_input.async_callback()
                 if i < num_steps - 1:
                     if i == 0:
                         if model_input.async_callback is not None:
@@ -2241,11 +2319,30 @@ class HPUModelRunner(HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]):
                     is_prompt=is_prompt)
                 self.profiler.record_counter(self.event_start, counters)
             if num_steps == 1:
+                if self.return_hidden_states:
+                    # we only need to pass hidden states of most recent token
+                    assert model_input.sampling_metadata is not None
+                    if model_input.is_prompt:
+                        output.prefill_hidden_states = hidden_states
+                    output.hidden_states = hidden_states
+                if use_delayed_sampling:
+                    if self.is_driver_worker:
+                        return [fake_output]
+                    else:
+                        return []
+
                 return [output] if self.is_driver_worker else []
             else:
                 return []
         return output if type(output) is list else [output]
 
+    def _delayed_sampler_outputs(self, model_input):
+        next_token_ids = [[DUMMY_TOKEN_ID]] * len(
+            model_input.sampling_metadata.seq_groups)
+        sampler_output = self._make_decode_output(
+            next_token_ids, model_input.sampling_metadata.seq_groups)
+        return sampler_output
+
     def _decode_sampler_outputs(self, model_input):
         use_async_out_proc = model_input.async_callback is not None
         sampler_outputs = []
@@ -2312,3 +2409,32 @@ class HPUModelRunner(HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]):
 
     def __del__(self):
         self.shutdown_inc()
+
+    def _patch_prev_output(self):
+        assert len(self.cached_step_inputs) == len(self.cached_step_outputs), \
+            f'''Inputs and outputs are out of sync!
+            {len(self.cached_step_inputs)} vs {len(self.cached_step_outputs)}'''
+        if len(self.cached_step_inputs) == 0:
+            return
+        model_input = self.cached_step_inputs.pop(0)
+        delayed_output = self.cached_step_outputs.pop(0).cpu().squeeze(
+            -1).tolist()
+        ctx = model_input.async_callback.keywords["ctx"]  # type: ignore
+        # If there's no output to patch with, which is usually the case when
+        # we're starting a new request after all requests are completed.
+        if len(ctx.output_queue) == 0:
+            return
+        assert len(
+            ctx.output_queue) == 1, 'There should be exactly 1 output waiting!'
+        output_data = ctx.output_queue[0]
+        assert len(output_data.outputs) == 1
+        for fake_out, real_out in zip(output_data.outputs[0], delayed_output):
+            fake_out.samples[0].output_token = real_out
+        for sg, real_out in zip(output_data.seq_group_metadata_list,
+                                delayed_output):
+            assert len(sg.seq_data) == 1
+            seq_data = list(sg.seq_data.values())[0]
+            # This is a hack. Assigning output_token_ids triggers
+            # a cache recomputation and we only need to update the last token
+            seq_data.output_token_ids_array[-1] = real_out
+            seq_data._cached_all_token_ids[-1] = real_out

From eb8ef4224da757bf8ea1c738f9514601835f8bbb Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Wed, 23 Apr 2025 12:06:30 +0800
Subject: [PATCH 581/593] [doc] add download path tips (#17013)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 docs/source/models/supported_models.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 331f18db817d1..0fdffbeefd031 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -133,7 +133,7 @@ class MyConfig(PretrainedConfig):
 
 ### Hugging Face Hub
 
-By default, vLLM loads models from [Hugging Face (HF) Hub](https://huggingface.co/models).
+By default, vLLM loads models from [Hugging Face (HF) Hub](https://huggingface.co/models). To change the download path for models, you can set the `HF_HOME` environment variable; for more details, refer to [their official documentation](https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables#hfhome).
 
 To determine whether a given model is natively supported, you can check the `config.json` file inside the HF repository.
 If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.

From 047797ef904fd77b13af3a972e92ef6de37a36db Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Wed, 23 Apr 2025 12:35:24 +0800
Subject: [PATCH 582/593] [Bugfix] Triton FA function takes no keyword
 arguments (#16902)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/attention/backends/mla/common.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index 2517a59718382..a3dec0dbda9f8 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -1091,7 +1091,14 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
                 q,
                 k,
                 maybe_padded_v,
-                **kwargs,
+                None,  # output
+                kwargs["cu_seqlens_q"],
+                kwargs["cu_seqlens_k"],
+                kwargs["max_seqlen_q"],
+                kwargs["max_seqlen_k"],
+                kwargs["causal"],
+                softmax_scale,
+                None,  # bias
             )
         if is_vllm_fa:
             attn_out = self.flash_attn_varlen_func(

From b2f195c4294ce72a97d4fdc6ca672ed85b916bd8 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Tue, 22 Apr 2025 21:36:29 -0700
Subject: [PATCH 583/593] [V1] Avoid socket errors during shutdown when
 requests are in in-flight (#16807)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/engine/core.py        |  2 +-
 vllm/v1/engine/core_client.py | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 2211431fbceb1..572e052cdcc27 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -384,7 +384,7 @@ class EngineCoreProc(EngineCore):
 
         except SystemExit:
             logger.debug("EngineCore exiting.")
-
+            raise
         except Exception as e:
             if engine_core is None:
                 logger.exception("EngineCore failed to start.")
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 0efb5dfb39b70..a2727d995e7d9 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -312,6 +312,7 @@ class BackgroundResources:
     def __call__(self):
         """Clean up background resources."""
 
+        self.engine_dead = True
         for core_engine in self.core_engines:
             core_engine.close()
 
@@ -564,7 +565,7 @@ class SyncMPClient(MPClient):
         self._send_input(EngineCoreRequestType.ADD, request)
 
     def abort_requests(self, request_ids: list[str]) -> None:
-        if len(request_ids) > 0:
+        if request_ids and not self.resources.engine_dead:
             self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
     def profile(self, is_start: bool = True) -> None:
@@ -735,7 +736,7 @@ class AsyncMPClient(MPClient):
         self._ensure_output_queue_task()
 
     async def abort_requests_async(self, request_ids: list[str]) -> None:
-        if len(request_ids) > 0:
+        if request_ids and not self.resources.engine_dead:
             await self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
     async def profile_async(self, is_start: bool = True) -> None:
@@ -902,5 +903,6 @@ class DPAsyncMPClient(AsyncMPClient):
 
     async def _abort_requests(self, request_ids: list[str],
                               engine: CoreEngine) -> None:
-        await self._send_input(EngineCoreRequestType.ABORT, request_ids,
-                               engine)
+        if not self.resources.engine_dead:
+            await self._send_input(EngineCoreRequestType.ABORT, request_ids,
+                                   engine)

From d0da99fb70ba68784575dc75a10ebf05afef0e5c Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Wed, 23 Apr 2025 00:49:24 -0400
Subject: [PATCH 584/593] [BugFix] llama4 fa3 fix - RuntimeError:
 scheduler_metadata must have shape (metadata_size) (#16998)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
---
 vllm/v1/attention/backends/flash_attn.py | 76 +++++++++++++++---------
 1 file changed, 48 insertions(+), 28 deletions(-)

diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index dd6021468ac81..51ae386d33898 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -105,6 +105,7 @@ class FlashAttentionMetadata:
         local_block_table: torch.Tensor
         local_max_query_len: int
         local_max_seq_len: int
+        local_scheduler_metadata: Optional[torch.Tensor]
 
     local_attn_metadata: Optional[LocalAttentionMetadata] = None
 
@@ -282,7 +283,9 @@ class FlashAttentionMetadataBuilder:
 
         self.runner = runner
         self.aot_schedule = (get_flash_attn_version() == 3)
-        self.num_heads = model_config.get_num_attention_heads(
+        self.num_heads_q = model_config.get_num_attention_heads(
+            runner.parallel_config)
+        self.num_heads_kv = model_config.get_num_kv_heads(
             runner.parallel_config)
         self.headdim = model_config.get_head_size()
         self.page_size = self.runner.block_size
@@ -304,6 +307,23 @@ class FlashAttentionMetadataBuilder:
         slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
             self.runner.device, non_blocking=True).long()
 
+        def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
+                     max_seq_len, causal):
+            if self.aot_schedule:
+                return get_scheduler_metadata(
+                    batch_size=batch_size,
+                    max_seqlen_q=max_query_len,
+                    max_seqlen_k=max_seq_len,
+                    cache_seqlens=seqlens,
+                    num_heads_q=self.num_heads_q,
+                    num_heads_kv=self.num_heads_kv,
+                    headdim=self.headdim,
+                    page_size=self.page_size,
+                    cu_seqlens_q=cu_query_lens,
+                    causal=causal,
+                )
+            return None
+
         # for local attention
         local_attn_metadata = None
         if self.runner.attention_chunk_size is not None:
@@ -315,36 +335,31 @@ class FlashAttentionMetadataBuilder:
                     block_table,
                     self.runner.block_size,
                 )
+            local_query_start_loc = torch.from_numpy(virt_q_cu_seqlens_np).to(
+                self.runner.device, non_blocking=True)
+            local_seqused_k = torch.from_numpy(virt_k_seqlens_np).to(
+                self.runner.device, non_blocking=True)
+            local_max_query_len = seqlens_q_local_np.max()
+            local_max_seq_len = virt_k_seqlens_np.max()
+            local_scheduler_metadata = schedule(
+                batch_size=local_query_start_loc.shape[0] - 1,
+                cu_query_lens=local_query_start_loc,
+                max_query_len=local_max_query_len,
+                seqlens=local_seqused_k,
+                max_seq_len=local_max_seq_len,
+                causal=True)
+
             local_attn_metadata = FlashAttentionMetadata.LocalAttentionMetadata(
-                local_query_start_loc=torch.from_numpy(
-                    virt_q_cu_seqlens_np).to(self.runner.device,
-                                             non_blocking=True),
-                local_seqused_k=torch.from_numpy(virt_k_seqlens_np).to(
-                    self.runner.device, non_blocking=True),
+                local_query_start_loc=local_query_start_loc,
+                local_seqused_k=local_seqused_k,
                 local_block_table=virt_block_table,
-                local_max_query_len=seqlens_q_local_np.max(),
-                local_max_seq_len=virt_k_seqlens_np.max(),
+                local_max_query_len=local_max_query_len,
+                local_max_seq_len=local_max_seq_len,
+                local_scheduler_metadata=local_scheduler_metadata,
             )
 
         use_cascade = common_prefix_len > 0
 
-        def schedule(cu_query_lens, max_query_len, seqlens, max_seq_len,
-                     causal):
-            if self.aot_schedule:
-                return get_scheduler_metadata(
-                    batch_size=num_reqs,
-                    max_seqlen_q=max_query_len,
-                    max_seqlen_k=max_seq_len,
-                    cache_seqlens=seqlens,
-                    num_heads_q=self.num_heads,
-                    num_heads_kv=self.num_heads,
-                    headdim=self.headdim,
-                    page_size=self.page_size,
-                    cu_seqlens_q=cu_query_lens,
-                    causal=causal,
-                )
-            return None
-
         if use_cascade:
             cu_prefix_query_lens = torch.tensor([0, num_actual_tokens],
                                                 dtype=torch.int32,
@@ -357,12 +372,14 @@ class FlashAttentionMetadataBuilder:
             suffix_kv_lens = torch.from_numpy(suffix_kv_lens).to(
                 self.runner.device)
             prefix_scheduler_metadata = schedule(
+                batch_size=num_reqs,
                 cu_query_lens=cu_prefix_query_lens,
                 max_query_len=num_actual_tokens,
                 seqlens=prefix_kv_lens,
                 max_seq_len=common_prefix_len,
                 causal=False)
-            scheduler_metadata = schedule(cu_query_lens=query_start_loc,
+            scheduler_metadata = schedule(batch_size=num_reqs,
+                                          cu_query_lens=query_start_loc,
                                           max_query_len=max_query_len,
                                           seqlens=suffix_kv_lens,
                                           max_seq_len=max_seq_len -
@@ -373,7 +390,8 @@ class FlashAttentionMetadataBuilder:
             prefix_kv_lens = None
             suffix_kv_lens = None
             prefix_scheduler_metadata = None
-            scheduler_metadata = schedule(cu_query_lens=query_start_loc,
+            scheduler_metadata = schedule(batch_size=num_reqs,
+                                          cu_query_lens=query_start_loc,
                                           max_query_len=max_query_len,
                                           seqlens=seq_lens,
                                           max_seq_len=max_seq_len,
@@ -540,12 +558,14 @@ class FlashAttentionImpl(AttentionImpl):
                 max_seqlen_q = local_metadata.local_max_query_len
                 max_seqlen_k = local_metadata.local_max_seq_len
                 block_table = local_metadata.local_block_table
+                scheduler_metadata = local_metadata.local_scheduler_metadata
             else:
                 cu_seqlens_q = attn_metadata.query_start_loc
                 seqused_k = attn_metadata.seq_lens
                 max_seqlen_q = attn_metadata.max_query_len
                 max_seqlen_k = attn_metadata.max_seq_len
                 block_table = attn_metadata.block_table
+                scheduler_metadata = attn_metadata.scheduler_metadata
 
             descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1])
 
@@ -564,7 +584,7 @@ class FlashAttentionImpl(AttentionImpl):
                 window_size=self.sliding_window,
                 block_table=block_table,
                 softcap=self.logits_soft_cap,
-                scheduler_metadata=attn_metadata.scheduler_metadata,
+                scheduler_metadata=scheduler_metadata,
                 fa_version=self.vllm_flash_attn_version,
                 q_descale=layer._q_scale.expand(descale_shape),
                 k_descale=layer._k_scale.expand(descale_shape),

From ec69124eb4729840d0bfbfa4f84608d0f48ff9a5 Mon Sep 17 00:00:00 2001
From: huafeng <qidizou88@gmail.com>
Date: Wed, 23 Apr 2025 14:16:53 +0800
Subject: [PATCH 585/593] [Misc] Improve readability of get_open_port function.
 (#17024)

Signed-off-by: gitover22 <qidizou88@gmail.com>
---
 vllm/utils.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index c6e2afff72d77..c65a370bd530a 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -628,12 +628,12 @@ def get_open_port() -> int:
     process. Currently it uses 2 ports.
     """
     if "VLLM_DP_MASTER_PORT" in os.environ:
-        dp_port = envs.VLLM_DP_MASTER_PORT
+        dp_master_port = envs.VLLM_DP_MASTER_PORT
+        reserved_port_range = range(dp_master_port, dp_master_port + 10)
         while True:
-            port = _get_open_port()
-            if dp_port <= port < dp_port + 10:
-                continue
-            return port
+            candidate_port = _get_open_port()
+            if candidate_port not in reserved_port_range:
+                return candidate_port
     return _get_open_port()
 
 
From 8c87a9ad46dd8b972d4cd9c6cecb5b284c92f583 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Wed, 23 Apr 2025 15:24:09 +0800
Subject: [PATCH 586/593] [Bugfix] Fix AssertionError:
 skip_special_tokens=False is not supported for Mistral tokenizers (#16964)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 .../openai/tool_parsers/mistral_tool_parser.py       | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index bff6cb79ad536..f0000daa0a41c 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -72,10 +72,14 @@ class MistralToolParser(ToolParser):
 
     def adjust_request(
             self, request: ChatCompletionRequest) -> ChatCompletionRequest:
-        if request.tools and request.tool_choice != 'none':
-            # do not skip special tokens because mistral uses the special
-            # tokens to indicate the start and end of the tool calls
-            # information.
+        if not isinstance(
+                self.model_tokenizer, MistralTokenizer
+        ) and request.tools and request.tool_choice != 'none':
+            # Do not skip special tokens when using chat template
+            # with Mistral parser as TOOL_CALL token is needed
+            # for tool detection.
+            # Note: we don't want skip_special_tokens=False
+            # with MistralTokenizer as it is incompatible
             request.skip_special_tokens = False
         return request
 

From ce17db80851e56c053316cebddee923b440b4d8a Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Wed, 23 Apr 2025 04:13:34 -0400
Subject: [PATCH 587/593] [CI] Run v1/test_serial_utils.py in CI (#16996)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .buildkite/test-pipeline.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 95e38305e1e1b..2420b2d5d71b5 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -209,6 +209,7 @@ steps:
     - pytest -v -s v1/worker
     - pytest -v -s v1/structured_output
     - pytest -v -s v1/spec_decode
+    - pytest -v -s v1/test_serial_utils.py
     - pytest -v -s v1/test_stats.py
     - pytest -v -s v1/test_utils.py
     - pytest -v -s v1/test_oracle.py

From aa72d9a4ea6b31a845bf4fbd5a97d3175a8c329a Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 23 Apr 2025 06:46:23 -0600
Subject: [PATCH 588/593] Mistral-format support for compressed-tensors
 (#16803)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/transformers_utils/config.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 178fdd63a872f..4e2a31ce67297 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -690,6 +690,9 @@ def load_params_config(model: Union[str, Path], revision: Optional[str],
                 "quant_method": "fp8",
                 "activation_scheme": "static"
             }
+        elif quantization.get("quant_method") == "compressed-tensors":
+            # Pass through the quantization config to compressed-tensors
+            quantization_config = quantization
         else:
             raise ValueError(
                 f"Found unknown quantization='{quantization}' in config")
@@ -707,6 +710,7 @@ def load_params_config(model: Union[str, Path], revision: Optional[str],
 
     if config_type == "multimodal":
         multimodal_config = config_dict.pop("vision_encoder")
+        quantization_config = config_dict.get("quantization_config", {})
 
         config_dict = {
             "text_config": config_dict,
@@ -714,6 +718,8 @@ def load_params_config(model: Union[str, Path], revision: Optional[str],
         }
         config_dict["architectures"] = ["PixtralForConditionalGeneration"]
         config_dict["model_type"] = "pixtral"
+        if quantization_config:
+            config_dict["quantization_config"] = quantization_config
 
     config_dict.update(kwargs)
 

From 6317a5174a4f3cbd57c44d15023042cecc576f9e Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 23 Apr 2025 07:21:07 -0600
Subject: [PATCH 589/593] Categorize `tests/kernels/` based on kernel type
 (#16799)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .../test_lm_eval_correctness.py               |  2 +-
 .buildkite/test-pipeline.yaml                 | 43 ++++++++++++++++---
 tests/kernels/{ => attention}/conftest.py     |  0
 .../kernels/{ => attention}/test_attention.py |  3 +-
 .../test_attention_selector.py                |  9 ++++
 .../test_blocksparse_attention.py             |  3 +-
 tests/kernels/{ => attention}/test_cache.py   |  0
 .../test_cascade_flash_attn.py                |  0
 .../test_encoder_decoder_attn.py              |  0
 .../{ => attention}/test_flash_attn.py        |  0
 .../{ => attention}/test_flashinfer.py        |  0
 .../kernels/{ => attention}/test_flashmla.py  |  0
 .../{ => attention}/test_lightning_attn.py    |  0
 .../{ => attention}/test_merge_attn_states.py |  0
 .../kernels/{ => attention}/test_mha_attn.py  |  0
 .../{ => attention}/test_mla_decode_cpu.py    |  0
 .../{ => attention}/test_prefix_prefill.py    |  0
 .../test_rocm_attention_selector.py           |  0
 .../test_triton_decode_attention.py           |  0
 tests/kernels/{ => core}/test_activation.py   |  3 +-
 .../{ => core}/test_fused_quant_layernorm.py  |  0
 tests/kernels/{ => core}/test_layernorm.py    |  0
 tests/kernels/core/test_opcheck.py            | 25 +++++++++++
 tests/kernels/{ => core}/test_permute_cols.py |  0
 tests/kernels/{ => core}/test_pos_encoding.py |  3 +-
 .../{ => core}/test_rotary_embedding.py       |  0
 tests/kernels/{ => core}/test_uva.py          |  0
 .../kernels/{ => mamba}/test_causal_conv1d.py |  0
 .../kernels/{ => mamba}/test_mamba_mixer2.py  |  0
 tests/kernels/{ => mamba}/test_mamba_ssm.py   |  0
 .../kernels/{ => mamba}/test_mamba_ssm_ssd.py |  0
 tests/kernels/{ => moe}/test_cutlass_moe.py   |  0
 tests/kernels/{ => moe}/test_moe.py           |  0
 .../{ => moe}/test_triton_moe_ptpc_fp8.py     |  0
 .../{ => quantization}/test_allspark_gemm.py  |  0
 tests/kernels/{ => quantization}/test_aqlm.py |  0
 tests/kernels/{ => quantization}/test_awq.py  |  0
 .../{ => quantization}/test_awq_marlin.py     |  0
 .../{ => quantization}/test_awq_triton.py     |  0
 .../{ => quantization}/test_block_fp8.py      |  3 +-
 .../{ => quantization}/test_block_int8.py     |  3 +-
 .../test_cutlass_2of4_sparse.py               |  3 +-
 .../test_cutlass_scaled_mm.py}                |  4 +-
 .../{ => quantization}/test_fp8_quant.py      |  0
 tests/kernels/{ => quantization}/test_ggml.py |  0
 tests/kernels/{ => quantization}/test_gguf.py |  0
 tests/kernels/{ => quantization}/test_gptq.py |  0
 .../{ => quantization}/test_int8_kernel.py    |  0
 .../{ => quantization}/test_int8_quant.py     |  0
 .../{ => quantization}/test_machete_mm.py     |  0
 .../{ => quantization}/test_marlin_gemm.py    |  0
 .../{ => quantization}/test_nvfp4_quant.py    |  0
 .../test_nvfp4_scaled_mm.py                   |  0
 .../test_triton_scaled_mm.py                  |  0
 tests/kernels/test_utils.py                   | 25 -----------
 55 files changed, 80 insertions(+), 49 deletions(-)
 rename tests/kernels/{ => attention}/conftest.py (100%)
 rename tests/kernels/{ => attention}/test_attention.py (99%)
 rename tests/kernels/{ => attention}/test_attention_selector.py (95%)
 rename tests/kernels/{ => attention}/test_blocksparse_attention.py (99%)
 rename tests/kernels/{ => attention}/test_cache.py (100%)
 rename tests/kernels/{ => attention}/test_cascade_flash_attn.py (100%)
 rename tests/kernels/{ => attention}/test_encoder_decoder_attn.py (100%)
 rename tests/kernels/{ => attention}/test_flash_attn.py (100%)
 rename tests/kernels/{ => attention}/test_flashinfer.py (100%)
 rename tests/kernels/{ => attention}/test_flashmla.py (100%)
 rename tests/kernels/{ => attention}/test_lightning_attn.py (100%)
 rename tests/kernels/{ => attention}/test_merge_attn_states.py (100%)
 rename tests/kernels/{ => attention}/test_mha_attn.py (100%)
 rename tests/kernels/{ => attention}/test_mla_decode_cpu.py (100%)
 rename tests/kernels/{ => attention}/test_prefix_prefill.py (100%)
 rename tests/kernels/{ => attention}/test_rocm_attention_selector.py (100%)
 rename tests/kernels/{ => attention}/test_triton_decode_attention.py (100%)
 rename tests/kernels/{ => core}/test_activation.py (97%)
 rename tests/kernels/{ => core}/test_fused_quant_layernorm.py (100%)
 rename tests/kernels/{ => core}/test_layernorm.py (100%)
 create mode 100644 tests/kernels/core/test_opcheck.py
 rename tests/kernels/{ => core}/test_permute_cols.py (100%)
 rename tests/kernels/{ => core}/test_pos_encoding.py (99%)
 rename tests/kernels/{ => core}/test_rotary_embedding.py (100%)
 rename tests/kernels/{ => core}/test_uva.py (100%)
 rename tests/kernels/{ => mamba}/test_causal_conv1d.py (100%)
 rename tests/kernels/{ => mamba}/test_mamba_mixer2.py (100%)
 rename tests/kernels/{ => mamba}/test_mamba_ssm.py (100%)
 rename tests/kernels/{ => mamba}/test_mamba_ssm_ssd.py (100%)
 rename tests/kernels/{ => moe}/test_cutlass_moe.py (100%)
 rename tests/kernels/{ => moe}/test_moe.py (100%)
 rename tests/kernels/{ => moe}/test_triton_moe_ptpc_fp8.py (100%)
 rename tests/kernels/{ => quantization}/test_allspark_gemm.py (100%)
 rename tests/kernels/{ => quantization}/test_aqlm.py (100%)
 rename tests/kernels/{ => quantization}/test_awq.py (100%)
 rename tests/kernels/{ => quantization}/test_awq_marlin.py (100%)
 rename tests/kernels/{ => quantization}/test_awq_triton.py (100%)
 rename tests/kernels/{ => quantization}/test_block_fp8.py (99%)
 rename tests/kernels/{ => quantization}/test_block_int8.py (99%)
 rename tests/kernels/{ => quantization}/test_cutlass_2of4_sparse.py (99%)
 rename tests/kernels/{test_cutlass.py => quantization/test_cutlass_scaled_mm.py} (99%)
 rename tests/kernels/{ => quantization}/test_fp8_quant.py (100%)
 rename tests/kernels/{ => quantization}/test_ggml.py (100%)
 rename tests/kernels/{ => quantization}/test_gguf.py (100%)
 rename tests/kernels/{ => quantization}/test_gptq.py (100%)
 rename tests/kernels/{ => quantization}/test_int8_kernel.py (100%)
 rename tests/kernels/{ => quantization}/test_int8_quant.py (100%)
 rename tests/kernels/{ => quantization}/test_machete_mm.py (100%)
 rename tests/kernels/{ => quantization}/test_marlin_gemm.py (100%)
 rename tests/kernels/{ => quantization}/test_nvfp4_quant.py (100%)
 rename tests/kernels/{ => quantization}/test_nvfp4_scaled_mm.py (100%)
 rename tests/kernels/{ => quantization}/test_triton_scaled_mm.py (100%)
 delete mode 100644 tests/kernels/test_utils.py

diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
index 4ae23eff62f37..6015a83e82950 100644
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -16,7 +16,7 @@ import numpy
 import pytest
 import yaml
 
-RTOL = 0.05
+RTOL = 0.08
 TEST_DATA_FILE = os.environ.get(
     "LM_EVAL_TEST_DATA_FILE",
     ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 2420b2d5d71b5..ec00bc7f108df 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -317,15 +317,46 @@ steps:
   commands:
   - pytest -v -s compile/test_full_graph.py
 
-- label: Kernels Test %N # 1h each
-  mirror_hardwares: [amd]
+- label: Kernels Core Operation Test
   source_file_dependencies:
   - csrc/
-  - vllm/attention
-  - tests/kernels
+  - tests/kernels/core
   commands:
-    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 4
+    - pytest -v -s kernels/core
+
+- label: Kernels Attention Test %N
+  source_file_dependencies:
+  - csrc/attention/
+  - vllm/attention
+  - vllm/v1/attention
+  - tests/kernels/attention
+  commands:
+    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels Quantization Test %N
+  source_file_dependencies:
+  - csrc/quantization/
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization
+  commands:
+    - pytest -v -s kernels/quantization  --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels MoE Test
+  source_file_dependencies:
+  - csrc/moe/
+  - tests/kernels/moe
+  - vllm/model_executor/layers/fused_moe/
+  commands:
+    - pytest -v -s kernels/moe
+
+- label: Kernels Mamba Test
+  source_file_dependencies:
+  - csrc/mamba/
+  - tests/kernels/mamba
+  commands:
+    - pytest -v -s kernels/mamba
 
 - label: Tensorizer Test # 11min
   # mirror_hardwares: [amd]
diff --git a/tests/kernels/conftest.py b/tests/kernels/attention/conftest.py
similarity index 100%
rename from tests/kernels/conftest.py
rename to tests/kernels/attention/conftest.py
diff --git a/tests/kernels/test_attention.py b/tests/kernels/attention/test_attention.py
similarity index 99%
rename from tests/kernels/test_attention.py
rename to tests/kernels/attention/test_attention.py
index 0d7898a900e48..e5650136f2584 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -6,13 +6,12 @@ from typing import Optional
 import pytest
 import torch
 
+from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 from vllm.utils import get_max_shared_memory_bytes
 
-from .allclose_default import get_default_atol, get_default_rtol
-
 if not current_platform.is_rocm():
     from xformers import ops as xops
     from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
similarity index 95%
rename from tests/kernels/test_attention_selector.py
rename to tests/kernels/attention/test_attention_selector.py
index 2b5e0a29ddc55..b0414244c2151 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -156,6 +156,15 @@ def test_env(
                         expected = ("TRITON_MLA_VLLM_V1"
                                     if use_v1 else "TRITON_MLA")
                         assert backend.get_name() == expected
+                elif name == "FLASHINFER":
+                    backend = get_attn_backend(16,
+                                               torch.float16,
+                                               torch.float16,
+                                               block_size,
+                                               False,
+                                               use_mla=use_mla)
+                    expected = "FLASHINFER_VLLM_V1" if use_v1 else name
+                    assert backend.get_name() == expected
                 else:
                     backend = get_attn_backend(16,
                                                torch.float16,
diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/attention/test_blocksparse_attention.py
similarity index 99%
rename from tests/kernels/test_blocksparse_attention.py
rename to tests/kernels/attention/test_blocksparse_attention.py
index 3025ae0f921a4..82d038257575c 100644
--- a/tests/kernels/test_blocksparse_attention.py
+++ b/tests/kernels/attention/test_blocksparse_attention.py
@@ -6,14 +6,13 @@ from typing import Optional
 import pytest
 import torch
 
+from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from vllm import _custom_ops as ops
 from vllm.attention.ops.blocksparse_attention.interface import (
     LocalStridedBlockSparseAttn)
 from vllm.platforms import current_platform
 from vllm.utils import get_max_shared_memory_bytes
 
-from .allclose_default import get_default_atol, get_default_rtol
-
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
 # - 512 as a buffer
diff --git a/tests/kernels/test_cache.py b/tests/kernels/attention/test_cache.py
similarity index 100%
rename from tests/kernels/test_cache.py
rename to tests/kernels/attention/test_cache.py
diff --git a/tests/kernels/test_cascade_flash_attn.py b/tests/kernels/attention/test_cascade_flash_attn.py
similarity index 100%
rename from tests/kernels/test_cascade_flash_attn.py
rename to tests/kernels/attention/test_cascade_flash_attn.py
diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/attention/test_encoder_decoder_attn.py
similarity index 100%
rename from tests/kernels/test_encoder_decoder_attn.py
rename to tests/kernels/attention/test_encoder_decoder_attn.py
diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py
similarity index 100%
rename from tests/kernels/test_flash_attn.py
rename to tests/kernels/attention/test_flash_attn.py
diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py
similarity index 100%
rename from tests/kernels/test_flashinfer.py
rename to tests/kernels/attention/test_flashinfer.py
diff --git a/tests/kernels/test_flashmla.py b/tests/kernels/attention/test_flashmla.py
similarity index 100%
rename from tests/kernels/test_flashmla.py
rename to tests/kernels/attention/test_flashmla.py
diff --git a/tests/kernels/test_lightning_attn.py b/tests/kernels/attention/test_lightning_attn.py
similarity index 100%
rename from tests/kernels/test_lightning_attn.py
rename to tests/kernels/attention/test_lightning_attn.py
diff --git a/tests/kernels/test_merge_attn_states.py b/tests/kernels/attention/test_merge_attn_states.py
similarity index 100%
rename from tests/kernels/test_merge_attn_states.py
rename to tests/kernels/attention/test_merge_attn_states.py
diff --git a/tests/kernels/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py
similarity index 100%
rename from tests/kernels/test_mha_attn.py
rename to tests/kernels/attention/test_mha_attn.py
diff --git a/tests/kernels/test_mla_decode_cpu.py b/tests/kernels/attention/test_mla_decode_cpu.py
similarity index 100%
rename from tests/kernels/test_mla_decode_cpu.py
rename to tests/kernels/attention/test_mla_decode_cpu.py
diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py
similarity index 100%
rename from tests/kernels/test_prefix_prefill.py
rename to tests/kernels/attention/test_prefix_prefill.py
diff --git a/tests/kernels/test_rocm_attention_selector.py b/tests/kernels/attention/test_rocm_attention_selector.py
similarity index 100%
rename from tests/kernels/test_rocm_attention_selector.py
rename to tests/kernels/attention/test_rocm_attention_selector.py
diff --git a/tests/kernels/test_triton_decode_attention.py b/tests/kernels/attention/test_triton_decode_attention.py
similarity index 100%
rename from tests/kernels/test_triton_decode_attention.py
rename to tests/kernels/attention/test_triton_decode_attention.py
diff --git a/tests/kernels/test_activation.py b/tests/kernels/core/test_activation.py
similarity index 97%
rename from tests/kernels/test_activation.py
rename to tests/kernels/core/test_activation.py
index cf0f21ce06514..79f838a954e70 100644
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/core/test_activation.py
@@ -5,6 +5,7 @@ import random
 import pytest
 import torch
 
+from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from tests.kernels.utils import opcheck
 from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul,
                                                    GeluAndMul, MulAndSilu,
@@ -12,8 +13,6 @@ from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul,
                                                    SiluAndMul)
 from vllm.platforms import current_platform
 
-from .allclose_default import get_default_atol, get_default_rtol
-
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
 D = [512, 13824]  # Arbitrary values for testing
diff --git a/tests/kernels/test_fused_quant_layernorm.py b/tests/kernels/core/test_fused_quant_layernorm.py
similarity index 100%
rename from tests/kernels/test_fused_quant_layernorm.py
rename to tests/kernels/core/test_fused_quant_layernorm.py
diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/core/test_layernorm.py
similarity index 100%
rename from tests/kernels/test_layernorm.py
rename to tests/kernels/core/test_layernorm.py
diff --git a/tests/kernels/core/test_opcheck.py b/tests/kernels/core/test_opcheck.py
new file mode 100644
index 0000000000000..c9a9679c5d80f
--- /dev/null
+++ b/tests/kernels/core/test_opcheck.py
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Tests for miscellaneous utilities
+"""
+
+import torch
+
+from tests.kernels.utils import opcheck
+
+
+def test_convert_fp8_opcheck():
+    data = torch.randn((256, 256), dtype=torch.float32, device="cuda")
+    result = torch.empty_like(data, dtype=torch.float8_e4m3fn)
+    opcheck(torch.ops._C_cache_ops.convert_fp8, (result, data, 1.0, "fp8"))
+
+
+# TODO: Add this back, currently fails with
+# csrc/cuda_utils_kernels.cu:15 'invalid argument'
+# @pytest.mark.skipif(not current_platform.is_cuda(),
+#                     reason="Only supported for CUDA")
+# def test_cuda_utils_opcheck():
+#     opcheck(torch.ops._C_cuda_utils.get_device_attribute, (0, 0))
+#     opcheck(
+#         torch.ops._C_cuda_utils.
+#         get_max_shared_memory_per_block_device_attribute, (0, ))
diff --git a/tests/kernels/test_permute_cols.py b/tests/kernels/core/test_permute_cols.py
similarity index 100%
rename from tests/kernels/test_permute_cols.py
rename to tests/kernels/core/test_permute_cols.py
diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py
similarity index 99%
rename from tests/kernels/test_pos_encoding.py
rename to tests/kernels/core/test_pos_encoding.py
index eb83b4d612c22..2b7bf755ec22d 100644
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/core/test_pos_encoding.py
@@ -6,11 +6,10 @@ from typing import Callable, Optional
 import pytest
 import torch
 
+from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.platforms import current_platform
 
-from .allclose_default import get_default_atol, get_default_rtol
-
 IS_NEOX_STYLE = [True, False]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 HEAD_SIZES = [64, 80, 112, 120, 256]
diff --git a/tests/kernels/test_rotary_embedding.py b/tests/kernels/core/test_rotary_embedding.py
similarity index 100%
rename from tests/kernels/test_rotary_embedding.py
rename to tests/kernels/core/test_rotary_embedding.py
diff --git a/tests/kernels/test_uva.py b/tests/kernels/core/test_uva.py
similarity index 100%
rename from tests/kernels/test_uva.py
rename to tests/kernels/core/test_uva.py
diff --git a/tests/kernels/test_causal_conv1d.py b/tests/kernels/mamba/test_causal_conv1d.py
similarity index 100%
rename from tests/kernels/test_causal_conv1d.py
rename to tests/kernels/mamba/test_causal_conv1d.py
diff --git a/tests/kernels/test_mamba_mixer2.py b/tests/kernels/mamba/test_mamba_mixer2.py
similarity index 100%
rename from tests/kernels/test_mamba_mixer2.py
rename to tests/kernels/mamba/test_mamba_mixer2.py
diff --git a/tests/kernels/test_mamba_ssm.py b/tests/kernels/mamba/test_mamba_ssm.py
similarity index 100%
rename from tests/kernels/test_mamba_ssm.py
rename to tests/kernels/mamba/test_mamba_ssm.py
diff --git a/tests/kernels/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py
similarity index 100%
rename from tests/kernels/test_mamba_ssm_ssd.py
rename to tests/kernels/mamba/test_mamba_ssm_ssd.py
diff --git a/tests/kernels/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py
similarity index 100%
rename from tests/kernels/test_cutlass_moe.py
rename to tests/kernels/moe/test_cutlass_moe.py
diff --git a/tests/kernels/test_moe.py b/tests/kernels/moe/test_moe.py
similarity index 100%
rename from tests/kernels/test_moe.py
rename to tests/kernels/moe/test_moe.py
diff --git a/tests/kernels/test_triton_moe_ptpc_fp8.py b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
similarity index 100%
rename from tests/kernels/test_triton_moe_ptpc_fp8.py
rename to tests/kernels/moe/test_triton_moe_ptpc_fp8.py
diff --git a/tests/kernels/test_allspark_gemm.py b/tests/kernels/quantization/test_allspark_gemm.py
similarity index 100%
rename from tests/kernels/test_allspark_gemm.py
rename to tests/kernels/quantization/test_allspark_gemm.py
diff --git a/tests/kernels/test_aqlm.py b/tests/kernels/quantization/test_aqlm.py
similarity index 100%
rename from tests/kernels/test_aqlm.py
rename to tests/kernels/quantization/test_aqlm.py
diff --git a/tests/kernels/test_awq.py b/tests/kernels/quantization/test_awq.py
similarity index 100%
rename from tests/kernels/test_awq.py
rename to tests/kernels/quantization/test_awq.py
diff --git a/tests/kernels/test_awq_marlin.py b/tests/kernels/quantization/test_awq_marlin.py
similarity index 100%
rename from tests/kernels/test_awq_marlin.py
rename to tests/kernels/quantization/test_awq_marlin.py
diff --git a/tests/kernels/test_awq_triton.py b/tests/kernels/quantization/test_awq_triton.py
similarity index 100%
rename from tests/kernels/test_awq_triton.py
rename to tests/kernels/quantization/test_awq_triton.py
diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py
similarity index 99%
rename from tests/kernels/test_block_fp8.py
rename to tests/kernels/quantization/test_block_fp8.py
index c450048bf6651..da594675e924a 100644
--- a/tests/kernels/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@@ -6,6 +6,7 @@ import itertools
 import pytest
 import torch
 
+from tests.kernels.utils_block import native_w8a8_block_matmul
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
@@ -18,8 +19,6 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     per_token_group_quant_fp8, w8a8_block_fp8_matmul)
 from vllm.platforms import current_platform
 
-from .utils_block import native_w8a8_block_matmul
-
 dg_available = False
 try:
     import deep_gemm
diff --git a/tests/kernels/test_block_int8.py b/tests/kernels/quantization/test_block_int8.py
similarity index 99%
rename from tests/kernels/test_block_int8.py
rename to tests/kernels/quantization/test_block_int8.py
index 9447f9d691650..943470ad113d1 100644
--- a/tests/kernels/test_block_int8.py
+++ b/tests/kernels/quantization/test_block_int8.py
@@ -6,6 +6,7 @@ import itertools
 import pytest
 import torch
 
+from tests.kernels.utils_block import native_w8a8_block_matmul
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
@@ -13,8 +14,6 @@ from vllm.model_executor.layers.quantization.utils.int8_utils import (
     w8a8_block_int8_matmul)
 from vllm.platforms import current_platform
 
-from .utils_block import native_w8a8_block_matmul
-
 if current_platform.get_device_capability() < (7, 0):
     pytest.skip("INT8 Triton requires CUDA 7.0 or higher",
                 allow_module_level=True)
diff --git a/tests/kernels/test_cutlass_2of4_sparse.py b/tests/kernels/quantization/test_cutlass_2of4_sparse.py
similarity index 99%
rename from tests/kernels/test_cutlass_2of4_sparse.py
rename to tests/kernels/quantization/test_cutlass_2of4_sparse.py
index 2890e15d6cbaf..d67d2dbb89981 100644
--- a/tests/kernels/test_cutlass_2of4_sparse.py
+++ b/tests/kernels/quantization/test_cutlass_2of4_sparse.py
@@ -7,13 +7,12 @@ Run `pytest tests/kernels/test_semi_structured.py`.
 import pytest
 import torch
 
+from tests.kernels.utils import baseline_scaled_mm, to_fp8, to_int8
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     sparse_cutlass_supported)
 from vllm.platforms import current_platform
 
-from .utils import baseline_scaled_mm, to_fp8, to_int8
-
 CUDA_DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py
similarity index 99%
rename from tests/kernels/test_cutlass.py
rename to tests/kernels/quantization/test_cutlass_scaled_mm.py
index f11ce6f45a984..8084d9bf2c2da 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py
@@ -8,13 +8,11 @@ import random
 import pytest
 import torch
 
-from tests.kernels.utils import opcheck
+from tests.kernels.utils import baseline_scaled_mm, opcheck, to_fp8, to_int8
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 from vllm.utils import cdiv
 
-from .utils import baseline_scaled_mm, to_fp8, to_int8
-
 MNK_FACTORS = [
     (1, 256, 128),
     (1, 16384, 1024),
diff --git a/tests/kernels/test_fp8_quant.py b/tests/kernels/quantization/test_fp8_quant.py
similarity index 100%
rename from tests/kernels/test_fp8_quant.py
rename to tests/kernels/quantization/test_fp8_quant.py
diff --git a/tests/kernels/test_ggml.py b/tests/kernels/quantization/test_ggml.py
similarity index 100%
rename from tests/kernels/test_ggml.py
rename to tests/kernels/quantization/test_ggml.py
diff --git a/tests/kernels/test_gguf.py b/tests/kernels/quantization/test_gguf.py
similarity index 100%
rename from tests/kernels/test_gguf.py
rename to tests/kernels/quantization/test_gguf.py
diff --git a/tests/kernels/test_gptq.py b/tests/kernels/quantization/test_gptq.py
similarity index 100%
rename from tests/kernels/test_gptq.py
rename to tests/kernels/quantization/test_gptq.py
diff --git a/tests/kernels/test_int8_kernel.py b/tests/kernels/quantization/test_int8_kernel.py
similarity index 100%
rename from tests/kernels/test_int8_kernel.py
rename to tests/kernels/quantization/test_int8_kernel.py
diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/quantization/test_int8_quant.py
similarity index 100%
rename from tests/kernels/test_int8_quant.py
rename to tests/kernels/quantization/test_int8_quant.py
diff --git a/tests/kernels/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py
similarity index 100%
rename from tests/kernels/test_machete_mm.py
rename to tests/kernels/quantization/test_machete_mm.py
diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py
similarity index 100%
rename from tests/kernels/test_marlin_gemm.py
rename to tests/kernels/quantization/test_marlin_gemm.py
diff --git a/tests/kernels/test_nvfp4_quant.py b/tests/kernels/quantization/test_nvfp4_quant.py
similarity index 100%
rename from tests/kernels/test_nvfp4_quant.py
rename to tests/kernels/quantization/test_nvfp4_quant.py
diff --git a/tests/kernels/test_nvfp4_scaled_mm.py b/tests/kernels/quantization/test_nvfp4_scaled_mm.py
similarity index 100%
rename from tests/kernels/test_nvfp4_scaled_mm.py
rename to tests/kernels/quantization/test_nvfp4_scaled_mm.py
diff --git a/tests/kernels/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py
similarity index 100%
rename from tests/kernels/test_triton_scaled_mm.py
rename to tests/kernels/quantization/test_triton_scaled_mm.py
diff --git a/tests/kernels/test_utils.py b/tests/kernels/test_utils.py
deleted file mode 100644
index d3f0320026519..0000000000000
--- a/tests/kernels/test_utils.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-"""
-Tests for miscellaneous utilities
-"""
-
-import pytest
-import torch
-
-from tests.kernels.utils import opcheck
-from vllm.platforms import current_platform
-
-
-def test_convert_fp8_opcheck():
-    data = torch.randn((256, 256), dtype=torch.float32, device="cuda")
-    result = torch.empty_like(data, dtype=torch.float8_e4m3fn)
-    opcheck(torch.ops._C_cache_ops.convert_fp8, (result, data, 1.0, "fp8"))
-
-
-@pytest.mark.skipif(not current_platform.is_cuda(),
-                    reason="Only supported for CUDA")
-def test_cuda_utils_opcheck():
-    opcheck(torch.ops._C_cuda_utils.get_device_attribute, (0, 0))
-    opcheck(
-        torch.ops._C_cuda_utils.
-        get_max_shared_memory_per_block_device_attribute, (0, ))

From f7912cba3d613afa8b96ce2e04dad671205050c6 Mon Sep 17 00:00:00 2001
From: Michael Yao <haifeng.yao@daocloud.io>
Date: Wed, 23 Apr 2025 22:32:16 +0800
Subject: [PATCH 590/593] [Doc] Add top anchor and a note to
 quantization/bitblas.md (#17042)

Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
---
 docs/source/features/quantization/bitblas.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/source/features/quantization/bitblas.md b/docs/source/features/quantization/bitblas.md
index aff917f90ec2a..2901f760d3e4c 100644
--- a/docs/source/features/quantization/bitblas.md
+++ b/docs/source/features/quantization/bitblas.md
@@ -1,7 +1,15 @@
+(bitblas)=
+
 # BitBLAS
 
 vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more efficient and flexible model inference. Compared to other quantization frameworks, BitBLAS provides more precision combinations.
 
+:::{note}
+Ensure your hardware supports the selected `dtype` (`torch.bfloat16` or `torch.float16`).
+Most recent NVIDIA GPUs support `float16`, while `bfloat16` is more common on newer architectures like Ampere or Hopper.
+For details see [supported hardware](https://docs.vllm.ai/en/latest/features/quantization/supported_hardware.html).
+:::
+
 Below are the steps to utilize BitBLAS with vLLM.
 
 ```console

From 53c0fa1e25a5d406d3011e760be068d7a8c102fb Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 23 Apr 2025 15:32:26 +0100
Subject: [PATCH 591/593] Ensure that `pid` passed to `kill_process_tree` is
 `int` for `mypy` (#17051)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/v1/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 32d8101f681d3..9c0fa2d0773d4 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -134,8 +134,8 @@ def shutdown(proc: Process, input_path: str, output_path: str):
         proc.terminate()
         proc.join(5)
 
-        if proc.is_alive():
-            kill_process_tree(proc.pid)
+        if proc.is_alive() and (pid := proc.pid) is not None:
+            kill_process_tree(pid)
 
     # Remove zmq ipc socket files.
     ipc_sockets = [output_path, input_path]

From af869f6dffec0cd99394bfc7449e28a67d2e9a5d Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Wed, 23 Apr 2025 10:33:14 -0400
Subject: [PATCH 592/593] [CI] Update structured-output label automation
 (#17055)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .github/mergify.yml | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index 3097b994659ab..2033722b5f33c 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -55,11 +55,19 @@ pull_request_rules:
   description: Automatically apply structured-output label
   conditions:
     - or:
+      - files~=^benchmarks/structured_schemas/
+      - files=benchmarks/benchmark_serving_structured_output.py
+      - files=benchmarks/run_structured_output_benchmark.sh
+      - files=docs/source/features/structured_outputs.md
+      - files=examples/offline_inference/structured_outputs.py
+      - files=examples/online_serving/openai_chat_completion_structured_outputs.py
+      - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
       - files~=^vllm/model_executor/guided_decoding/
       - files=tests/model_executor/test_guided_processors.py
       - files=tests/entrypoints/llm/test_guided_generate.py
-      - files=benchmarks/benchmark_serving_guided.py
-      - files=benchmarks/benchmark_guided.py
+      - files~=^tests/v1/structured_output/
+      - files=tests/v1/entrypoints/llm/test_guided_generate.py
+      - files~=^vllm/v1/structured_output/
   actions:
     label:
       add:

From 8e630d680e70fe7c7a1022ed33a09e6efaad629f Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 23 Apr 2025 15:33:51 +0100
Subject: [PATCH 593/593] Improve Transformers backend model loading QoL
 (#17039)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/model_loader/utils.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 15f37aad6d8c3..af4f2e95a9afe 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -55,7 +55,10 @@ def resolve_transformers_arch(model_config: ModelConfig,
         #     "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
         # },
         auto_modules = {
-            name: get_class_from_dynamic_module(module, model_config.model)
+            name:
+            get_class_from_dynamic_module(module,
+                                          model_config.model,
+                                          revision=model_config.revision)
             for name, module in sorted(auto_map.items(), key=lambda x: x[0])
         }
         custom_model_module = auto_modules.get("AutoModel")
@@ -97,10 +100,10 @@ def get_model_architecture(
         architectures = ["QuantMixtralForCausalLM"]
 
     vllm_supported_archs = ModelRegistry.get_supported_archs()
-    is_vllm_supported = any(arch in vllm_supported_archs
-                            for arch in architectures)
-    if (not is_vllm_supported
-            or model_config.model_impl == ModelImpl.TRANSFORMERS):
+    vllm_not_supported = not any(arch in vllm_supported_archs
+                                 for arch in architectures)
+    if (model_config.model_impl == ModelImpl.TRANSFORMERS or
+            model_config.model_impl != ModelImpl.VLLM and vllm_not_supported):
         architectures = resolve_transformers_arch(model_config, architectures)
 
     model_cls, arch = ModelRegistry.resolve_model_cls(architectures)